diff options
Diffstat (limited to 'storage/innobase')
249 files changed, 21651 insertions, 27685 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 08e8e3ff5ab..10be3187072 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -37,7 +37,6 @@ SET(INNOBASE_SOURCES buf/buf0flu.cc buf/buf0lru.cc buf/buf0rea.cc - buf/buf0mtflu.cc data/data0data.cc data/data0type.cc dict/dict0boot.cc @@ -154,7 +153,7 @@ MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE DEFAULT RECOMPILE_FOR_EMBEDDED LINK_LIBRARIES ${ZLIB_LIBRARY} - ${CRC32_VPMSUM_LIBRARY} + ${CRC32_LIBRARY} ${NUMA_LIBRARY} ${LIBSYSTEMD} ${LINKER_SCRIPT}) @@ -177,18 +176,22 @@ IF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" mtr/mtr0mtr.cc row/row0merge.cc row/row0mysql.cc + row/row0trunc.cc srv/srv0srv.cc COMPILE_FLAGS "-O0" ) ENDIF() IF(MSVC) + IF(CMAKE_SIZEOF_VOID_P EQUAL 8) + ADD_COMPILE_FLAGS( + pars/lexyy.cc + COMPILE_FLAGS "/wd4267") + ENDIF() # silence "switch statement contains 'default' but no 'case' label # on generated file. TARGET_COMPILE_OPTIONS(innobase PRIVATE "/wd4065") ENDIF() -ADD_SUBDIRECTORY(${CMAKE_SOURCE_DIR}/extra/mariabackup ${CMAKE_BINARY_DIR}/extra/mariabackup) - -IF(TARGET innobase) - ADD_DEPENDENCIES(innobase GenError) +IF(NOT (PLUGIN_INNOBASE STREQUAL DYNAMIC)) + ADD_SUBDIRECTORY(${CMAKE_SOURCE_DIR}/extra/mariabackup ${CMAKE_BINARY_DIR}/extra/mariabackup) ENDIF() diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index f7fe4413086..2842f9a6bc0 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -2,7 +2,7 @@ Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2014, 2020, MariaDB Corporation. +Copyright (c) 2014, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -204,7 +204,7 @@ btr_root_fseg_validate( ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space); ut_a(offset >= FIL_PAGE_DATA); - ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END); + ut_a(offset <= srv_page_size - FIL_PAGE_DATA_END); return(TRUE); } #endif /* UNIV_BTR_DEBUG */ @@ -220,24 +220,25 @@ btr_root_block_get( or RW_X_LATCH */ mtr_t* mtr) /*!< in: mtr */ { - const ulint space = dict_index_get_space(index); - const page_id_t page_id(space, dict_index_get_page(index)); - const page_size_t page_size(dict_table_page_size(index->table)); + if (!index->table || !index->table->space) { + return NULL; + } - buf_block_t* block = btr_block_get(page_id, page_size, mode, - index, mtr); + buf_block_t* block = btr_block_get( + page_id_t(index->table->space_id, index->page), + page_size_t(index->table->space->flags), mode, + index, mtr); if (!block) { - if (index && index->table) { - index->table->file_unreadable = true; - - ib_push_warning( - static_cast<THD*>(NULL), DB_DECRYPTION_FAILED, - "Table %s in tablespace %lu is encrypted but encryption service or" - " used key_id is not available. " - " Can't continue reading table.", - index->table->name.m_name, space); - } + index->table->file_unreadable = true; + + ib_push_warning( + static_cast<THD*>(NULL), DB_DECRYPTION_FAILED, + "Table %s in file %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + index->table->name.m_name, + UT_LIST_GET_FIRST(index->table->space->chain)->name); return NULL; } @@ -249,9 +250,9 @@ btr_root_block_get( const page_t* root = buf_block_get_frame(block); ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF - + root, space)); + + root, index->table->space_id)); ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP - + root, space)); + + root, index->table->space_id)); } #endif /* UNIV_BTR_DEBUG */ @@ -299,7 +300,7 @@ btr_height_get( root_block = btr_root_block_get(index, RW_S_LATCH, mtr); if (root_block) { - height = btr_page_get_level(buf_block_get_frame(root_block), mtr); + height = btr_page_get_level(buf_block_get_frame(root_block)); /* Release the S latch on the root page. */ mtr->memo_release(root_block, MTR_MEMO_PAGE_S_FIX); @@ -355,9 +356,8 @@ btr_root_adjust_on_import( buf_block_t* block; page_zip_des_t* page_zip; dict_table_t* table = index->table; - const ulint space_id = dict_index_get_space(index); - const page_id_t page_id(space_id, dict_index_get_page(index)); - const page_size_t page_size(dict_table_page_size(table)); + const page_id_t page_id(table->space_id, index->page); + const page_size_t page_size(table->space->flags); DBUG_EXECUTE_IF("ib_import_trigger_corruption_3", return(DB_CORRUPTION);); @@ -385,10 +385,9 @@ btr_root_adjust_on_import( } else { /* Check that the table flags and the tablespace flags match. */ - ulint flags = dict_tf_to_fsp_flags(table->flags); - ulint fsp_flags = fil_space_get_flags(table->space); - err = flags == fsp_flags - ? DB_SUCCESS : DB_CORRUPTION; + err = (dict_tf_to_fsp_flags(table->flags) + == table->space->flags) + ? DB_SUCCESS : DB_CORRUPTION; } } else { err = DB_SUCCESS; @@ -398,10 +397,10 @@ btr_root_adjust_on_import( if (err == DB_SUCCESS && (!btr_root_fseg_adjust_on_import( FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF - + page, page_zip, space_id) + + page, page_zip, table->space_id) || !btr_root_fseg_adjust_on_import( FIL_PAGE_DATA + PAGE_BTR_SEG_TOP - + page, page_zip, space_id))) { + + page, page_zip, table->space_id))) { err = DB_CORRUPTION; } @@ -425,7 +424,7 @@ btr_page_create( { page_t* page = buf_block_get_frame(block); - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); if (page_zip) { page_create_zip(block, index, level, 0, NULL, mtr); @@ -467,8 +466,8 @@ btr_page_alloc_for_ibuf( ut_a(node_addr.page != FIL_NULL); new_block = buf_page_get( - page_id_t(dict_index_get_space(index), node_addr.page), - dict_table_page_size(index->table), + page_id_t(index->table->space_id, node_addr.page), + page_size_t(index->table->space->flags), RW_X_LATCH, mtr); new_page = buf_block_get_frame(new_block); @@ -702,7 +701,7 @@ btr_page_free_for_ibuf( { page_t* root; - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); root = btr_root_get(index, mtr); flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, @@ -721,17 +720,17 @@ btr_page_free_for_ibuf( void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, bool blob) { - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); #ifdef BTR_CUR_HASH_ADAPT if (block->index && !block->index->freed()) { ut_ad(!blob); ut_ad(page_is_leaf(block->frame)); } #endif - ut_ad(index->space == block->page.id.space()); + ut_ad(index->table->space_id == block->page.id.space()); /* The root page is freed by btr_free_root(). */ ut_ad(block->page.id.page_no() != index->page); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); /* The page gets invalid for optimistic searches: increment the frame modify clock */ @@ -751,14 +750,12 @@ void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, ? PAGE_HEADER + PAGE_BTR_SEG_LEAF : PAGE_HEADER + PAGE_BTR_SEG_TOP]; fseg_free_page(seg_header, - block->page.id.space(), - block->page.id.page_no(), - mtr); + index->table->space, block->page.id.page_no(), mtr); /* The page was marked free in the allocation bitmap, but it should remain exclusively latched until mtr_t::commit() or until it is explicitly freed from the mini-transaction. */ - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); if (srv_immediate_scrub_data_uncompressed) { /* In MDEV-15528 this code must be removed and the @@ -766,7 +763,7 @@ void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, should zero out the page after the redo log for this mini-transaction has been durably written. The log would include the 10.4 MLOG_INIT_FREE_PAGE record. */ - fsp_init_file_page(fil_space_get(index->space), block, mtr); + fsp_init_file_page(index->table->space, block, mtr); } } @@ -818,13 +815,14 @@ btr_node_ptr_get_child( mtr_t* mtr) /*!< in: mtr */ { ut_ad(rec_offs_validate(node_ptr, index, offsets)); - - const page_id_t page_id( - page_get_space_id(page_align(node_ptr)), - btr_node_ptr_get_child_page_no(node_ptr, offsets)); - - return(btr_block_get(page_id, dict_table_page_size(index->table), - RW_SX_LATCH, index, mtr)); + ut_ad(index->table->space_id + == page_get_space_id(page_align(node_ptr))); + + return btr_block_get( + page_id_t(index->table->space_id, + btr_node_ptr_get_child_page_no(node_ptr, offsets)), + page_size_t(index->table->space->flags), + RW_SX_LATCH, index, mtr); } /************************************************************//** @@ -867,7 +865,7 @@ btr_page_get_father_node_ptr_func( ut_ad(dict_index_get_page(index) != page_no); - level = btr_page_get_level(btr_cur_get_page(cursor), mtr); + level = btr_page_get_level(btr_cur_get_page(cursor)); user_rec = btr_cur_get_rec(cursor); ut_a(page_rec_is_user_rec(user_rec)); @@ -878,7 +876,7 @@ btr_page_get_father_node_ptr_func( err = btr_cur_search_to_nth_level( index, level + 1, tuple, PAGE_CUR_LE, latch_mode, cursor, 0, - file, line, mtr, 0); + file, line, mtr); if (err != DB_SUCCESS) { ib::warn() << " Error code: " << err @@ -892,7 +890,7 @@ btr_page_get_father_node_ptr_func( node_ptr = btr_cur_get_rec(cursor); - offsets = rec_get_offsets(node_ptr, index, offsets, false, + offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) { @@ -909,10 +907,11 @@ btr_page_get_father_node_ptr_func( print_rec = page_rec_get_next( page_get_infimum_rec(page_align(user_rec))); offsets = rec_get_offsets(print_rec, index, offsets, - page_rec_is_leaf(user_rec), + page_rec_is_leaf(user_rec) + ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); page_rec_print(print_rec, offsets); - offsets = rec_get_offsets(node_ptr, index, offsets, false, + offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); page_rec_print(node_ptr, offsets); @@ -1052,8 +1051,7 @@ btr_free_root_check( /** Create the root node for a new index tree. @param[in] type type of the index -@param[in] space space where created -@param[in] page_size page size +@param[in,out] space tablespace where created @param[in] index_id index id @param[in] index index, or NULL when applying TRUNCATE log record during recovery @@ -1064,8 +1062,7 @@ record during recovery ulint btr_create( ulint type, - ulint space, - const page_size_t& page_size, + fil_space_t* space, index_id_t index_id, dict_index_t* index, const btr_create_t* btr_redo_create_info, @@ -1128,7 +1125,7 @@ btr_create( if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr, - block)) { + false, block)) { /* Not enough space for new segment, free root segment before return. */ btr_free_root(block, mtr, @@ -1204,7 +1201,7 @@ btr_create( Note: Insert Buffering is disabled for temporary tables given that most temporary tables are smaller in size and short-lived. */ if (!(type & DICT_CLUSTERED) - && (index == NULL || !dict_table_is_temporary(index->table))) { + && (index == NULL || !index->table->is_temporary())) { ibuf_reset_free_bits(block); } @@ -1236,7 +1233,7 @@ btr_free_but_not_root( leaf_loop: mtr_start(&mtr); mtr_set_log_mode(&mtr, log_mode); - mtr.set_named_space(block->page.id.space()); + mtr.set_named_space_id(block->page.id.space()); page_t* root = block->frame; @@ -1266,7 +1263,7 @@ leaf_loop: top_loop: mtr_start(&mtr); mtr_set_log_mode(&mtr, log_mode); - mtr.set_named_space(block->page.id.space()); + mtr.set_named_space_id(block->page.id.space()); root = block->frame; @@ -1280,7 +1277,6 @@ top_loop: mtr_commit(&mtr); if (!finished) { - goto top_loop; } } @@ -1305,7 +1301,7 @@ btr_free_if_exists( } btr_free_but_not_root(root, mtr->get_log_mode()); - mtr->set_named_space(page_id.space()); + mtr->set_named_space_id(page_id.space()); btr_free_root(root, mtr, true); } @@ -1338,28 +1334,22 @@ btr_free( ib_uint64_t btr_read_autoinc(dict_index_t* index) { - ut_ad(dict_index_is_clust(index)); + ut_ad(index->is_primary()); ut_ad(index->table->persistent_autoinc); - ut_ad(!dict_table_is_temporary(index->table)); - - if (fil_space_t* space = fil_space_acquire(index->space)) { - mtr_t mtr; - mtr.start(); - ib_uint64_t autoinc; - if (buf_block_t* block = buf_page_get( - page_id_t(index->space, index->page), - page_size_t(space->flags), - RW_S_LATCH, &mtr)) { - autoinc = page_get_autoinc(block->frame); - } else { - autoinc = 0; - } - mtr.commit(); - fil_space_release(space); - return(autoinc); + ut_ad(!index->table->is_temporary()); + mtr_t mtr; + mtr.start(); + ib_uint64_t autoinc; + if (buf_block_t* block = buf_page_get( + page_id_t(index->table->space_id, index->page), + page_size_t(index->table->space->flags), + RW_S_LATCH, &mtr)) { + autoinc = page_get_autoinc(block->frame); + } else { + autoinc = 0; } - - return(0); + mtr.commit(); + return autoinc; } /** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC, @@ -1372,47 +1362,43 @@ ib_uint64_t btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no) { ut_ad(table->persistent_autoinc); - ut_ad(!dict_table_is_temporary(table)); + ut_ad(!table->is_temporary()); dict_index_t* index = dict_table_get_first_index(table); if (index == NULL) { - } else if (fil_space_t* space = fil_space_acquire(index->space)) { - mtr_t mtr; - mtr.start(); - buf_block_t* block = buf_page_get( - page_id_t(index->space, index->page), - page_size_t(space->flags), - RW_S_LATCH, &mtr); - - ib_uint64_t autoinc = block - ? page_get_autoinc(block->frame) : 0; - const bool retry = block && autoinc == 0 - && !page_is_empty(block->frame); - mtr.commit(); - fil_space_release(space); - - if (retry) { - /* This should be an old data file where - PAGE_ROOT_AUTO_INC was initialized to 0. - Fall back to reading MAX(autoinc_col). - There should be an index on it. */ - const dict_col_t* autoinc_col - = dict_table_get_nth_col(table, col_no); - while (index != NULL - && index->fields[0].col != autoinc_col) { - index = dict_table_get_next_index(index); - } + return 0; + } - if (index != NULL && index->space == space->id) { - autoinc = row_search_max_autoinc(index); - } + mtr_t mtr; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id_t(index->table->space_id, index->page), + page_size_t(index->table->space->flags), + RW_S_LATCH, &mtr); + + ib_uint64_t autoinc = block ? page_get_autoinc(block->frame) : 0; + const bool retry = block && autoinc == 0 + && !page_is_empty(block->frame); + mtr.commit(); + + if (retry) { + /* This should be an old data file where + PAGE_ROOT_AUTO_INC was initialized to 0. + Fall back to reading MAX(autoinc_col). + There should be an index on it. */ + const dict_col_t* autoinc_col + = dict_table_get_nth_col(table, col_no); + while (index && index->fields[0].col != autoinc_col) { + index = dict_table_get_next_index(index); } - return(autoinc); + if (index) { + autoinc = row_search_max_autoinc(index); + } } - return(0); + return autoinc; } /** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC. @@ -1424,22 +1410,19 @@ btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no) void btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset) { - ut_ad(dict_index_is_clust(index)); + ut_ad(index->is_primary()); ut_ad(index->table->persistent_autoinc); - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); - if (fil_space_t* space = fil_space_acquire(index->space)) { - mtr_t mtr; - mtr.start(); - mtr.set_named_space(space); - page_set_autoinc(buf_page_get( - page_id_t(index->space, index->page), - page_size_t(space->flags), - RW_SX_LATCH, &mtr), - index, autoinc, &mtr, reset); - mtr.commit(); - fil_space_release(space); - } + mtr_t mtr; + mtr.start(); + fil_space_t* space = index->table->space; + mtr.set_named_space(space); + page_set_autoinc(buf_page_get(page_id_t(space->id, index->page), + page_size_t(space->flags), + RW_SX_LATCH, &mtr), + index, autoinc, &mtr, reset); + mtr.commit(); } /*************************************************************//** @@ -1482,11 +1465,11 @@ btr_page_reorganize_low( bool log_compressed; bool is_spatial; - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); btr_assert_not_corrupted(block, index); ut_ad(fil_page_index_page_check(block->frame)); ut_ad(index->is_dummy - || block->page.id.space() == index->space); + || block->page.id.space() == index->table->space->id); ut_ad(index->is_dummy || block->page.id.page_no() != index->page || !page_has_siblings(page)); @@ -1542,7 +1525,7 @@ btr_page_reorganize_low( During redo log apply, dict_index_is_sec_or_ibuf() always holds, even for clustered indexes. */ - ut_ad(recovery || dict_table_is_temporary(index->table) + ut_ad(recovery || index->table->is_temporary() || !page_is_leaf(temp_page) || !dict_index_is_sec_or_ibuf(index) || page_get_max_trx_id(page) != 0); @@ -1572,18 +1555,18 @@ btr_page_reorganize_low( ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page, PAGE_HEADER + PAGE_N_RECS + temp_page, PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS))); - ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page, - UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page, + ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page, + srv_page_size - FIL_PAGE_DATA_END + temp_page, FIL_PAGE_DATA_END)); #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page, PAGE_N_RECS - PAGE_N_DIR_SLOTS); memcpy(PAGE_DATA + page, PAGE_DATA + temp_page, - UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END); + srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END); #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG - ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE)); + ut_a(!memcmp(page, temp_page, srv_page_size)); #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ goto func_exit; @@ -1621,6 +1604,17 @@ func_exit: #ifdef UNIV_ZIP_DEBUG ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ + + if (!recovery && block->page.id.page_no() == index->page + && fil_page_get_type(temp_page) == FIL_PAGE_TYPE_INSTANT) { + /* Preserve the PAGE_INSTANT information. */ + ut_ad(!page_zip); + ut_ad(index->is_instant()); + memcpy(FIL_PAGE_TYPE + page, FIL_PAGE_TYPE + temp_page, 2); + memcpy(PAGE_HEADER + PAGE_INSTANT + page, + PAGE_HEADER + PAGE_INSTANT + temp_page, 2); + } + buf_block_free(temp_block); /* Restore logging mode */ @@ -1655,6 +1649,19 @@ func_exit: MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL); } + if (UNIV_UNLIKELY(fil_page_get_type(page) == FIL_PAGE_TYPE_INSTANT)) { + /* Log the PAGE_INSTANT information. */ + ut_ad(!page_zip); + ut_ad(index->is_instant()); + ut_ad(!recovery); + mlog_write_ulint(FIL_PAGE_TYPE + page, FIL_PAGE_TYPE_INSTANT, + MLOG_2BYTES, mtr); + mlog_write_ulint(PAGE_HEADER + PAGE_INSTANT + page, + mach_read_from_2(PAGE_HEADER + PAGE_INSTANT + + page), + MLOG_2BYTES, mtr); + } + return(success); } @@ -1753,24 +1760,26 @@ btr_parse_page_reorganize( return(ptr); } -/*************************************************************//** -Empties an index page. @see btr_page_create(). */ -static +/** Empty an index page (possibly the root page). @see btr_page_create(). +@param[in,out] block page to be emptied +@param[in,out] page_zip compressed page frame, or NULL +@param[in] index index of the page +@param[in] level B-tree level of the page (0=leaf) +@param[in,out] mtr mini-transaction */ void btr_page_empty( -/*===========*/ - buf_block_t* block, /*!< in: page to be emptied */ - page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */ - dict_index_t* index, /*!< in: index of the page */ - ulint level, /*!< in: the B-tree level of the page */ - mtr_t* mtr) /*!< in: mtr */ + buf_block_t* block, + page_zip_des_t* page_zip, + dict_index_t* index, + ulint level, + mtr_t* mtr) { page_t* page = buf_block_get_frame(block); - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); ut_ad(page_zip == buf_block_get_page_zip(block)); ut_ad(!index->is_dummy); - ut_ad(index->space == block->page.id.space()); + ut_ad(index->table->space->id == block->page.id.space()); #ifdef UNIV_ZIP_DEBUG ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ @@ -1841,12 +1850,13 @@ btr_root_raise_and_insert( root_page_zip = buf_block_get_page_zip(root_block); ut_ad(!page_is_empty(root)); index = btr_cur_get_index(cursor); + ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); #ifdef UNIV_ZIP_DEBUG ut_a(!root_page_zip || page_zip_validate(root_page_zip, root, index)); #endif /* UNIV_ZIP_DEBUG */ #ifdef UNIV_BTR_DEBUG if (!dict_index_is_ibuf(index)) { - ulint space = dict_index_get_space(index); + ulint space = index->table->space_id; ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + root, space)); @@ -1859,14 +1869,13 @@ btr_root_raise_and_insert( ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); - ut_ad(mtr_is_block_fix( - mtr, root_block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX)); /* Allocate a new page to the tree. Root splitting is done by first moving the root records to the new page, emptying the root, putting a node pointer to the new page, and then splitting the new page. */ - level = btr_page_get_level(root, mtr); + level = btr_page_get_level(root); new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr); @@ -1903,19 +1912,16 @@ btr_root_raise_and_insert( root_page_zip, root, index, mtr); /* Update the lock table and possible hash index. */ - - if (!dict_table_is_locking_disabled(index->table)) { - lock_move_rec_list_end(new_block, root_block, - page_get_infimum_rec(root)); - } + lock_move_rec_list_end(new_block, root_block, + page_get_infimum_rec(root)); /* Move any existing predicate locks */ if (dict_index_is_spatial(index)) { lock_prdt_rec_move(new_block, root_block); + } else { + btr_search_move_or_delete_hash_entries( + new_block, root_block); } - - btr_search_move_or_delete_hash_entries(new_block, root_block, - index); } if (dict_index_is_sec_or_ibuf(index)) { @@ -1970,7 +1976,7 @@ btr_root_raise_and_insert( rtr_page_cal_mbr(index, new_block, &new_mbr, *heap); node_ptr = rtr_index_build_node_ptr( - index, &new_mbr, rec, new_page_no, *heap, level); + index, &new_mbr, rec, new_page_no, *heap); } else { node_ptr = dict_index_build_node_ptr( index, rec, new_page_no, *heap, level); @@ -1984,6 +1990,17 @@ btr_root_raise_and_insert( /* Rebuild the root page to get free space */ btr_page_empty(root_block, root_page_zip, index, level + 1, mtr); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(root_block->frame)); + + if (index->is_instant()) { + ut_ad(!root_page_zip); + byte* page_type = root_block->frame + FIL_PAGE_TYPE; + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_INDEX); + mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT, + MLOG_2BYTES, mtr); + page_set_instant(root_block->frame, index->n_core_fields, mtr); + } ut_ad(!page_has_siblings(root)); @@ -2003,7 +2020,7 @@ btr_root_raise_and_insert( /* We play safe and reset the free bits for the new page */ if (!dict_index_is_clust(index) - && !dict_table_is_temporary(index->table)) { + && !index->table->is_temporary()) { ibuf_reset_free_bits(new_block); } @@ -2041,6 +2058,20 @@ rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor) return NULL; } + /* The metadata record must be present in the leftmost leaf page + of the clustered index, if and only if index->is_instant(). + However, during innobase_instant_try(), index->is_instant() + would already hold when row_ins_clust_index_entry_low() + is being invoked to insert the the metadata record. + So, we can only assert that when the metadata record exists, + index->is_instant() must hold. */ + ut_ad(!page_is_leaf(page) || page_has_prev(page) + || cursor->index->is_instant() + || !(rec_get_info_bits(page_rec_get_next_const( + page_get_infimum_rec(page)), + dict_table_is_comp(cursor->index->table)) + & REC_INFO_MIN_REC_FLAG)); + const rec_t* infimum = page_get_infimum_rec(page); /* If the convergence is in the middle of a page, include also @@ -2149,7 +2180,7 @@ btr_page_get_split_rec( /* free_space is now the free space of a created new page */ total_data = page_get_data_size(page) + insert_size; - total_n_recs = page_get_n_recs(page) + 1; + total_n_recs = ulint(page_get_n_recs(page)) + 1; ut_ad(total_n_recs >= 2); total_space = total_data + page_dir_calc_reserved_space(total_n_recs); @@ -2184,7 +2215,9 @@ btr_page_get_split_rec( incl_data += insert_size; } else { offsets = rec_get_offsets(rec, cursor->index, offsets, - page_is_leaf(page), + page_is_leaf(page) + ? cursor->index->n_core_fields + : 0, ULINT_UNDEFINED, &heap); incl_data += rec_offs_size(offsets); } @@ -2260,7 +2293,7 @@ btr_page_insert_fits( /* free_space is now the free space of a created new page */ total_data = page_get_data_size(page) + insert_size; - total_n_recs = page_get_n_recs(page) + 1; + total_n_recs = ulint(page_get_n_recs(page)) + 1; /* We determine which records (from rec to end_rec, not including end_rec) will end up on the other half page from tuple when it is @@ -2293,7 +2326,9 @@ btr_page_insert_fits( space after rec is removed from page. */ *offsets = rec_get_offsets(rec, cursor->index, *offsets, - page_is_leaf(page), + page_is_leaf(page) + ? cursor->index->n_core_fields + : 0, ULINT_UNDEFINED, heap); total_data -= rec_offs_size(*offsets); @@ -2344,7 +2379,7 @@ btr_insert_on_non_leaf_level_func( dberr_t err = btr_cur_search_to_nth_level( index, level, tuple, PAGE_CUR_LE, BTR_CONT_MODIFY_TREE, - &cursor, 0, file, line, mtr, 0); + &cursor, 0, file, line, mtr); if (err != DB_SUCCESS) { ib::warn() << " Error code: " << err @@ -2365,7 +2400,7 @@ btr_insert_on_non_leaf_level_func( btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_RTREE_INSERT, BTR_CONT_MODIFY_TREE, - &cursor, 0, file, line, mtr, 0); + &cursor, 0, file, line, mtr); } ut_ad(cursor.flag == BTR_CUR_BINARY); @@ -2432,9 +2467,8 @@ btr_attach_half_pages( buf_block_t* prev_block = NULL; buf_block_t* next_block = NULL; - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); - ut_ad(mtr_is_block_fix( - mtr, new_block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX)); /* Create a memory heap where the data tuple is stored */ heap = mem_heap_create(1024); @@ -2492,9 +2526,8 @@ btr_attach_half_pages( } /* Get the level of the split pages */ - level = btr_page_get_level(buf_block_get_frame(block), mtr); - ut_ad(level - == btr_page_get_level(buf_block_get_frame(new_block), mtr)); + level = btr_page_get_level(buf_block_get_frame(block)); + ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block))); /* Build the node pointer (= node key and page address) for the upper half */ @@ -2582,7 +2615,8 @@ btr_page_tuple_smaller( first_rec = page_cur_get_rec(&pcur); *offsets = rec_get_offsets( - first_rec, cursor->index, *offsets, page_is_leaf(block->frame), + first_rec, cursor->index, *offsets, + page_is_leaf(block->frame) ? cursor->index->n_core_fields : 0, n_uniq, heap); return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0); @@ -2619,8 +2653,7 @@ btr_insert_into_right_sibling( ut_ad(mtr_memo_contains_flagged( mtr, dict_index_get_lock(cursor->index), MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); - ut_ad(mtr_is_block_fix( - mtr, block, MTR_MEMO_PAGE_X_FIX, cursor->index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); ut_ad(heap); if (next_page_no == FIL_NULL || !page_rec_is_supremum( @@ -2667,7 +2700,7 @@ btr_insert_into_right_sibling( if (is_leaf && next_block->page.size.is_compressed() && !dict_index_is_clust(cursor->index) - && !dict_table_is_temporary(cursor->index->table)) { + && !cursor->index->table->is_temporary()) { /* Reset the IBUF_BITMAP_FREE bits, because page_cur_tuple_insert() will have attempted page reorganize before failing. */ @@ -2678,7 +2711,7 @@ btr_insert_into_right_sibling( ibool compressed; dberr_t err; - ulint level = btr_page_get_level(next_page, mtr); + ulint level = btr_page_get_level(next_page); /* adjust cursor position */ *btr_cur_get_page_cur(cursor) = next_page_cursor; @@ -2709,7 +2742,7 @@ btr_insert_into_right_sibling( if (is_leaf && !dict_index_is_clust(cursor->index) - && !dict_table_is_temporary(cursor->index->table)) { + && !cursor->index->table->is_temporary()) { /* Update the free bits of the B-tree page in the insert buffer bitmap. */ @@ -2765,7 +2798,7 @@ btr_page_split_and_insert( ulint n_iterations = 0; ulint n_uniq; - if (dict_index_is_spatial(cursor->index)) { + if (cursor->index->is_spatial()) { /* Split rtree page and update parent */ return(rtr_page_split_and_insert(flags, cursor, offsets, heap, tuple, n_ext, mtr)); @@ -2792,8 +2825,7 @@ func_start: page = buf_block_get_frame(block); page_zip = buf_block_get_page_zip(block); - ut_ad(mtr_is_block_fix( - mtr, block, MTR_MEMO_PAGE_X_FIX, cursor->index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); ut_ad(!page_is_empty(page)); /* try to insert to the next page if possible before split */ @@ -2843,7 +2875,7 @@ func_start: /* 2. Allocate a new page to the index */ new_block = btr_page_alloc(cursor->index, hint_page_no, direction, - btr_page_get_level(page, mtr), mtr, mtr); + btr_page_get_level(page), mtr, mtr); if (!new_block) { return(NULL); @@ -2852,7 +2884,7 @@ func_start: new_page = buf_block_get_frame(new_block); new_page_zip = buf_block_get_page_zip(new_block); btr_page_create(new_block, new_page_zip, cursor->index, - btr_page_get_level(page, mtr), mtr); + btr_page_get_level(page), mtr); /* Only record the leaf level page splits. */ if (page_is_leaf(page)) { cursor->index->stat_defrag_n_page_split ++; @@ -2868,7 +2900,9 @@ func_start: first_rec = move_limit = split_rec; *offsets = rec_get_offsets(split_rec, cursor->index, *offsets, - page_is_leaf(page), n_uniq, heap); + page_is_leaf(page) + ? cursor->index->n_core_fields : 0, + n_uniq, heap); insert_left = !tuple || cmp_dtuple_rec(tuple, split_rec, *offsets) < 0; @@ -2962,16 +2996,12 @@ insert_empty: ULINT_UNDEFINED, mtr); /* Update the lock table and possible hash index. */ - - if (!dict_table_is_locking_disabled( - cursor->index->table)) { - lock_move_rec_list_start( - new_block, block, move_limit, - new_page + PAGE_NEW_INFIMUM); - } + lock_move_rec_list_start( + new_block, block, move_limit, + new_page + PAGE_NEW_INFIMUM); btr_search_move_or_delete_hash_entries( - new_block, block, cursor->index); + new_block, block); /* Delete the records from the source page. */ @@ -3008,14 +3038,10 @@ insert_empty: cursor->index, mtr); /* Update the lock table and possible hash index. */ - if (!dict_table_is_locking_disabled( - cursor->index->table)) { - lock_move_rec_list_end( - new_block, block, move_limit); - } + lock_move_rec_list_end(new_block, block, move_limit); btr_search_move_or_delete_hash_entries( - new_block, block, cursor->index); + new_block, block); /* Delete the records from the source page. */ @@ -3100,7 +3126,7 @@ insert_empty: insert_failed: /* We play safe and reset the free bits for new_page */ if (!dict_index_is_clust(cursor->index) - && !dict_table_is_temporary(cursor->index->table)) { + && !cursor->index->table->is_temporary()) { ibuf_reset_free_bits(new_block); ibuf_reset_free_bits(block); } @@ -3118,7 +3144,7 @@ func_exit: left and right pages in the same mtr */ if (!dict_index_is_clust(cursor->index) - && !dict_table_is_temporary(cursor->index->table) + && !cursor->index->table->is_temporary() && page_is_leaf(page)) { ibuf_update_free_bits_for_two_pages_low( @@ -3151,7 +3177,7 @@ btr_level_list_remove_func( { ut_ad(page != NULL); ut_ad(mtr != NULL); - ut_ad(mtr_is_page_fix(mtr, page, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); ut_ad(space == page_get_space_id(page)); /* Get the previous and next page numbers of page */ @@ -3297,9 +3323,9 @@ btr_lift_page_up( buf_block_t* block_orig = block; ut_ad(!page_has_siblings(page)); - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - page_level = btr_page_get_level(page, mtr); + page_level = btr_page_get_level(page); root_page_no = dict_index_get_page(index); { @@ -3307,7 +3333,8 @@ btr_lift_page_up( rec_offs* offsets = NULL; mem_heap_t* heap = mem_heap_create( sizeof(*offsets) - * (REC_OFFS_HEADER_SIZE + 1 + 1 + index->n_fields)); + * (REC_OFFS_HEADER_SIZE + 1 + 1 + + unsigned(index->n_fields))); buf_block_t* b; if (dict_index_is_spatial(index)) { @@ -3361,11 +3388,11 @@ btr_lift_page_up( block = father_block; page = buf_block_get_frame(block); - page_level = btr_page_get_level(page, mtr); + page_level = btr_page_get_level(page); ut_ad(!page_has_siblings(page)); - ut_ad(mtr_is_block_fix( - mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains( + mtr, block, MTR_MEMO_PAGE_X_FIX)); father_block = blocks[0]; father_page_zip = buf_block_get_page_zip(father_block); @@ -3379,6 +3406,20 @@ btr_lift_page_up( /* Make the father empty */ btr_page_empty(father_block, father_page_zip, index, page_level, mtr); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(father_block->frame)); + + if (index->is_instant() + && father_block->page.id.page_no() == root_page_no) { + ut_ad(!father_page_zip); + byte* page_type = father_block->frame + FIL_PAGE_TYPE; + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_INDEX); + mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT, + MLOG_2BYTES, mtr); + page_set_instant(father_block->frame, + index->n_core_fields, mtr); + } + page_level++; /* Copy the records to the father page one by one. */ @@ -3400,18 +3441,16 @@ btr_lift_page_up( /* Update the lock table and possible hash index. */ - if (!dict_table_is_locking_disabled(index->table)) { - lock_move_rec_list_end(father_block, block, - page_get_infimum_rec(page)); - } + lock_move_rec_list_end(father_block, block, + page_get_infimum_rec(page)); /* Also update the predicate locks */ if (dict_index_is_spatial(index)) { lock_prdt_rec_move(father_block, block); + } else { + btr_search_move_or_delete_hash_entries( + father_block, block); } - - btr_search_move_or_delete_hash_entries(father_block, block, - index); } if (!dict_table_is_locking_disabled(index->table)) { @@ -3419,7 +3458,7 @@ btr_lift_page_up( if (dict_index_is_spatial(index)) { lock_mutex_enter(); lock_prdt_page_free_from_discard( - block, lock_sys->prdt_page_hash); + block, lock_sys.prdt_page_hash); lock_mutex_exit(); } lock_update_copy_and_discard(father_block, block); @@ -3430,7 +3469,7 @@ btr_lift_page_up( page_t* page = buf_block_get_frame(blocks[i]); page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]); - ut_ad(btr_page_get_level(page, mtr) == page_level + 1); + ut_ad(btr_page_get_level(page) == page_level + 1); btr_page_set_level(page, page_zip, page_level, mtr); #ifdef UNIV_ZIP_DEBUG @@ -3447,7 +3486,7 @@ btr_lift_page_up( /* We play it safe and reset the free bits for the father */ if (!dict_index_is_clust(index) - && !dict_table_is_temporary(index->table)) { + && !index->table->is_temporary()) { ibuf_reset_free_bits(father_block); } ut_ad(page_validate(father_page, index)); @@ -3478,7 +3517,6 @@ btr_compress( mtr_t* mtr) /*!< in/out: mini-transaction */ { dict_index_t* index; - ulint space; ulint left_page_no; ulint right_page_no; buf_block_t* merge_block; @@ -3514,10 +3552,9 @@ btr_compress( } #endif /* UNIV_DEBUG */ - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); - space = dict_index_get_space(index); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - const page_size_t page_size(dict_table_page_size(index->table)); + const page_size_t page_size(index->table->space->flags); MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS); @@ -3636,7 +3673,7 @@ retry: rec_offs* offsets2 = NULL; /* For rtree, we need to update father's mbr. */ - if (dict_index_is_spatial(index)) { + if (index->is_spatial()) { /* We only support merge pages with the same parent page */ if (!rtr_check_same_block( @@ -3654,14 +3691,14 @@ retry: offsets2 = rec_get_offsets( btr_cur_get_rec(&cursor2), index, NULL, - page_is_leaf(cursor2.page_cur.block->frame), + page_is_leaf(cursor2.page_cur.block->frame) + ? index->n_fields : 0, ULINT_UNDEFINED, &heap); /* Check if parent entry needs to be updated */ mbr_changed = rtr_merge_mbr_changed( &cursor2, &father_cursor, - offsets2, offsets, &new_mbr, - merge_block, block, index); + offsets2, offsets, &new_mbr); } rec_t* orig_pred = page_copy_rec_list_start( @@ -3675,7 +3712,8 @@ retry: btr_search_drop_page_hash_index(block); /* Remove the page from the level list */ - btr_level_list_remove(space, page_size, page, index, mtr); + btr_level_list_remove(index->table->space_id, + page_size, page, index, mtr); if (dict_index_is_spatial(index)) { rec_t* my_rec = father_cursor.page_cur.rec; @@ -3705,14 +3743,13 @@ retry: merge_page, &new_mbr, NULL, mtr); #endif } else { - rtr_node_ptr_delete( - index, &father_cursor, block, mtr); + rtr_node_ptr_delete(&father_cursor, mtr); } /* No GAP lock needs to be worrying about */ lock_mutex_enter(); lock_prdt_page_free_from_discard( - block, lock_sys->prdt_page_hash); + block, lock_sys.prdt_page_hash); lock_rec_free_all_from_discard_page(block); lock_mutex_exit(); } else { @@ -3768,9 +3805,7 @@ retry: #ifdef UNIV_BTR_DEBUG memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4); #endif /* UNIV_BTR_DEBUG */ -#if FIL_NULL != 0xffffffff -# error "FIL_NULL != 0xffffffff" -#endif + compile_time_assert(FIL_NULL == 0xffffffffU); memset(merge_page + FIL_PAGE_PREV, 0xff, 4); } @@ -3807,7 +3842,8 @@ retry: #endif /* UNIV_BTR_DEBUG */ /* Remove the page from the level list */ - btr_level_list_remove(space, page_size, (page_t*)page, index, mtr); + btr_level_list_remove(index->table->space_id, + page_size, page, index, mtr); ut_ad(btr_node_ptr_get_child_page_no( btr_cur_get_rec(&father_cursor), offsets) @@ -3830,13 +3866,14 @@ retry: #endif /* UNIV_DEBUG */ /* For rtree, we need to update father's mbr. */ - if (dict_index_is_spatial(index)) { + if (index->is_spatial()) { rec_offs* offsets2; ulint rec_info; offsets2 = rec_get_offsets( btr_cur_get_rec(&cursor2), index, NULL, - page_is_leaf(cursor2.page_cur.block->frame), + page_is_leaf(cursor2.page_cur.block->frame) + ? index->n_fields : 0, ULINT_UNDEFINED, &heap); ut_ad(btr_node_ptr_get_child_page_no( @@ -3853,9 +3890,7 @@ retry: rtr_merge_and_update_mbr(&father_cursor, &cursor2, offsets, offsets2, - merge_page, - merge_block, - block, index, mtr); + merge_page, mtr); } else { /* Otherwise, we will keep the node ptr of merge page and delete the father node ptr. @@ -3864,13 +3899,11 @@ retry: rtr_merge_and_update_mbr(&cursor2, &father_cursor, offsets2, offsets, - merge_page, - merge_block, - block, index, mtr); + merge_page, mtr); } lock_mutex_enter(); lock_prdt_page_free_from_discard( - block, lock_sys->prdt_page_hash); + block, lock_sys.prdt_page_hash); lock_rec_free_all_from_discard_page(block); lock_mutex_exit(); } else { @@ -3895,7 +3928,7 @@ retry: } if (!dict_index_is_clust(index) - && !dict_table_is_temporary(index->table) + && !index->table->is_temporary() && page_is_leaf(merge_page)) { /* Update the free bits of the B-tree page in the insert buffer bitmap. This has to be done in a @@ -3932,7 +3965,7 @@ retry: write the bits accurately in a separate mini-transaction. */ ibuf_update_free_bits_if_full(merge_block, - UNIV_PAGE_SIZE, + srv_page_size, ULINT_UNDEFINED); } } @@ -4019,10 +4052,10 @@ btr_discard_only_page_on_level( const page_t* page = buf_block_get_frame(block); ut_a(page_get_n_recs(page) == 1); - ut_a(page_level == btr_page_get_level(page, mtr)); + ut_a(page_level == btr_page_get_level(page)); ut_a(!page_has_siblings(page)); ut_ad(fil_page_index_page_check(page)); - ut_ad(block->page.id.space() == index->space); + ut_ad(block->page.id.space() == index->table->space->id); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); btr_search_drop_page_hash_index(block); @@ -4054,7 +4087,7 @@ btr_discard_only_page_on_level( #ifdef UNIV_BTR_DEBUG if (!dict_index_is_ibuf(index)) { const page_t* root = buf_block_get_frame(block); - const ulint space = dict_index_get_space(index); + const ulint space = index->table->space_id; ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + root, space)); ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP @@ -4064,9 +4097,14 @@ btr_discard_only_page_on_level( btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr); ut_ad(page_is_leaf(buf_block_get_frame(block))); - - if (!dict_index_is_clust(index) - && !dict_table_is_temporary(index->table)) { + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(block->frame)); + + if (index->is_primary()) { + /* Concurrent access is prevented by the root_block->lock + X-latch, so this should be safe. */ + index->remove_instant(); + } else if (!index->table->is_temporary()) { /* We play it safe and reset the free bits for the root */ ibuf_reset_free_bits(block); @@ -4106,9 +4144,7 @@ btr_discard_page( ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); - - const ulint space = dict_index_get_space(index); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); MONITOR_INC(MONITOR_INDEX_DISCARD); @@ -4123,12 +4159,12 @@ btr_discard_page( left_page_no = btr_page_get_prev(buf_block_get_frame(block)); right_page_no = btr_page_get_next(buf_block_get_frame(block)); - const page_size_t page_size(dict_table_page_size(index->table)); + const page_size_t page_size(index->table->space->flags); ut_d(bool parent_is_different = false); if (left_page_no != FIL_NULL) { merge_block = btr_block_get( - page_id_t(space, left_page_no), page_size, - RW_X_LATCH, index, mtr); + page_id_t(index->table->space_id, left_page_no), + page_size, RW_X_LATCH, index, mtr); merge_page = buf_block_get_frame(merge_block); #ifdef UNIV_BTR_DEBUG @@ -4143,8 +4179,8 @@ btr_discard_page( == btr_cur_get_rec(&parent_cursor))); } else if (right_page_no != FIL_NULL) { merge_block = btr_block_get( - page_id_t(space, right_page_no), page_size, - RW_X_LATCH, index, mtr); + page_id_t(index->table->space_id, right_page_no), + page_size, RW_X_LATCH, index, mtr); merge_page = buf_block_get_frame(merge_block); #ifdef UNIV_BTR_DEBUG @@ -4180,13 +4216,14 @@ btr_discard_page( } if (dict_index_is_spatial(index)) { - rtr_node_ptr_delete(index, &parent_cursor, block, mtr); + rtr_node_ptr_delete(&parent_cursor, mtr); } else { btr_cur_node_ptr_delete(&parent_cursor, mtr); } /* Remove the page from the level list */ - btr_level_list_remove(space, page_size, page, index, mtr); + btr_level_list_remove(index->table->space_id, page_size, + page, index, mtr); #ifdef UNIV_ZIP_DEBUG { @@ -4287,9 +4324,9 @@ btr_print_recursive( ulint i = 0; mtr_t mtr2; - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_SX_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_SX_FIX)); - ib::info() << "NODE ON LEVEL " << btr_page_get_level(page, mtr) + ib::info() << "NODE ON LEVEL " << btr_page_get_level(page) << " page " << block->page.id; page_print(block, index, width, width); @@ -4314,7 +4351,7 @@ btr_print_recursive( node_ptr = page_cur_get_rec(&cursor); *offsets = rec_get_offsets( - node_ptr, index, *offsets, false, + node_ptr, index, *offsets, 0, ULINT_UNDEFINED, heap); btr_print_recursive(index, btr_node_ptr_get_child(node_ptr, @@ -4381,7 +4418,7 @@ btr_check_node_ptr( btr_cur_t cursor; page_t* page = buf_block_get_frame(block); - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); if (dict_index_get_page(index) == block->page.id.page_no()) { @@ -4405,7 +4442,7 @@ btr_check_node_ptr( tuple = dict_index_build_node_ptr( index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap, - btr_page_get_level(page, mtr)); + btr_page_get_level(page)); /* For spatial index, the MBR in the parent rec could be different with that of first rec of child, their relationship should be @@ -4455,8 +4492,6 @@ btr_index_rec_validate( and page on error */ { ulint len; - ulint n; - ulint i; const page_t* page; mem_heap_t* heap = NULL; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; @@ -4465,7 +4500,9 @@ btr_index_rec_validate( page = page_align(rec); - if (dict_index_is_ibuf(index)) { + ut_ad(index->n_core_fields); + + if (index->is_ibuf()) { /* The insert buffer index tree can contain records from any other index: we cannot check the number of fields or their length */ @@ -4487,31 +4524,35 @@ btr_index_rec_validate( return(FALSE); } - n = dict_index_get_n_fields(index); - - if (!page_is_comp(page) - && (rec_get_n_fields_old(rec) != n - /* a record for older SYS_INDEXES table - (missing merge_threshold column) is acceptable. */ - && !(index->id == DICT_INDEXES_ID - && rec_get_n_fields_old(rec) == n - 1))) { - btr_index_rec_validate_report(page, rec, index); + if (!page_is_comp(page)) { + const ulint n_rec_fields = rec_get_n_fields_old(rec); + if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD + && index->id == DICT_INDEXES_ID) { + /* A record for older SYS_INDEXES table + (missing merge_threshold column) is acceptable. */ + } else if (n_rec_fields < index->n_core_fields + || n_rec_fields > index->n_fields) { + btr_index_rec_validate_report(page, rec, index); - ib::error() << "Has " << rec_get_n_fields_old(rec) - << " fields, should have " << n; + ib::error() << "Has " << rec_get_n_fields_old(rec) + << " fields, should have " + << index->n_core_fields << ".." + << index->n_fields; - if (dump_on_error) { - fputs("InnoDB: corrupt record ", stderr); - rec_print_old(stderr, rec); - putc('\n', stderr); + if (dump_on_error) { + fputs("InnoDB: corrupt record ", stderr); + rec_print_old(stderr, rec); + putc('\n', stderr); + } + return(FALSE); } - return(FALSE); } - offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page), + offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page) + ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); - for (i = 0; i < n; i++) { + for (unsigned i = 0; i < index->n_fields; i++) { dict_field_t* field = dict_index_get_nth_field(index, i); ulint fixed_size = dict_col_get_fixed_size( dict_field_get_col(field), @@ -4538,14 +4579,10 @@ btr_index_rec_validate( length. When fixed_size == 0, prefix_len is the maximum length of the prefix index column. */ - if ((field->prefix_len == 0 - && len != UNIV_SQL_NULL && fixed_size - && len != fixed_size) - || (field->prefix_len > 0 - && len != UNIV_SQL_NULL - && len - > field->prefix_len)) { - + if (len_is_stored(len) + && (field->prefix_len + ? len > field->prefix_len + : (fixed_size && len != fixed_size))) { btr_index_rec_validate_report(page, rec, index); ib::error error; @@ -4716,20 +4753,20 @@ btr_validate_level( ulint parent_right_page_no = FIL_NULL; bool rightmost_child = false; - mtr_start(&mtr); + mtr.start(); if (!srv_read_only_mode) { if (lockout) { - mtr_x_lock(dict_index_get_lock(index), &mtr); + mtr_x_lock_index(index, &mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } } block = btr_root_block_get(index, RW_SX_LATCH, &mtr); page = buf_block_get_frame(block); - fil_space_t* space = fil_space_get(index->space); + fil_space_t* space = index->table->space; const page_size_t table_page_size( dict_table_page_size(index->table)); const page_size_t space_page_size(space->flags); @@ -4744,7 +4781,7 @@ btr_validate_level( return(false); } - while (level != btr_page_get_level(page, &mtr)) { + while (level != btr_page_get_level(page)) { const rec_t* node_ptr; if (fseg_page_is_free(space, block->page.id.page_no())) { @@ -4756,8 +4793,8 @@ btr_validate_level( ret = false; } - ut_a(index->space == block->page.id.space()); - ut_a(index->space == page_get_space_id(page)); + ut_a(index->table->space_id == block->page.id.space()); + ut_a(block->page.id.space() == page_get_space_id(page)); #ifdef UNIV_ZIP_DEBUG page_zip = buf_block_get_page_zip(block); ut_a(!page_zip || page_zip_validate(page_zip, page, index)); @@ -4768,7 +4805,7 @@ btr_validate_level( page_cur_move_to_next(&cursor); node_ptr = page_cur_get_rec(&cursor); - offsets = rec_get_offsets(node_ptr, index, offsets, false, + offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); savepoint2 = mtr_set_savepoint(&mtr); @@ -4785,8 +4822,6 @@ btr_validate_level( left_page_no = btr_page_get_prev(page); while (left_page_no != FIL_NULL) { - page_id_t left_page_id( - index->space, left_page_no); /* To obey latch order of tree blocks, we should release the right_block once to obtain lock of the uncle block. */ @@ -4795,7 +4830,8 @@ btr_validate_level( savepoint2 = mtr_set_savepoint(&mtr); block = btr_block_get( - left_page_id, + page_id_t(index->table->space_id, + left_page_no), table_page_size, RW_SX_LATCH, index, &mtr); page = buf_block_get_frame(block); @@ -4812,9 +4848,9 @@ loop: offsets = offsets2 = NULL; if (!srv_read_only_mode) { if (lockout) { - mtr_x_lock(dict_index_get_lock(index), &mtr); + mtr_x_lock_index(index, &mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } } @@ -4823,7 +4859,7 @@ loop: ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ - ut_a(block->page.id.space() == index->space); + ut_a(block->page.id.space() == index->table->space_id); if (fseg_page_is_free(space, block->page.id.page_no())) { @@ -4852,7 +4888,7 @@ loop: ret = false; } - ut_a(btr_page_get_level(page, &mtr) == level); + ut_a(btr_page_get_level(page) == level); right_page_no = btr_page_get_next(page); left_page_no = btr_page_get_prev(page); @@ -4866,7 +4902,7 @@ loop: savepoint = mtr_set_savepoint(&mtr); right_block = btr_block_get( - page_id_t(index->space, right_page_no), + page_id_t(index->table->space_id, right_page_no), table_page_size, RW_SX_LATCH, index, &mtr); @@ -4893,10 +4929,12 @@ loop: right_rec = page_rec_get_next(page_get_infimum_rec( right_page)); offsets = rec_get_offsets(rec, index, offsets, - page_is_leaf(page), + page_is_leaf(page) + ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); offsets2 = rec_get_offsets(right_rec, index, offsets2, - page_is_leaf(right_page), + page_is_leaf(right_page) + ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); /* For spatial index, we cannot guarantee the key ordering @@ -4995,7 +5033,7 @@ loop: node_ptr_tuple = dict_index_build_node_ptr( index, page_rec_get_next(page_get_infimum_rec(page)), - 0, heap, btr_page_get_level(page, &mtr)); + 0, heap, btr_page_get_level(page)); if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, offsets)) { @@ -5042,14 +5080,15 @@ loop: if (parent_right_page_no != FIL_NULL) { btr_block_get( - page_id_t(index->space, + page_id_t(index->table + ->space_id, parent_right_page_no), table_page_size, RW_SX_LATCH, index, &mtr); } right_block = btr_block_get( - page_id_t(index->space, + page_id_t(index->table->space_id, right_page_no), table_page_size, RW_SX_LATCH, index, &mtr); @@ -5114,27 +5153,27 @@ node_ptr_fails: /* Commit the mini-transaction to release the latch on 'page'. Re-acquire the latch on right_page, which will become 'page' on the next loop. The page has already been checked. */ - mtr_commit(&mtr); + mtr.commit(); if (trx_is_interrupted(trx)) { /* On interrupt, return the current status. */ } else if (right_page_no != FIL_NULL) { - mtr_start(&mtr); + mtr.start(); if (!lockout) { if (rightmost_child) { if (parent_right_page_no != FIL_NULL) { btr_block_get( page_id_t( - index->space, + index->table->space_id, parent_right_page_no), table_page_size, RW_SX_LATCH, index, &mtr); } } else if (parent_page_no != FIL_NULL) { btr_block_get( - page_id_t(index->space, + page_id_t(index->table->space_id, parent_page_no), table_page_size, RW_SX_LATCH, index, &mtr); @@ -5142,7 +5181,7 @@ node_ptr_fails: } block = btr_block_get( - page_id_t(index->space, right_page_no), + page_id_t(index->table->space_id, right_page_no), table_page_size, RW_SX_LATCH, index, &mtr); @@ -5170,12 +5209,12 @@ btr_validate_spatial_index( mtr_t mtr; bool ok = true; - mtr_start(&mtr); + mtr.start(); - mtr_x_lock(dict_index_get_lock(index), &mtr); + mtr_x_lock_index(index, &mtr); page_t* root = btr_root_get(index, &mtr); - ulint n = btr_page_get_level(root, &mtr); + ulint n = btr_page_get_level(root); #ifdef UNIV_RTR_DEBUG fprintf(stderr, "R-tree level is %lu\n", n); @@ -5192,7 +5231,7 @@ btr_validate_spatial_index( } } - mtr_commit(&mtr); + mtr.commit(); return(ok); } @@ -5228,9 +5267,9 @@ btr_validate_index( if (!srv_read_only_mode) { if (lockout) { - mtr_x_lock(dict_index_get_lock(index), &mtr); + mtr_x_lock_index(index, &mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } } @@ -5242,13 +5281,12 @@ btr_validate_index( return err; } - ulint n = btr_page_get_level(root, &mtr); + ulint n = btr_page_get_level(root); for (ulint i = 0; i <= n; ++i) { if (!btr_validate_level(index, trx, n - i, lockout)) { err = DB_CORRUPTION; - break; } } @@ -5288,8 +5326,8 @@ btr_can_merge_with_page( index = btr_cur_get_index(cursor); page = btr_cur_get_page(cursor); - const page_id_t page_id(dict_index_get_space(index), page_no); - const page_size_t page_size(dict_table_page_size(index->table)); + const page_id_t page_id(index->table->space_id, page_no); + const page_size_t page_size(index->table->space->flags); mblock = btr_block_get(page_id, page_size, RW_X_LATCH, index, mtr); mpage = buf_block_get_frame(mblock); diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc index 100e48cf537..c05cf2a7b7a 100644 --- a/storage/innobase/btr/btr0bulk.cc +++ b/storage/innobase/btr/btr0bulk.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2014, 2019, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,7 +33,7 @@ Created 03/11/2014 Shaohua Wang #include "trx0trx.h" /** Innodb B-tree index fill factor for bulk load. */ -long innobase_fill_factor; +uint innobase_fill_factor; /** whether to reduce redo logging during ALTER TABLE */ my_bool innodb_log_optimize_ddl; @@ -57,7 +57,7 @@ PageBulk::init() m_mtr.set_log_mode(MTR_LOG_NO_REDO); m_mtr.set_flush_observer(m_flush_observer); } else { - m_mtr.set_named_space(m_index->space); + m_index->set_modified(m_mtr); } if (m_page_no == FIL_NULL) { @@ -68,11 +68,12 @@ PageBulk::init() the allocation order, and we will always generate redo log for page allocation, even when creating a new tablespace. */ alloc_mtr.start(); - alloc_mtr.set_named_space(m_index->space); + m_index->set_modified(alloc_mtr); ulint n_reserved; bool success; - success = fsp_reserve_free_extents(&n_reserved, m_index->space, + success = fsp_reserve_free_extents(&n_reserved, + m_index->table->space, 1, FSP_NORMAL, &alloc_mtr); if (!success) { alloc_mtr.commit(); @@ -84,10 +85,7 @@ PageBulk::init() new_block = btr_page_alloc(m_index, 0, FSP_UP, m_level, &alloc_mtr, &m_mtr); - if (n_reserved > 0) { - fil_space_release_free_extents(m_index->space, - n_reserved); - } + m_index->table->space->release_free_extents(n_reserved); alloc_mtr.commit(); @@ -122,11 +120,10 @@ PageBulk::init() m_index->id, &m_mtr); } } else { - page_id_t page_id(dict_index_get_space(m_index), m_page_no); - page_size_t page_size(dict_table_page_size(m_index->table)); - - new_block = btr_block_get(page_id, page_size, - RW_X_LATCH, m_index, &m_mtr); + new_block = btr_block_get( + page_id_t(m_index->table->space_id, m_page_no), + page_size_t(m_index->table->space->flags), + RW_X_LATCH, m_index, &m_mtr); new_page = buf_block_get_frame(new_block); new_page_zip = buf_block_get_page_zip(new_block); @@ -156,16 +153,18 @@ PageBulk::init() m_reserved_space = dict_index_get_space_reserve(); } else { m_reserved_space = - UNIV_PAGE_SIZE * (100 - innobase_fill_factor) / 100; + srv_page_size * (100 - innobase_fill_factor) / 100; } m_padding_space = - UNIV_PAGE_SIZE - dict_index_zip_pad_optimal_page_size(m_index); + srv_page_size - dict_index_zip_pad_optimal_page_size(m_index); m_heap_top = page_header_get_ptr(new_page, PAGE_HEAP_TOP); m_rec_no = page_header_get_field(new_page, PAGE_N_RECS); - ut_ad(page_header_get_field(m_page, PAGE_DIRECTION) - == PAGE_NO_DIRECTION); - page_header_set_field(m_page, NULL, PAGE_DIRECTION, 0); + /* Temporarily reset PAGE_DIRECTION_B from PAGE_NO_DIRECTION to 0, + without writing redo log, to ensure that needs_finish() will hold + on an empty page. */ + ut_ad(m_page[PAGE_HEADER + PAGE_DIRECTION_B] == PAGE_NO_DIRECTION); + m_page[PAGE_HEADER + PAGE_DIRECTION_B] = 0; ut_d(m_total_data = 0); /* See page_copy_rec_list_end_to_created_page() */ ut_d(page_header_set_field(m_page, NULL, PAGE_HEAP_TOP, @@ -187,13 +186,15 @@ PageBulk::insert( ut_ad(m_heap != NULL); rec_size = rec_offs_size(offsets); + ut_d(const bool is_leaf = page_rec_is_leaf(m_cur_rec)); #ifdef UNIV_DEBUG /* Check whether records are in order. */ if (!page_rec_is_infimum_low(page_offset(m_cur_rec))) { rec_t* old_rec = m_cur_rec; rec_offs* old_offsets = rec_get_offsets( - old_rec, m_index, NULL, page_rec_is_leaf(old_rec), + old_rec, m_index, NULL, is_leaf + ? m_index->n_core_fields : 0, ULINT_UNDEFINED, &m_heap); ut_ad(cmp_rec_rec(rec, old_rec, offsets, old_offsets, m_index) @@ -205,7 +206,7 @@ PageBulk::insert( /* 1. Copy the record to page. */ rec_t* insert_rec = rec_copy(m_heap_top, rec, offsets); - rec_offs_make_valid(insert_rec, m_index, offsets); + rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets); /* 2. Insert the record in the linked list. */ /* 3. Set the n_owned field in the inserted record to zero, @@ -234,7 +235,7 @@ PageBulk::insert( - page_dir_calc_reserved_space(m_rec_no); ut_ad(m_free_space >= rec_size + slot_size); - ut_ad(m_heap_top + rec_size < m_page + UNIV_PAGE_SIZE); + ut_ad(m_heap_top + rec_size < m_page + srv_page_size); m_free_space -= rec_size + slot_size; m_heap_top += rec_size; @@ -254,10 +255,10 @@ inline bool PageBulk::needs_finish() const { ut_ad(page_align(m_cur_rec) == m_block->frame); ut_ad(m_page == m_block->frame); - ulint n_heap= page_header_get_field(m_page, PAGE_N_HEAP); - if (!n_heap || !page_header_get_field(m_page, PAGE_DIRECTION)) + if (!m_page[PAGE_HEADER + PAGE_DIRECTION_B]) return true; - ulint heap_no; + ulint heap_no, n_heap= page_header_get_field(m_page, PAGE_N_HEAP); + ut_ad((n_heap & 0x7fff) >= PAGE_HEAP_NO_USER_LOW); if (n_heap & 0x8000) { n_heap&= 0x7fff; @@ -346,9 +347,13 @@ PageBulk::finish() page_dir_slot_set_rec(slot, page_get_supremum_rec(m_page)); page_dir_slot_set_n_owned(slot, NULL, count + 1); + ut_ad(!page_get_instant(m_page)); + if (!m_rec_no) { - page_header_set_field(m_page, NULL, PAGE_DIRECTION, - PAGE_NO_DIRECTION); + /* Restore PAGE_DIRECTION_B from 0 to + PAGE_NO_DIRECTION like it should be on an empty page, + again without writing redo log. */ + m_page[PAGE_HEADER + PAGE_DIRECTION_B] = PAGE_NO_DIRECTION; } else if (!m_flush_observer && !m_page_zip) { mlog_write_ulint(PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page, 2 + slot_index, MLOG_2BYTES, &m_mtr); @@ -364,7 +369,7 @@ PageBulk::finish() mlog_write_ulint(PAGE_HEADER + PAGE_LAST_INSERT + m_page, ulint(m_cur_rec - m_page), MLOG_2BYTES, &m_mtr); - mlog_write_ulint(PAGE_HEADER + PAGE_DIRECTION + m_page, + mlog_write_ulint(PAGE_HEADER + PAGE_DIRECTION_B - 1 + m_page, PAGE_RIGHT, MLOG_2BYTES, &m_mtr); mlog_write_ulint(PAGE_HEADER + PAGE_N_DIRECTION + m_page, 0, MLOG_2BYTES, &m_mtr); @@ -381,7 +386,7 @@ PageBulk::finish() mach_write_to_2(PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no); mach_write_to_2(PAGE_HEADER + PAGE_LAST_INSERT + m_page, ulint(m_cur_rec - m_page)); - mach_write_to_2(PAGE_HEADER + PAGE_DIRECTION + m_page, + mach_write_to_2(PAGE_HEADER + PAGE_DIRECTION_B - 1 + m_page, PAGE_RIGHT); mach_write_to_2(PAGE_HEADER + PAGE_N_DIRECTION + m_page, 0); } @@ -443,6 +448,7 @@ PageBulk::getSplitRec() ut_ad(m_page_zip != NULL); ut_ad(m_rec_no >= 2); + ut_ad(!m_index->is_instant()); ut_ad(page_get_free_space_of_empty(m_is_comp) > m_free_space); total_used_size = page_get_free_space_of_empty(m_is_comp) @@ -452,13 +458,13 @@ PageBulk::getSplitRec() n_recs = 0; offsets = NULL; rec = page_get_infimum_rec(m_page); + const ulint n_core = page_is_leaf(m_page) ? m_index->n_core_fields : 0; do { rec = page_rec_get_next(rec); ut_ad(page_rec_is_user_rec(rec)); - offsets = rec_get_offsets(rec, m_index, offsets, - page_is_leaf(m_page), + offsets = rec_get_offsets(rec, m_index, offsets, n_core, ULINT_UNDEFINED, &m_heap); total_recs_size += rec_offs_size(offsets); n_recs++; @@ -487,9 +493,11 @@ PageBulk::copyIn( ut_ad(m_rec_no == 0); ut_ad(page_rec_is_user_rec(rec)); + const ulint n_core = page_rec_is_leaf(rec) + ? m_index->n_core_fields : 0; + do { - offsets = rec_get_offsets(rec, m_index, offsets, - page_rec_is_leaf(split_rec), + offsets = rec_get_offsets(rec, m_index, offsets, n_core, ULINT_UNDEFINED, &m_heap); insert(rec, offsets); @@ -530,8 +538,10 @@ PageBulk::copyOut( /* Set last record's next in page */ rec_offs* offsets = NULL; rec = page_rec_get_prev(split_rec); - offsets = rec_get_offsets(rec, m_index, offsets, - page_rec_is_leaf(split_rec), + const ulint n_core = page_rec_is_leaf(split_rec) + ? m_index->n_core_fields : 0; + + offsets = rec_get_offsets(rec, m_index, offsets, n_core, ULINT_UNDEFINED, &m_heap); page_rec_set_next(rec, page_get_supremum_rec(m_page)); @@ -539,19 +549,17 @@ PageBulk::copyOut( m_cur_rec = rec; m_heap_top = rec_get_end(rec, offsets); - offsets = rec_get_offsets(last_rec, m_index, offsets, - page_rec_is_leaf(split_rec), + offsets = rec_get_offsets(last_rec, m_index, offsets, n_core, ULINT_UNDEFINED, &m_heap); - m_free_space += rec_get_end(last_rec, offsets) - - m_heap_top + m_free_space += ulint(rec_get_end(last_rec, offsets) - m_heap_top) + page_dir_calc_reserved_space(m_rec_no) - page_dir_calc_reserved_space(n); - ut_ad(m_free_space > 0); + ut_ad(lint(m_free_space) > 0); m_rec_no = n; #ifdef UNIV_DEBUG - m_total_data -= rec_get_end(last_rec, offsets) - m_heap_top; + m_total_data -= ulint(rec_get_end(last_rec, offsets) - m_heap_top); #endif /* UNIV_DEBUG */ } @@ -692,7 +700,7 @@ PageBulk::latch() m_mtr.set_log_mode(MTR_LOG_NO_REDO); m_mtr.set_flush_observer(m_flush_observer); } else { - m_mtr.set_named_space(m_index->space); + m_index->set_modified(m_mtr); } ut_ad(m_block->page.buf_fix_count); @@ -700,7 +708,7 @@ PageBulk::latch() /* In case the block is S-latched by page_cleaner. */ if (!buf_page_optimistic_get(RW_X_LATCH, m_block, m_modify_clock, __FILE__, __LINE__, &m_mtr)) { - m_block = buf_page_get_gen(page_id_t(m_index->space, + m_block = buf_page_get_gen(page_id_t(m_index->table->space_id, m_page_no), univ_page_size, RW_X_LATCH, m_block, BUF_GET_IF_IN_POOL, @@ -822,7 +830,7 @@ BtrBulk::pageCommit( /** Log free check */ inline void BtrBulk::logFreeCheck() { - if (log_sys->check_flush_or_checkpoint) { + if (log_sys.check_flush_or_checkpoint) { release(); log_free_check(); @@ -972,7 +980,8 @@ BtrBulk::insert( /* Convert tuple to rec. */ rec = rec_convert_dtuple_to_rec(static_cast<byte*>(mem_heap_alloc( page_bulk->m_heap, rec_size)), m_index, tuple, n_ext); - offsets = rec_get_offsets(rec, m_index, offsets, !level, + offsets = rec_get_offsets(rec, m_index, offsets, level + ? 0 : m_index->n_core_fields, ULINT_UNDEFINED, &page_bulk->m_heap); page_bulk->insert(rec, offsets); @@ -1014,7 +1023,7 @@ BtrBulk::finish(dberr_t err) { ulint last_page_no = FIL_NULL; - ut_ad(!dict_table_is_temporary(m_index->table)); + ut_ad(!m_index->table->is_temporary()); if (m_page_bulks.size() == 0) { /* The table is empty. The root page of the index tree @@ -1046,30 +1055,27 @@ BtrBulk::finish(dberr_t err) rec_t* first_rec; mtr_t mtr; buf_block_t* last_block; - page_t* last_page; - page_id_t page_id(dict_index_get_space(m_index), - last_page_no); - page_size_t page_size(dict_table_page_size(m_index->table)); - ulint root_page_no = dict_index_get_page(m_index); PageBulk root_page_bulk(m_index, m_trx->id, - root_page_no, m_root_level, + m_index->page, m_root_level, m_flush_observer); - mtr_start(&mtr); - mtr.set_named_space(dict_index_get_space(m_index)); - mtr_x_lock(dict_index_get_lock(m_index), &mtr); + mtr.start(); + m_index->set_modified(mtr); + mtr_x_lock_index(m_index, &mtr); ut_ad(last_page_no != FIL_NULL); - last_block = btr_block_get(page_id, page_size, - RW_X_LATCH, m_index, &mtr); - last_page = buf_block_get_frame(last_block); - first_rec = page_rec_get_next(page_get_infimum_rec(last_page)); + last_block = btr_block_get( + page_id_t(m_index->table->space_id, last_page_no), + page_size_t(m_index->table->space->flags), + RW_X_LATCH, m_index, &mtr); + first_rec = page_rec_get_next( + page_get_infimum_rec(last_block->frame)); ut_ad(page_rec_is_user_rec(first_rec)); /* Copy last page to root page. */ err = root_page_bulk.init(); if (err != DB_SUCCESS) { - mtr_commit(&mtr); + mtr.commit(); return(err); } root_page_bulk.copyIn(first_rec); @@ -1081,7 +1087,7 @@ BtrBulk::finish(dberr_t err) /* Do not flush the last page. */ last_block->page.flush_observer = NULL; - mtr_commit(&mtr); + mtr.commit(); err = pageCommit(&root_page_bulk, NULL, false); ut_ad(err == DB_SUCCESS); diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 3bbddb79a0c..66e680ae494 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -80,18 +80,14 @@ enum btr_op_t { BTR_DELMARK_OP /*!< Mark a record for deletion */ }; -/** Modification types for the B-tree operation. */ +/** Modification types for the B-tree operation. + Note that the order must be DELETE, BOTH, INSERT !! + */ enum btr_intention_t { BTR_INTENTION_DELETE, BTR_INTENTION_BOTH, BTR_INTENTION_INSERT }; -#if BTR_INTENTION_DELETE > BTR_INTENTION_BOTH -#error "BTR_INTENTION_DELETE > BTR_INTENTION_BOTH" -#endif -#if BTR_INTENTION_BOTH > BTR_INTENTION_INSERT -#error "BTR_INTENTION_BOTH > BTR_INTENTION_INSERT" -#endif /** For the index->lock scalability improvement, only possibility of clear performance regression observed was caused by grown huge history list length. @@ -127,7 +123,7 @@ uint btr_cur_limit_optimistic_insert_debug; /** In the optimistic insert, if the insert does not fit, but this much space can be released by page reorganize, then it is reorganized */ -#define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32) +#define BTR_CUR_PAGE_REORGANIZE_LIMIT (srv_page_size / 32) /** The structure of a BLOB part header */ /* @{ */ @@ -214,16 +210,6 @@ btr_rec_free_externally_stored_fields( /*==================== B-TREE SEARCH =========================*/ -#if MTR_MEMO_PAGE_S_FIX != RW_S_LATCH -#error "MTR_MEMO_PAGE_S_FIX != RW_S_LATCH" -#endif -#if MTR_MEMO_PAGE_X_FIX != RW_X_LATCH -#error "MTR_MEMO_PAGE_X_FIX != RW_X_LATCH" -#endif -#if MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH -#error "MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH" -#endif - /** Latches the leaf page or pages requested. @param[in] block leaf page where the search converged @param[in] page_id page id of the leaf @@ -248,6 +234,10 @@ btr_cur_latch_leaves( bool spatial; btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}}; + compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH)); + compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH)); + compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH)); + spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info; ut_ad(buf_page_in_file(&block->page)); @@ -397,6 +387,240 @@ btr_cur_latch_leaves( return(latch_leaves); } +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] index clustered index definition +@param[in,out] mtr mini-transaction +@return error code +@retval DB_SUCCESS if no error occurred +@retval DB_CORRUPTION if any corruption was noticed */ +static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr) +{ + ut_ad(index->is_primary()); + ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES); + ut_ad(index->table->supports_instant()); + ut_ad(index->table->is_readable()); + + const fil_space_t* space = index->table->space; + if (!space) { +unreadable: + ib::error() << "Table " << index->table->name + << " has an unreadable root page"; + index->table->corrupted = true; + return DB_CORRUPTION; + } + + page_t* root = btr_root_get(index, mtr); + + if (!root || btr_cur_instant_root_init(index, root)) { + goto unreadable; + } + + ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES); + + if (fil_page_get_type(root) == FIL_PAGE_INDEX) { + ut_ad(!index->is_instant()); + return DB_SUCCESS; + } + + btr_cur_t cur; + dberr_t err = btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF, + &cur, 0, mtr); + if (err != DB_SUCCESS) { + index->table->corrupted = true; + return err; + } + + ut_ad(page_cur_is_before_first(&cur.page_cur)); + ut_ad(page_is_leaf(cur.page_cur.block->frame)); + + page_cur_move_to_next(&cur.page_cur); + + const rec_t* rec = cur.page_cur.rec; + const ulint comp = dict_table_is_comp(index->table); + const ulint info_bits = rec_get_info_bits(rec, comp); + + if (page_rec_is_supremum(rec) + || !(info_bits & REC_INFO_MIN_REC_FLAG)) { + if (!index->is_instant()) { + /* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be + assigned even if instant ADD COLUMN was not + committed. Changes to these page header fields are not + undo-logged, but changes to the hidden metadata record + are. If the server is killed and restarted, the page + header fields could remain set even though no metadata + record is present. */ + return DB_SUCCESS; + } + + ib::error() << "Table " << index->table->name + << " is missing instant ALTER metadata"; + index->table->corrupted = true; + return DB_CORRUPTION; + } + + if (info_bits != REC_INFO_MIN_REC_FLAG + || (comp && rec_get_status(rec) != REC_STATUS_COLUMNS_ADDED)) { +incompatible: + ib::error() << "Table " << index->table->name + << " contains unrecognizable instant ALTER metadata"; + index->table->corrupted = true; + return DB_CORRUPTION; + } + + /* Read the metadata. We can get here on server restart + or when the table was evicted from the data dictionary cache + and is now being accessed again. + + Here, READ COMMITTED and REPEATABLE READ should be equivalent. + Committing the ADD COLUMN operation would acquire + MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any + concurrent operations on the table, including table eviction + from the cache. */ + + mem_heap_t* heap = NULL; + rec_offs* offsets = rec_get_offsets(rec, index, NULL, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + if (rec_offs_any_default(offsets)) { +inconsistent: + mem_heap_free(heap); + goto incompatible; + } + + /* In fact, because we only ever append fields to the metadata + record, it is also OK to perform READ UNCOMMITTED and + then ignore any extra fields, provided that + trx_sys.is_registered(DB_TRX_ID). */ + if (rec_offs_n_fields(offsets) > index->n_fields + && !trx_sys.is_registered(current_trx(), + row_get_rec_trx_id(rec, index, + offsets))) { + goto inconsistent; + } + + for (unsigned i = index->n_core_fields; i < index->n_fields; i++) { + ulint len; + const byte* data = rec_get_nth_field(rec, offsets, i, &len); + dict_col_t* col = index->fields[i].col; + ut_ad(!col->is_instant()); + ut_ad(!col->def_val.data); + col->def_val.len = len; + switch (len) { + case UNIV_SQL_NULL: + continue; + case 0: + col->def_val.data = field_ref_zero; + continue; + } + ut_ad(len != UNIV_SQL_DEFAULT); + if (!rec_offs_nth_extern(offsets, i)) { + col->def_val.data = mem_heap_dup( + index->table->heap, data, len); + } else if (len < BTR_EXTERN_FIELD_REF_SIZE + || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + col->def_val.len = UNIV_SQL_DEFAULT; + goto inconsistent; + } else { + col->def_val.data = btr_copy_externally_stored_field( + &col->def_val.len, data, + dict_table_page_size(index->table), + len, index->table->heap); + } + } + + mem_heap_free(heap); + return DB_SUCCESS; +} + +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] table table definition from the data dictionary +@return error code +@retval DB_SUCCESS if no error occurred */ +dberr_t +btr_cur_instant_init(dict_table_t* table) +{ + mtr_t mtr; + dict_index_t* index = dict_table_get_first_index(table); + mtr.start(); + dberr_t err = index + ? btr_cur_instant_init_low(index, &mtr) + : DB_CORRUPTION; + mtr.commit(); + return(err); +} + +/** Initialize the n_core_null_bytes on first access to a clustered +index root page. +@param[in] index clustered index that is on its first access +@param[in] page clustered index root page +@return whether the page is corrupted */ +bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page) +{ + ut_ad(!index->is_dummy); + ut_ad(fil_page_index_page_check(page)); + ut_ad(!page_has_siblings(page)); + ut_ad(page_get_space_id(page) == index->table->space_id); + ut_ad(page_get_page_no(page) == index->page); + ut_ad(!page_is_comp(page) == !dict_table_is_comp(index->table)); + ut_ad(index->is_primary()); + ut_ad(!index->is_instant()); + ut_ad(index->table->supports_instant()); + /* This is normally executed as part of btr_cur_instant_init() + when dict_load_table_one() is loading a table definition. + Other threads should not access or modify the n_core_null_bytes, + n_core_fields before dict_load_table_one() returns. + + This can also be executed during IMPORT TABLESPACE, where the + table definition is exclusively locked. */ + + switch (fil_page_get_type(page)) { + default: + ut_ad(!"wrong page type"); + return true; + case FIL_PAGE_INDEX: + /* The field PAGE_INSTANT is guaranteed 0 on clustered + index root pages of ROW_FORMAT=COMPACT or + ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */ + ut_ad(!page_is_comp(page) || !page_get_instant(page)); + index->n_core_null_bytes = UT_BITS_IN_BYTES( + unsigned(index->n_nullable)); + return false; + case FIL_PAGE_TYPE_INSTANT: + break; + } + + const uint16_t n = page_get_instant(page); + + if (n < index->n_uniq + DATA_ROLL_PTR || n > index->n_fields) { + /* The PRIMARY KEY (or hidden DB_ROW_ID) and + DB_TRX_ID,DB_ROLL_PTR columns must always be present + as 'core' fields. All fields, including those for + instantly added columns, must be present in the data + dictionary. */ + return true; + } + + if (memcmp(page_get_infimum_rec(page), "infimum", 8) + || memcmp(page_get_supremum_rec(page), "supremum", 8)) { + /* In a later format, these fields in a FIL_PAGE_TYPE_INSTANT + root page could be repurposed for something else. */ + return true; + } + + index->n_core_fields = n; + ut_ad(!index->is_dummy); + ut_d(index->is_dummy = true); + index->n_core_null_bytes = n == index->n_fields + ? UT_BITS_IN_BYTES(unsigned(index->n_nullable)) + : UT_BITS_IN_BYTES(index->get_n_nullable(n)); + ut_d(index->is_dummy = false); + return false; +} + /** Optimistically latches the leaf page or pages requested. @param[in] block guessed buffer block @param[in] modify_clock modify clock value @@ -444,14 +668,12 @@ btr_cur_optimistic_latch_leaves( rw_lock_s_unlock(&block->lock); if (left_page_no != FIL_NULL) { - const page_id_t page_id( - dict_index_get_space(cursor->index), - left_page_no); dberr_t err = DB_SUCCESS; - cursor->left_block = buf_page_get_gen( - page_id, - dict_table_page_size(cursor->index->table), + page_id_t(cursor->index->table->space_id, + left_page_no), + page_size_t(cursor->index->table->space + ->flags), mode, NULL, BUF_GET_POSSIBLY_FREED, __FILE__, __LINE__, mtr, &err); @@ -459,8 +681,7 @@ btr_cur_optimistic_latch_leaves( cursor->index->table->file_unreadable = true; } - if (btr_page_get_next(buf_block_get_frame( - cursor->left_block)) + if (btr_page_get_next(cursor->left_block->frame) != curr_page_no) { /* release the left block */ btr_leaf_page_release( @@ -528,7 +749,7 @@ btr_cur_get_and_clear_intention( /* both or unknown */ intention = BTR_INTENTION_BOTH; } - *latch_mode &= ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE); + *latch_mode &= ulint(~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)); return(intention); } @@ -611,7 +832,7 @@ btr_cur_will_modify_tree( ulint margin = rec_size; if (lock_intention == BTR_INTENTION_BOTH) { - ulint level = btr_page_get_level(page, mtr); + ulint level = btr_page_get_level(page); /* This value is the worst expectation for the node_ptr records to be deleted from this page. It is used to @@ -896,18 +1117,15 @@ btr_cur_search_to_nth_level_func( cursor->left_block is used to store a pointer to the left neighbor page, in the cases BTR_SEARCH_PREV and BTR_MODIFY_PREV; - NOTE that if has_search_latch - is != 0, we maybe do not have a latch set - on the cursor page, we assume - the caller uses his search latch - to protect the record! */ + NOTE that if ahi_latch, we might not have a + cursor page latch, we assume that ahi_latch + protects the record! */ btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is s- or x-latched, but see also above! */ #ifdef BTR_CUR_HASH_ADAPT - ulint has_search_latch, - /*!< in: info on the latch mode the - caller currently has on search system: - RW_S_LATCH, or 0 */ + rw_lock_t* ahi_latch, + /*!< in: currently held btr_search_latch + (in RW_S_LATCH mode), or NULL */ #endif /* BTR_CUR_HASH_ADAPT */ const char* file, /*!< in: file name */ unsigned line, /*!< in: line where called */ @@ -929,7 +1147,7 @@ btr_cur_search_to_nth_level_func( page_cur_mode_t search_mode = PAGE_CUR_UNSUPP; ulint buf_mode; ulint estimate; - ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2; + ulint node_ptr_max_size = srv_page_size / 2; page_cur_t* page_cursor; btr_op_t btr_op; ulint root_height = 0; /* remove warning */ @@ -978,12 +1196,10 @@ btr_cur_search_to_nth_level_func( ut_ad(!(index->type & DICT_FTS)); ut_ad(index->page != FIL_NULL); -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(&cursor->up_match, sizeof cursor->up_match); MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes); MEM_UNDEFINED(&cursor->low_match, sizeof cursor->low_match); MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes); -#endif /* HAVE_valgrind_or_MSAN */ #ifdef UNIV_DEBUG cursor->up_match = ULINT_UNDEFINED; cursor->low_match = ULINT_UNDEFINED; @@ -1032,7 +1248,7 @@ btr_cur_search_to_nth_level_func( /* Operations on the clustered index cannot be buffered. */ ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index)); /* Operations on the temporary table(indexes) cannot be buffered. */ - ut_ad(btr_op == BTR_NO_OP || !dict_table_is_temporary(index->table)); + ut_ad(btr_op == BTR_NO_OP || !index->table->is_temporary()); /* Operation on the spatial index cannot be buffered. */ ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index)); @@ -1068,7 +1284,6 @@ btr_cur_search_to_nth_level_func( guess = info->root_guess; #ifdef BTR_CUR_HASH_ADAPT - rw_lock_t* const search_latch = btr_get_search_latch(index); # ifdef UNIV_SEARCH_PERF_STAT info->n_searches++; @@ -1084,15 +1299,15 @@ btr_cur_search_to_nth_level_func( && mode != PAGE_CUR_LE_OR_EXTENDS # endif /* PAGE_CUR_LE_OR_EXTENDS */ && !dict_index_is_spatial(index) - /* If !has_search_latch, we do a dirty read of + /* If !ahi_latch, we do a dirty read of btr_search_enabled below, and btr_search_guess_on_hash() will have to check it again. */ && btr_search_enabled && !modify_external - && rw_lock_get_writer(search_latch) == RW_LOCK_NOT_LOCKED + && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG) && btr_search_guess_on_hash(index, info, tuple, mode, latch_mode, cursor, - has_search_latch, mtr)) { + ahi_latch, mtr)) { /* Search using the hash index succeeded */ @@ -1108,15 +1323,15 @@ btr_cur_search_to_nth_level_func( } # endif /* BTR_CUR_HASH_ADAPT */ #endif /* BTR_CUR_ADAPT */ - btr_cur_n_non_sea++; + my_atomic_addlint(&btr_cur_n_non_sea, 1); /* If the hash search did not succeed, do binary search down the tree */ #ifdef BTR_CUR_HASH_ADAPT - if (has_search_latch) { + if (ahi_latch) { /* Release possible search latch to obey latching order */ - rw_lock_s_unlock(search_latch); + rw_lock_s_unlock(ahi_latch); } #endif /* BTR_CUR_HASH_ADAPT */ @@ -1131,18 +1346,18 @@ btr_cur_search_to_nth_level_func( Free blocks and read IO bandwidth should be prior for them, when the history list is glowing huge. */ if (lock_intention == BTR_INTENTION_DELETE - && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH + && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { - mtr_x_lock(dict_index_get_lock(index), mtr); - } else if (dict_index_is_spatial(index) +x_latch_index: + mtr_x_lock_index(index, mtr); + } else if (index->is_spatial() && lock_intention <= BTR_INTENTION_BOTH) { /* X lock the if there is possibility of pessimistic delete on spatial index. As we could lock upward for the tree */ - - mtr_x_lock(dict_index_get_lock(index), mtr); + goto x_latch_index; } else { - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } upper_rw_latch = RW_X_LATCH; break; @@ -1174,10 +1389,10 @@ btr_cur_search_to_nth_level_func( BTR_ALREADY_S_LATCHED */ ut_ad(latch_mode != BTR_SEARCH_TREE); - mtr_s_lock(dict_index_get_lock(index), mtr); + mtr_s_lock_index(index, mtr); } else { /* BTR_MODIFY_EXTERNAL needs to be excluded */ - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } upper_rw_latch = RW_S_LATCH; } else { @@ -1188,11 +1403,10 @@ btr_cur_search_to_nth_level_func( page_cursor = btr_cur_get_page_cur(cursor); - const ulint space = dict_index_get_space(index); - const page_size_t page_size(dict_table_page_size(index->table)); + const page_size_t page_size(index->table->space->flags); /* Start with the root page. */ - page_id_t page_id(space, dict_index_get_page(index)); + page_id_t page_id(index->table->space_id, index->page); if (root_leaf_rw_latch == RW_X_LATCH) { node_ptr_max_size = btr_node_ptr_max_size(index); @@ -1474,10 +1688,10 @@ retry_page_get: ut_ad(fil_page_index_page_check(page)); ut_ad(index->id == btr_page_get_index_id(page)); - if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) { + if (height == ULINT_UNDEFINED) { /* We are in the root node */ - height = btr_page_get_level(page, mtr); + height = btr_page_get_level(page); root_height = height; cursor->tree_height = root_height + 1; @@ -1642,6 +1856,7 @@ retry_page_get: } #ifdef BTR_CUR_HASH_ADAPT } else if (height == 0 && btr_search_enabled + && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG) && !dict_index_is_spatial(index)) { /* The adaptive hash index is only used when searching for leaf pages (height==0), but not in r-trees. @@ -1666,8 +1881,7 @@ retry_page_get: /* If this is the desired level, leave the loop */ - ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor), - mtr)); + ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor))); /* Add Predicate lock if it is serializable isolation and only if it is in the search case */ @@ -1690,7 +1904,7 @@ retry_page_get: } lock_prdt_lock(block, &prdt, index, LOCK_S, - LOCK_PREDICATE, cursor->thr, mtr); + LOCK_PREDICATE, cursor->thr); if (rw_latch == RW_NO_LATCH && height != 0) { rw_lock_s_unlock(&(block->lock)); @@ -1707,7 +1921,7 @@ retry_page_get: node_ptr = page_cur_get_rec(page_cursor); - offsets = rec_get_offsets(node_ptr, index, offsets, false, + offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); /* If the rec is the first or last in the page for @@ -1737,7 +1951,7 @@ need_opposite_intention: lock_intention = BTR_INTENTION_BOTH; - page_id = page_id_t(space, dict_index_get_page(index)); + page_id.set_page_no(index->page); up_match = 0; low_match = 0; height = ULINT_UNDEFINED; @@ -1838,7 +2052,7 @@ need_opposite_intention: offsets2 = rec_get_offsets( first_rec, index, offsets2, - false, ULINT_UNDEFINED, &heap); + 0, ULINT_UNDEFINED, &heap); cmp_rec_rec(node_ptr, first_rec, offsets, offsets2, index, false, &matched_fields); @@ -1856,7 +2070,7 @@ need_opposite_intention: offsets2 = rec_get_offsets( last_rec, index, offsets2, - false, ULINT_UNDEFINED, &heap); + 0, ULINT_UNDEFINED, &heap); cmp_rec_rec( node_ptr, last_rec, offsets, offsets2, index, @@ -1953,8 +2167,7 @@ need_opposite_intention: ulint idx = n_blocks - (leftmost_from_level - 1); - page_id = page_id_t( - space, + page_id.set_page_no( tree_blocks[idx]->page.id.page_no()); for (ulint i = n_blocks @@ -1988,8 +2201,7 @@ need_opposite_intention: } /* Go to the child node */ - page_id = page_id_t( - space, + page_id.set_page_no( btr_node_ptr_get_child_page_no(node_ptr, offsets)); n_blocks++; @@ -2027,7 +2239,7 @@ need_opposite_intention: offsets = rec_get_offsets( my_node_ptr, index, offsets, - false, ULINT_UNDEFINED, &heap); + 0, ULINT_UNDEFINED, &heap); ulint my_page_no = btr_node_ptr_get_child_page_no( @@ -2127,11 +2339,20 @@ need_opposite_intention: will properly check btr_search_enabled again in btr_search_build_page_hash_index() before building a page hash index, while holding search latch. */ - if (btr_search_enabled + if (!btr_search_enabled) { # ifdef MYSQL_INDEX_DISABLE_AHI - && !index->disable_ahi + } else if (index->disable_ahi) { # endif - ) { + } else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) { + ut_ad(index->is_instant()); + /* This may be a search tuple for + btr_pcur_restore_position(). */ + ut_ad(tuple->info_bits == REC_INFO_METADATA + || tuple->info_bits == REC_INFO_MIN_REC_FLAG); + } else if (rec_is_metadata(btr_cur_get_rec(cursor), index)) { + /* Only user records belong in the adaptive + hash index. */ + } else { btr_search_info_update(index, cursor); } #endif /* BTR_CUR_HASH_ADAPT */ @@ -2169,17 +2390,17 @@ func_exit: ut_free(prev_tree_savepoints); } -#ifdef BTR_CUR_HASH_ADAPT - if (has_search_latch) { - rw_lock_s_lock(search_latch); - } -#endif /* BTR_CUR_HASH_ADAPT */ - if (mbr_adj) { /* remember that we will need to adjust parent MBR */ cursor->rtr_info->mbr_adj = true; } +#ifdef BTR_CUR_HASH_ADAPT + if (ahi_latch) { + rw_lock_s_lock(ahi_latch); + } +#endif /* BTR_CUR_HASH_ADAPT */ + DBUG_RETURN(err); } @@ -2200,7 +2421,7 @@ btr_cur_open_at_index_side_func( mtr_t* mtr) /*!< in/out: mini-transaction */ { page_cur_t* page_cursor; - ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2; + ulint node_ptr_max_size = srv_page_size / 2; ulint height; ulint root_height = 0; /* remove warning */ rec_t* node_ptr; @@ -2220,14 +2441,14 @@ btr_cur_open_at_index_side_func( rec_offs_init(offsets_); estimate = latch_mode & BTR_ESTIMATE; - latch_mode &= ~BTR_ESTIMATE; + latch_mode &= ulint(~BTR_ESTIMATE); ut_ad(level != ULINT_UNDEFINED); bool s_latch_by_caller; s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED; - latch_mode &= ~BTR_ALREADY_S_LATCHED; + latch_mode &= ulint(~BTR_ALREADY_S_LATCHED); lock_intention = btr_cur_get_and_clear_intention(&latch_mode); @@ -2255,11 +2476,11 @@ btr_cur_open_at_index_side_func( Free blocks and read IO bandwidth should be prior for them, when the history list is glowing huge. */ if (lock_intention == BTR_INTENTION_DELETE - && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH + && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { - mtr_x_lock(dict_index_get_lock(index), mtr); + mtr_x_lock_index(index, mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } upper_rw_latch = RW_X_LATCH; break; @@ -2275,7 +2496,7 @@ btr_cur_open_at_index_side_func( BTR_ALREADY_S_LATCHED */ ut_ad(latch_mode != BTR_SEARCH_TREE); - mtr_s_lock(dict_index_get_lock(index), mtr); + mtr_s_lock_index(index, mtr); } upper_rw_latch = RW_S_LATCH; } else { @@ -2287,9 +2508,8 @@ btr_cur_open_at_index_side_func( page_cursor = btr_cur_get_page_cur(cursor); cursor->index = index; - page_id_t page_id(dict_index_get_space(index), - dict_index_get_page(index)); - const page_size_t& page_size = dict_table_page_size(index->table); + page_id_t page_id(index->table->space_id, index->page); + const page_size_t page_size(index->table->space->flags); if (root_leaf_rw_latch == RW_X_LATCH) { node_ptr_max_size = btr_node_ptr_max_size(index); @@ -2357,12 +2577,12 @@ btr_cur_open_at_index_side_func( if (height == ULINT_UNDEFINED) { /* We are in the root node */ - height = btr_page_get_level(page, mtr); + height = btr_page_get_level(page); root_height = height; ut_a(height >= level); } else { /* TODO: flag the index corrupted if this fails */ - ut_ad(height == btr_page_get_level(page, mtr)); + ut_ad(height == btr_page_get_level(page)); } if (height == level) { @@ -2471,7 +2691,7 @@ btr_cur_open_at_index_side_func( node_ptr = page_cur_get_rec(page_cursor); offsets = rec_get_offsets(node_ptr, cursor->index, offsets, - false, ULINT_UNDEFINED, &heap); + 0, ULINT_UNDEFINED, &heap); /* If the rec is the first or last in the page for pessimistic delete intention, it might cause node_ptr insert @@ -2572,7 +2792,7 @@ btr_cur_open_at_rnd_pos_func( mtr_t* mtr) /*!< in: mtr */ { page_cur_t* page_cursor; - ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2; + ulint node_ptr_max_size = srv_page_size / 2; ulint height; rec_t* node_ptr; ulint savepoint; @@ -2587,7 +2807,7 @@ btr_cur_open_at_rnd_pos_func( rec_offs* offsets = offsets_; rec_offs_init(offsets_); - ut_ad(!dict_index_is_spatial(index)); + ut_ad(!index->is_spatial()); lock_intention = btr_cur_get_and_clear_intention(&latch_mode); @@ -2601,11 +2821,11 @@ btr_cur_open_at_rnd_pos_func( Free blocks and read IO bandwidth should be prior for them, when the history list is glowing huge. */ if (lock_intention == BTR_INTENTION_DELETE - && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH + && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { - mtr_x_lock(dict_index_get_lock(index), mtr); + mtr_x_lock_index(index, mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } upper_rw_latch = RW_X_LATCH; break; @@ -2621,7 +2841,7 @@ btr_cur_open_at_rnd_pos_func( /* fall through */ default: if (!srv_read_only_mode) { - mtr_s_lock(dict_index_get_lock(index), mtr); + mtr_s_lock_index(index, mtr); upper_rw_latch = RW_S_LATCH; } else { upper_rw_latch = RW_NO_LATCH; @@ -2645,9 +2865,8 @@ btr_cur_open_at_rnd_pos_func( page_cursor = btr_cur_get_page_cur(cursor); cursor->index = index; - page_id_t page_id(dict_index_get_space(index), - dict_index_get_page(index)); - const page_size_t& page_size = dict_table_page_size(index->table); + page_id_t page_id(index->table->space_id, index->page); + const page_size_t page_size(index->table->space->flags); dberr_t err = DB_SUCCESS; if (root_leaf_rw_latch == RW_X_LATCH) { @@ -2717,7 +2936,7 @@ btr_cur_open_at_rnd_pos_func( if (height == ULINT_UNDEFINED) { /* We are in the root node */ - height = btr_page_get_level(page, mtr); + height = btr_page_get_level(page); } if (height == 0) { @@ -2767,7 +2986,7 @@ btr_cur_open_at_rnd_pos_func( node_ptr = page_cur_get_rec(page_cursor); offsets = rec_get_offsets(node_ptr, cursor->index, offsets, - false, ULINT_UNDEFINED, &heap); + 0, ULINT_UNDEFINED, &heap); /* If the rec is the first or last in the page for pessimistic delete intention, it might cause node_ptr insert @@ -2884,9 +3103,8 @@ btr_cur_insert_if_possible( ut_ad(dtuple_check_typed(tuple)); - ut_ad(mtr_is_block_fix( - mtr, btr_cur_get_block(cursor), - MTR_MEMO_PAGE_X_FIX, cursor->index->table)); + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); page_cursor = btr_cur_get_page_cur(cursor); /* Now, try the insert */ @@ -2921,7 +3139,7 @@ btr_cur_ins_lock_and_undo( dtuple_t* entry, /*!< in/out: entry to insert */ que_thr_t* thr, /*!< in: query thread or NULL */ mtr_t* mtr, /*!< in/out: mini-transaction */ - ibool* inherit)/*!< out: TRUE if the inserted new record maybe + bool* inherit)/*!< out: true if the inserted new record maybe should inherit LOCK_GAP type locks from the successor record */ { @@ -2939,7 +3157,7 @@ btr_cur_ins_lock_and_undo( ut_ad(!dict_index_is_online_ddl(index) || dict_index_is_clust(index) || (flags & BTR_CREATE_FLAG)); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); /* Check if there is predicate or GAP lock preventing the insertion */ if (!(flags & BTR_NO_LOCKING_FLAG)) { @@ -2997,23 +3215,21 @@ btr_cur_ins_lock_and_undo( if (flags & BTR_NO_UNDO_LOG_FLAG) { roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS; + if (!(flags & BTR_KEEP_SYS_FLAG)) { +upd_sys: + row_upd_index_entry_sys_field(entry, index, + DATA_ROLL_PTR, roll_ptr); + } } else { err = trx_undo_report_row_operation(thr, index, entry, NULL, 0, NULL, NULL, &roll_ptr); - if (err != DB_SUCCESS) { - return(err); + if (err == DB_SUCCESS) { + goto upd_sys; } } - /* Now we can fill in the roll ptr field in entry */ - if (!(flags & BTR_KEEP_SYS_FLAG)) { - - row_upd_index_entry_sys_field(entry, index, - DATA_ROLL_PTR, roll_ptr); - } - - return(DB_SUCCESS); + return(err); } /** @@ -3087,9 +3303,9 @@ btr_cur_optimistic_insert( buf_block_t* block; page_t* page; rec_t* dummy; - ibool leaf; - ibool reorg __attribute__((unused)); - ibool inherit = TRUE; + bool leaf; + bool reorg __attribute__((unused)); + bool inherit = true; ulint rec_size; dberr_t err; @@ -3100,7 +3316,7 @@ btr_cur_optimistic_insert( page = buf_block_get_frame(block); index = cursor->index; - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); ut_ad(!dict_index_is_online_ddl(index) || dict_index_is_clust(index) || (flags & BTR_CREATE_FLAG)); @@ -3108,12 +3324,12 @@ btr_cur_optimistic_insert( const page_size_t& page_size = block->page.size; -#ifdef HAVE_valgrind_or_MSAN +#ifdef HAVE_valgrind if (page_size.is_compressed()) { MEM_CHECK_DEFINED(page, page_size.logical()); MEM_CHECK_DEFINED(block->page.zip.data, page_size.physical()); } -#endif /* HAVE_valgrind_or_MSAN */ +#endif /* HAVE_valgrind */ leaf = page_is_leaf(page); @@ -3237,7 +3453,10 @@ fail_err: ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN); ut_ad(*static_cast<const byte*> (trx_id[1].data) & 0x80); - if (!(flags & BTR_NO_UNDO_LOG_FLAG)) { + if (flags & BTR_NO_UNDO_LOG_FLAG) { + ut_ad(!memcmp(trx_id->data, reset_trx_id, + DATA_TRX_ID_LEN)); + } else { ut_ad(thr->graph->trx->id); ut_ad(thr->graph->trx->id == trx_read_trx_id( @@ -3257,7 +3476,7 @@ fail_err: if (*rec) { } else if (page_size.is_compressed()) { - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); /* Reset the IBUF_BITMAP_FREE bits, because page_cur_tuple_insert() will have attempted page reorganize before failing. */ @@ -3296,10 +3515,18 @@ fail_err: # ifdef MYSQL_INDEX_DISABLE_AHI } else if (index->disable_ahi) { # endif - } else if (!reorg && cursor->flag == BTR_CUR_HASH) { - btr_search_update_hash_node_on_insert(cursor); + } else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { + ut_ad(entry->info_bits == REC_INFO_METADATA); + ut_ad(index->is_instant()); + ut_ad(flags == BTR_NO_LOCKING_FLAG); } else { - btr_search_update_hash_on_insert(cursor); + rw_lock_t* ahi_latch = btr_get_search_latch(index); + if (!reorg && cursor->flag == BTR_CUR_HASH) { + btr_search_update_hash_node_on_insert( + cursor, ahi_latch); + } else { + btr_search_update_hash_on_insert(cursor, ahi_latch); + } } #endif /* BTR_CUR_HASH_ADAPT */ @@ -3310,7 +3537,7 @@ fail_err: if (leaf && !dict_index_is_clust(index) - && !dict_table_is_temporary(index->table)) { + && !index->table->is_temporary()) { /* Update the free bits of the B-tree page in the insert buffer bitmap. */ @@ -3376,7 +3603,7 @@ btr_cur_pessimistic_insert( dict_index_t* index = cursor->index; big_rec_t* big_rec_vec = NULL; dberr_t err; - ibool inherit = FALSE; + bool inherit = false; bool success; ulint n_reserved = 0; @@ -3388,9 +3615,8 @@ btr_cur_pessimistic_insert( ut_ad(mtr_memo_contains_flagged( mtr, dict_index_get_lock(btr_cur_get_index(cursor)), MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); - ut_ad(mtr_is_block_fix( - mtr, btr_cur_get_block(cursor), - MTR_MEMO_PAGE_X_FIX, cursor->index->table)); + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); ut_ad(!dict_index_is_online_ddl(index) || dict_index_is_clust(index) || (flags & BTR_CREATE_FLAG)); @@ -3414,7 +3640,8 @@ btr_cur_pessimistic_insert( ulint n_extents = cursor->tree_height / 16 + 3; - success = fsp_reserve_free_extents(&n_reserved, index->space, + success = fsp_reserve_free_extents(&n_reserved, + index->table->space, n_extents, FSP_NORMAL, mtr); if (!success) { return(DB_OUT_OF_FILE_SPACE); @@ -3439,10 +3666,7 @@ btr_cur_pessimistic_insert( if (big_rec_vec == NULL) { - if (n_reserved > 0) { - fil_space_release_free_extents(index->space, - n_reserved); - } + index->table->space->release_free_extents(n_reserved); return(DB_TOO_BIG_RECORD); } } @@ -3466,7 +3690,7 @@ btr_cur_pessimistic_insert( || dict_index_is_spatial(index)); if (!(flags & BTR_NO_LOCKING_FLAG)) { - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); if (dict_index_is_spatial(index)) { /* Do nothing */ } else { @@ -3484,7 +3708,7 @@ btr_cur_pessimistic_insert( || !page_has_prev(btr_cur_get_page(cursor))) { /* split and inserted need to call lock_update_insert() always. */ - inherit = TRUE; + inherit = true; } } } @@ -3496,7 +3720,15 @@ btr_cur_pessimistic_insert( # ifdef MYSQL_INDEX_DISABLE_AHI if (index->disable_ahi); else # endif - btr_search_update_hash_on_insert(cursor); + if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { + ut_ad(entry->info_bits == REC_INFO_METADATA); + ut_ad(index->is_instant()); + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(!(flags & BTR_CREATE_FLAG)); + } else { + btr_search_update_hash_on_insert( + cursor, btr_get_search_latch(index)); + } #endif /* BTR_CUR_HASH_ADAPT */ if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) { @@ -3504,10 +3736,7 @@ btr_cur_pessimistic_insert( } } - if (n_reserved > 0) { - fil_space_release_free_extents(index->space, n_reserved); - } - + index->table->space->release_free_extents(n_reserved); *big_rec = big_rec_vec; return(DB_SUCCESS); @@ -3543,7 +3772,7 @@ btr_cur_upd_lock_and_undo( index = cursor->index; ut_ad(rec_offs_validate(rec, index, offsets)); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); if (!dict_index_is_clust(index)) { ut_ad(dict_index_is_online_ddl(index) @@ -3680,7 +3909,7 @@ btr_cur_parse_update_in_place( rec_offset = mach_read_from_2(ptr); ptr += 2; - ut_a(rec_offset <= UNIV_PAGE_SIZE); + ut_a(rec_offset <= srv_page_size); heap = mem_heap_create(256); @@ -3705,7 +3934,8 @@ btr_cur_parse_update_in_place( flags != (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG) - || page_is_leaf(page), + || page_is_leaf(page) + ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); if (!(flags & BTR_KEEP_SYS_FLAG)) { @@ -3777,7 +4007,8 @@ btr_cur_update_alloc_zip_func( goto out_of_space; } - rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets); + rec_offs_make_valid(page_cur_get_rec(cursor), index, + page_is_leaf(page), offsets); /* After recompressing a page, we must make sure that the free bits in the insert buffer bitmap will not exceed the free @@ -3799,7 +4030,7 @@ out_of_space: /* Out of space: reset the free bits. */ if (!dict_index_is_clust(index) - && !dict_table_is_temporary(index->table) + && !index->table->is_temporary() && page_is_leaf(page)) { ibuf_reset_free_bits(page_cur_get_block(cursor)); } @@ -3857,6 +4088,7 @@ btr_cur_update_in_place( | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor))); ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id); + ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG)); DBUG_LOG("ib_cur", "update-in-place " << index->name << " (" << index->id @@ -3868,7 +4100,7 @@ btr_cur_update_in_place( /* Check that enough space is available on the compressed page. */ if (page_zip) { - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); if (!btr_cur_update_alloc_zip( page_zip, btr_cur_get_page_cur(cursor), @@ -3919,7 +4151,8 @@ btr_cur_update_in_place( if (!dict_index_is_clust(index) || row_upd_changes_ord_field_binary( index, update, thr, NULL, NULL)) { - + ut_ad(!(update->info_bits + & REC_INFO_MIN_REC_FLAG)); /* Remove possible hash index pointer to this record */ btr_search_update_hash_on_delete(cursor); @@ -3961,13 +4194,67 @@ func_exit: && !dict_index_is_clust(index) && page_is_leaf(buf_block_get_frame(block))) { /* Update the free bits in the insert buffer. */ - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); ibuf_update_free_bits_zip(block, mtr); } return(err); } +/** Trim an update tuple due to instant ADD COLUMN, if needed. +For normal records, the trailing instantly added fields that match +the initial default values are omitted. + +For the special metadata record on a table on which instant +ADD COLUMN has already been executed, both ADD COLUMN and the +rollback of ADD COLUMN need to be handled specially. + +@param[in,out] entry index entry +@param[in] index index +@param[in] update update vector +@param[in] thr execution thread */ +static inline +void +btr_cur_trim( + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + const que_thr_t* thr) +{ + if (!index->is_instant()) { + } else if (UNIV_UNLIKELY(update->info_bits == REC_INFO_METADATA)) { + /* We are either updating a metadata record + (instantly adding columns to a table where instant ADD was + already executed) or rolling back such an operation. */ + ut_ad(!upd_get_nth_field(update, 0)->orig_len); + ut_ad(upd_get_nth_field(update, 0)->field_no + > index->n_core_fields); + + if (thr->graph->trx->in_rollback) { + /* This rollback can occur either as part of + ha_innobase::commit_inplace_alter_table() rolling + back after a failed innobase_add_instant_try(), + or as part of crash recovery. Either way, the + table will be in the data dictionary cache, with + the instantly added columns going to be removed + later in the rollback. */ + ut_ad(index->table->cached); + /* The DB_TRX_ID,DB_ROLL_PTR are always last, + and there should be some change to roll back. + The first field in the update vector is the + first instantly added column logged by + innobase_add_instant_try(). */ + ut_ad(update->n_fields > 2); + ulint n_fields = upd_get_nth_field(update, 0) + ->field_no; + ut_ad(n_fields + 1 >= entry->n_fields); + entry->n_fields = n_fields; + } + } else { + entry->trim(*index); + } +} + /*************************************************************//** Tries to update a record on a page in an index tree. It is assumed that mtr holds an x-latch on the page. The operation does not succeed if there is too @@ -4022,7 +4309,7 @@ btr_cur_optimistic_update( ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) || index->table->is_temporary()); ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); /* This is intended only for leaf page updates */ ut_ad(page_is_leaf(page)); /* The insert buffer tree should never be updated in place. */ @@ -4036,14 +4323,18 @@ btr_cur_optimistic_update( ut_ad(fil_page_index_page_check(page)); ut_ad(btr_page_get_index_id(page) == index->id); - *offsets = rec_get_offsets(rec, index, *offsets, true, + *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields, ULINT_UNDEFINED, heap); #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(rec, *offsets) || thr_get_trx(thr) == trx_roll_crash_recv_trx); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - if (!row_upd_changes_field_size_or_external(index, *offsets, update)) { + const bool is_metadata = update->info_bits == REC_INFO_METADATA; + + if (UNIV_LIKELY(!is_metadata) + && !row_upd_changes_field_size_or_external(index, *offsets, + update)) { /* The simplest and the most common case: the update does not change the size of any field and none of the updated fields is @@ -4095,7 +4386,8 @@ any_extern: corresponding to new_entry is latched in mtr. Thus the following call is safe. */ row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, - FALSE, *heap); + *heap); + btr_cur_trim(new_entry, index, update, thr); old_rec_size = rec_offs_size(*offsets); new_rec_size = rec_get_converted_size(index, new_entry, 0); @@ -4105,7 +4397,7 @@ any_extern: #endif /* UNIV_ZIP_DEBUG */ if (page_zip) { - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page), dict_index_get_n_fields(index), @@ -4199,7 +4491,16 @@ any_extern: lock_rec_store_on_page_infimum(block, rec); } - btr_search_update_hash_on_delete(cursor); + if (UNIV_UNLIKELY(is_metadata)) { + ut_ad(new_entry->info_bits == REC_INFO_METADATA); + ut_ad(index->is_instant()); + /* This can be innobase_add_instant_try() performing a + subsequent instant ADD COLUMN, or its rollback by + row_undo_mod_clust_low(). */ + ut_ad(flags & BTR_NO_LOCKING_FLAG); + } else { + btr_search_update_hash_on_delete(cursor); + } page_cur_delete_rec(page_cursor, index, *offsets, mtr); @@ -4217,8 +4518,14 @@ any_extern: cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr); ut_a(rec); /* <- We calculated above the insert would fit */ - /* Restore the old explicit lock state on the record */ - if (!dict_table_is_locking_disabled(index->table)) { + if (UNIV_UNLIKELY(is_metadata)) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + btr_page_reorganize(page_cursor, index, mtr); + } else if (!dict_table_is_locking_disabled(index->table)) { + /* Restore the old explicit lock state on the record */ lock_rec_restore_from_page_infimum(block, rec, block); } @@ -4230,9 +4537,9 @@ func_exit: && !dict_index_is_clust(index)) { /* Update the free bits in the insert buffer. */ if (page_zip) { - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); ibuf_update_free_bits_zip(block, mtr); - } else if (!dict_table_is_temporary(index->table)) { + } else if (!index->table->is_temporary()) { ibuf_update_free_bits_low(block, max_ins_size, mtr); } } @@ -4350,11 +4657,11 @@ btr_cur_pessimistic_update( ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); #ifdef UNIV_ZIP_DEBUG ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ - ut_ad(!page_zip || !dict_table_is_temporary(index->table)); + ut_ad(!page_zip || !index->table->is_temporary()); /* The insert buffer tree should never be updated in place. */ ut_ad(!dict_index_is_ibuf(index)); ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) @@ -4386,7 +4693,7 @@ btr_cur_pessimistic_update( && optim_err != DB_ZIP_OVERFLOW && !dict_index_is_clust(index) && page_is_leaf(page)) { - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); ibuf_update_free_bits_zip(block, mtr); } @@ -4410,7 +4717,11 @@ btr_cur_pessimistic_update( purge would also have removed the clustered index record itself. Thus the following call is safe. */ row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, - FALSE, entry_heap); + entry_heap); + btr_cur_trim(new_entry, index, update, thr); + + const bool is_metadata = new_entry->info_bits + & REC_INFO_MIN_REC_FLAG; /* We have to set appropriate extern storage bits in the new record to be inserted: we have to remember which fields were such */ @@ -4456,11 +4767,7 @@ btr_cur_pessimistic_update( ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ - if (n_reserved > 0) { - fil_space_release_free_extents( - index->space, n_reserved); - } - + index->table->space->release_free_extents(n_reserved); err = DB_TOO_BIG_RECORD; goto err_exit; } @@ -4487,7 +4794,7 @@ btr_cur_pessimistic_update( ulint n_extents = cursor->tree_height / 16 + 3; if (!fsp_reserve_free_extents( - &n_reserved, index->space, n_extents, + &n_reserved, index->table->space, n_extents, flags & BTR_NO_UNDO_LOG_FLAG ? FSP_CLEANING : FSP_NORMAL, mtr)) { @@ -4508,19 +4815,30 @@ btr_cur_pessimistic_update( page, 1); } - /* Store state of explicit locks on rec on the page infimum record, - before deleting rec. The page infimum acts as a dummy carrier of the - locks, taking care also of lock releases, before we can move the locks - back on the actual record. There is a special case: if we are - inserting on the root page and the insert causes a call of - btr_root_raise_and_insert. Therefore we cannot in the lock system - delete the lock structs set on the root page even if the root - page carries just node pointers. */ - if (!dict_table_is_locking_disabled(index->table)) { - lock_rec_store_on_page_infimum(block, rec); - } + if (UNIV_UNLIKELY(is_metadata)) { + ut_ad(new_entry->info_bits == REC_INFO_METADATA); + ut_ad(index->is_instant()); + /* This can be innobase_add_instant_try() performing a + subsequent instant ADD COLUMN, or its rollback by + row_undo_mod_clust_low(). */ + ut_ad(flags & BTR_NO_LOCKING_FLAG); + } else { + btr_search_update_hash_on_delete(cursor); - btr_search_update_hash_on_delete(cursor); + /* Store state of explicit locks on rec on the page + infimum record, before deleting rec. The page infimum + acts as a dummy carrier of the locks, taking care also + of lock releases, before we can move the locks back on + the actual record. There is a special case: if we are + inserting on the root page and the insert causes a + call of btr_root_raise_and_insert. Therefore we cannot + in the lock system delete the lock structs set on the + root page even if the root page carries just node + pointers. */ + if (!dict_table_is_locking_disabled(index->table)) { + lock_rec_store_on_page_infimum(block, rec); + } + } #ifdef UNIV_ZIP_DEBUG ut_a(!page_zip || page_zip_validate(page_zip, page, index)); @@ -4537,7 +4855,15 @@ btr_cur_pessimistic_update( if (rec) { page_cursor->rec = rec; - if (!dict_table_is_locking_disabled(index->table)) { + if (UNIV_UNLIKELY(is_metadata)) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + btr_page_reorganize(page_cursor, index, mtr); + rec = page_cursor->rec; + rec_offs_make_valid(rec, index, true, *offsets); + } else if (!dict_table_is_locking_disabled(index->table)) { lock_rec_restore_from_page_infimum( btr_cur_get_block(cursor), rec, block); } @@ -4554,11 +4880,12 @@ btr_cur_pessimistic_update( } bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG); + ut_ad(!adjust || page_is_leaf(page)); if (btr_cur_compress_if_useful(cursor, adjust, mtr)) { if (adjust) { - rec_offs_make_valid( - page_cursor->rec, index, *offsets); + rec_offs_make_valid(page_cursor->rec, index, + true, *offsets); } } else if (!dict_index_is_clust(index) && page_is_leaf(page)) { @@ -4566,9 +4893,9 @@ btr_cur_pessimistic_update( This is the same block which was skipped by BTR_KEEP_IBUF_BITMAP. */ if (page_zip) { - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); ibuf_update_free_bits_zip(block, mtr); - } else if (!dict_table_is_temporary(index->table)) { + } else if (!index->table->is_temporary()) { ibuf_update_free_bits_low(block, max_ins_size, mtr); } @@ -4600,7 +4927,7 @@ btr_cur_pessimistic_update( This is the same block which was skipped by BTR_KEEP_IBUF_BITMAP. */ if (!dict_index_is_clust(index) - && !dict_table_is_temporary(index->table) + && !index->table->is_temporary() && page_is_leaf(page)) { ibuf_reset_free_bits(block); } @@ -4623,7 +4950,7 @@ btr_cur_pessimistic_update( MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } /* Was the record to be updated positioned as the first user @@ -4652,7 +4979,7 @@ btr_cur_pessimistic_update( max_trx_id is ignored for temp tables because it not required for MVCC. */ if (dict_index_is_sec_or_ibuf(index) - && !dict_table_is_temporary(index->table)) { + && !index->table->is_temporary()) { /* Update PAGE_MAX_TRX_ID in the index page header. It was not updated by btr_cur_pessimistic_insert() because of BTR_NO_LOCKING_FLAG. */ @@ -4684,7 +5011,14 @@ btr_cur_pessimistic_update( ut_ad(row_get_rec_trx_id(rec, index, *offsets)); } - if (!dict_table_is_locking_disabled(index->table)) { + if (UNIV_UNLIKELY(is_metadata)) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + btr_page_reorganize(page_cursor, index, mtr); + rec = page_cursor->rec; + } else if (!dict_table_is_locking_disabled(index->table)) { lock_rec_restore_from_page_infimum( btr_cur_get_block(cursor), rec, block); } @@ -4704,12 +5038,8 @@ return_after_reservations: ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ - if (n_reserved > 0) { - fil_space_release_free_extents(index->space, n_reserved); - } - + index->table->space->release_free_extents(n_reserved); *big_rec = big_rec_vec; - return(err); } @@ -4731,7 +5061,7 @@ btr_cur_del_mark_set_clust_rec_log( byte* log_ptr; ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); log_ptr = mlog_open_and_write_index(mtr, rec, index, page_rec_is_comp(rec) @@ -4805,7 +5135,7 @@ btr_cur_parse_del_mark_set_clust_rec( offset = mach_read_from_2(ptr); ptr += 2; - ut_a(offset <= UNIV_PAGE_SIZE); + ut_a(offset <= srv_page_size); /* In delete-marked records, DB_TRX_ID must always refer to an existing undo log record. */ @@ -4833,7 +5163,8 @@ btr_cur_parse_del_mark_set_clust_rec( if (!(flags & BTR_KEEP_SYS_FLAG)) { row_upd_rec_sys_fields_in_recovery( rec, page_zip, - rec_get_offsets(rec, index, offsets, true, + rec_get_offsets(rec, index, offsets, + index->n_core_fields, pos + 2, &heap), pos, trx_id, roll_ptr); } else { @@ -4842,7 +5173,8 @@ btr_cur_parse_del_mark_set_clust_rec( ut_ad(memcmp(rec_get_nth_field( rec, rec_get_offsets(rec, index, - offsets, true, + offsets, index + ->n_core_fields, pos, &heap), pos, &offset), field_ref_zero, DATA_TRX_ID_LEN)); @@ -4885,7 +5217,7 @@ btr_cur_del_mark_set_clust_rec( ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); ut_ad(buf_block_get_frame(block) == page_align(rec)); ut_ad(page_rec_is_leaf(rec)); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { /* We may already have delete-marked this record @@ -4999,7 +5331,7 @@ btr_cur_parse_del_mark_set_sec_rec( offset = mach_read_from_2(ptr); ptr += 2; - ut_a(offset <= UNIV_PAGE_SIZE); + ut_a(offset <= srv_page_size); if (page) { rec = page + offset; @@ -5110,9 +5442,8 @@ btr_cur_compress_if_useful( ut_ad(mtr_memo_contains_flagged( mtr, dict_index_get_lock(btr_cur_get_index(cursor)), MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); - ut_ad(mtr_is_block_fix( - mtr, btr_cur_get_block(cursor), - MTR_MEMO_PAGE_X_FIX, cursor->index->table)); + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); if (dict_index_is_spatial(cursor->index)) { const page_t* page = btr_cur_get_page(cursor); @@ -5158,41 +5489,109 @@ btr_cur_optimistic_delete_func( mem_heap_t* heap = NULL; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs* offsets = offsets_; - ibool no_compress_needed; rec_offs_init(offsets_); ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), MTR_MEMO_PAGE_X_FIX)); - ut_ad(mtr->is_named_space(cursor->index->space)); + ut_ad(mtr->is_named_space(cursor->index->table->space)); ut_ad(!cursor->index->is_dummy); /* This is intended only for leaf page deletions */ block = btr_cur_get_block(cursor); - ut_ad(block->page.id.space() == cursor->index->space); + ut_ad(block->page.id.space() == cursor->index->table->space->id); ut_ad(page_is_leaf(buf_block_get_frame(block))); ut_ad(!dict_index_is_online_ddl(cursor->index) || dict_index_is_clust(cursor->index) || (flags & BTR_CREATE_FLAG)); rec = btr_cur_get_rec(cursor); - offsets = rec_get_offsets(rec, cursor->index, offsets, true, + + offsets = rec_get_offsets(rec, cursor->index, offsets, + cursor->index->n_core_fields, ULINT_UNDEFINED, &heap); - no_compress_needed = !rec_offs_any_extern(offsets) + const ibool no_compress_needed = !rec_offs_any_extern(offsets) && btr_cur_can_delete_without_compress( cursor, rec_offs_size(offsets), mtr); - if (no_compress_needed) { + if (!no_compress_needed) { + /* prefetch siblings of the leaf for the pessimistic + operation. */ + btr_cur_prefetch_siblings(block); + goto func_exit; + } + + if (UNIV_UNLIKELY(block->page.id.page_no() == cursor->index->page + && page_get_n_recs(block->frame) == 1 + + (cursor->index->is_instant() + && !rec_is_metadata(rec, cursor->index)))) { + /* The whole index (and table) becomes logically empty. + Empty the whole page. That is, if we are deleting the + only user record, also delete the metadata record + if one exists (it exists if and only if is_instant()). + If we are deleting the metadata record and the + table becomes empty, clean up the whole page. */ + dict_index_t* index = cursor->index; + ut_ad(!index->is_instant() + || rec_is_metadata( + page_rec_get_next_const( + page_get_infimum_rec(block->frame)), + index)); + if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG)) { + /* This should be rolling back instant ADD COLUMN. + If this is a recovered transaction, then + index->is_instant() will hold until the + insert into SYS_COLUMNS is rolled back. */ + ut_ad(index->table->supports_instant()); + ut_ad(index->is_primary()); + } else { + lock_update_delete(block, rec); + } + btr_page_empty(block, buf_block_get_page_zip(block), + index, 0, mtr); + page_cur_set_after_last(block, btr_cur_get_page_cur(cursor)); + + if (index->is_primary()) { + /* Concurrent access is prevented by + root_block->lock X-latch, so this should be + safe. */ + index->remove_instant(); + } + + goto func_exit; + } + { page_t* page = buf_block_get_frame(block); page_zip_des_t* page_zip= buf_block_get_page_zip(block); - lock_update_delete(block, rec); + if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG)) { + /* This should be rolling back instant ADD COLUMN. + If this is a recovered transaction, then + index->is_instant() will hold until the + insert into SYS_COLUMNS is rolled back. */ + ut_ad(cursor->index->table->supports_instant()); + ut_ad(cursor->index->is_primary()); + ut_ad(!page_zip); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, offsets, mtr); + /* We must empty the PAGE_FREE list, because + after rollback, this deleted metadata record + would have too many fields, and we would be + unable to know the size of the freed record. */ + btr_page_reorganize(btr_cur_get_page_cur(cursor), + cursor->index, mtr); + goto func_exit; + } else { + lock_update_delete(block, rec); - btr_search_update_hash_on_delete(cursor); + btr_search_update_hash_on_delete(cursor); + } if (page_zip) { #ifdef UNIV_ZIP_DEBUG @@ -5221,17 +5620,14 @@ btr_cur_optimistic_delete_func( into non-leaf pages, into clustered indexes, or into the change buffer. */ if (!dict_index_is_clust(cursor->index) - && !dict_table_is_temporary(cursor->index->table) + && !cursor->index->table->is_temporary() && !dict_index_is_ibuf(cursor->index)) { ibuf_update_free_bits_low(block, max_ins, mtr); } } - } else { - /* prefetch siblings of the leaf for the pessimistic - operation. */ - btr_cur_prefetch_siblings(block); } +func_exit: if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -5294,9 +5690,9 @@ btr_cur_pessimistic_delete( MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); ut_ad(!index->is_dummy); - ut_ad(block->page.id.space() == index->space); + ut_ad(block->page.id.space() == index->table->space->id); if (!has_reserved_extents) { /* First reserve enough free space for the file segments @@ -5306,7 +5702,7 @@ btr_cur_pessimistic_delete( ulint n_extents = cursor->tree_height / 32 + 1; success = fsp_reserve_free_extents(&n_reserved, - index->space, + index->table->space, n_extents, FSP_CLEANING, mtr); if (!success) { @@ -5323,7 +5719,8 @@ btr_cur_pessimistic_delete( ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ - offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page), + offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page) + ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); if (rec_offs_any_extern(offsets)) { @@ -5339,29 +5736,78 @@ btr_cur_pessimistic_delete( bool min_mark_next_rec = false; if (page_is_leaf(page)) { - ut_ad(!(rec_get_info_bits(rec, page_rec_is_comp(rec)) - & REC_INFO_MIN_REC_FLAG)); - if (flags == 0) { + const bool is_metadata = rec_get_info_bits( + rec, page_rec_is_comp(rec)) & REC_INFO_MIN_REC_FLAG; + if (UNIV_UNLIKELY(is_metadata)) { + /* This should be rolling back instant ADD COLUMN. + If this is a recovered transaction, then + index->is_instant() will hold until the + insert into SYS_COLUMNS is rolled back. */ + ut_ad(rollback); + ut_ad(index->table->supports_instant()); + ut_ad(index->is_primary()); + } else if (flags == 0) { lock_update_delete(block, rec); } - } - if (UNIV_UNLIKELY(page_get_n_recs(page) < 2) - && UNIV_UNLIKELY(dict_index_get_page(index) - != block->page.id.page_no())) { + if (block->page.id.page_no() != index->page) { + if (page_get_n_recs(page) < 2) { + goto discard_page; + } + } else if (page_get_n_recs(page) == 1 + + (index->is_instant() + && !rec_is_metadata(rec, index))) { + /* The whole index (and table) becomes logically empty. + Empty the whole page. That is, if we are deleting the + only user record, also delete the metadata record + if one exists (it exists if and only if is_instant()). + If we are deleting the metadata record and the + table becomes empty, clean up the whole page. */ + ut_ad(!index->is_instant() + || rec_is_metadata( + page_rec_get_next_const( + page_get_infimum_rec(page)), + index)); + btr_page_empty(block, page_zip, index, 0, mtr); + page_cur_set_after_last(block, + btr_cur_get_page_cur(cursor)); + if (index->is_primary()) { + /* Concurrent access is prevented by + index->lock and root_block->lock + X-latch, so this should be safe. */ + index->remove_instant(); + } + ret = TRUE; + goto return_after_reservations; + } - /* If there is only one record, drop the whole page in - btr_discard_page, if this is not the root page */ + if (UNIV_LIKELY(!is_metadata)) { + btr_search_update_hash_on_delete(cursor); + } else { + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + index, offsets, mtr); + /* We must empty the PAGE_FREE list, because + after rollback, this deleted metadata record + would carry too many fields, and we would be + unable to know the size of the freed record. */ + btr_page_reorganize(btr_cur_get_page_cur(cursor), + index, mtr); + ut_ad(!ret); + goto return_after_reservations; + } + } else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) { + if (page_rec_is_last(rec, page)) { +discard_page: + ut_ad(page_get_n_recs(page) == 1); + /* If there is only one record, drop + the whole page. */ - btr_discard_page(cursor, mtr); + btr_discard_page(cursor, mtr); - ret = TRUE; - goto return_after_reservations; - } + ret = TRUE; + goto return_after_reservations; + } - if (page_is_leaf(page)) { - btr_search_update_hash_on_delete(cursor); - } else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) { next_rec = page_rec_get_next(rec); if (!page_has_prev(page)) { @@ -5370,7 +5816,7 @@ btr_cur_pessimistic_delete( pointer as the predefined minimum record */ min_mark_next_rec = true; - } else if (dict_index_is_spatial(index)) { + } else if (index->is_spatial()) { /* For rtree, if delete the leftmost node pointer, we need to update parent page. */ rtr_mbr_t father_mbr; @@ -5385,7 +5831,7 @@ btr_cur_pessimistic_delete( &father_cursor); offsets = rec_get_offsets( btr_cur_get_rec(&father_cursor), index, NULL, - false, ULINT_UNDEFINED, &heap); + 0, ULINT_UNDEFINED, &heap); father_rec = btr_cur_get_rec(&father_cursor); rtr_read_mbr(rec_get_nth_field( @@ -5408,11 +5854,10 @@ btr_cur_pessimistic_delete( on a page, we have to change the parent node pointer so that it is equal to the new leftmost node pointer on the page */ - ulint level = btr_page_get_level(page, mtr); - btr_cur_t cursor; btr_page_get_father(index, block, mtr, &cursor); btr_cur_node_ptr_delete(&cursor, mtr); + const ulint level = btr_page_get_level(page); // FIXME: reuse the node_ptr from above dtuple_t* node_ptr = dict_index_build_node_ptr( index, next_rec, block->page.id.page_no(), @@ -5480,10 +5925,7 @@ return_after_reservations: has segment header and already modified in most of cases.*/ } - if (n_reserved > 0) { - fil_space_release_free_extents(index->space, n_reserved); - } - + index->table->space->release_free_extents(n_reserved); return(ret); } @@ -5546,7 +5988,7 @@ btr_cur_add_path_info( slot->nth_rec = page_rec_get_n_recs_before(rec); slot->n_recs = page_get_n_recs(page); slot->page_no = page_get_page_no(page); - slot->page_level = btr_page_get_level_low(page); + slot->page_level = btr_page_get_level(page); } /*******************************************************************//** @@ -5562,31 +6004,28 @@ the number of pages between slot1->page and slot2->page (which is n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE. @return number of rows, not including the borders (exact or estimated) */ static -int64_t +ha_rows btr_estimate_n_rows_in_range_on_level( /*==================================*/ dict_index_t* index, /*!< in: index */ btr_path_t* slot1, /*!< in: left border */ btr_path_t* slot2, /*!< in: right border */ - int64_t n_rows_on_prev_level, /*!< in: number of rows + ha_rows n_rows_on_prev_level, /*!< in: number of rows on the previous level for the same descend paths; used to determine the number of pages on this level */ - ibool* is_n_rows_exact) /*!< out: TRUE if the returned + bool* is_n_rows_exact) /*!< out: TRUE if the returned value is exact i.e. not an estimation */ { - int64_t n_rows; - ulint n_pages_read; + ha_rows n_rows = 0; + uint n_pages_read = 0; ulint level; - n_rows = 0; - n_pages_read = 0; - /* Assume by default that we will scan all pages between slot1->page_no and slot2->page_no. */ - *is_n_rows_exact = TRUE; + *is_n_rows_exact = true; /* Add records from slot1->page_no which are to the right of the record which serves as a left border of the range, if any @@ -5611,10 +6050,8 @@ btr_estimate_n_rows_in_range_on_level( average from the pages scanned so far. */ # define N_PAGES_READ_LIMIT 10 - page_id_t page_id( - dict_index_get_space(index), slot1->page_no); - const fil_space_t* space = fil_space_get(index->space); - ut_ad(space); + const fil_space_t* space = index->table->space; + page_id_t page_id(space->id, slot1->page_no); const page_size_t page_size(space->flags); level = slot1->page_level; @@ -5663,7 +6100,7 @@ btr_estimate_n_rows_in_range_on_level( reuses them. */ if (!fil_page_index_page_check(page) || btr_page_get_index_id(page) != index->id - || btr_page_get_level_low(page) != level) { + || btr_page_get_level(page) != level) { /* The page got reused for something else */ mtr_commit(&mtr); @@ -5704,7 +6141,7 @@ btr_estimate_n_rows_in_range_on_level( inexact: - *is_n_rows_exact = FALSE; + *is_n_rows_exact = false; /* We did interrupt before reaching slot2->page */ @@ -5712,8 +6149,7 @@ inexact: /* The number of pages on this level is n_rows_on_prev_level, multiply it by the average number of recs per page so far */ - n_rows = n_rows_on_prev_level - * n_rows / n_pages_read; + n_rows = n_rows_on_prev_level * n_rows / n_pages_read; } else { /* The tree changed before we could even start with slot1->page_no */ @@ -5732,7 +6168,7 @@ static const unsigned rows_in_range_max_retries = 4; /** We pretend that a range has that many records if the tree keeps changing for rows_in_range_max_retries retries while we try to estimate the records in a given range. */ -static const int64_t rows_in_range_arbitrary_ret_val = 10; +static const ha_rows rows_in_range_arbitrary_ret_val = 10; /** Estimates the number of rows in a given index range. @param[in] index index @@ -5749,7 +6185,7 @@ rows_in_range_arbitrary_ret_val as a result (if nth_attempt >= rows_in_range_max_retries and the tree is modified between the two dives). */ static -int64_t +ha_rows btr_estimate_n_rows_in_range_low( dict_index_t* index, const dtuple_t* tuple1, @@ -5763,14 +6199,14 @@ btr_estimate_n_rows_in_range_low( btr_cur_t cursor; btr_path_t* slot1; btr_path_t* slot2; - ibool diverged; - ibool diverged_lot; + bool diverged; + bool diverged_lot; ulint divergence_level; - int64_t n_rows; - ibool is_n_rows_exact; + ha_rows n_rows; + bool is_n_rows_exact; ulint i; mtr_t mtr; - int64_t table_n_rows; + ha_rows table_n_rows; table_n_rows = dict_table_get_n_rows(index->table); @@ -5794,7 +6230,7 @@ btr_estimate_n_rows_in_range_low( btr_cur_search_to_nth_level(index, 0, tuple1, mode1, BTR_SEARCH_LEAF | BTR_ESTIMATE, &cursor, 0, - __FILE__, __LINE__, &mtr, 0); + __FILE__, __LINE__, &mtr); ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor))); @@ -5848,7 +6284,7 @@ btr_estimate_n_rows_in_range_low( btr_cur_search_to_nth_level(index, 0, tuple2, mode2, BTR_SEARCH_LEAF | BTR_ESTIMATE, &cursor, 0, - __FILE__, __LINE__, &mtr, 0); + __FILE__, __LINE__, &mtr); const rec_t* rec = btr_cur_get_rec(&cursor); @@ -5906,16 +6342,16 @@ btr_estimate_n_rows_in_range_low( /* We have the path information for the range in path1 and path2 */ n_rows = 0; - is_n_rows_exact = TRUE; + is_n_rows_exact = true; /* This becomes true when the two paths do not pass through the same pages anymore. */ - diverged = FALSE; + diverged = false; /* This becomes true when the paths are not the same or adjacent any more. This means that they pass through the same or neighboring-on-the-same-level pages only. */ - diverged_lot = FALSE; + diverged_lot = false; /* This is the level where paths diverged a lot. */ divergence_level = 1000000; @@ -6038,15 +6474,12 @@ btr_estimate_n_rows_in_range_low( return(rows_in_range_arbitrary_ret_val); } - const int64_t ret = - btr_estimate_n_rows_in_range_low( - index, tuple1, mode1, - tuple2, mode2, nth_attempt + 1); - - return(ret); + return btr_estimate_n_rows_in_range_low( + index, tuple1, mode1, + tuple2, mode2, nth_attempt + 1); } - diverged = TRUE; + diverged = true; if (slot1->nth_rec < slot2->nth_rec) { /* We do not count the borders (nor the left @@ -6059,7 +6492,7 @@ btr_estimate_n_rows_in_range_low( and slot2, so on the level below the slots will point to non-adjacent pages. */ - diverged_lot = TRUE; + diverged_lot = true; divergence_level = i; } } else { @@ -6081,7 +6514,7 @@ btr_estimate_n_rows_in_range_low( if (slot1->nth_rec < slot1->n_recs || slot2->nth_rec > 1) { - diverged_lot = TRUE; + diverged_lot = true; divergence_level = i; n_rows = 0; @@ -6111,7 +6544,7 @@ btr_estimate_n_rows_in_range_low( @param[in] tuple2 range end, may also be empty tuple @param[in] mode2 search mode for range end @return estimated number of rows */ -int64_t +ha_rows btr_estimate_n_rows_in_range( dict_index_t* index, const dtuple_t* tuple1, @@ -6119,10 +6552,8 @@ btr_estimate_n_rows_in_range( const dtuple_t* tuple2, page_cur_mode_t mode2) { - const int64_t ret = btr_estimate_n_rows_in_range_low( - index, tuple1, mode1, tuple2, mode2, 1 /* first attempt */); - - return(ret); + return btr_estimate_n_rows_in_range_low( + index, tuple1, mode1, tuple2, mode2, 1); } /*******************************************************************//** @@ -6322,12 +6753,13 @@ btr_estimate_number_of_different_key_vals(dict_index_t* index) page = btr_cur_get_page(&cursor); rec = page_rec_get_next(page_get_infimum_rec(page)); - ut_d(const bool is_leaf = page_is_leaf(page)); + const ulint n_core = page_is_leaf(page) + ? index->n_core_fields : 0; if (!page_rec_is_supremum(rec)) { not_empty_flag = 1; offsets_rec = rec_get_offsets(rec, index, offsets_rec, - is_leaf, + n_core, ULINT_UNDEFINED, &heap); if (n_not_null != NULL) { @@ -6348,7 +6780,7 @@ btr_estimate_number_of_different_key_vals(dict_index_t* index) offsets_next_rec = rec_get_offsets(next_rec, index, offsets_next_rec, - is_leaf, + n_core, ULINT_UNDEFINED, &heap); @@ -6473,7 +6905,7 @@ btr_rec_get_field_ref_offs( ut_a(rec_offs_nth_extern(offsets, n)); field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len); - ut_a(local_len != UNIV_SQL_NULL); + ut_a(len_is_stored(local_len)); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE); @@ -6515,12 +6947,12 @@ btr_rec_get_externally_stored_len( btr_rec_get_field_ref(rec, offsets, i) + BTR_EXTERN_LEN + 4); - total_extern_len += ut_calc_align(extern_len, - UNIV_PAGE_SIZE); + total_extern_len += ut_calc_align( + extern_len, ulint(srv_page_size)); } } - return(total_extern_len / UNIV_PAGE_SIZE); + return total_extern_len >> srv_page_size_shift; } /*******************************************************************//** @@ -6667,15 +7099,14 @@ static void btr_blob_free( /*==========*/ - dict_index_t* index, /*!< in: index */ buf_block_t* block, /*!< in: buffer block */ ibool all, /*!< in: TRUE=remove also the compressed page if there is one */ mtr_t* mtr) /*!< in: mini-transaction to commit */ { buf_pool_t* buf_pool = buf_pool_from_block(block); - const page_id_t page_id = block->page.id; - ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); + const page_id_t page_id(block->page.id); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); mtr_commit(mtr); buf_pool_mutex_enter(buf_pool); @@ -6765,22 +7196,19 @@ struct btr_blob_log_check_t { const mtr_log_t log_mode = m_mtr->get_log_mode(); m_mtr->start(); m_mtr->set_log_mode(log_mode); - m_mtr->set_named_space(index->space); + index->set_modified(*m_mtr); m_mtr->set_flush_observer(observer); if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) { - page_id_t page_id(dict_index_get_space(index), - page_no); - page_size_t page_size(dict_table_page_size( - index->table)); - page_cur_t* page_cur = &m_pcur->btr_cur.page_cur; - - page_cur->block = btr_block_get( - page_id, page_size, RW_X_LATCH, index, m_mtr); - page_cur->rec = buf_block_get_frame(page_cur->block) + m_pcur->btr_cur.page_cur.block = btr_block_get( + page_id_t(index->table->space_id, page_no), + page_size_t(index->table->space->flags), + RW_X_LATCH, index, m_mtr); + m_pcur->btr_cur.page_cur.rec + = m_pcur->btr_cur.page_cur.block->frame + offs; - buf_block_buf_fix_dec(page_cur->block); + buf_block_buf_fix_dec(m_pcur->btr_cur.page_cur.block); } else { ut_ad(m_pcur->rel_pos == BTR_PCUR_ON); bool ret = btr_pcur_restore_position( @@ -6793,8 +7221,8 @@ struct btr_blob_log_check_t { *m_block = btr_pcur_get_block(m_pcur); *m_rec = btr_pcur_get_rec(m_pcur); - ut_d(rec_offs_make_valid( - *m_rec, index, const_cast<rec_offs*>(m_offsets))); + rec_offs_make_valid(*m_rec, index, true, + const_cast<rec_offs*>(m_offsets)); ut_ad(m_mtr->memo_contains_page_flagged( *m_rec, @@ -6861,8 +7289,7 @@ btr_store_big_rec_extern_fields( || mtr_memo_contains_flagged(btr_mtr, &index->lock, MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); - ut_ad(mtr_is_block_fix( - btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains(btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX)); ut_ad(buf_block_get_frame(rec_block) == page_align(rec)); ut_a(dict_index_is_clust(index)); @@ -6888,7 +7315,7 @@ btr_store_big_rec_extern_fields( heap = mem_heap_create(250000); page_zip_set_alloc(&c_stream, heap); - err = deflateInit2(&c_stream, page_zip_level, + err = deflateInit2(&c_stream, int(page_zip_level), Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY); ut_a(err == Z_OK); } @@ -6968,8 +7395,8 @@ btr_store_big_rec_extern_fields( rec_page_no = rec_block->page.id.page_no(); } - mtr_start(&mtr); - mtr.set_named_space(index->space); + mtr.start(); + index->set_modified(mtr); mtr.set_log_mode(btr_mtr->get_log_mode()); mtr.set_flush_observer(btr_mtr->get_flush_observer()); @@ -6985,18 +7412,19 @@ btr_store_big_rec_extern_fields( mtr_t *alloc_mtr; if (UNIV_UNLIKELY(op == BTR_STORE_INSERT_BULK)) { - mtr_start(&mtr_bulk); + mtr_bulk.start(); mtr_bulk.set_spaces(mtr); alloc_mtr = &mtr_bulk; } else { alloc_mtr = &mtr; } - if (!fsp_reserve_free_extents(&r_extents, space_id, 1, + if (!fsp_reserve_free_extents(&r_extents, + index->table->space, 1, FSP_BLOB, alloc_mtr, 1)) { - mtr_commit(alloc_mtr); + alloc_mtr->commit(); error = DB_OUT_OF_FILE_SPACE; goto func_exit; } @@ -7004,10 +7432,10 @@ btr_store_big_rec_extern_fields( block = btr_page_alloc(index, hint_page_no, FSP_NO_DIR, 0, alloc_mtr, &mtr); - alloc_mtr->release_free_extents(r_extents); + index->table->space->release_free_extents(r_extents); if (UNIV_UNLIKELY(op == BTR_STORE_INSERT_BULK)) { - mtr_commit(&mtr_bulk); + mtr_bulk.commit(); } ut_a(block != NULL); @@ -7146,7 +7574,7 @@ next_zip_page: /* Commit mtr and release the uncompressed page frame to save memory. */ - btr_blob_free(index, block, FALSE, &mtr); + btr_blob_free(block, FALSE, &mtr); if (err == Z_STREAM_END) { break; @@ -7207,7 +7635,7 @@ next_zip_page: prev_page_no = page_no; - mtr_commit(&mtr); + mtr.commit(); if (extern_len == 0) { break; @@ -7273,11 +7701,11 @@ btr_check_blob_fil_page_type( ulint flags = fil_space_get_flags(space_id); #ifndef UNIV_DEBUG /* Improve debug test coverage */ - if (dict_tf_get_format(flags) == UNIV_FORMAT_A) { + if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) { /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB pages. Do not print anything about the type mismatch when reading - a BLOB page that is in Antelope format.*/ + a BLOB page that may be from old versions. */ return; } #endif /* !UNIV_DEBUG */ @@ -7328,11 +7756,11 @@ btr_free_externally_stored_field( ulint next_page_no; mtr_t mtr; - ut_ad(dict_index_is_clust(index)); + ut_ad(index->is_primary()); ut_ad(mtr_memo_contains_flagged(local_mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); - ut_ad(mtr_is_page_fix( - local_mtr, field_ref, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains_page(local_mtr, field_ref, + MTR_MEMO_PAGE_X_FIX)); ut_ad(!rec || rec_offs_validate(rec, index, offsets)); ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i)); ut_ad(local_mtr->is_named_space( @@ -7350,7 +7778,8 @@ btr_free_externally_stored_field( ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN) & ~((BTR_EXTERN_OWNER_FLAG | BTR_EXTERN_INHERITED_FLAG) << 24))); - ut_ad(space_id == index->space); + ut_ad(space_id == index->table->space->id); + ut_ad(space_id == index->table->space_id); const page_size_t ext_page_size(dict_table_page_size(index->table)); const page_size_t& rec_page_size(rec == NULL @@ -7371,7 +7800,7 @@ btr_free_externally_stored_field( mtr.set_spaces(*local_mtr); mtr.set_log_mode(local_mtr->get_log_mode()); - ut_ad(!dict_table_is_temporary(index->table) + ut_ad(!index->table->is_temporary() || local_mtr->get_log_mode() == MTR_LOG_NO_REDO); const page_t* p = page_align(field_ref); @@ -7459,7 +7888,7 @@ btr_free_externally_stored_field( MLOG_4BYTES, &mtr); /* Zero out the BLOB length. If the server crashes during the execution of this function, - trx_rollback_or_clean_all_recovered() could + trx_rollback_all_recovered() could dereference the half-deleted BLOB, fetching a wrong prefix for the BLOB. */ mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4, @@ -7468,7 +7897,7 @@ btr_free_externally_stored_field( } /* Commit mtr and release the BLOB block to save memory. */ - btr_blob_free(index, ext_block, TRUE, &mtr); + btr_blob_free(ext_block, TRUE, &mtr); } } @@ -7493,8 +7922,8 @@ btr_rec_free_externally_stored_fields( ulint i; ut_ad(rec_offs_validate(rec, index, offsets)); - ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table)); - ut_ad(dict_index_is_clust(index)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)); + ut_ad(index->is_primary()); ut_ad(page_rec_is_leaf(rec)); /* Free possible externally stored fields in the record */ @@ -7532,7 +7961,7 @@ btr_rec_free_updated_extern_fields( ulint i; ut_ad(rec_offs_validate(rec, index, offsets)); - ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)); /* Free possible externally stored fields in the record */ diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc index 645334cbf4d..99b15965281 100644 --- a/storage/innobase/btr/btr0defragment.cc +++ b/storage/innobase/btr/btr0defragment.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved. -Copyright (C) 2014, 2019, MariaDB Corporation. +Copyright (C) 2014, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,9 +39,6 @@ Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com #include <list> -using std::list; -using std::min; - /* When there's no work, either because defragment is disabled, or because no query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/ #define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000 @@ -160,14 +157,14 @@ btr_defragment_add_index( dberr_t* err) /*!< out: error code */ { mtr_t mtr; - ulint page_no = dict_index_get_page(index); *err = DB_SUCCESS; mtr_start(&mtr); // Load index rood page. - const page_id_t page_id(dict_index_get_space(index), page_no); - const page_size_t page_size(dict_table_page_size(index->table)); - buf_block_t* block = btr_block_get(page_id, page_size, RW_NO_LATCH, index, &mtr); + buf_block_t* block = btr_block_get( + page_id_t(index->table->space_id, index->page), + page_size_t(index->table->space->flags), + RW_NO_LATCH, index, &mtr); page_t* page = NULL; if (block) { @@ -311,7 +308,7 @@ btr_defragment_save_defrag_stats_if_needed( dict_index_t* index) /*!< in: index */ { if (srv_defragment_stats_accuracy != 0 // stats tracking disabled - && dict_index_get_space(index) != 0 // do not track system tables + && index->table->space_id != 0 // do not track system tables && index->stat_defrag_modified_counter >= srv_defragment_stats_accuracy) { dict_stats_defrag_pool_add(index); @@ -343,12 +340,12 @@ btr_defragment_calc_n_recs_for_size( ulint size = 0; page_cur_t cur; + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; page_cur_set_before_first(block, &cur); page_cur_move_to_next(&cur); while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) { rec_t* cur_rec = page_cur_get_rec(&cur); - offsets = rec_get_offsets(cur_rec, index, offsets, - page_is_leaf(page), + offsets = rec_get_offsets(cur_rec, index, offsets, n_core, ULINT_UNDEFINED, &heap); ulint rec_size = rec_offs_size(offsets); size += rec_size; @@ -360,6 +357,9 @@ btr_defragment_calc_n_recs_for_size( page_cur_move_to_next(&cur); } *n_recs_size = size; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return n_recs; } @@ -383,8 +383,7 @@ btr_defragment_merge_pages( { page_t* from_page = buf_block_get_frame(from_block); page_t* to_page = buf_block_get_frame(to_block); - ulint space = dict_index_get_space(index); - ulint level = btr_page_get_level(from_page, mtr); + ulint level = btr_page_get_level(from_page); ulint n_recs = page_get_n_recs(from_page); ulint new_data_size = page_get_data_size(to_page); ulint max_ins_size = @@ -403,7 +402,7 @@ btr_defragment_merge_pages( // Estimate how many records can be moved from the from_page to // the to_page. if (page_size.is_compressed()) { - ulint page_diff = UNIV_PAGE_SIZE - *max_data_size; + ulint page_diff = srv_page_size - *max_data_size; max_ins_size_to_use = (max_ins_size_to_use > page_diff) ? max_ins_size_to_use - page_diff : 0; } @@ -476,7 +475,7 @@ btr_defragment_merge_pages( } else { ibuf_update_free_bits_if_full( to_block, - UNIV_PAGE_SIZE, + srv_page_size, ULINT_UNDEFINED); } } @@ -487,7 +486,9 @@ btr_defragment_merge_pages( lock_update_merge_left(to_block, orig_pred, from_block); btr_search_drop_page_hash_index(from_block); - btr_level_list_remove(space, page_size, (page_t*)from_page, index, mtr); + btr_level_list_remove( + index->table->space_id, + page_size, from_page, index, mtr); btr_page_get_father(index, from_block, mtr, &parent); btr_cur_node_ptr_delete(&parent, mtr); /* btr_blob_dbg_remove(from_page, index, @@ -541,7 +542,6 @@ btr_defragment_n_pages( uint n_pages,/*!< in: number of pages to defragment */ mtr_t* mtr) /*!< in/out: mini-transaction */ { - ulint space; /* We will need to load the n+1 block because if the last page is freed and we need to modify the prev_page_no of that block. */ buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1]; @@ -552,7 +552,6 @@ btr_defragment_n_pages( ulint data_size_per_rec; ulint optimal_page_size; ulint reserved_space; - ulint level; ulint max_data_size = 0; uint n_defragmented = 0; uint n_new_slots; @@ -562,8 +561,11 @@ btr_defragment_n_pages( /* It doesn't make sense to call this function with n_pages = 1. */ ut_ad(n_pages > 1); - space = dict_index_get_space(index); - if (space == 0) { + if (!page_is_leaf(block->frame)) { + return NULL; + } + + if (!index->table->space || !index->table->space_id) { /* Ignore space 0. */ return NULL; } @@ -573,12 +575,7 @@ btr_defragment_n_pages( } first_page = buf_block_get_frame(block); - level = btr_page_get_level(first_page, mtr); - const page_size_t page_size(dict_table_page_size(index->table)); - - if (level != 0) { - return NULL; - } + const page_size_t page_size(index->table->space->flags); /* 1. Load the pages and calculate the total data size. */ blocks[0] = block; @@ -593,9 +590,8 @@ btr_defragment_n_pages( break; } - const page_id_t page_id(dict_index_get_space(index), page_no); - - blocks[i] = btr_block_get(page_id, page_size, + blocks[i] = btr_block_get(page_id_t(index->table->space_id, + page_no), page_size, RW_X_LATCH, index, mtr); } @@ -623,7 +619,7 @@ btr_defragment_n_pages( // For compressed pages, we take compression failures into account. if (page_size.is_compressed()) { ulint size = 0; - int i = 0; + uint i = 0; // We estimate the optimal data size of the index use samples of // data size. These samples are taken when pages failed to // compress due to insertion on the page. We use the average @@ -637,7 +633,7 @@ btr_defragment_n_pages( size += index->stat_defrag_data_size_sample[i]; } if (i != 0) { - size = size / i; + size /= i; optimal_page_size = ut_min(optimal_page_size, size); } max_data_size = optimal_page_size; @@ -750,10 +746,10 @@ DECLARE_THREAD(btr_defragment_thread)(void*) mtr_start(&mtr); cursor = btr_pcur_get_btr_cur(pcur); index = btr_cur_get_index(cursor); - mtr.set_named_space(index->space); + index->set_modified(mtr); /* To follow the latching order defined in WL#6326, acquire index->lock X-latch. This entitles us to acquire page latches in any order for the index. */ - mtr_x_lock(&index->lock, &mtr); + mtr_x_lock_index(index, &mtr); /* This will acquire index->lock SX-latch, which per WL#6363 is allowed when we are already holding the X-latch. */ btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr); diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc index 560774c715e..ef626e12cf4 100644 --- a/storage/innobase/btr/btr0pcur.cc +++ b/storage/innobase/btr/btr0pcur.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2020, MariaDB Corporation. +Copyright (c) 2016, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -61,6 +61,7 @@ btr_pcur_reset( cursor->btr_cur.index = NULL; cursor->btr_cur.page_cur.rec = NULL; cursor->old_rec = NULL; + cursor->old_n_core_fields = 0; cursor->old_n_fields = 0; cursor->old_stored = false; @@ -128,18 +129,19 @@ btr_pcur_store_position( cursor->old_stored = true; if (page_is_empty(block->frame)) { + ut_ad(block->page.id.page_no() == index->page); +empty_table: /* It must be an empty index tree; NOTE that in this case we do not store the modify_clock, but always do a search if we restore the cursor position */ ut_a(!page_has_siblings(block->frame)); ut_ad(page_is_leaf(block->frame)); - ut_ad(block->page.id.page_no() == index->page); if (page_rec_is_supremum_low(offs)) { - cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; } else { +before_first: cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE; } @@ -147,23 +149,58 @@ btr_pcur_store_position( } if (page_rec_is_supremum_low(offs)) { - rec = page_rec_get_prev(rec); - cursor->rel_pos = BTR_PCUR_AFTER; + ut_ad(!page_rec_is_infimum(rec)); - } else if (page_rec_is_infimum_low(offs)) { + if (UNIV_UNLIKELY(rec_is_metadata(rec, index))) { + /* The table may be empty such that it only + contains a metadata record, in a leaf page + that is not the root page. */ + ut_ad(index->is_primary()); + ut_ad(block->page.id.page_no() != index->page); + goto empty_table; + } + cursor->rel_pos = BTR_PCUR_AFTER; + } else if (page_rec_is_infimum_low(offs)) { rec = page_rec_get_next(rec); + if (rec_is_metadata(rec, index)) { + ut_ad(!page_has_prev(block->frame)); + rec = page_rec_get_next(rec); + if (page_rec_is_supremum(rec)) { + goto before_first; + } + } + cursor->rel_pos = BTR_PCUR_BEFORE; } else { cursor->rel_pos = BTR_PCUR_ON; } - cursor->old_rec = dict_index_copy_rec_order_prefix( - index, rec, &cursor->old_n_fields, - &cursor->old_rec_buf, &cursor->buf_size); + UNIV_PREFETCH_R(rec); + + if (index->is_ibuf()) { + cursor->old_n_fields = uint16(rec_get_n_fields_old(rec)); + } else { + cursor->old_n_fields = static_cast<uint16>( + dict_index_get_n_unique_in_tree(index)); + if (index->is_spatial() && !page_rec_is_leaf(rec)) { + ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) + == DICT_INDEX_SPATIAL_NODEPTR_SIZE); + /* For R-tree, we have to compare + the child page numbers as well. */ + cursor->old_n_fields + = DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1; + } + } + + cursor->old_n_core_fields = index->n_core_fields; + cursor->old_rec = rec_copy_prefix_to_buf(rec, index, + cursor->old_n_fields, + &cursor->old_rec_buf, + &cursor->buf_size); cursor->block_when_stored.store(block); @@ -195,6 +232,7 @@ btr_pcur_copy_stored_position( + (pcur_donate->old_rec - pcur_donate->old_rec_buf); } + pcur_receive->old_n_core_fields = pcur_donate->old_n_core_fields; pcur_receive->old_n_fields = pcur_donate->old_n_fields; } @@ -286,6 +324,8 @@ btr_pcur_restore_position_func( } ut_a(cursor->old_rec); + ut_a(cursor->old_n_core_fields); + ut_a(cursor->old_n_core_fields <= index->n_core_fields); ut_a(cursor->old_n_fields); switch (latch_mode) { @@ -319,11 +359,16 @@ btr_pcur_restore_position_func( rec_offs_init(offsets2_); heap = mem_heap_create(256); + ut_ad(cursor->old_n_core_fields + == index->n_core_fields); + offsets1 = rec_get_offsets( - cursor->old_rec, index, offsets1, true, + cursor->old_rec, index, offsets1, + cursor->old_n_core_fields, cursor->old_n_fields, &heap); offsets2 = rec_get_offsets( - rec, index, offsets2, true, + rec, index, offsets2, + index->n_core_fields, cursor->old_n_fields, &heap); ut_ad(!cmp_rec_rec(cursor->old_rec, @@ -348,8 +393,14 @@ btr_pcur_restore_position_func( heap = mem_heap_create(256); - tuple = dict_index_build_data_tuple(cursor->old_rec, index, true, - cursor->old_n_fields, heap); + tuple = dtuple_create(heap, cursor->old_n_fields); + + dict_index_copy_types(tuple, index, cursor->old_n_fields); + + rec_copy_prefix_to_dtuple(tuple, cursor->old_rec, index, + cursor->old_n_core_fields, + cursor->old_n_fields, heap); + ut_ad(dtuple_check_typed(tuple)); /* Save the old search mode of the cursor */ old_mode = cursor->search_mode; @@ -369,9 +420,10 @@ btr_pcur_restore_position_func( mode = PAGE_CUR_UNSUPP; } - btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode, cursor, + btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode, + cursor, #ifdef BTR_CUR_HASH_ADAPT - 0, + NULL, #endif /* BTR_CUR_HASH_ADAPT */ file, line, mtr); @@ -387,7 +439,8 @@ btr_pcur_restore_position_func( && btr_pcur_is_on_user_rec(cursor) && !cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor), rec_get_offsets(btr_pcur_get_rec(cursor), - index, offsets, true, + index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &heap))) { /* We have to store the NEW value for the modify clock, @@ -508,7 +561,7 @@ btr_pcur_move_backward_from_page( ut_ad(cursor->latch_mode != BTR_NO_LATCHES); ut_ad(btr_pcur_is_before_first_on_page(cursor)); - ut_ad(!btr_pcur_is_before_first_in_tree(cursor, mtr)); + ut_ad(!btr_pcur_is_before_first_in_tree(cursor)); latch_mode = cursor->latch_mode; @@ -580,7 +633,7 @@ btr_pcur_move_to_prev( if (btr_pcur_is_before_first_on_page(cursor)) { - if (btr_pcur_is_before_first_in_tree(cursor, mtr)) { + if (btr_pcur_is_before_first_in_tree(cursor)) { return(FALSE); } diff --git a/storage/innobase/btr/btr0scrub.cc b/storage/innobase/btr/btr0scrub.cc index 60d9c90c310..6a550739121 100644 --- a/storage/innobase/btr/btr0scrub.cc +++ b/storage/innobase/btr/btr0scrub.cc @@ -133,7 +133,7 @@ btr_scrub_lock_dict_func(ulint space_id, bool lock_to_close_table, if (lock_to_close_table) { } else if (fil_space_t* space = fil_space_acquire(space_id)) { bool stopping = space->is_stopping(); - fil_space_release(space); + space->release(); if (stopping) { return false; } @@ -209,7 +209,7 @@ btr_scrub_table_close_for_thread( btr_scrub_table_close(scrub_data->current_table); mutex_exit(&dict_sys->mutex); } - fil_space_release(space); + space->release(); } scrub_data->current_table = NULL; @@ -423,7 +423,7 @@ btr_pessimistic_scrub( * so that splitting won't fail due to this */ ulint n_extents = 3; ulint n_reserved = 0; - if (!fsp_reserve_free_extents(&n_reserved, index->space, + if (!fsp_reserve_free_extents(&n_reserved, index->table->space, n_extents, FSP_NORMAL, mtr)) { log_scrub_failure(index, scrub_data, block, DB_OUT_OF_FILE_SPACE); @@ -432,12 +432,9 @@ btr_pessimistic_scrub( /* read block variables */ const ulint page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); - const page_id_t page_id(dict_index_get_space(index), page_no); const uint32_t left_page_no = btr_page_get_prev(page); const uint32_t right_page_no = btr_page_get_next(page); - const page_id_t lpage_id(dict_index_get_space(index), left_page_no); - const page_id_t rpage_id(dict_index_get_space(index), right_page_no); - const page_size_t page_size(dict_table_page_size(index->table)); + const page_size_t page_size(index->table->space->flags); /** * When splitting page, we need X-latches on left/right brothers @@ -453,15 +450,15 @@ btr_pessimistic_scrub( mtr->release_block_at_savepoint(scrub_data->savepoint, block); buf_block_t* get_block __attribute__((unused)) = btr_block_get( - lpage_id, page_size, - RW_X_LATCH, index, mtr); + page_id_t(index->table->space_id, left_page_no), + page_size, RW_X_LATCH, index, mtr); /** * Refetch block and re-initialize page */ block = btr_block_get( - page_id, page_size, - RW_X_LATCH, index, mtr); + page_id_t(index->table->space_id, page_no), + page_size, RW_X_LATCH, index, mtr); page = buf_block_get_frame(block); @@ -474,8 +471,8 @@ btr_pessimistic_scrub( if (right_page_no != FIL_NULL) { buf_block_t* get_block __attribute__((unused))= btr_block_get( - rpage_id, page_size, - RW_X_LATCH, index, mtr); + page_id_t(index->table->space_id, right_page_no), + page_size, RW_X_LATCH, index, mtr); } /* arguments to btr_page_split_and_insert */ @@ -522,10 +519,7 @@ btr_pessimistic_scrub( mem_heap_free(heap); } - if (n_reserved > 0) { - fil_space_release_free_extents(index->space, n_reserved); - } - + index->table->space->release_free_extents(n_reserved); scrub_data->scrub_stat.page_splits++; return DB_SUCCESS; } @@ -669,7 +663,7 @@ btr_scrub_free_page( * it will be found by scrubbing thread again */ memset(buf_block_get_frame(block) + PAGE_HEADER, 0, - UNIV_PAGE_SIZE - PAGE_HEADER); + srv_page_size - PAGE_HEADER); mach_write_to_2(buf_block_get_frame(block) + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED); @@ -748,7 +742,7 @@ btr_scrub_recheck_page( } mtr_start(mtr); - mtr_x_lock(dict_index_get_lock(scrub_data->current_index), mtr); + mtr_x_lock_index(scrub_data->current_index, mtr); /** set savepoint for X-latch of block */ scrub_data->savepoint = mtr_set_savepoint(mtr); return BTR_SCRUB_PAGE; @@ -787,13 +781,14 @@ btr_scrub_page( /* check that table/index still match now that they are loaded */ - if (scrub_data->current_table->space != scrub_data->space) { + if (!scrub_data->current_table->space + || scrub_data->current_table->space_id != scrub_data->space) { /* this is truncate table */ mtr_commit(mtr); return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE; } - if (scrub_data->current_index->space != scrub_data->space) { + if (scrub_data->current_index->table != scrub_data->current_table) { /* this is truncate table */ mtr_commit(mtr); return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE; diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc index 7b80d22c778..1b8375d60fe 100644 --- a/storage/innobase/btr/btr0sea.cc +++ b/storage/innobase/btr/btr0sea.cc @@ -45,10 +45,10 @@ Created 2/17/1996 Heikki Tuuri /** Is search system enabled. Search system is protected by array of latches. */ -char btr_search_enabled = true; +char btr_search_enabled; /** Number of adaptive hash index partition. */ -ulong btr_ahi_parts = 8; +ulong btr_ahi_parts; #ifdef UNIV_SEARCH_PERF_STAT /** Number of successful adaptive hash index lookups */ @@ -84,7 +84,74 @@ is then built on the page, assuming the global limit has been reached */ /** The global limit for consecutive potentially successful hash searches, before hash index building is started */ -#define BTR_SEARCH_BUILD_LIMIT 100 +#define BTR_SEARCH_BUILD_LIMIT 100U + +/** Compute a hash value of a record in a page. +@param[in] rec index record +@param[in] offsets return value of rec_get_offsets() +@param[in] n_fields number of complete fields to fold +@param[in] n_bytes number of bytes to fold in the last field +@param[in] index_id index tree ID +@return the hash value */ +static inline +ulint +rec_fold( + const rec_t* rec, + const rec_offs* offsets, + ulint n_fields, + ulint n_bytes, + index_id_t tree_id) +{ + ulint i; + const byte* data; + ulint len; + ulint fold; + ulint n_fields_rec; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!page_rec_is_metadata(rec)); + ut_ad(n_fields > 0 || n_bytes > 0); + + n_fields_rec = rec_offs_n_fields(offsets); + ut_ad(n_fields <= n_fields_rec); + ut_ad(n_fields < n_fields_rec || n_bytes == 0); + + if (n_fields > n_fields_rec) { + n_fields = n_fields_rec; + } + + if (n_fields == n_fields_rec) { + n_bytes = 0; + } + + fold = ut_fold_ull(tree_id); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} /** Determine the number of accessed key fields. @param[in] n_fields number of complete fields @@ -110,23 +177,6 @@ btr_search_get_n_fields( return(btr_search_get_n_fields(cursor->n_fields, cursor->n_bytes)); } -/********************************************************************//** -Builds a hash index on a page with the given parameters. If the page already -has a hash index with different parameters, the old hash index is removed. -If index is non-NULL, this function checks if n_fields and n_bytes are -sensible values, and does not build a hash index if not. */ -static -void -btr_search_build_page_hash_index( -/*=============================*/ - dict_index_t* index, /*!< in: index for which to build, or NULL if - not known */ - buf_block_t* block, /*!< in: index page, s- or x-latched */ - ulint n_fields,/*!< in: hash this many full fields */ - ulint n_bytes,/*!< in: hash this many bytes from the next - field */ - ibool left_side);/*!< in: hash for searches from left side? */ - /** This function should be called before reserving any btr search mutex, if the intended operation might add nodes to the search system hash table. Because of the latching order, once we have reserved the btr search system @@ -139,7 +189,7 @@ will not guarantee success. @param[in] index index handler */ static void -btr_search_check_free_space_in_heap(dict_index_t* index) +btr_search_check_free_space_in_heap(const dict_index_t* index) { /* Note that we peek the value of heap->free_block without reserving the latch: this is ok, because we will not guarantee that there will @@ -171,8 +221,7 @@ func_exit: /** Creates and initializes the adaptive search system at a database start. @param[in] hash_size hash table size. */ -void -btr_search_sys_create(ulint hash_size) +void btr_search_sys_create(ulint hash_size) { /* Search System is divided into n parts. Each part controls access to distinct set of hash buckets from @@ -205,6 +254,12 @@ btr_search_sys_create(ulint hash_size) /** Frees the adaptive search system at a database shutdown. */ void btr_search_sys_free() { + if (!btr_search_sys) + { + ut_ad(!btr_search_latches); + return; + } + ut_ad(btr_search_sys); ut_ad(btr_search_latches); @@ -371,9 +426,8 @@ void btr_search_disable() } /** Enable the adaptive hash search system. -@param[in] resize Flag to indicate call during buf_pool_resize() */ -void -btr_search_enable(bool resize) +@param resize whether buf_pool_resize() is the caller */ +void btr_search_enable(bool resize) { if (!resize) { buf_pool_mutex_enter_all(); @@ -424,8 +478,8 @@ btr_search_info_update_hash( ulint n_unique; int cmp; - ut_ad(!rw_lock_own_flagged(btr_get_search_latch(index), - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); if (dict_index_is_ibuf(index)) { /* So many deletes are performed on an insert buffer tree @@ -529,17 +583,12 @@ block->n_hash_helps, n_fields, n_bytes, left_side are NOT protected by any semaphore, to save CPU time! Do not assume the fields are consistent. @return TRUE if building a (new) hash index on the block is recommended @param[in,out] info search info -@param[in,out] block buffer block -@param[in] cursor cursor */ +@param[in,out] block buffer block */ static -ibool -btr_search_update_block_hash_info( - btr_search_t* info, - buf_block_t* block, - const btr_cur_t* cursor) +bool +btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block) { - ut_ad(!rw_lock_own_flagged(btr_get_search_latch(cursor->index), - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + ut_ad(!btr_search_own_any()); ut_ad(rw_lock_own_flagged(&block->lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); @@ -586,11 +635,11 @@ btr_search_update_block_hash_info( /* Build a new hash index on the page */ - return(TRUE); + return(true); } } - return(FALSE); + return(false); } /** Updates a hash node reference when it has been unsuccessfully used in a @@ -630,8 +679,8 @@ btr_search_update_hash_ref( return; } - ut_ad(block->page.id.space() == index->space); - ut_a(index == cursor->index); + ut_ad(block->page.id.space() == index->table->space_id); + ut_ad(index == cursor->index); ut_ad(!dict_index_is_ibuf(index)); rw_lock_t* const latch = btr_get_search_latch(index); rw_lock_x_lock(latch); @@ -654,7 +703,8 @@ btr_search_update_hash_ref( ulint fold = rec_fold( rec, - rec_get_offsets(rec, index, offsets_, true, + rec_get_offsets(rec, index, offsets_, + index->n_core_fields, ULINT_UNDEFINED, &heap), block->curr_n_fields, block->curr_n_bytes, index->id); @@ -672,56 +722,6 @@ func_exit: rw_lock_x_unlock(latch); } -/** Updates the search info. -@param[in,out] info search info -@param[in] cursor cursor which was just positioned */ -void -btr_search_info_update_slow( - btr_search_t* info, - btr_cur_t* cursor) -{ - buf_block_t* block; - ibool build_index; - - ut_ad(!rw_lock_own_flagged(btr_get_search_latch(cursor->index), - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - - block = btr_cur_get_block(cursor); - - /* NOTE that the following two function calls do NOT protect - info or block->n_fields etc. with any semaphore, to save CPU time! - We cannot assume the fields are consistent when we return from - those functions! */ - - btr_search_info_update_hash(info, cursor); - - build_index = btr_search_update_block_hash_info(info, block, cursor); - - if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) { - - btr_search_check_free_space_in_heap(cursor->index); - } - - if (cursor->flag == BTR_CUR_HASH_FAIL) { - /* Update the hash node reference, if appropriate */ - -#ifdef UNIV_SEARCH_PERF_STAT - btr_search_n_hash_fail++; -#endif /* UNIV_SEARCH_PERF_STAT */ - btr_search_update_hash_ref(info, block, cursor); - } - - if (build_index) { - /* Note that since we did not protect block->n_fields etc. - with any semaphore, the values can be inconsistent. We have - to check inside the function call that they make sense. */ - btr_search_build_page_hash_index(cursor->index, block, - block->n_fields, - block->n_bytes, - block->left_side); - } -} - /** Checks if a guessed position for a tree cursor is right. Note that if mode is PAGE_CUR_LE, which is used in inserts, and the function returns TRUE, then cursor->up_match and cursor->low_match both have sensible values. @@ -735,16 +735,14 @@ TRUE, then cursor->up_match and cursor->low_match both have sensible values. previous record to check our guess! @param[in] tuple data tuple @param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, PAGE_CUR_GE -@param[in] mtr mini transaction -@return TRUE if success */ +@return whether a match was found */ static -ibool +bool btr_search_check_guess( btr_cur_t* cursor, - ibool can_only_compare_to_cursor_rec, + bool can_only_compare_to_cursor_rec, const dtuple_t* tuple, - ulint mode, - mtr_t* mtr) + ulint mode) { rec_t* rec; ulint n_unique; @@ -765,7 +763,8 @@ btr_search_check_guess( match = 0; - offsets = rec_get_offsets(rec, cursor->index, offsets, true, + offsets = rec_get_offsets(rec, cursor->index, offsets, + cursor->index->n_core_fields, n_unique, &heap); cmp = cmp_dtuple_rec_with_match(tuple, rec, offsets, &match); @@ -806,11 +805,9 @@ btr_search_check_guess( match = 0; if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) { - rec_t* prev_rec; - ut_ad(!page_rec_is_infimum(rec)); - prev_rec = page_rec_get_prev(rec); + const rec_t* prev_rec = page_rec_get_prev(rec); if (page_rec_is_infimum(prev_rec)) { success = !page_has_prev(page_align(prev_rec)); @@ -818,7 +815,8 @@ btr_search_check_guess( } offsets = rec_get_offsets(prev_rec, cursor->index, offsets, - true, n_unique, &heap); + cursor->index->n_core_fields, + n_unique, &heap); cmp = cmp_dtuple_rec_with_match( tuple, prev_rec, offsets, &match); if (mode == PAGE_CUR_GE) { @@ -826,14 +824,10 @@ btr_search_check_guess( } else { success = cmp >= 0; } - - goto exit_func; } else { - rec_t* next_rec; - ut_ad(!page_rec_is_supremum(rec)); - next_rec = page_rec_get_next(rec); + const rec_t* next_rec = page_rec_get_next(rec); if (page_rec_is_supremum(next_rec)) { if (!page_has_next(page_align(next_rec))) { @@ -845,7 +839,8 @@ btr_search_check_guess( } offsets = rec_get_offsets(next_rec, cursor->index, offsets, - true, n_unique, &heap); + cursor->index->n_core_fields, + n_unique, &heap); cmp = cmp_dtuple_rec_with_match( tuple, next_rec, offsets, &match); if (mode == PAGE_CUR_LE) { @@ -893,12 +888,11 @@ both have sensible values. we assume the caller uses his search latch to protect the record! @param[out] cursor tree cursor -@param[in] has_search_latch - latch mode the caller currently has on - search system: RW_S/X_LATCH or 0 +@param[in] ahi_latch the adaptive hash index latch being held, + or NULL @param[in] mtr mini transaction -@return TRUE if succeeded */ -ibool +@return whether the search succeeded */ +bool btr_search_guess_on_hash( dict_index_t* index, btr_search_t* info, @@ -906,7 +900,7 @@ btr_search_guess_on_hash( ulint mode, ulint latch_mode, btr_cur_t* cursor, - ulint has_search_latch, + rw_lock_t* ahi_latch, mtr_t* mtr) { ulint fold; @@ -915,13 +909,16 @@ btr_search_guess_on_hash( btr_cur_t cursor2; btr_pcur_t pcur; #endif + ut_ad(!ahi_latch || rw_lock_own_flagged( + ahi_latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); if (!btr_search_enabled) { - return(FALSE); + return false; } ut_ad(index && info && tuple && cursor && mtr); ut_ad(!dict_index_is_ibuf(index)); + ut_ad(!ahi_latch || ahi_latch == btr_get_search_latch(index)); ut_ad((latch_mode == BTR_SEARCH_LEAF) || (latch_mode == BTR_MODIFY_LEAF)); @@ -932,16 +929,14 @@ btr_search_guess_on_hash( any latch here! */ if (info->n_hash_potential == 0) { - - return(FALSE); + return false; } cursor->n_fields = info->n_fields; cursor->n_bytes = info->n_bytes; if (dtuple_get_n_fields(tuple) < btr_search_get_n_fields(cursor)) { - - return(FALSE); + return false; } index_id = index->id; @@ -954,34 +949,36 @@ btr_search_guess_on_hash( cursor->fold = fold; cursor->flag = BTR_CUR_HASH; - rw_lock_t* const latch = btr_get_search_latch(index); + rw_lock_t* use_latch = ahi_latch ? NULL : btr_get_search_latch(index); + const rec_t* rec; - if (!has_search_latch) { - rw_lock_s_lock(latch); + if (use_latch) { + rw_lock_s_lock(use_latch); if (!btr_search_enabled) { -fail: - if (!has_search_latch) { - rw_lock_s_unlock(latch); - } - btr_search_failure(info, cursor); - return(FALSE); + goto fail; } + } else { + ut_ad(btr_search_enabled); + ut_ad(rw_lock_own(ahi_latch, RW_LOCK_S)); } - ut_ad(rw_lock_own(latch, RW_LOCK_S)); - - const rec_t* rec = static_cast<const rec_t*>( + rec = static_cast<const rec_t*>( ha_search_and_get_data(btr_get_search_table(index), fold)); if (!rec) { - goto fail; + if (use_latch) { +fail: + rw_lock_s_unlock(use_latch); + } + + btr_search_failure(info, cursor); + return false; } buf_block_t* block = buf_block_from_ahi(rec); - if (!has_search_latch) { - + if (use_latch) { if (!buf_page_get_known_nowait( latch_mode, block, BUF_MAKE_YOUNG, __FILE__, __LINE__, mtr)) { @@ -991,7 +988,7 @@ fail: const bool fail = index != block->index && index_id == block->index->id; ut_a(!fail || block->index->freed()); - rw_lock_s_unlock(latch); + rw_lock_s_unlock(use_latch); buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH); if (UNIV_UNLIKELY(fail)) { @@ -1006,13 +1003,14 @@ fail: if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { ut_ad(buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH); + fail_and_release_page: - if (!has_search_latch) { + if (!ahi_latch) { btr_leaf_page_release(block, latch_mode, mtr); } btr_search_failure(info, cursor); - return(FALSE); + return false; } ut_ad(page_rec_is_user_rec(rec)); @@ -1027,9 +1025,7 @@ fail_and_release_page: record to determine if our guess for the cursor position is right. */ if (index_id != btr_page_get_index_id(block->frame) - || !btr_search_check_guess(cursor, - has_search_latch, - tuple, mode, mtr)) { + || !btr_search_check_guess(cursor, !!ahi_latch, tuple, mode)) { goto fail_and_release_page; } @@ -1045,7 +1041,7 @@ fail_and_release_page: info->last_hash_succ = FALSE; /* Currently, does not work if the following fails: */ - ut_ad(!has_search_latch); + ut_ad(!ahi_latch); btr_leaf_page_release(block, latch_mode, mtr); @@ -1078,7 +1074,7 @@ fail_and_release_page: #ifdef UNIV_SEARCH_PERF_STAT btr_search_n_succ++; #endif - if (!has_search_latch && buf_page_peek_if_too_old(&block->page)) { + if (!ahi_latch && buf_page_peek_if_too_old(&block->page)) { buf_page_make_young(&block->page); } @@ -1091,7 +1087,7 @@ fail_and_release_page: ++buf_pool->stat.n_page_gets; } - return(TRUE); + return true; } /** Drop any adaptive hash index entries that point to an index page. @@ -1100,8 +1096,7 @@ fail_and_release_page: block->buf_fix_count == 0 or it is an index page which has already been removed from the buf_pool->page_hash i.e.: it is in state BUF_BLOCK_REMOVE_HASH */ -void -btr_search_drop_page_hash_index(buf_block_t* block) +void btr_search_drop_page_hash_index(buf_block_t* block) { ulint n_fields; ulint n_bytes; @@ -1121,6 +1116,8 @@ retry: /* This debug check uses a dirty read that could theoretically cause false positives while buf_pool_clear_hash_index() is executing. */ assert_block_ahi_valid(block); + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); if (!block->index) { return; @@ -1146,8 +1143,6 @@ retry: latch = btr_search_latches[ahi_slot]; dict_index_t* index = block->index; - ut_ad(!btr_search_own_any(RW_LOCK_S)); - ut_ad(!btr_search_own_any(RW_LOCK_X)); bool is_freed = index && index->freed(); if (is_freed) { @@ -1172,8 +1167,7 @@ retry: #endif ut_ad(btr_search_enabled); - ut_ad(index->space == FIL_NULL - || block->page.id.space() == index->space); + ut_ad(block->page.id.space() == index->table->space_id); ut_a(index_id == index->id); ut_ad(!dict_index_is_ibuf(index)); @@ -1201,6 +1195,9 @@ retry: rec = page_get_infimum_rec(page); rec = page_rec_get_next_low(rec, page_is_comp(page)); + if (rec_is_metadata(rec, index)) { + rec = page_rec_get_next_low(rec, page_is_comp(page)); + } prev_fold = 0; @@ -1209,7 +1206,7 @@ retry: while (!page_rec_is_supremum(rec)) { offsets = rec_get_offsets( - rec, index, offsets, true, + rec, index, offsets, index->n_core_fields, btr_search_get_n_fields(n_fields, n_bytes), &heap); fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id); @@ -1334,6 +1331,7 @@ If index is non-NULL, this function checks if n_fields and n_bytes are sensible, and does not build a hash index if not. @param[in,out] index index for which to build. @param[in,out] block index page, s-/x- latched. +@param[in,out] ahi_latch the adaptive search latch @param[in] n_fields hash this many full fields @param[in] n_bytes hash this many bytes of the next field @param[in] left_side hash for searches from left side */ @@ -1342,21 +1340,19 @@ void btr_search_build_page_hash_index( dict_index_t* index, buf_block_t* block, + rw_lock_t* ahi_latch, ulint n_fields, ulint n_bytes, ibool left_side) { - hash_table_t* table; - page_t* page; - rec_t* rec; - rec_t* next_rec; + const rec_t* rec; + const rec_t* next_rec; ulint fold; ulint next_fold; ulint n_cached; ulint n_recs; ulint* folds; - rec_t** recs; - ulint i; + const rec_t** recs; mem_heap_t* heap = NULL; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs* offsets = offsets_; @@ -1369,8 +1365,9 @@ btr_search_build_page_hash_index( } rec_offs_init(offsets_); + ut_ad(ahi_latch == btr_get_search_latch(index)); ut_ad(index); - ut_ad(block->page.id.space() == index->space); + ut_ad(block->page.id.space() == index->table->space_id); ut_ad(!dict_index_is_ibuf(index)); ut_ad(page_is_leaf(block->frame)); @@ -1378,24 +1375,21 @@ btr_search_build_page_hash_index( RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); ut_ad(block->page.id.page_no() >= 3); - rw_lock_t* const latch = btr_get_search_latch(index); - rw_lock_s_lock(latch); + rw_lock_s_lock(ahi_latch); - if (!btr_search_enabled) { - rw_lock_s_unlock(latch); - return; - } + const bool enabled = btr_search_enabled; + const bool rebuild = enabled && block->index + && (block->curr_n_fields != n_fields + || block->curr_n_bytes != n_bytes + || block->curr_left_side != left_side); - table = btr_get_search_table(index); - page = buf_block_get_frame(block); + rw_lock_s_unlock(ahi_latch); - const bool must_drop = block->index - && ((block->curr_n_fields != n_fields) - || (block->curr_n_bytes != n_bytes) - || (block->curr_left_side != left_side)); - rw_lock_s_unlock(latch); + if (!enabled) { + return; + } - if (must_drop) { + if (rebuild) { btr_search_drop_page_hash_index(block); } @@ -1411,6 +1405,7 @@ btr_search_build_page_hash_index( return; } + page_t* page = buf_block_get_frame(block); n_recs = page_get_n_recs(page); if (n_recs == 0) { @@ -1418,20 +1413,26 @@ btr_search_build_page_hash_index( return; } + rec = page_rec_get_next_const(page_get_infimum_rec(page)); + + if (rec_is_metadata(rec, index)) { + rec = page_rec_get_next_const(rec); + if (!--n_recs) return; + } + /* Calculate and cache fold values and corresponding records into an array for fast insertion to the hash index */ - folds = (ulint*) ut_malloc_nokey(n_recs * sizeof(ulint)); - recs = (rec_t**) ut_malloc_nokey(n_recs * sizeof(rec_t*)); + folds = static_cast<ulint*>(ut_malloc_nokey(n_recs * sizeof *folds)); + recs = static_cast<const rec_t**>( + ut_malloc_nokey(n_recs * sizeof *recs)); n_cached = 0; ut_a(index->id == btr_page_get_index_id(page)); - rec = page_rec_get_next(page_get_infimum_rec(page)); - offsets = rec_get_offsets( - rec, index, offsets, true, + rec, index, offsets, index->n_core_fields, btr_search_get_n_fields(n_fields, n_bytes), &heap); ut_ad(page_rec_is_supremum(rec) @@ -1447,7 +1448,7 @@ btr_search_build_page_hash_index( } for (;;) { - next_rec = page_rec_get_next(rec); + next_rec = page_rec_get_next_const(rec); if (page_rec_is_supremum(next_rec)) { @@ -1462,7 +1463,7 @@ btr_search_build_page_hash_index( } offsets = rec_get_offsets( - next_rec, index, offsets, true, + next_rec, index, offsets, index->n_core_fields, btr_search_get_n_fields(n_fields, n_bytes), &heap); next_fold = rec_fold(next_rec, offsets, n_fields, n_bytes, index->id); @@ -1488,19 +1489,12 @@ btr_search_build_page_hash_index( btr_search_check_free_space_in_heap(index); - rw_lock_x_lock(latch); + rw_lock_x_lock(ahi_latch); if (!btr_search_enabled) { goto exit_func; } - table = btr_get_search_table(index); - if (block->index && ((block->curr_n_fields != n_fields) - || (block->curr_n_bytes != n_bytes) - || (block->curr_left_side != left_side))) { - goto exit_func; - } - /* This counter is decremented every time we drop page hash index entries and is incremented here. Since we can rebuild hash index for a page that is already hashed, we @@ -1509,6 +1503,10 @@ btr_search_build_page_hash_index( if (!block->index) { assert_block_ahi_empty(block); index->search_info->ref_count++; + } else if (block->curr_n_fields != n_fields + || block->curr_n_bytes != n_bytes + || block->curr_left_side != left_side) { + goto exit_func; } block->n_hash_helps = 0; @@ -1518,16 +1516,18 @@ btr_search_build_page_hash_index( block->curr_left_side = unsigned(left_side); block->index = index; - for (i = 0; i < n_cached; i++) { - - ha_insert_for_fold(table, folds[i], block, recs[i]); + { + hash_table_t* table = btr_get_search_table(index); + for (ulint i = 0; i < n_cached; i++) { + ha_insert_for_fold(table, folds[i], block, recs[i]); + } } MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED); MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached); exit_func: assert_block_ahi_valid(block); - rw_lock_x_unlock(latch); + rw_lock_x_unlock(ahi_latch); ut_free(folds); ut_free(recs); @@ -1536,50 +1536,100 @@ exit_func: } } -/** Moves or deletes hash entries for moved records. If new_page is already -hashed, then the hash index for page, if any, is dropped. If new_page is not -hashed, and page is hashed, then a new hash index is built to new_page with the -same parameters as page (this often happens when a page is split). -@param[in,out] new_block records are copied to this page. -@param[in,out] block index page from which record are copied, and the - copied records will be deleted from this page. -@param[in,out] index record descriptor */ +/** Updates the search info. +@param[in,out] info search info +@param[in,out] cursor cursor which was just positioned */ void -btr_search_move_or_delete_hash_entries( - buf_block_t* new_block, - buf_block_t* block, - dict_index_t* index) +btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor) { -#ifdef MYSQL_INDEX_DISABLE_AHI - if (index->disable_ahi) return; -#endif - if (!btr_search_enabled) { - return; + rw_lock_t* ahi_latch = btr_get_search_latch(cursor->index); + + ut_ad(!rw_lock_own_flagged(ahi_latch, + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + + buf_block_t* block = btr_cur_get_block(cursor); + + /* NOTE that the following two function calls do NOT protect + info or block->n_fields etc. with any semaphore, to save CPU time! + We cannot assume the fields are consistent when we return from + those functions! */ + + btr_search_info_update_hash(info, cursor); + + bool build_index = btr_search_update_block_hash_info(info, block); + + if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) { + + btr_search_check_free_space_in_heap(cursor->index); + } + + if (cursor->flag == BTR_CUR_HASH_FAIL) { + /* Update the hash node reference, if appropriate */ + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_hash_fail++; +#endif /* UNIV_SEARCH_PERF_STAT */ + + btr_search_update_hash_ref(info, block, cursor); } + if (build_index) { + /* Note that since we did not protect block->n_fields etc. + with any semaphore, the values can be inconsistent. We have + to check inside the function call that they make sense. */ + btr_search_build_page_hash_index(cursor->index, block, + ahi_latch, + block->n_fields, + block->n_bytes, + block->left_side); + } +} + +/** Move or delete hash entries for moved records, usually in a page split. +If new_block is already hashed, then any hash index for block is dropped. +If new_block is not hashed, and block is hashed, then a new hash index is +built to new_block with the same parameters as block. +@param[in,out] new_block destination page +@param[in,out] block source page (subject to deletion later) */ +void +btr_search_move_or_delete_hash_entries( + buf_block_t* new_block, + buf_block_t* block) +{ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X)); ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_X)); - rw_lock_t* const latch = btr_get_search_latch(index); - rw_lock_s_lock(latch); + if (!btr_search_enabled) { + return; + } - ut_a(!new_block->index || new_block->index == index); - ut_a(!block->index || block->index->id == index->id); - ut_ad(!(new_block->index || block->index) - || !dict_index_is_ibuf(index)); + dict_index_t* index = block->index; + if (!index) { + index = new_block->index; + } else { + ut_ad(!new_block->index || index == new_block->index); + } assert_block_ahi_valid(block); assert_block_ahi_valid(new_block); + rw_lock_t* ahi_latch = index ? btr_get_search_latch(index) : NULL; + if (new_block->index) { drop_exit: - rw_lock_s_unlock(latch); btr_search_drop_page_hash_index(block); return; } + if (!index) { + return; + } + + rw_lock_s_lock(ahi_latch); + if (block->index) { if (block->index != index) { + rw_lock_s_unlock(ahi_latch); goto drop_exit; } @@ -1591,27 +1641,26 @@ drop_exit: new_block->n_bytes = block->curr_n_bytes; new_block->left_side = left_side; - rw_lock_s_unlock(latch); + rw_lock_s_unlock(ahi_latch); ut_a(n_fields > 0 || n_bytes > 0); btr_search_build_page_hash_index( - index, new_block, n_fields, n_bytes, left_side); + index, new_block, ahi_latch, + n_fields, n_bytes, left_side); ut_ad(n_fields == block->curr_n_fields); ut_ad(n_bytes == block->curr_n_bytes); ut_ad(left_side == block->curr_left_side); return; } - rw_lock_s_unlock(latch); + rw_lock_s_unlock(ahi_latch); } /** Updates the page hash index when a single record is deleted from a page. @param[in] cursor cursor which was positioned on the record to delete using btr_cur_search_, the record is not yet deleted.*/ -void -btr_search_update_hash_on_delete(btr_cur_t* cursor) +void btr_search_update_hash_on_delete(btr_cur_t* cursor) { - hash_table_t* table; buf_block_t* block; const rec_t* rec; ulint fold; @@ -1647,58 +1696,60 @@ btr_search_update_hash_on_delete(btr_cur_t* cursor) return; } - ut_ad(block->page.id.space() == index->space); + ut_ad(block->page.id.space() == index->table->space_id); ut_a(index == cursor->index); ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0); ut_ad(!dict_index_is_ibuf(index)); rec = btr_cur_get_rec(cursor); - fold = rec_fold(rec, rec_get_offsets(rec, index, offsets_, true, + fold = rec_fold(rec, rec_get_offsets(rec, index, offsets_, + index->n_core_fields, ULINT_UNDEFINED, &heap), block->curr_n_fields, block->curr_n_bytes, index->id); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } - rw_lock_t* const latch = btr_get_search_latch(index); - rw_lock_x_lock(latch); + rw_lock_t* ahi_latch = btr_get_search_latch(index); + + rw_lock_x_lock(ahi_latch); assert_block_ahi_valid(block); - if (!btr_search_enabled) { - rw_lock_x_unlock(latch); - return; - } + if (btr_search_enabled) { + hash_table_t* table = btr_get_search_table(index); + if (block->index) { + ut_a(block->index == index); - table = btr_get_search_table(index); - if (block->index) { - ut_a(block->index == index); + if (ha_search_and_delete_if_found(table, fold, rec)) { + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED); + } else { + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND); + } - if (ha_search_and_delete_if_found(table, fold, rec)) { - MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED); - } else { - MONITOR_INC( - MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND); + assert_block_ahi_valid(block); } - - assert_block_ahi_valid(block); } - rw_lock_x_unlock(latch); + rw_lock_x_unlock(ahi_latch); } /** Updates the page hash index when a single record is inserted on a page. @param[in] cursor cursor which was positioned to the place to insert using btr_cur_search_, and the new record has been - inserted next to the cursor. */ + inserted next to the cursor. +@param[in] ahi_latch the adaptive hash index latch */ void -btr_search_update_hash_node_on_insert(btr_cur_t* cursor) +btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch) { hash_table_t* table; buf_block_t* block; dict_index_t* index; rec_t* rec; + ut_ad(ahi_latch == btr_get_search_latch(cursor->index)); + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); #ifdef MYSQL_INDEX_DISABLE_AHI if (cursor->index->disable_ahi) return; #endif @@ -1727,9 +1778,7 @@ btr_search_update_hash_node_on_insert(btr_cur_t* cursor) ut_a(cursor->index == index); ut_ad(!dict_index_is_ibuf(index)); - - rw_lock_t* const latch = btr_get_search_latch(index); - rw_lock_x_lock(latch); + rw_lock_x_lock(ahi_latch); if (!block->index || !btr_search_enabled) { @@ -1753,10 +1802,11 @@ btr_search_update_hash_node_on_insert(btr_cur_t* cursor) func_exit: assert_block_ahi_valid(block); - rw_lock_x_unlock(latch); + rw_lock_x_unlock(ahi_latch); } else { - rw_lock_x_unlock(latch); - btr_search_update_hash_on_insert(cursor); + rw_lock_x_unlock(ahi_latch); + + btr_search_update_hash_on_insert(cursor, ahi_latch); } } @@ -1764,9 +1814,10 @@ func_exit: @param[in,out] cursor cursor which was positioned to the place to insert using btr_cur_search_..., and the new record has been inserted next - to the cursor */ + to the cursor +@param[in] ahi_latch the adaptive hash index latch */ void -btr_search_update_hash_on_insert(btr_cur_t* cursor) +btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch) { buf_block_t* block; dict_index_t* index; @@ -1783,7 +1834,10 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor) rec_offs* offsets = offsets_; rec_offs_init(offsets_); + ut_ad(ahi_latch == btr_get_search_latch(cursor->index)); ut_ad(page_is_leaf(btr_cur_get_page(cursor))); + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); #ifdef MYSQL_INDEX_DISABLE_AHI if (cursor->index->disable_ahi) return; #endif @@ -1803,7 +1857,7 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor) return; } - ut_ad(block->page.id.space() == index->space); + ut_ad(block->page.id.space() == index->table->space_id); btr_search_check_free_space_in_heap(index); rec = btr_cur_get_rec(cursor); @@ -1827,32 +1881,32 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor) ins_rec = page_rec_get_next_const(rec); next_rec = page_rec_get_next_const(ins_rec); - offsets = rec_get_offsets(ins_rec, index, offsets, true, + offsets = rec_get_offsets(ins_rec, index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &heap); ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index->id); if (!page_rec_is_supremum(next_rec)) { offsets = rec_get_offsets( - next_rec, index, offsets, true, + next_rec, index, offsets, index->n_core_fields, btr_search_get_n_fields(n_fields, n_bytes), &heap); next_fold = rec_fold(next_rec, offsets, n_fields, n_bytes, index->id); } - rw_lock_t* const latch = btr_get_search_latch(index); - /* We must not look up "table" before acquiring the latch. */ + /* We must not look up "table" before acquiring ahi_latch. */ hash_table_t* table = NULL; bool locked = false; - if (!page_rec_is_infimum(rec)) { + if (!page_rec_is_infimum(rec) && !rec_is_metadata(rec, index)) { offsets = rec_get_offsets( - rec, index, offsets, true, + rec, index, offsets, index->n_core_fields, btr_search_get_n_fields(n_fields, n_bytes), &heap); fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id); } else { - locked = left_side; - if (locked) { - rw_lock_x_lock(latch); + if (left_side) { + locked = true; + rw_lock_x_lock(ahi_latch); if (!btr_search_enabled || !block->index) { goto function_exit; @@ -1869,7 +1923,7 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor) if (!locked) { locked = true; - rw_lock_x_lock(latch); + rw_lock_x_lock(ahi_latch); if (!btr_search_enabled || !block->index) { goto function_exit; @@ -1891,7 +1945,7 @@ check_next_rec: if (!left_side) { if (!locked) { locked = true; - rw_lock_x_lock(latch); + rw_lock_x_lock(ahi_latch); if (!btr_search_enabled || !block->index) { goto function_exit; @@ -1909,7 +1963,7 @@ check_next_rec: if (ins_fold != next_fold) { if (!locked) { locked = true; - rw_lock_x_lock(latch); + rw_lock_x_lock(ahi_latch); if (!btr_search_enabled || !block->index) { goto function_exit; @@ -1930,8 +1984,9 @@ function_exit: mem_heap_free(heap); } if (locked) { - rw_lock_x_unlock(latch); + rw_lock_x_unlock(ahi_latch); } + ut_ad(!rw_lock_own(ahi_latch, RW_LOCK_X)); } #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG @@ -2045,12 +2100,14 @@ btr_search_hash_table_validate(ulint hash_table_id) } ut_ad(!dict_index_is_ibuf(block->index)); - ut_ad(block->page.id.space() == block->index->space); + ut_ad(block->page.id.space() + == block->index->table->space_id); page_index_id = btr_page_get_index_id(block->frame); offsets = rec_get_offsets( - node->data, block->index, offsets, true, + node->data, block->index, offsets, + block->index->n_core_fields, btr_search_get_n_fields(block->curr_n_fields, block->curr_n_bytes), &heap); diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc index 80d82b9e78c..593adf700b3 100644 --- a/storage/innobase/buf/buf0buddy.cc +++ b/storage/innobase/buf/buf0buddy.cc @@ -73,10 +73,6 @@ list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */ value by the consumer of the block */ #define BUF_BUDDY_STAMP_NONFREE 0XFFFFFFFFUL -#if BUF_BUDDY_STAMP_FREE >= BUF_BUDDY_STAMP_NONFREE -# error "BUF_BUDDY_STAMP_FREE >= BUF_BUDDY_STAMP_NONFREE" -#endif - /** Return type of buf_buddy_is_free() */ enum buf_buddy_state_t { BUF_BUDDY_STATE_FREE, /*!< If the buddy to completely free */ @@ -109,6 +105,7 @@ buf_buddy_stamp_is_free( /*====================*/ const buf_buddy_free_t* buf) /*!< in: block to check */ { + compile_time_assert(BUF_BUDDY_STAMP_FREE < BUF_BUDDY_STAMP_NONFREE); return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET) == BUF_BUDDY_STAMP_FREE); } @@ -133,13 +130,12 @@ buf_buddy_stamp_free( Stamps a buddy nonfree. @param[in,out] buf block to stamp @param[in] i block size */ -#define buf_buddy_stamp_nonfree(buf, i) do { \ - buf_buddy_mem_invalid(buf, i); \ - memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4); \ -} while (0) -#if BUF_BUDDY_STAMP_NONFREE != 0xffffffff -# error "BUF_BUDDY_STAMP_NONFREE != 0xffffffff" -#endif +static inline void buf_buddy_stamp_nonfree(buf_buddy_free_t* buf, ulint i) +{ + buf_buddy_mem_invalid(buf, i); + compile_time_assert(BUF_BUDDY_STAMP_NONFREE == 0xffffffffU); + memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4); +} /**********************************************************************//** Get the offset of the buddy of a compressed page frame. @@ -155,7 +151,7 @@ buf_buddy_get( ut_ad(size >= BUF_BUDDY_LOW); ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN); ut_ad(size < BUF_BUDDY_HIGH); - ut_ad(BUF_BUDDY_HIGH == UNIV_PAGE_SIZE); + ut_ad(BUF_BUDDY_HIGH == srv_page_size); ut_ad(!ut_align_offset(page, size)); if (((ulint) page) & size) { @@ -369,7 +365,7 @@ buf_buddy_alloc_zip( } /**********************************************************************//** -Deallocate a buffer frame of UNIV_PAGE_SIZE. */ +Deallocate a buffer frame of srv_page_size. */ static void buf_buddy_block_free( @@ -383,7 +379,7 @@ buf_buddy_block_free( ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); - ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE)); + ut_a(!ut_align_offset(buf, srv_page_size)); HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage, ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY @@ -397,9 +393,7 @@ buf_buddy_block_free( HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage); ut_d(memset(buf, 0, srv_page_size)); -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(buf, srv_page_size); -#endif /* HAVE_valgrind_or_MSAN */ block = (buf_block_t*) bpage; buf_page_mutex_enter(block); @@ -427,7 +421,7 @@ buf_buddy_block_register( buf_block_set_state(block, BUF_BLOCK_MEMORY); ut_a(block->frame); - ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE)); + ut_a(!ut_align_offset(block->frame, srv_page_size)); ut_ad(!block->page.in_page_hash); ut_ad(!block->page.in_zip_hash); @@ -484,8 +478,8 @@ buf_buddy_alloc_low( buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ - ibool* lru) /*!< in: pointer to a variable that - will be assigned TRUE if storage was + bool* lru) /*!< in: pointer to a variable that + will be assigned true if storage was allocated from the LRU list and buf_pool->mutex was temporarily released */ @@ -516,7 +510,7 @@ buf_buddy_alloc_low( /* Try replacing an uncompressed page in the buffer pool. */ buf_pool_mutex_exit(buf_pool); block = buf_LRU_get_free_block(buf_pool); - *lru = TRUE; + *lru = true; buf_pool_mutex_enter(buf_pool); alloc_big: @@ -758,7 +752,7 @@ func_exit: @param[in] buf_pool buffer pool instance @param[in] buf block to be reallocated, must be pointed to by the buffer pool -@param[in] size block size, up to UNIV_PAGE_SIZE +@param[in] size block size, up to srv_page_size @retval false if failed because of no free blocks. */ bool buf_buddy_realloc( diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 070789288b6..4f30417e239 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -264,8 +264,8 @@ reachable via buf_pool->chunks[]. The chains of free memory blocks (buf_pool->zip_free[]) are used by the buddy allocator (buf0buddy.cc) to keep track of currently unused -memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2. These -blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type +memory blocks of size sizeof(buf_page_t)..srv_page_size / 2. These +blocks are inside the srv_page_size-sized memory blocks of type BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer pool. The buddy allocator is solely used for allocating control blocks for compressed pages (buf_page_t) and compressed page frames. @@ -494,7 +494,7 @@ static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame) @return whether the operation was successful */ static bool buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space) { - ut_ad(space->n_pending_ios > 0); + ut_ad(space->pending_io()); ut_ad(space->id == bpage->id.space()); byte* dst_frame = bpage->zip.data ? bpage->zip.data : @@ -544,7 +544,7 @@ decompress_with_slot: slot->release(); ut_ad(!write_size || fil_page_type_validate(dst_frame)); - ut_ad(space->n_pending_ios > 0); + ut_ad(space->pending_io()); return write_size != 0; } @@ -587,13 +587,10 @@ decrypt_failed: goto decompress; } - ut_ad(space->n_pending_ios > 0); + ut_ad(space->pending_io()); return true; } -/* prototypes for new functions added to ha_innodb.cc */ -trx_t* innobase_get_trx(); - /********************************************************************//** Gets the smallest oldest_modification lsn for any page in the pool. Returns zero if all modified pages have been flushed to disk. @@ -693,7 +690,8 @@ buf_get_total_list_size_in_bytes( for statistics purpose */ buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes; buf_pools_list_size->unzip_LRU_bytes += - UT_LIST_GET_LEN(buf_pool->unzip_LRU) * UNIV_PAGE_SIZE; + UT_LIST_GET_LEN(buf_pool->unzip_LRU) + << srv_page_size_shift; buf_pools_list_size->flush_list_bytes += buf_pool->stat.flush_list_bytes; } @@ -1063,9 +1061,7 @@ buf_page_is_corrupted( checksum_field2 = mach_read_from_4( read_buf + page_size.logical() - FIL_PAGE_END_LSN_OLD_CHKSUM); -#if FIL_PAGE_LSN % 8 -#error "FIL_PAGE_LSN must be 64 bit aligned" -#endif + compile_time_assert(!(FIL_PAGE_LSN % 8)); /* A page filled with NUL bytes is considered not corrupted. The FIL_PAGE_FILE_FLUSH_LSN field may be written nonzero for @@ -1223,6 +1219,57 @@ buf_page_is_corrupted( } #ifndef UNIV_INNOCHECKSUM + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) +/** Enable buffers to be dumped to core files + +A convience function, not called anyhwere directly however +it is left available for gdb or any debugger to call +in the event that you want all of the memory to be dumped +to a core file. + +Returns number of errors found in madvise calls. */ +int +buf_madvise_do_dump() +{ + int ret= 0; + buf_pool_t* buf_pool; + buf_chunk_t* chunk; + + /* mirrors allocation in log_t::create() */ + if (log_sys.buf) { + ret += madvise(log_sys.buf, + srv_log_buffer_size, + MADV_DODUMP); + ret += madvise(log_sys.flush_buf, + srv_log_buffer_size, + MADV_DODUMP); + } + /* mirrors recv_sys_init() */ + if (recv_sys->buf) + { + ret+= madvise(recv_sys->buf, recv_sys->len, MADV_DODUMP); + } + + buf_pool_mutex_enter_all(); + + for (ulong i= 0; i < srv_buf_pool_instances; i++) + { + buf_pool = buf_pool_from_array(i); + chunk = buf_pool->chunks; + + for (int n = buf_pool->n_chunks; n--; chunk++) + { + ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP); + } + } + + buf_pool_mutex_exit_all(); + + return ret; +} +#endif + /** Dump a page to stderr. @param[in] read_buf database page @param[in] page_size page size */ @@ -1344,20 +1391,10 @@ buf_page_print(const byte* read_buf, const page_size_t& page_size) read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); } - if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) - == TRX_UNDO_INSERT) { - fprintf(stderr, - "InnoDB: Page may be an insert undo log page\n"); - } else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_TYPE) - == TRX_UNDO_UPDATE) { - fprintf(stderr, - "InnoDB: Page may be an update undo log page\n"); - } - switch (fil_page_get_type(read_buf)) { index_id_t index_id; case FIL_PAGE_INDEX: + case FIL_PAGE_TYPE_INSTANT: case FIL_PAGE_RTREE: index_id = btr_page_get_index_id(read_buf); ib::info() << "Page may be an index page where" @@ -1371,6 +1408,9 @@ buf_page_print(const byte* read_buf, const page_size_t& page_size) << " in table " << index->table->name; } break; + case FIL_PAGE_UNDO_LOG: + fputs("InnoDB: Page may be an undo log page\n", stderr); + break; case FIL_PAGE_INODE: fputs("InnoDB: Page may be an 'inode' page\n", stderr); break; @@ -1556,15 +1596,16 @@ buf_chunk_init( /* Round down to a multiple of page size, although it already should be. */ - mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE); + mem_size = ut_2pow_round<ulint>(mem_size, srv_page_size); /* Reserve space for the block descriptors. */ - mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block) - + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE); + mem_size += ut_2pow_round<ulint>((mem_size >> srv_page_size_shift) + * (sizeof *block) + + (srv_page_size - 1), + srv_page_size); DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return(NULL);); - chunk->mem = buf_pool->allocator.allocate_large(mem_size, - &chunk->mem_pfx); + chunk->mem = buf_pool->allocator.allocate_large_dontdump(mem_size, &chunk->mem_pfx); if (UNIV_UNLIKELY(chunk->mem == NULL)) { @@ -1594,12 +1635,12 @@ buf_chunk_init( chunk->blocks = (buf_block_t*) chunk->mem; /* Align a pointer to the first frame. Note that when - opt_large_page_size is smaller than UNIV_PAGE_SIZE, + opt_large_page_size is smaller than srv_page_size, we may allocate one fewer block than requested. When it is bigger, we may allocate more blocks than requested. */ - frame = (byte*) ut_align(chunk->mem, UNIV_PAGE_SIZE); - chunk->size = chunk->mem_pfx.m_size / UNIV_PAGE_SIZE + frame = (byte*) ut_align(chunk->mem, srv_page_size); + chunk->size = (chunk->mem_pfx.m_size >> srv_page_size_shift) - (frame != chunk->mem); /* Subtract the space needed for block descriptors. */ @@ -1607,7 +1648,7 @@ buf_chunk_init( ulint size = chunk->size; while (frame < (byte*) (chunk->blocks + size)) { - frame += UNIV_PAGE_SIZE; + frame += srv_page_size; size--; } @@ -1632,7 +1673,7 @@ buf_chunk_init( ut_ad(buf_pool_from_block(block) == buf_pool); block++; - frame += UNIV_PAGE_SIZE; + frame += srv_page_size; } buf_pool_register_chunk(chunk); @@ -1858,8 +1899,8 @@ buf_pool_init_instance( &block->debug_latch)); } - buf_pool->allocator.deallocate_large( - chunk->mem, &chunk->mem_pfx); + buf_pool->allocator.deallocate_large_dodump( + chunk->mem, &chunk->mem_pfx, chunk->mem_size()); } ut_free(buf_pool->chunks); buf_pool_mutex_exit(buf_pool); @@ -1879,7 +1920,8 @@ buf_pool_init_instance( ut_min(BUF_READ_AHEAD_PAGES, ut_2_power_up(buf_pool->curr_size / BUF_READ_AHEAD_PORTION)); - buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE; + buf_pool->curr_pool_size = buf_pool->curr_size + << srv_page_size_shift; buf_pool->old_size = buf_pool->curr_size; buf_pool->n_chunks_new = buf_pool->n_chunks; @@ -2009,8 +2051,8 @@ buf_pool_free_instance( ut_d(rw_lock_free(&block->debug_latch)); } - buf_pool->allocator.deallocate_large( - chunk->mem, &chunk->mem_pfx); + buf_pool->allocator.deallocate_large_dodump( + chunk->mem, &chunk->mem_pfx, chunk->mem_size()); } for (ulint i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; ++i) { @@ -2326,7 +2368,7 @@ buf_frame_will_withdrawn( while (chunk < echunk) { if (ptr >= chunk->blocks->frame && ptr < (chunk->blocks + chunk->size - 1)->frame - + UNIV_PAGE_SIZE) { + + srv_page_size) { return(true); } ++chunk; @@ -2643,7 +2685,7 @@ buf_pool_resize() ut_ad(srv_buf_pool_chunk_unit > 0); new_instance_size = srv_buf_pool_size / srv_buf_pool_instances; - new_instance_size /= UNIV_PAGE_SIZE; + new_instance_size >>= srv_page_size_shift; buf_resize_status("Resizing buffer pool from " ULINTPF " to " ULINTPF " (unit=" ULINTPF ").", @@ -2662,7 +2704,8 @@ buf_pool_resize() buf_pool->curr_size = new_instance_size; - buf_pool->n_chunks_new = new_instance_size * UNIV_PAGE_SIZE + buf_pool->n_chunks_new = + (new_instance_size << srv_page_size_shift) / srv_buf_pool_chunk_unit; buf_pool_mutex_exit(buf_pool); @@ -2748,11 +2791,11 @@ withdraw_retry: } lock_mutex_enter(); - trx_sys_mutex_enter(); + mutex_enter(&trx_sys.mutex); bool found = false; - for (trx_t* trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); + for (trx_t* trx = UT_LIST_GET_FIRST(trx_sys.trx_list); trx != NULL; - trx = UT_LIST_GET_NEXT(mysql_trx_list, trx)) { + trx = UT_LIST_GET_NEXT(trx_list, trx)) { if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL && withdraw_started > trx->start_time) { @@ -2771,7 +2814,7 @@ withdraw_retry: stderr, trx, current_time); } } - trx_sys_mutex_exit(); + mutex_exit(&trx_sys.mutex); lock_mutex_exit(); withdraw_started = current_time; @@ -2849,6 +2892,22 @@ withdraw_retry: while (chunk < echunk) { buf_block_t* block = chunk->blocks; + /* buf_LRU_block_free_non_file_page() + invokes MEM_NOACCESS() on any blocks + that are in free_list. We must + cancel the effect of that. In MemorySanitizer, + MEM_NOACCESS() is no-op, so we must not do + anything special for it here. */ +#ifdef HAVE_valgrind +# if !__has_feature(memory_sanitizer) + MEM_MAKE_DEFINED(chunk->mem, + chunk->mem_size()); +# endif +#else + MEM_MAKE_ADDRESSABLE(chunk->mem, + chunk->mem_size()); +#endif + for (ulint j = chunk->size; j--; block++) { mutex_free(&block->mutex); @@ -2858,8 +2917,8 @@ withdraw_retry: &block->debug_latch)); } - buf_pool->allocator.deallocate_large( - chunk->mem, &chunk->mem_pfx); + buf_pool->allocator.deallocate_large_dodump( + chunk->mem, &chunk->mem_pfx, chunk->mem_size()); sum_freed += chunk->size; @@ -2997,7 +3056,7 @@ calc_buf_pool_size: ut_2_power_up(buf_pool->curr_size / BUF_READ_AHEAD_PORTION)); buf_pool->curr_pool_size - = buf_pool->curr_size * UNIV_PAGE_SIZE; + = buf_pool->curr_size << srv_page_size_shift; curr_size += buf_pool->curr_pool_size; buf_pool->old_size = buf_pool->curr_size; } @@ -3049,8 +3108,9 @@ calc_buf_pool_size: buf_resize_status("Resizing also other hash tables."); /* normalize lock_sys */ - srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE); - lock_sys_resize(srv_lock_table_size); + srv_lock_table_size = 5 + * (srv_buf_pool_size >> srv_page_size_shift); + lock_sys.resize(srv_lock_table_size); /* normalize dict_sys */ dict_resize(); @@ -3876,7 +3936,7 @@ buf_zip_decompress( if (page_zip_decompress(&block->page.zip, block->frame, TRUE)) { if (space) { - fil_space_release_for_io(space); + space->release_for_io(); } return(TRUE); } @@ -3895,7 +3955,7 @@ buf_zip_decompress( /* Copy to uncompressed storage. */ memcpy(block->frame, frame, block->page.size.physical()); if (space) { - fil_space_release_for_io(space); + space->release_for_io(); } return(TRUE); @@ -3910,13 +3970,16 @@ err_exit: if (encrypted) { ib::info() << "Row compressed page could be encrypted" " with key_version " << key_version; - dict_set_encrypted_by_space(block->page.id.space()); - } else { - dict_set_corrupted_by_space(block->page.id.space()); } if (space) { - fil_space_release_for_io(space); + if (encrypted) { + dict_set_encrypted_by_space(space); + } else { + dict_set_corrupted_by_space(space); + } + + space->release_for_io(); } return(FALSE); @@ -3947,16 +4010,16 @@ buf_block_from_ahi(const byte* ptr) chunk = (--it)->second; } - ulint offs = ptr - chunk->blocks->frame; + ulint offs = ulint(ptr - chunk->blocks->frame); - offs >>= UNIV_PAGE_SIZE_SHIFT; + offs >>= srv_page_size_shift; ut_a(offs < chunk->size); buf_block_t* block = &chunk->blocks[offs]; /* The function buf_chunk_init() invokes buf_block_init() so that - block[n].frame == block->frame + n * UNIV_PAGE_SIZE. Check it. */ + block[n].frame == block->frame + n * srv_page_size. Check it. */ ut_ad(block->frame == page_align(ptr)); /* Read the state of the block without holding a mutex. A state transition from BUF_BLOCK_FILE_PAGE to @@ -4358,9 +4421,16 @@ loop: /* Try to set table as corrupted instead of asserting. */ - if (page_id.space() != TRX_SYS_SPACE && - dict_set_corrupted_by_space(page_id.space())) { - return (NULL); + if (page_id.space() == TRX_SYS_SPACE) { + } else if (page_id.space() == SRV_TMP_SPACE_ID) { + } else if (fil_space_t* space + = fil_space_acquire_for_io( + page_id.space())) { + bool set = dict_set_corrupted_by_space(space); + space->release_for_io(); + if (set) { + return NULL; + } } ib::fatal() << "Unable to read page " << page_id @@ -4373,9 +4443,7 @@ loop: } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(fsp_skip_sanity_check(page_id.space()) - || ++buf_dbg_counter % 5771 - || buf_validate()); + ut_a(++buf_dbg_counter % 5771 || buf_validate()); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ goto loop; } else { @@ -4761,9 +4829,7 @@ evict_from_pool: } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(fsp_skip_sanity_check(page_id.space()) - || ++buf_dbg_counter % 5771 - || buf_validate()); + ut_a(++buf_dbg_counter % 5771 || buf_validate()); ut_a(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ @@ -4925,9 +4991,7 @@ buf_page_optimistic_get( mtr_memo_push(mtr, block, fix_type); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(fsp_skip_sanity_check(block->page.id.space()) - || ++buf_dbg_counter % 5771 - || buf_validate()); + ut_a(++buf_dbg_counter % 5771 || buf_validate()); ut_a(block->page.buf_fix_count > 0); ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ @@ -5122,9 +5186,7 @@ buf_page_try_get_func( mtr_memo_push(mtr, block, fix_type); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(fsp_skip_sanity_check(block->page.id.space()) - || ++buf_dbg_counter % 5771 - || buf_validate()); + ut_a(++buf_dbg_counter % 5771 || buf_validate()); ut_a(block->page.buf_fix_count > 0); ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ @@ -5261,7 +5323,7 @@ buf_page_init_for_read( buf_page_t* watch_page; rw_lock_t* hash_lock; mtr_t mtr; - ibool lru = FALSE; + bool lru = false; void* data; buf_pool_t* buf_pool = buf_pool_get(page_id); @@ -5643,7 +5705,7 @@ loop: if (page_size.is_compressed()) { void* data; - ibool lru; + bool lru; /* Prevent race conditions during buf_buddy_alloc(), which may release and reacquire buf_pool->mutex, @@ -5735,13 +5797,14 @@ buf_page_monitor( switch (fil_page_get_type(frame)) { ulint level; - + case FIL_PAGE_TYPE_INSTANT: case FIL_PAGE_INDEX: case FIL_PAGE_RTREE: - level = btr_page_get_level_low(frame); + level = btr_page_get_level(frame); /* Check if it is an index page for insert buffer */ - if (btr_page_get_index_id(frame) + if (fil_page_get_type(frame) == FIL_PAGE_INDEX + && btr_page_get_index_id(frame) == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) { if (level == 0) { counter = MONITOR_RW_COUNTER( @@ -5826,9 +5889,9 @@ static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space) are marked unusable later e.g. in ::open(). */ if (!space.crypt_data || space.crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) { - dict_set_corrupted_by_space(bpage->id.space()); + dict_set_corrupted_by_space(&space); } else { - dict_set_encrypted_by_space(bpage->id.space()); + dict_set_encrypted_by_space(&space); } } @@ -5892,7 +5955,7 @@ after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if accessed tablespace is not found */ static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space) { - ut_ad(space->n_pending_ios > 0); + ut_ad(space->pending_io()); byte* dst_frame = (bpage->zip.data) ? bpage->zip.data : ((buf_block_t*) bpage)->frame; @@ -6000,7 +6063,7 @@ buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict) my_atomic_addlint(&buf_pool->n_pend_unzip, 1); ibool ok = buf_zip_decompress((buf_block_t*) bpage, FALSE); - my_atomic_addlint(&buf_pool->n_pend_unzip, -1); + my_atomic_addlint(&buf_pool->n_pend_unzip, ulint(-1)); if (!ok) { ib::info() << "Page " @@ -6054,7 +6117,7 @@ database_corrupted: buf_corrupt_page_release(bpage, space); ib::info() << "Simulated IMPORT " "corruption"; - fil_space_release_for_io(space); + space->release_for_io(); return(err); } err = DB_SUCCESS; @@ -6101,7 +6164,7 @@ database_corrupted: } buf_corrupt_page_release(bpage, space); - fil_space_release_for_io(space); + space->release_for_io(); return(err); } } @@ -6119,7 +6182,7 @@ database_corrupted: recv_recover_corrupt_page(corrupt_page_id); } - fil_space_release_for_io(space); + space->release_for_io(); return err; } @@ -6142,7 +6205,7 @@ database_corrupted: bpage->size); } - fil_space_release_for_io(space); + space->release_for_io(); } else { /* io_type == BUF_IO_WRITE */ if (bpage->slot) { @@ -7360,7 +7423,7 @@ buf_page_encrypt_before_write( byte* src_frame) { ut_ad(space->id == bpage->id.space()); - bpage->real_size = UNIV_PAGE_SIZE; + bpage->real_size = srv_page_size; fil_page_type_validate(src_frame); @@ -7426,7 +7489,7 @@ not_compressed: src_frame, dst_frame); } - bpage->real_size = UNIV_PAGE_SIZE; + bpage->real_size = srv_page_size; slot->out_buf = dst_frame = tmp; ut_d(fil_page_type_validate(tmp)); diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc index 9e81b0384c6..70ad5ed600b 100644 --- a/storage/innobase/buf/buf0checksum.cc +++ b/storage/innobase/buf/buf0checksum.cc @@ -112,7 +112,7 @@ buf_calc_page_new_checksum(const byte* page) FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION - FIL_PAGE_OFFSET) + ut_fold_binary(page + FIL_PAGE_DATA, - UNIV_PAGE_SIZE - FIL_PAGE_DATA + srv_page_size - FIL_PAGE_DATA - FIL_PAGE_END_LSN_OLD_CHKSUM); return(static_cast<uint32_t>(checksum)); } diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 30f24d57391..973f780c16a 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -150,11 +150,11 @@ buf_dblwr_init( ut_zalloc_nokey(buf_size * sizeof(bool))); buf_dblwr->write_buf_unaligned = static_cast<byte*>( - ut_malloc_nokey((1 + buf_size) * UNIV_PAGE_SIZE)); + ut_malloc_nokey((1 + buf_size) << srv_page_size_shift)); buf_dblwr->write_buf = static_cast<byte*>( ut_align(buf_dblwr->write_buf_unaligned, - UNIV_PAGE_SIZE)); + srv_page_size)); buf_dblwr->buf_block_arr = static_cast<buf_page_t**>( ut_zalloc_nokey(buf_size * sizeof(void*))); @@ -200,24 +200,19 @@ start_again: buf_dblwr_being_created = FALSE; return(true); } else { - fil_space_t* space = fil_space_acquire(TRX_SYS_SPACE); - const bool fail = UT_LIST_GET_FIRST(space->chain)->size - < 3 * FSP_EXTENT_SIZE; - fil_space_release(space); - - if (fail) { + if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size + < 3 * FSP_EXTENT_SIZE) { goto too_small; } } trx_sys_block = buf_page_get( page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), - page_size_t(srv_page_size, srv_page_size, 0), RW_X_LATCH, - &mtr); + univ_page_size, RW_X_LATCH, &mtr); - block2 = fseg_create(TRX_SYS_SPACE, - TRX_SYS_DOUBLEWRITE - + TRX_SYS_DOUBLEWRITE_FSEG, &mtr, trx_sys_block); + block2 = fseg_create(fil_system.sys_space, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG, + &mtr, false, trx_sys_block); if (block2 == NULL) { too_small: @@ -225,7 +220,8 @@ too_small: << "Cannot create doublewrite buffer: " "the first file in innodb_data_file_path" " must be at least " - << (3 * (FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) >> 20) + << (3 * (FSP_EXTENT_SIZE + >> (20U - srv_page_size_shift))) << "M."; mtr.commit(); return(false); @@ -382,10 +378,10 @@ buf_dblwr_init_or_load_pages( /* We do the file i/o past the buffer pool */ unaligned_read_buf = static_cast<byte*>( - ut_malloc_nokey(3 * UNIV_PAGE_SIZE)); + ut_malloc_nokey(3U << srv_page_size_shift)); read_buf = static_cast<byte*>( - ut_align(unaligned_read_buf, UNIV_PAGE_SIZE)); + ut_align(unaligned_read_buf, srv_page_size)); /* Read the trx sys header to check if we are using the doublewrite buffer */ @@ -395,8 +391,8 @@ buf_dblwr_init_or_load_pages( err = os_file_read( read_request, - file, read_buf, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, - UNIV_PAGE_SIZE); + file, read_buf, TRX_SYS_PAGE_NO << srv_page_size_shift, + srv_page_size); if (err != DB_SUCCESS) { @@ -444,8 +440,8 @@ buf_dblwr_init_or_load_pages( /* Read the pages from the doublewrite buffer to memory */ err = os_file_read( read_request, - file, buf, block1 * UNIV_PAGE_SIZE, - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE); + file, buf, block1 << srv_page_size_shift, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift); if (err != DB_SUCCESS) { @@ -461,9 +457,9 @@ buf_dblwr_init_or_load_pages( err = os_file_read( read_request, file, - buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - block2 * UNIV_PAGE_SIZE, - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE); + buf + (TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift), + block2 << srv_page_size_shift, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift); if (err != DB_SUCCESS) { @@ -503,8 +499,8 @@ buf_dblwr_init_or_load_pages( err = os_file_write( write_request, path, file, page, - source_page_no * UNIV_PAGE_SIZE, - UNIV_PAGE_SIZE); + source_page_no << srv_page_size_shift, + srv_page_size); if (err != DB_SUCCESS) { ib::error() @@ -522,7 +518,7 @@ buf_dblwr_init_or_load_pages( recv_dblwr.add(page); } - page += univ_page_size.physical(); + page += srv_page_size; } if (reset_space_ids) { @@ -609,7 +605,7 @@ buf_dblwr_process() << " (" << space->size << " pages)"; } next_page: - fil_space_release_for_io(space); + space->release_for_io(); continue; } @@ -789,14 +785,14 @@ buf_dblwr_check_page_lsn( } if (memcmp(page + (FIL_PAGE_LSN + 4), - page + (UNIV_PAGE_SIZE + page + (srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4)) { const ulint lsn1 = mach_read_from_4( page + FIL_PAGE_LSN + 4); const ulint lsn2 = mach_read_from_4( - page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM + 4); ib::error() << "The page to be written seems corrupt!" @@ -837,6 +833,7 @@ buf_dblwr_check_block( switch (fil_page_get_type(block->frame)) { case FIL_PAGE_INDEX: + case FIL_PAGE_TYPE_INSTANT: case FIL_PAGE_RTREE: if (page_is_comp(block->frame)) { if (page_simple_validate_new(block->frame)) { @@ -869,7 +866,6 @@ buf_dblwr_check_block( case FIL_PAGE_TYPE_ALLOCATED: /* empty pages should never be flushed */ return; - break; } buf_dblwr_assert_on_corrupt_block(block); @@ -996,7 +992,7 @@ try_again: for (ulint len2 = 0, i = 0; i < buf_dblwr->first_free; - len2 += UNIV_PAGE_SIZE, i++) { + len2 += srv_page_size, i++) { const buf_block_t* block; @@ -1019,8 +1015,8 @@ try_again: } /* Write out the first block of the doublewrite buffer */ - len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE, - buf_dblwr->first_free) * UNIV_PAGE_SIZE; + len = std::min<ulint>(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE, + buf_dblwr->first_free) << srv_page_size_shift; fil_io(IORequestWrite, true, page_id_t(TRX_SYS_SPACE, buf_dblwr->block1), univ_page_size, @@ -1033,10 +1029,10 @@ try_again: /* Write out the second block of the doublewrite buffer. */ len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) - * UNIV_PAGE_SIZE; + << srv_page_size_shift; write_buf = buf_dblwr->write_buf - + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; + + (TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift); fil_io(IORequestWrite, true, page_id_t(TRX_SYS_SPACE, buf_dblwr->block2), univ_page_size, @@ -1119,7 +1115,7 @@ try_again: } byte* p = buf_dblwr->write_buf - + univ_page_size.physical() * buf_dblwr->first_free; + + srv_page_size * buf_dblwr->first_free; /* We request frame here to get correct buffer in case of encryption and/or page compression */ @@ -1132,7 +1128,7 @@ try_again: memcpy(p, frame, bpage->size.physical()); memset(p + bpage->size.physical(), 0x0, - univ_page_size.physical() - bpage->size.physical()); + srv_page_size - bpage->size.physical()); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); MEM_CHECK_DEFINED(frame, bpage->size.logical()); @@ -1259,20 +1255,20 @@ retry: void * frame = buf_page_get_frame(bpage); if (bpage->size.is_compressed()) { - memcpy(buf_dblwr->write_buf + univ_page_size.physical() * i, + memcpy(buf_dblwr->write_buf + srv_page_size * i, frame, bpage->size.physical()); - memset(buf_dblwr->write_buf + univ_page_size.physical() * i + memset(buf_dblwr->write_buf + srv_page_size * i + bpage->size.physical(), 0x0, - univ_page_size.physical() - bpage->size.physical()); + srv_page_size - bpage->size.physical()); fil_io(IORequestWrite, true, page_id_t(TRX_SYS_SPACE, offset), univ_page_size, 0, - univ_page_size.physical(), - (void *)(buf_dblwr->write_buf + univ_page_size.physical() * i), + srv_page_size, + (void *)(buf_dblwr->write_buf + srv_page_size * i), NULL); } else { /* It is a regular page. Write it directly to the @@ -1282,7 +1278,7 @@ retry: page_id_t(TRX_SYS_SPACE, offset), univ_page_size, 0, - univ_page_size.physical(), + srv_page_size, (void*) frame, NULL); } diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc index b3ef58ecb55..eb7d085ba57 100644 --- a/storage/innobase/buf/buf0dump.cc +++ b/storage/innobase/buf/buf0dump.cc @@ -437,6 +437,11 @@ buf_dump( buf_dump_status(STATUS_INFO, "Buffer pool(s) dump completed at %s", now); + + /* Though dumping doesn't related to an incomplete load, + we reset this to 0 here to indicate that a shutdown can also perform + a dump */ + export_vars.innodb_buffer_pool_load_incomplete = 0; } /*****************************************************************//** @@ -535,7 +540,7 @@ buf_load() buf_load_status(STATUS_INFO, "Loading buffer pool(s) from %s", full_filename); - f = fopen(full_filename, "r"); + f = fopen(full_filename, "r" STR_O_CLOEXEC); if (f == NULL) { buf_load_status(STATUS_INFO, "Cannot open '%s' for reading: %s", @@ -600,6 +605,8 @@ buf_load() rewind(f); + export_vars.innodb_buffer_pool_load_incomplete = 1; + for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { fscanf_ret = fscanf(f, ULINTPF "," ULINTPF, &space_id, &page_no); @@ -648,7 +655,7 @@ buf_load() ut_sprintf_timestamp(now); buf_load_status(STATUS_INFO, "Buffer pool(s) load completed at %s" - " (%s was empty)", now, full_filename); + " (%s was empty or had errors)", now, full_filename); return; } @@ -688,7 +695,7 @@ buf_load() if (this_space_id != cur_space_id) { if (space != NULL) { - fil_space_release(space); + space->release(); } cur_space_id = this_space_id; @@ -720,7 +727,7 @@ buf_load() if (buf_load_abort_flag) { if (space != NULL) { - fil_space_release(space); + space->release(); } buf_load_abort_flag = FALSE; ut_free(dump); @@ -742,18 +749,39 @@ buf_load() buf_load_throttle_if_needed( &last_check_time, &last_activity_cnt, i); + +#ifdef UNIV_DEBUG + if ((i+1) >= srv_buf_pool_load_pages_abort) { + buf_load_abort_flag = 1; + } +#endif } if (space != NULL) { - fil_space_release(space); + space->release(); } ut_free(dump); ut_sprintf_timestamp(now); - buf_load_status(STATUS_INFO, + if (i == dump_n) { + buf_load_status(STATUS_INFO, "Buffer pool(s) load completed at %s", now); + export_vars.innodb_buffer_pool_load_incomplete = 0; + } else if (!buf_load_abort_flag) { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load aborted due to user instigated abort at %s", + now); + /* intentionally don't reset innodb_buffer_pool_load_incomplete + as we don't want a shutdown to save the buffer pool */ + } else { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load aborted due to shutdown at %s", + now); + /* intentionally don't reset innodb_buffer_pool_load_incomplete + as we want to abort without saving the buffer pool */ + } /* Make sure that estimated = completed when we end. */ /* mysql_stage_set_work_completed(pfs_stage_progress, dump_n); */ @@ -822,15 +850,16 @@ DECLARE_THREAD(buf_dump_thread)(void*) } if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) { + if (export_vars.innodb_buffer_pool_load_incomplete) { + buf_dump_status(STATUS_INFO, + "Dumping of buffer pool not started" + " as load was incomplete"); #ifdef WITH_WSREP - if (!wsrep_recovery) { + } else if (wsrep_recovery) { #endif /* WITH_WSREP */ - - buf_dump(FALSE /* ignore shutdown down flag, - keep going even if we are in a shutdown state */); -#ifdef WITH_WSREP + } else { + buf_dump(FALSE/* do complete dump at shutdown */); } -#endif /* WITH_WSREP */ } srv_buf_dump_thread_active = false; diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 26610337d0d..6f56a09eda2 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -31,7 +31,6 @@ Created 11/11/1995 Heikki Tuuri #include "buf0flu.h" #include "buf0buf.h" -#include "buf0mtflu.h" #include "buf0checksum.h" #include "srv0start.h" #include "srv0srv.h" @@ -151,6 +150,8 @@ struct page_cleaner_t { threads. */ os_event_t is_finished; /*!< event to signal that all slots were finished. */ + os_event_t is_started; /*!< event to signal that + thread is started/exiting */ volatile ulint n_workers; /*!< number of worker threads in existence */ bool requested; /*!< true if requested pages @@ -830,7 +831,7 @@ buf_flush_init_for_writing( ut_ad(block == NULL || block->frame == page); ut_ad(block == NULL || page_zip_ == NULL || &block->page.zip == page_zip_); - ut_ad(!srv_safe_truncate || !block || newest_lsn); + ut_ad(!block || newest_lsn); ut_ad(page); #if 0 /* MDEV-15528 TODO: reinstate this check */ /* innodb_immediate_scrub_data_uncompressed=ON would cause @@ -838,7 +839,7 @@ buf_flush_init_for_writing( cause them to be written as almost-all-zeroed. In MDEV-15528 we should change that implement an option to make freed pages appear all-zero, bypassing this code. */ - ut_ad(!srv_safe_truncate || !newest_lsn || fil_page_get_type(page)); + ut_ad(!newest_lsn || fil_page_get_type(page)); #endif if (page_zip_) { @@ -884,7 +885,7 @@ buf_flush_init_for_writing( /* Write the newest modification lsn to the page header and trailer */ mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); - mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + mach_write_to_8(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, newest_lsn); if (block && srv_page_size == 16384) { @@ -915,6 +916,7 @@ buf_flush_init_for_writing( default: switch (page_type) { case FIL_PAGE_INDEX: + case FIL_PAGE_TYPE_INSTANT: case FIL_PAGE_RTREE: case FIL_PAGE_UNDO_LOG: case FIL_PAGE_INODE: @@ -978,7 +980,7 @@ buf_flush_init_for_writing( new enum is added and not handled here */ } - mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, checksum); } @@ -1003,7 +1005,7 @@ buf_flush_write_block_low( || space->purpose == FIL_TYPE_IMPORT || space->purpose == FIL_TYPE_TABLESPACE); ut_ad((space->purpose == FIL_TYPE_TEMPORARY) - == fsp_is_system_temporary(space->id)); + == (space == fil_system.temp_space)); page_t* frame = NULL; #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); @@ -1111,7 +1113,7 @@ buf_flush_write_block_low( ut_ad(err == DB_SUCCESS); } - fil_space_release_for_io(space); + space->release_for_io(); /* Increment the counter of I/O operations used for selecting LRU policy. */ @@ -1393,7 +1395,7 @@ buf_flush_try_neighbors( if (fil_space_t *s = fil_space_acquire_for_io(page_id.space())) { high = s->max_page_number_for_io(high); - fil_space_release_for_io(s); + s->release_for_io(); } else { return 0; } @@ -1833,6 +1835,7 @@ not guaranteed that the actual number is that big, though) @param[in] lsn_limit in the case of BUF_FLUSH_LIST all blocks whose oldest_modification is smaller than this should be flushed (if their number does not exceed min_n), otherwise ignored */ +static void buf_flush_batch( buf_pool_t* buf_pool, @@ -1872,6 +1875,7 @@ Gather the aggregated stats for both flush list and LRU list flushing. @param page_count_flush number of pages flushed from the end of the flush_list @param page_count_LRU number of pages flushed from the end of the LRU list */ +static void buf_flush_stats( /*============*/ @@ -1888,6 +1892,7 @@ buf_flush_stats( /******************************************************************//** Start a buffer flush batch for LRU or flush list */ +static ibool buf_flush_start( /*============*/ @@ -1919,22 +1924,8 @@ buf_flush_start( } /******************************************************************//** -Gather the aggregated stats for both flush list and LRU list flushing */ -void -buf_flush_common( -/*=============*/ - buf_flush_t flush_type, /*!< in: type of flush */ - ulint page_count) /*!< in: number of pages flushed */ -{ - buf_dblwr_flush_buffered_writes(); - - ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - - srv_stats.buf_pool_flushed.add(page_count); -} - -/******************************************************************//** End a buffer flush batch for LRU or flush list */ +static void buf_flush_end( /*==========*/ @@ -2106,10 +2097,6 @@ buf_flush_lists( ulint n_flushed = 0; bool success = true; - if (buf_mtflu_init_done()) { - return(buf_mtflu_flush_list(min_n, lsn_limit, n_processed)); - } - if (n_processed) { *n_processed = 0; } @@ -2268,11 +2255,6 @@ buf_flush_LRU_list( memset(&n, 0, sizeof(flush_counters_t)); - if(buf_mtflu_init_done()) - { - return(buf_mtflu_flush_LRU_tail()); - } - ut_ad(buf_pool); /* srv_LRU_scan_depth can be arbitrarily large value. We cap it with current LRU size. */ @@ -2434,7 +2416,7 @@ page_cleaner_flush_pages_recommendation( cur_lsn = log_get_lsn_nowait(); - /* log_get_lsn_nowait tries to get log_sys->mutex with + /* log_get_lsn_nowait tries to get log_sys.mutex with mutex_enter_nowait, if this does not succeed function returns 0, do not use that value to update stats. */ if (cur_lsn == 0) { @@ -2698,7 +2680,7 @@ buf_flush_page_cleaner_init(void) page_cleaner.is_requested = os_event_create("pc_is_requested"); page_cleaner.is_finished = os_event_create("pc_is_finished"); - + page_cleaner.is_started = os_event_create("pc_is_started"); page_cleaner.n_slots = static_cast<ulint>(srv_buf_pool_instances); ut_d(page_cleaner.n_disabled_debug = 0); @@ -2773,8 +2755,8 @@ pc_flush_slot(void) { ulint lru_tm = 0; ulint list_tm = 0; - int lru_pass = 0; - int list_pass = 0; + ulint lru_pass = 0; + ulint list_pass = 0; mutex_enter(&page_cleaner.mutex); @@ -2978,17 +2960,10 @@ buf_flush_page_cleaner_disabled_loop(void) } /** Disables page cleaner threads (coordinator and workers). -It's used by: SET GLOBAL innodb_page_cleaner_disabled_debug = 1 (0). -@param[in] thd thread handle -@param[in] var pointer to system variable -@param[out] var_ptr where the formal string goes @param[in] save immediate result from check function */ -void -buf_flush_page_cleaner_disabled_debug_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save) +void buf_flush_page_cleaner_disabled_debug_update(THD*, + st_mysql_sys_var*, void*, + const void* save) { if (!page_cleaner.is_running) { return; @@ -3418,6 +3393,7 @@ thread_exit: os_event_destroy(page_cleaner.is_finished); os_event_destroy(page_cleaner.is_requested); + os_event_destroy(page_cleaner.is_started); buf_page_cleaner_is_active = false; @@ -3429,6 +3405,35 @@ thread_exit: OS_THREAD_DUMMY_RETURN; } +/** Adjust thread count for page cleaner workers. +@param[in] new_cnt Number of threads to be used */ +void +buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt) +{ + mutex_enter(&page_cleaner.mutex); + + srv_n_page_cleaners = new_cnt; + if (new_cnt > page_cleaner.n_workers) { + /* User has increased the number of page + cleaner threads. */ + ulint add = new_cnt - page_cleaner.n_workers; + for (ulint i = 0; i < add; i++) { + os_thread_id_t cleaner_thread_id; + os_thread_create(buf_flush_page_cleaner_worker, NULL, &cleaner_thread_id); + } + } + + mutex_exit(&page_cleaner.mutex); + + /* Wait until defined number of workers has started. */ + while (page_cleaner.is_running && + page_cleaner.n_workers != (srv_n_page_cleaners - 1)) { + os_event_set(page_cleaner.is_requested); + os_event_reset(page_cleaner.is_started); + os_event_wait_time(page_cleaner.is_started, 1000000); + } +} + /******************************************************************//** Worker thread of page_cleaner. @return a dummy parameter */ @@ -3441,9 +3446,18 @@ DECLARE_THREAD(buf_flush_page_cleaner_worker)( os_thread_create */ { my_thread_init(); +#ifndef DBUG_OFF + os_thread_id_t cleaner_thread_id = os_thread_get_curr_id(); +#endif mutex_enter(&page_cleaner.mutex); - page_cleaner.n_workers++; + ulint thread_no = page_cleaner.n_workers++; + + DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id + << " started; n_workers=" << page_cleaner.n_workers); + + /* Signal that we have started */ + os_event_set(page_cleaner.is_started); mutex_exit(&page_cleaner.mutex); #ifdef UNIV_LINUX @@ -3466,11 +3480,31 @@ DECLARE_THREAD(buf_flush_page_cleaner_worker)( break; } + ut_ad(srv_n_page_cleaners >= 1); + + /* If number of page cleaner threads is decreased + exit those that are not anymore needed. */ + if (srv_shutdown_state == SRV_SHUTDOWN_NONE && + thread_no >= (srv_n_page_cleaners - 1)) { + DBUG_LOG("ib_buf", "Exiting " + << thread_no + << " page cleaner worker thread_id " + << os_thread_pf(cleaner_thread_id) + << " total threads " << srv_n_page_cleaners << "."); + break; + } + pc_flush_slot(); } mutex_enter(&page_cleaner.mutex); page_cleaner.n_workers--; + + DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id + << " exiting; n_workers=" << page_cleaner.n_workers); + + /* Signal that we have stopped */ + os_event_set(page_cleaner.is_started); mutex_exit(&page_cleaner.mutex); my_thread_end(); @@ -3675,17 +3709,17 @@ buf_flush_get_dirty_pages_count( } /** FlushObserver constructor -@param[in] space_id table space id +@param[in] space tablespace @param[in] trx trx instance @param[in] stage performance schema accounting object, used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages() for accounting. */ FlushObserver::FlushObserver( - ulint space_id, + fil_space_t* space, trx_t* trx, ut_stage_alter_t* stage) : - m_space_id(space_id), + m_space(space), m_trx(trx), m_stage(stage), m_interrupted(false) @@ -3704,7 +3738,7 @@ FlushObserver::FlushObserver( /** FlushObserver deconstructor */ FlushObserver::~FlushObserver() { - ut_ad(buf_flush_get_dirty_pages_count(m_space_id, this) == 0); + ut_ad(buf_flush_get_dirty_pages_count(m_space->id, this) == 0); UT_DELETE(m_flushed); UT_DELETE(m_removed); @@ -3762,10 +3796,10 @@ FlushObserver::flush() if (!m_interrupted && m_stage) { m_stage->begin_phase_flush(buf_flush_get_dirty_pages_count( - m_space_id, this)); + m_space->id, this)); } - buf_LRU_flush_or_remove_pages(m_space_id, this); + buf_LRU_flush_or_remove_pages(m_space->id, this); /* Wait for all dirty pages were flushed. */ for (ulint i = 0; i < srv_buf_pool_instances; i++) { diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 290bb1a737e..d3bba9a1130 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -58,9 +58,6 @@ static const ulint BUF_LRU_OLD_TOLERANCE = 20; (that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks). @see buf_LRU_old_adjust_len */ #define BUF_LRU_NON_OLD_MIN_LEN 5 -#if BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN -# error "BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN" -#endif #ifdef BTR_CUR_HASH_ADAPT /** When dropping the search hash index entries before deleting an ibd @@ -809,7 +806,7 @@ buf_LRU_get_free_only( assert_block_ahi_empty(block); buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE); - MEM_UNDEFINED(block->frame, srv_page_size); + MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size); ut_ad(buf_pool_from_block(block) == buf_pool); @@ -856,7 +853,7 @@ buf_LRU_check_size_of_non_data_objects( " Check that your transactions do not set too many" " row locks, or review if" " innodb_buffer_pool_size=" - << (buf_pool->curr_size >> (20 - UNIV_PAGE_SIZE_SHIFT)) + << (buf_pool->curr_size >> (20U - srv_page_size_shift)) << "M could be bigger."; } else if (!recv_recovery_is_on() && buf_pool->curr_size == buf_pool->old_size @@ -879,7 +876,7 @@ buf_LRU_check_size_of_non_data_objects( " set too many row locks." " innodb_buffer_pool_size=" << (buf_pool->curr_size >> - (20 - UNIV_PAGE_SIZE_SHIFT)) << "M." + (20U - srv_page_size_shift)) << "M." " Starting the InnoDB Monitor to print" " diagnostics."; @@ -1065,9 +1062,11 @@ buf_LRU_old_adjust_len( ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); -#if BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5) -# error "BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5)" -#endif + compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN + > BUF_LRU_OLD_RATIO_DIV + * (BUF_LRU_OLD_TOLERANCE + 5)); + compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN); + #ifdef UNIV_LRU_DEBUG /* buf_pool->LRU_old must be the first item in the LRU list whose "old" flag is set. */ @@ -1609,13 +1608,9 @@ func_exit: order to avoid bogus Valgrind or MSAN warnings.*/ buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); -#ifdef HAVE_valgrind_or_MSAN MEM_MAKE_DEFINED(block->frame, srv_page_size); -#endif /* HAVE_valgrind_or_MSAN */ btr_search_drop_page_hash_index(block); -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(block->frame, srv_page_size); -#endif /* HAVE_valgrind_or_MSAN */ buf_pool_mutex_enter(buf_pool); @@ -1777,11 +1772,11 @@ buf_LRU_block_remove_hashed( break; case FIL_PAGE_INDEX: case FIL_PAGE_RTREE: -#ifdef UNIV_ZIP_DEBUG +#if defined UNIV_ZIP_DEBUG && defined BTR_CUR_HASH_ADAPT ut_a(page_zip_validate( &bpage->zip, page, ((buf_block_t*) bpage)->index)); -#endif /* UNIV_ZIP_DEBUG */ +#endif /* UNIV_ZIP_DEBUG && BTR_CUR_HASH_ADAPT */ break; default: ib::error() << "The compressed page to be" diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc deleted file mode 100644 index aae90e48168..00000000000 --- a/storage/innobase/buf/buf0mtflu.cc +++ /dev/null @@ -1,736 +0,0 @@ -/***************************************************************************** - -Copyright (C) 2013, 2014, Fusion-io. All Rights Reserved. -Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/******************************************************************//** -@file buf/buf0mtflu.cc -Multi-threaded flush method implementation - -Created 06/11/2013 Dhananjoy Das DDas@fusionio.com -Modified 12/12/2013 Jan Lindström jan.lindstrom@skysql.com -Modified 03/02/2014 Dhananjoy Das DDas@fusionio.com -Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com -***********************************************************************/ - -#include "buf0buf.h" -#include "buf0flu.h" -#include "buf0mtflu.h" -#include "buf0checksum.h" -#include "srv0start.h" -#include "srv0srv.h" -#include "page0zip.h" -#include "ut0byte.h" -#include "ut0lst.h" -#include "page0page.h" -#include "fil0fil.h" -#include "buf0lru.h" -#include "buf0rea.h" -#include "ibuf0ibuf.h" -#include "log0log.h" -#include "os0file.h" -#include "trx0sys.h" -#include "srv0mon.h" -#include "mysql/plugin.h" -#include "mysql/service_thd_wait.h" -#include "fil0pagecompress.h" - -#define MT_COMP_WATER_MARK 50 -/** Time to wait for a message. */ -#define MT_WAIT_IN_USECS 5000000 - -/* Work item status */ -typedef enum wrk_status { - WRK_ITEM_UNSET=0, /*!< Work item is not set */ - WRK_ITEM_START=1, /*!< Processing of work item has started */ - WRK_ITEM_DONE=2, /*!< Processing is done usually set to - SUCCESS/FAILED */ - WRK_ITEM_SUCCESS=2, /*!< Work item successfully processed */ - WRK_ITEM_FAILED=3, /*!< Work item process failed */ - WRK_ITEM_EXIT=4, /*!< Exiting */ - WRK_ITEM_SET=5, /*!< Work item is set */ - WRK_ITEM_STATUS_UNDEFINED -} wrk_status_t; - -/* Work item task type */ -typedef enum mt_wrk_tsk { - MT_WRK_NONE=0, /*!< Exit queue-wait */ - MT_WRK_WRITE=1, /*!< Flush operation */ - MT_WRK_READ=2, /*!< Read operation */ - MT_WRK_UNDEFINED -} mt_wrk_tsk_t; - -/* Work thread status */ -typedef enum wthr_status { - WTHR_NOT_INIT=0, /*!< Work thread not initialized */ - WTHR_INITIALIZED=1, /*!< Work thread initialized */ - WTHR_SIG_WAITING=2, /*!< Work thread wating signal */ - WTHR_RUNNING=3, /*!< Work thread running */ - WTHR_NO_WORK=4, /*!< Work thread has no work */ - WTHR_KILL_IT=5, /*!< Work thread should exit */ - WTHR_STATUS_UNDEFINED -} wthr_status_t; - -/* Write work task */ -typedef struct wr_tsk { - buf_pool_t *buf_pool; /*!< buffer-pool instance */ - buf_flush_t flush_type; /*!< flush-type for buffer-pool - flush operation */ - ulint min; /*!< minimum number of pages - requested to be flushed */ - lsn_t lsn_limit; /*!< lsn limit for the buffer-pool - flush operation */ -} wr_tsk_t; - -/* Read work task */ -typedef struct rd_tsk { - buf_pool_t *page_pool; /*!< list of pages to decompress; */ -} rd_tsk_t; - -/* Work item */ -typedef struct wrk_itm -{ - mt_wrk_tsk_t tsk; /*!< Task type. Based on task-type - one of the entries wr_tsk/rd_tsk - will be used */ - wr_tsk_t wr; /*!< Flush page list */ - rd_tsk_t rd; /*!< Decompress page list */ - ulint n_flushed; /*!< Number of flushed pages */ - ulint n_evicted; /*!< Number of evicted pages */ - os_thread_id_t id_usr; /*!< Thread-id currently working */ - wrk_status_t wi_status; /*!< Work item status */ - mem_heap_t *wheap; /*!< Heap were to allocate memory - for queue nodes */ - mem_heap_t *rheap; -} wrk_t; - -struct thread_data_t -{ - os_thread_id_t wthread_id; /*!< Identifier */ - wthr_status_t wt_status; /*!< Worker thread status */ -}; - -/** Flush dirty pages when multi-threaded flush is used. */ -extern "C" UNIV_INTERN -os_thread_ret_t -DECLARE_THREAD(mtflush_io_thread)(void* arg); - -/** Thread syncronization data */ -struct thread_sync_t -{ - /** Constructor */ - thread_sync_t(ulint n_threads, mem_heap_t* wheap, mem_heap_t* rheap) : - thread_global_mtx(), n_threads(n_threads), - wq(ib_wqueue_create()), - wr_cq(ib_wqueue_create()), - rd_cq(ib_wqueue_create()), - wheap(wheap), rheap(rheap), gwt_status(), - thread_data(static_cast<thread_data_t*>( - mem_heap_zalloc(wheap, n_threads - * sizeof *thread_data))) - { - ut_a(wq); - ut_a(wr_cq); - ut_a(rd_cq); - ut_a(thread_data); - - mutex_create(LATCH_ID_MTFLUSH_THREAD_MUTEX, - &thread_global_mtx); - - /* Create threads for page-compression-flush */ - for(ulint i = 0; i < n_threads; i++) { - thread_data[i].wt_status = WTHR_INITIALIZED; - os_thread_create(mtflush_io_thread, this, - &thread_data[i].wthread_id); - } - } - - /** Destructor */ - ~thread_sync_t() - { - ut_a(ib_wqueue_is_empty(wq)); - ut_a(ib_wqueue_is_empty(wr_cq)); - ut_a(ib_wqueue_is_empty(rd_cq)); - - /* Free all queues */ - ib_wqueue_free(wq); - ib_wqueue_free(wr_cq); - ib_wqueue_free(rd_cq); - - mutex_free(&thread_global_mtx); - - mem_heap_free(rheap); - mem_heap_free(wheap); - } - - /* Global variables used by all threads */ - ib_mutex_t thread_global_mtx; /*!< Mutex used protecting below - variables */ - ulint n_threads; /*!< Number of threads */ - ib_wqueue_t *wq; /*!< Work Queue */ - ib_wqueue_t *wr_cq; /*!< Write Completion Queue */ - ib_wqueue_t *rd_cq; /*!< Read Completion Queue */ - mem_heap_t* wheap; /*!< Work heap where memory - is allocated */ - mem_heap_t* rheap; /*!< Work heap where memory - is allocated */ - wthr_status_t gwt_status; /*!< Global thread status */ - - /* Variables used by only one thread at a time */ - thread_data_t* thread_data; /*!< Thread specific data */ -}; - -static thread_sync_t* mtflush_ctx; -static ib_mutex_t mtflush_mtx; - -/******************************************************************//** -Return true if multi-threaded flush is initialized -@return true if initialized */ -bool -buf_mtflu_init_done(void) -/*=====================*/ -{ - return(mtflush_ctx != NULL); -} - -/******************************************************************//** -Fush buffer pool instance. -@return number of flushed pages, or 0 if error happened -*/ -static -ulint -buf_mtflu_flush_pool_instance( -/*==========================*/ - wrk_t *work_item) /*!< inout: work item to be flushed */ -{ - flush_counters_t n; - ut_a(work_item != NULL); - ut_a(work_item->wr.buf_pool != NULL); - - if (!buf_flush_start(work_item->wr.buf_pool, work_item->wr.flush_type)) { - /* We have two choices here. If lsn_limit was - specified then skipping an instance of buffer - pool means we cannot guarantee that all pages - up to lsn_limit has been flushed. We can - return right now with failure or we can try - to flush remaining buffer pools up to the - lsn_limit. We attempt to flush other buffer - pools based on the assumption that it will - help in the retry which will follow the - failure. */ -#ifdef UNIV_MTFLUSH_DEBUG - fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n"); -#endif - return 0; - } - - memset(&n, 0, sizeof(flush_counters_t)); - - if (work_item->wr.flush_type == BUF_FLUSH_LRU) { - /* srv_LRU_scan_depth can be arbitrarily large value. - * We cap it with current LRU size. - */ - buf_pool_mutex_enter(work_item->wr.buf_pool); - work_item->wr.min = UT_LIST_GET_LEN(work_item->wr.buf_pool->LRU); - buf_pool_mutex_exit(work_item->wr.buf_pool); - work_item->wr.min = ut_min((ulint)srv_LRU_scan_depth,(ulint)work_item->wr.min); - } - - buf_flush_batch(work_item->wr.buf_pool, - work_item->wr.flush_type, - work_item->wr.min, - work_item->wr.lsn_limit, - &n); - - buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type); - buf_flush_common(work_item->wr.flush_type, n.flushed); - work_item->n_flushed = n.flushed; - work_item->n_evicted = n.evicted; - - return work_item->n_flushed; -} - -/******************************************************************//** -Worker function to wait for work items and processing them and -sending reply back. -*/ -static -void -mtflush_service_io( -/*===============*/ - thread_sync_t* mtflush_io, /*!< inout: multi-threaded flush - syncronization data */ - thread_data_t* thread_data) /* Thread status data */ -{ - wrk_t *work_item = NULL; - ulint n_flushed=0; - - ut_a(mtflush_io != NULL); - ut_a(thread_data != NULL); - - thread_data->wt_status = WTHR_SIG_WAITING; - - work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq); - - if (work_item == NULL) { - work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq); - } - - if (work_item) { - thread_data->wt_status = WTHR_RUNNING; - } else { - /* Thread did not get any work */ - thread_data->wt_status = WTHR_NO_WORK; - return; - } - - if (work_item->wi_status != WRK_ITEM_EXIT) { - work_item->wi_status = WRK_ITEM_SET; - } - -#ifdef UNIV_MTFLUSH_DEBUG - ut_a(work_item->id_usr == 0); -#endif - work_item->id_usr = os_thread_get_curr_id(); - - /* This works as a producer/consumer model, where in tasks are - * inserted into the work-queue (wq) and completions are based - * on the type of operations performed and as a result the WRITE/ - * compression/flush operation completions get posted to wr_cq. - * And READ/decompress operations completions get posted to rd_cq. - * in future we may have others. - */ - - switch(work_item->tsk) { - case MT_WRK_NONE: - ut_a(work_item->wi_status == WRK_ITEM_EXIT); - work_item->wi_status = WRK_ITEM_EXIT; - ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); - thread_data->wt_status = WTHR_KILL_IT; - break; - - case MT_WRK_WRITE: - ut_a(work_item->wi_status == WRK_ITEM_SET); - work_item->wi_status = WRK_ITEM_START; - /* Process work item */ - if (0 == (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { - work_item->wi_status = WRK_ITEM_FAILED; - } - work_item->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); - break; - - case MT_WRK_READ: - ut_a(0); - break; - - default: - /* None other than Write/Read handling planned */ - ut_a(0); - break; - } -} - -/** Flush dirty pages when multi-threaded flush is used. */ -extern "C" UNIV_INTERN -os_thread_ret_t -DECLARE_THREAD(mtflush_io_thread)(void* arg) -{ - thread_sync_t *mtflush_io = ((thread_sync_t *)arg); - thread_data_t *this_thread_data = NULL; - ulint i; - - /* Find correct slot for this thread */ - mutex_enter(&(mtflush_io->thread_global_mtx)); - for(i=0; i < mtflush_io->n_threads; i ++) { - if (mtflush_io->thread_data[i].wthread_id == os_thread_get_curr_id()) { - break; - } - } - - ut_a(i <= mtflush_io->n_threads); - this_thread_data = &mtflush_io->thread_data[i]; - mutex_exit(&(mtflush_io->thread_global_mtx)); - - while (TRUE) { - -#ifdef UNIV_MTFLUSH_DEBUG - fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", - os_thread_get_curr_id(), - ib_wqueue_len(mtflush_io->wq), - ib_wqueue_len(mtflush_io->wr_cq)); -#endif /* UNIV_MTFLUSH_DEBUG */ - - mtflush_service_io(mtflush_io, this_thread_data); - - - if (this_thread_data->wt_status == WTHR_KILL_IT) { - break; - } - } - - os_thread_exit(); - OS_THREAD_DUMMY_RETURN; -} - -/******************************************************************//** -Add exit work item to work queue to signal multi-threded flush -threads that they should exit. -*/ -void -buf_mtflu_io_thread_exit(void) -/*==========================*/ -{ - ulint i; - thread_sync_t* mtflush_io = mtflush_ctx; - wrk_t* work_item = NULL; - - ut_a(mtflush_io != NULL); - - /* Allocate work items for shutdown message */ - work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads); - - /* Confirm if the io-thread KILL is in progress, bailout */ - if (mtflush_io->gwt_status == WTHR_KILL_IT) { - return; - } - - mtflush_io->gwt_status = WTHR_KILL_IT; - - /* This lock is to safequard against timing bug: flush request take - this mutex before sending work items to be processed by flush - threads. Inside flush thread we assume that work queue contains only - a constant number of items. Thus, we may not install new work items - below before all previous ones are processed. This mutex is released - by flush request after all work items sent to flush threads have - been processed. Thus, we can get this mutex if and only if work - queue is empty. */ - - mutex_enter(&mtflush_mtx); - - /* Make sure the work queue is empty */ - ut_a(ib_wqueue_is_empty(mtflush_io->wq)); - - /* Send one exit work item/thread */ - for (i=0; i < (ulint)srv_mtflush_threads; i++) { - work_item[i].tsk = MT_WRK_NONE; - work_item[i].wi_status = WRK_ITEM_EXIT; - work_item[i].wheap = mtflush_io->wheap; - work_item[i].rheap = mtflush_io->rheap; - work_item[i].id_usr = 0; - - ib_wqueue_add(mtflush_io->wq, - (void *)&(work_item[i]), - mtflush_io->wheap); - } - - /* Requests sent */ - mutex_exit(&mtflush_mtx); - - /* Wait until all work items on a work queue are processed */ - while(!ib_wqueue_is_empty(mtflush_io->wq)) { - /* Wait */ - os_thread_sleep(MT_WAIT_IN_USECS); - } - - ut_a(ib_wqueue_is_empty(mtflush_io->wq)); - - /* Collect all work done items */ - for (i=0; i < (ulint)srv_mtflush_threads;) { - wrk_t* work_item = NULL; - - work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, MT_WAIT_IN_USECS); - - /* If we receive reply to work item and it's status is exit, - thead has processed this message and existed */ - if (work_item && work_item->wi_status == WRK_ITEM_EXIT) { - i++; - } - } - - /* Wait about 1/2 sec to allow threads really exit */ - os_thread_sleep(MT_WAIT_IN_USECS); - - /* Make sure that work queue is empty */ - while(!ib_wqueue_is_empty(mtflush_io->wq)) - { - ib_wqueue_nowait(mtflush_io->wq); - } - - mtflush_ctx->~thread_sync_t(); - mtflush_ctx = NULL; - - mutex_free(&mtflush_mtx); -} - -/******************************************************************//** -Initialize multi-threaded flush thread syncronization data. -@return Initialized multi-threaded flush thread syncroniztion data. */ -void* -buf_mtflu_handler_init( -/*===================*/ - ulint n_threads, /*!< in: Number of threads to create */ - ulint wrk_cnt) /*!< in: Number of work items */ -{ - mem_heap_t* mtflush_heap; - mem_heap_t* mtflush_heap2; - - /* Create heap, work queue, write completion queue, read - completion queue for multi-threaded flush, and init - handler. */ - mtflush_heap = mem_heap_create(0); - ut_a(mtflush_heap != NULL); - mtflush_heap2 = mem_heap_create(0); - ut_a(mtflush_heap2 != NULL); - - mutex_create(LATCH_ID_MTFLUSH_MUTEX, &mtflush_mtx); - - mtflush_ctx = new (mem_heap_zalloc(mtflush_heap, sizeof *mtflush_ctx)) - thread_sync_t(n_threads, mtflush_heap, mtflush_heap2); - - return((void *)mtflush_ctx); -} - -/******************************************************************//** -Flush buffer pool instances. -@return number of pages flushed. */ -ulint -buf_mtflu_flush_work_items( -/*=======================*/ - ulint buf_pool_inst, /*!< in: Number of buffer pool instances */ - flush_counters_t *per_pool_cnt, /*!< out: Number of pages - flushed or evicted /instance */ - buf_flush_t flush_type, /*!< in: Type of flush */ - ulint min_n, /*!< in: Wished minimum number of - blocks to be flushed */ - lsn_t lsn_limit) /*!< in: All blocks whose - oldest_modification is smaller than - this should be flushed (if their - number does not exceed min_n) */ -{ - ulint n_flushed=0, i; - mem_heap_t* work_heap; - mem_heap_t* reply_heap; - wrk_t work_item[MTFLUSH_MAX_WORKER]; - - if (mtflush_ctx->gwt_status == WTHR_KILL_IT) { - return 0; - } - - /* Allocate heap where all work items used and queue - node items areallocated */ - work_heap = mem_heap_create(0); - reply_heap = mem_heap_create(0); - - - for(i=0;i<buf_pool_inst; i++) { - work_item[i].tsk = MT_WRK_WRITE; - work_item[i].wr.buf_pool = buf_pool_from_array(i); - work_item[i].wr.flush_type = flush_type; - work_item[i].wr.min = min_n; - work_item[i].wr.lsn_limit = lsn_limit; - work_item[i].wi_status = WRK_ITEM_UNSET; - work_item[i].wheap = work_heap; - work_item[i].rheap = reply_heap; - work_item[i].n_flushed = 0; - work_item[i].n_evicted = 0; - work_item[i].id_usr = 0; - - ib_wqueue_add(mtflush_ctx->wq, - (void *)(work_item + i), - work_heap); - } - - /* wait on the completion to arrive */ - for(i=0; i< buf_pool_inst;) { - wrk_t *done_wi = NULL; - done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq); - - if (done_wi != NULL) { - per_pool_cnt[i].flushed = done_wi->n_flushed; - per_pool_cnt[i].evicted = done_wi->n_evicted; - -#ifdef UNIV_MTFLUSH_DEBUG - if((int)done_wi->id_usr == 0 && - (done_wi->wi_status == WRK_ITEM_SET || - done_wi->wi_status == WRK_ITEM_UNSET)) { - fprintf(stderr, - "**Set/Unused work_item[%lu] flush_type=%d\n", - i, - done_wi->wr.flush_type); - ut_a(0); - } -#endif - - n_flushed+= done_wi->n_flushed+done_wi->n_evicted; - i++; - } - } - - /* Release used work_items and queue nodes */ - mem_heap_free(work_heap); - mem_heap_free(reply_heap); - - return(n_flushed); -} - -/*******************************************************************//** -Multi-threaded version of buf_flush_list -*/ -bool -buf_mtflu_flush_list( -/*=================*/ - ulint min_n, /*!< in: wished minimum mumber of blocks - flushed (it is not guaranteed that the - actual number is that big, though) */ - lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all - blocks whose oldest_modification is - smaller than this should be flushed - (if their number does not exceed - min_n), otherwise ignored */ - ulint* n_processed) /*!< out: the number of pages - which were processed is passed - back to caller. Ignored if NULL */ - -{ - ulint i; - bool success = true; - flush_counters_t cnt[MTFLUSH_MAX_WORKER]; - - if (n_processed) { - *n_processed = 0; - } - - if (min_n != ULINT_MAX) { - /* Ensure that flushing is spread evenly amongst the - buffer pool instances. When min_n is ULINT_MAX - we need to flush everything up to the lsn limit - so no limit here. */ - min_n = (min_n + srv_buf_pool_instances - 1) - / srv_buf_pool_instances; - } - - /* This lock is to safequard against re-entry if any. */ - mutex_enter(&mtflush_mtx); - buf_mtflu_flush_work_items(srv_buf_pool_instances, - cnt, BUF_FLUSH_LIST, - min_n, lsn_limit); - mutex_exit(&mtflush_mtx); - - for (i = 0; i < srv_buf_pool_instances; i++) { - if (n_processed) { - *n_processed += cnt[i].flushed+cnt[i].evicted; - } - - if (cnt[i].flushed) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_BATCH_TOTAL_PAGE, - MONITOR_FLUSH_BATCH_COUNT, - MONITOR_FLUSH_BATCH_PAGES, - cnt[i].flushed); - } - - if(cnt[i].evicted) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, - MONITOR_LRU_BATCH_EVICT_COUNT, - MONITOR_LRU_BATCH_EVICT_PAGES, - cnt[i].evicted); - } - } -#ifdef UNIV_MTFLUSH_DEBUG - fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu ]\n", - __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed); -#endif - return(success); -} - -/*********************************************************************//** -Clears up tail of the LRU lists: -* Put replaceable pages at the tail of LRU to the free list -* Flush dirty pages at the tail of LRU to the disk -The depth to which we scan each buffer pool is controlled by dynamic -config parameter innodb_LRU_scan_depth. -@return total pages flushed */ -UNIV_INTERN -ulint -buf_mtflu_flush_LRU_tail(void) -/*==========================*/ -{ - ulint total_flushed=0, i; - flush_counters_t cnt[MTFLUSH_MAX_WORKER]; - - ut_a(buf_mtflu_init_done()); - - /* At shutdown do not send requests anymore */ - if (!mtflush_ctx || mtflush_ctx->gwt_status == WTHR_KILL_IT) { - return (total_flushed); - } - - /* This lock is to safeguard against re-entry if any */ - mutex_enter(&mtflush_mtx); - buf_mtflu_flush_work_items(srv_buf_pool_instances, - cnt, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); - mutex_exit(&mtflush_mtx); - - for (i = 0; i < srv_buf_pool_instances; i++) { - total_flushed += cnt[i].flushed+cnt[i].evicted; - - if (cnt[i].flushed) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, - MONITOR_LRU_BATCH_FLUSH_COUNT, - MONITOR_LRU_BATCH_FLUSH_PAGES, - cnt[i].flushed); - } - - if(cnt[i].evicted) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, - MONITOR_LRU_BATCH_EVICT_COUNT, - MONITOR_LRU_BATCH_EVICT_PAGES, - cnt[i].evicted); - } - } - -#if UNIV_MTFLUSH_DEBUG - fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu ]\n", ( - srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed); -#endif - - return(total_flushed); -} - -/*********************************************************************//** -Set correct thread identifiers to io thread array based on -information we have. */ -void -buf_mtflu_set_thread_ids( -/*=====================*/ - ulint n_threads, /*!<in: Number of threads to fill */ - void* ctx, /*!<in: thread context */ - os_thread_id_t* thread_ids) /*!<in: thread id array */ -{ - thread_sync_t *mtflush_io = ((thread_sync_t *)ctx); - ulint i; - ut_a(mtflush_io != NULL); - ut_a(thread_ids != NULL); - - for(i = 0; i < n_threads; i++) { - thread_ids[i] = mtflush_io->thread_data[i].wthread_id; - } -} diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index ad583e577c4..9e3daa5e40e 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -281,7 +281,7 @@ buf_read_ahead_random( if (fil_space_t* space = fil_space_acquire(page_id.space())) { high = space->max_page_number_for_io(high); - fil_space_release(space); + space->release(); } else { return(0); } @@ -561,7 +561,7 @@ buf_read_ahead_linear( if (fil_space_t* space = fil_space_acquire(page_id.space())) { space_size = space->committed_size; - fil_space_release(space); + space->release(); if (high > space_size) { /* The area is not whole */ @@ -789,7 +789,7 @@ buf_read_ibuf_merge_pages( in the arrays */ { #ifdef UNIV_IBUF_DEBUG - ut_a(n_stored < UNIV_PAGE_SIZE); + ut_a(n_stored < srv_page_size); #endif for (ulint i = 0; i < n_stored; i++) { @@ -815,7 +815,7 @@ tablespace_deleted: && page_nos[i] >= space->size); i--; next: - fil_space_release(space); + space->release(); continue; } @@ -842,7 +842,7 @@ next: case DB_ERROR: break; case DB_TABLESPACE_DELETED: - fil_space_release(space); + space->release(); goto tablespace_deleted; case DB_PAGE_CORRUPTED: case DB_DECRYPTION_FAILED: diff --git a/storage/innobase/data/data0data.cc b/storage/innobase/data/data0data.cc index fba06e78987..3e23cd6f662 100644 --- a/storage/innobase/data/data0data.cc +++ b/storage/innobase/data/data0data.cc @@ -40,6 +40,40 @@ to data_error. */ ut_d(byte data_error); #endif /* UNIV_DEBUG */ +/** Trim the tail of an index tuple before insert or update. +After instant ADD COLUMN, if the last fields of a clustered index tuple +match the default values that were explicitly specified or implied during +ADD COLUMN, there will be no need to store them. +NOTE: A page latch in the index must be held, so that the index +may not lose 'instantness' before the trimmed tuple has been +inserted or updated. +@param[in] index index possibly with instantly added columns */ +void dtuple_t::trim(const dict_index_t& index) +{ + ut_ad(n_fields >= index.n_core_fields); + ut_ad(n_fields <= index.n_fields); + ut_ad(index.is_instant()); + + ulint i = n_fields; + for (; i > index.n_core_fields; i--) { + const dfield_t* dfield = dtuple_get_nth_field(this, i - 1); + const dict_col_t* col = dict_index_get_nth_col(&index, i - 1); + ut_ad(col->is_instant()); + ulint len = dfield_get_len(dfield); + if (len != col->def_val.len) { + break; + } + + if (len != 0 && len != UNIV_SQL_NULL + && dfield->data != col->def_val.data + && memcmp(dfield->data, col->def_val.data, len)) { + break; + } + } + + n_fields = i; +} + /** Compare two data tuples. @param[in] tuple1 first data tuple @param[in] tuple2 second data tuple @@ -195,7 +229,7 @@ dtuple_validate( const dtuple_t* tuple) /*!< in: tuple */ { ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); -#ifdef HAVE_valgrind_or_MSAN +#ifdef HAVE_valgrind const ulint n_fields = dtuple_get_n_fields(tuple); for (ulint i = 0; i < n_fields; i++) { @@ -206,7 +240,7 @@ dtuple_validate( dfield_get_len(field)); } } -#endif /* HAVE_valgrind_or_MSAN */ +#endif /* HAVE_valgrind */ ut_ad(dtuple_check_typed(tuple)); return(TRUE); @@ -288,7 +322,7 @@ dfield_print_also_hex( val = mach_read_from_1(data); if (!(prtype & DATA_UNSIGNED)) { - val &= ~0x80; + val &= ~0x80U; fprintf(stderr, "%ld", (long) val); } else { fprintf(stderr, "%lu", (ulong) val); @@ -299,7 +333,7 @@ dfield_print_also_hex( val = mach_read_from_2(data); if (!(prtype & DATA_UNSIGNED)) { - val &= ~0x8000; + val &= ~0x8000U; fprintf(stderr, "%ld", (long) val); } else { fprintf(stderr, "%lu", (ulong) val); @@ -310,7 +344,7 @@ dfield_print_also_hex( val = mach_read_from_3(data); if (!(prtype & DATA_UNSIGNED)) { - val &= ~0x800000; + val &= ~0x800000U; fprintf(stderr, "%ld", (long) val); } else { fprintf(stderr, "%lu", (ulong) val); @@ -704,7 +738,7 @@ void dtuple_convert_back_big_rec( /*========================*/ dict_index_t* index MY_ATTRIBUTE((unused)), /*!< in: index */ - dtuple_t* entry, /*!< in: entry whose data was put to vector */ + dtuple_t* entry, /*!< in/out: entry whose data was put to vector */ big_rec_t* vector) /*!< in, own: big rec vector; it is freed in this function */ { @@ -772,6 +806,7 @@ dfield_t::clone(mem_heap_t* heap) const dfield_t* obj = static_cast<dfield_t*>( mem_heap_alloc(heap, sizeof(dfield_t) + size)); + ut_ad(len != UNIV_SQL_DEFAULT); obj->ext = ext; obj->len = len; obj->type = type; diff --git a/storage/innobase/data/data0type.cc b/storage/innobase/data/data0type.cc index 953a59102c0..896d1240340 100644 --- a/storage/innobase/data/data0type.cc +++ b/storage/innobase/data/data0type.cc @@ -26,6 +26,12 @@ Created 1/16/1996 Heikki Tuuri #include "data0type.h" +/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */ +const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] = { + 0, 0, 0, 0, 0, 0, + 0x80, 0, 0, 0, 0, 0, 0 +}; + /* At the database startup we store the default-charset collation number of this MySQL installation to this global variable. If we have < 4.1.2 format column definitions, or records in the insert buffer, we use this @@ -53,7 +59,7 @@ dtype_get_at_most_n_mbchars( const char* str) /*!< in: the string whose prefix length is being determined */ { - ut_a(data_len != UNIV_SQL_NULL); + ut_a(len_is_stored(data_len)); ut_ad(!mbmaxlen || !(prefix_len % mbmaxlen)); if (mbminlen != mbmaxlen) { diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc index 8e4732f8214..8df93b97936 100644 --- a/storage/innobase/dict/dict0boot.cc +++ b/storage/innobase/dict/dict0boot.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2019, MariaDB Corporation. +Copyright (c) 2016, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -176,10 +176,11 @@ dict_hdr_create( ulint root_page_no; ut_ad(mtr); + compile_time_assert(DICT_HDR_SPACE == 0); /* Create the dictionary header file block in a new, allocated file segment in the system tablespace */ - block = fseg_create(DICT_HDR_SPACE, + block = fseg_create(fil_system.sys_space, DICT_HDR + DICT_HDR_FSEG_HEADER, mtr); ut_a(DICT_HDR_PAGE_NO == block->page.id.page_no()); @@ -207,8 +208,8 @@ dict_hdr_create( system tables */ /*--------------------------*/ - root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, DICT_HDR_SPACE, - univ_page_size, DICT_TABLES_ID, + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_TABLES_ID, dict_ind_redundant, NULL, mtr); if (root_page_no == FIL_NULL) { @@ -218,8 +219,8 @@ dict_hdr_create( mlog_write_ulint(dict_header + DICT_HDR_TABLES, root_page_no, MLOG_4BYTES, mtr); /*--------------------------*/ - root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, - univ_page_size, DICT_TABLE_IDS_ID, + root_page_no = btr_create(DICT_UNIQUE, + fil_system.sys_space, DICT_TABLE_IDS_ID, dict_ind_redundant, NULL, mtr); if (root_page_no == FIL_NULL) { @@ -229,8 +230,8 @@ dict_hdr_create( mlog_write_ulint(dict_header + DICT_HDR_TABLE_IDS, root_page_no, MLOG_4BYTES, mtr); /*--------------------------*/ - root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, DICT_HDR_SPACE, - univ_page_size, DICT_COLUMNS_ID, + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_COLUMNS_ID, dict_ind_redundant, NULL, mtr); if (root_page_no == FIL_NULL) { @@ -240,8 +241,8 @@ dict_hdr_create( mlog_write_ulint(dict_header + DICT_HDR_COLUMNS, root_page_no, MLOG_4BYTES, mtr); /*--------------------------*/ - root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, DICT_HDR_SPACE, - univ_page_size, DICT_INDEXES_ID, + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_INDEXES_ID, dict_ind_redundant, NULL, mtr); if (root_page_no == FIL_NULL) { @@ -251,8 +252,8 @@ dict_hdr_create( mlog_write_ulint(dict_header + DICT_HDR_INDEXES, root_page_no, MLOG_4BYTES, mtr); /*--------------------------*/ - root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, DICT_HDR_SPACE, - univ_page_size, DICT_FIELDS_ID, + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_FIELDS_ID, dict_ind_redundant, NULL, mtr); if (root_page_no == FIL_NULL) { @@ -279,7 +280,6 @@ dict_boot(void) dict_hdr_t* dict_hdr; mem_heap_t* heap; mtr_t mtr; - dberr_t error; /* Be sure these constants do not ever change. To avoid bloat, only check the *NUM_FIELDS* in each table */ @@ -328,15 +328,16 @@ dict_boot(void) /* Insert into the dictionary cache the descriptions of the basic system tables */ /*-------------------------*/ - table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0, 0, 0); + table = dict_mem_table_create("SYS_TABLES", fil_system.sys_space, + 8, 0, 0, 0); dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, MAX_FULL_NAME_LEN); dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8); /* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */ dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4); - /* The low order bit of TYPE is always set to 1. If the format - is UNIV_FORMAT_B or higher, this field matches table->flags. */ + /* The low order bit of TYPE is always set to 1. If ROW_FORMAT + is not REDUNDANT or COMPACT, this field matches table->flags. */ dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0); /* MIX_LEN may contain additional table flags when @@ -347,35 +348,35 @@ dict_boot(void) table->id = DICT_TABLES_ID; - dict_table_add_to_cache(table, FALSE, heap); + dict_table_add_system_columns(table, heap); + table->add_to_cache(); dict_sys->sys_tables = table; mem_heap_empty(heap); - index = dict_mem_index_create("SYS_TABLES", "CLUST_IND", - DICT_HDR_SPACE, + index = dict_mem_index_create(table, "CLUST_IND", DICT_UNIQUE | DICT_CLUSTERED, 1); dict_mem_index_add_field(index, "NAME", 0); index->id = DICT_TABLES_ID; - - error = dict_index_add_to_cache(table, index, - mach_read_from_4(dict_hdr - + DICT_HDR_TABLES)); + dberr_t error = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_TABLES)); ut_a(error == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = UT_BITS_IN_BYTES( + unsigned(table->indexes.start->n_nullable)); /*-------------------------*/ - index = dict_mem_index_create("SYS_TABLES", "ID_IND", - DICT_HDR_SPACE, DICT_UNIQUE, 1); + index = dict_mem_index_create(table, "ID_IND", DICT_UNIQUE, 1); dict_mem_index_add_field(index, "ID", 0); index->id = DICT_TABLE_IDS_ID; error = dict_index_add_to_cache( - table, index, mach_read_from_4(dict_hdr + DICT_HDR_TABLE_IDS)); + index, mach_read_from_4(dict_hdr + DICT_HDR_TABLE_IDS)); ut_a(error == DB_SUCCESS); /*-------------------------*/ - table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, + table = dict_mem_table_create("SYS_COLUMNS", fil_system.sys_space, 7, 0, 0, 0); dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8); @@ -388,25 +389,27 @@ dict_boot(void) table->id = DICT_COLUMNS_ID; - dict_table_add_to_cache(table, FALSE, heap); + dict_table_add_system_columns(table, heap); + table->add_to_cache(); dict_sys->sys_columns = table; mem_heap_empty(heap); - index = dict_mem_index_create("SYS_COLUMNS", "CLUST_IND", - DICT_HDR_SPACE, + index = dict_mem_index_create(table, "CLUST_IND", DICT_UNIQUE | DICT_CLUSTERED, 2); dict_mem_index_add_field(index, "TABLE_ID", 0); dict_mem_index_add_field(index, "POS", 0); index->id = DICT_COLUMNS_ID; - error = dict_index_add_to_cache(table, index, - mach_read_from_4(dict_hdr - + DICT_HDR_COLUMNS)); + error = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_COLUMNS)); ut_a(error == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = UT_BITS_IN_BYTES( + unsigned(table->indexes.start->n_nullable)); /*-------------------------*/ - table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, + table = dict_mem_table_create("SYS_INDEXES", fil_system.sys_space, DICT_NUM_COLS__SYS_INDEXES, 0, 0, 0); dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8); @@ -414,31 +417,43 @@ dict_boot(void) dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); + /* SYS_INDEXES.SPACE is redundant and not being read; + SYS_TABLES.SPACE is being used instead. */ dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "MERGE_THRESHOLD", DATA_INT, 0, 4); table->id = DICT_INDEXES_ID; - dict_table_add_to_cache(table, FALSE, heap); + dict_table_add_system_columns(table, heap); + /* The column SYS_INDEXES.MERGE_THRESHOLD was "instantly" + added in MySQL 5.7 and MariaDB 10.2.2. Assign it DEFAULT NULL. + Because of file format compatibility, we must treat SYS_INDEXES + as a special case, relaxing some debug assertions + for DICT_INDEXES_ID. */ + dict_table_get_nth_col(table, DICT_COL__SYS_INDEXES__MERGE_THRESHOLD) + ->def_val.len = UNIV_SQL_NULL; + table->add_to_cache(); dict_sys->sys_indexes = table; mem_heap_empty(heap); - index = dict_mem_index_create("SYS_INDEXES", "CLUST_IND", - DICT_HDR_SPACE, + index = dict_mem_index_create(table, "CLUST_IND", DICT_UNIQUE | DICT_CLUSTERED, 2); dict_mem_index_add_field(index, "TABLE_ID", 0); dict_mem_index_add_field(index, "ID", 0); index->id = DICT_INDEXES_ID; - error = dict_index_add_to_cache(table, index, - mach_read_from_4(dict_hdr - + DICT_HDR_INDEXES)); + error = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_INDEXES)); ut_a(error == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = UT_BITS_IN_BYTES( + unsigned(table->indexes.start->n_nullable)); /*-------------------------*/ - table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0, 0, 0); + table = dict_mem_table_create("SYS_FIELDS", fil_system.sys_space, + 3, 0, 0, 0); dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 8); dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); @@ -446,22 +461,24 @@ dict_boot(void) table->id = DICT_FIELDS_ID; - dict_table_add_to_cache(table, FALSE, heap); + dict_table_add_system_columns(table, heap); + table->add_to_cache(); dict_sys->sys_fields = table; mem_heap_free(heap); - index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND", - DICT_HDR_SPACE, + index = dict_mem_index_create(table, "CLUST_IND", DICT_UNIQUE | DICT_CLUSTERED, 2); dict_mem_index_add_field(index, "INDEX_ID", 0); dict_mem_index_add_field(index, "POS", 0); index->id = DICT_FIELDS_ID; - error = dict_index_add_to_cache(table, index, - mach_read_from_4(dict_hdr - + DICT_HDR_FIELDS)); + error = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_FIELDS)); ut_a(error == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = UT_BITS_IN_BYTES( + unsigned(table->indexes.start->n_nullable)); mtr_commit(&mtr); diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc index 2976581c19f..eba70aca6d1 100644 --- a/storage/innobase/dict/dict0crea.cc +++ b/storage/innobase/dict/dict0crea.cc @@ -38,6 +38,7 @@ Created 1/8/1996 Heikki Tuuri #include "row0mysql.h" #include "pars0pars.h" #include "trx0roll.h" +#include "trx0rseg.h" #include "trx0undo.h" #include "ut0vec.h" #include "dict0priv.h" @@ -64,7 +65,9 @@ dict_create_sys_tables_tuple( ulint type; ut_ad(table); + ut_ad(!table->space || table->space->id == table->space_id); ut_ad(heap); + ut_ad(table->n_cols >= DATA_N_SYS_COLS); sys_tables = dict_sys->sys_tables; @@ -98,12 +101,11 @@ dict_create_sys_tables_tuple( /* If there is any virtual column, encode it in N_COLS */ mach_write_to_4(ptr, dict_table_encode_n_col( - static_cast<ulint>(table->n_def), - static_cast<ulint>(table->n_v_def)) - | ((table->flags & DICT_TF_COMPACT) << 31)); + ulint(table->n_cols - DATA_N_SYS_COLS), + ulint(table->n_v_def)) + | (ulint(table->flags & DICT_TF_COMPACT) << 31)); dfield_set_data(dfield, ptr, 4); - /* 5: TYPE (table flags) -----------------------------*/ dfield = dtuple_get_nth_field( entry, DICT_COL__SYS_TABLES__TYPE); @@ -147,7 +149,7 @@ dict_create_sys_tables_tuple( entry, DICT_COL__SYS_TABLES__SPACE); ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); - mach_write_to_4(ptr, table->space); + mach_write_to_4(ptr, table->space_id); dfield_set_data(dfield, ptr, 4); /*----------------------------------*/ @@ -190,7 +192,7 @@ dict_create_sys_columns_tuple( v_col_no = column->ind; } else { column = dict_table_get_nth_col(table, i); - ut_ad(!dict_col_is_virtual(column)); + ut_ad(!column->is_virtual()); } sys_columns = dict_sys->sys_columns; @@ -350,15 +352,12 @@ dict_build_table_def_step( que_thr_t* thr, /*!< in: query thread */ tab_node_t* node) /*!< in: table create node */ { - dict_table_t* table; - - table = node->table; - ut_ad(!dict_table_is_temporary(table)); - - trx_t* trx = thr_get_trx(thr); - dict_table_assign_new_id(table, trx); - ut_ad(mutex_own(&dict_sys->mutex)); + dict_table_t* table = node->table; + ut_ad(!table->is_temporary()); + ut_ad(!table->space); + ut_ad(table->space_id == ULINT_UNDEFINED); + dict_table_assign_new_id(table, thr_get_trx(thr)); /* Always set this bit for all new created tables */ DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME); @@ -369,12 +368,12 @@ dict_build_table_def_step( if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_FILE_PER_TABLE)) { /* This table will need a new tablespace. */ - ut_ad(dict_table_get_format(table) <= UNIV_FORMAT_MAX); ut_ad(DICT_TF_GET_ZIP_SSIZE(table->flags) == 0 - || dict_table_get_format(table) >= UNIV_FORMAT_B); + || dict_table_has_atomic_blobs(table)); + trx_t* trx = thr_get_trx(thr); ut_ad(trx->table_id); mtr_t mtr; - trx_undo_t* undo = trx->rsegs.m_redo.insert_undo; + trx_undo_t* undo = trx->rsegs.m_redo.undo; if (undo && !undo->table_id && trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE) { /* This must be a TRUNCATE operation where @@ -383,41 +382,43 @@ dict_build_table_def_step( associated with the new empty table, so that we can remove it on recovery. */ mtr.start(); - trx_undo_mark_as_dict(trx, undo, &mtr); + undo->table_id = trx->table_id; + undo->dict_operation = TRUE; + page_t* page = trx_undo_page_get( + page_id_t(trx->rsegs.m_redo.rseg->space->id, + undo->hdr_page_no), + &mtr); + mlog_write_ulint(page + undo->hdr_offset + + TRX_UNDO_DICT_TRANS, + TRUE, MLOG_1BYTE, &mtr); + mlog_write_ull(page + undo->hdr_offset + + TRX_UNDO_TABLE_ID, + trx->table_id, &mtr); mtr.commit(); log_write_up_to(mtr.commit_lsn(), true); } - ulint space; /* Get a new tablespace ID */ - dict_hdr_get_new_id(NULL, NULL, &space, table, false); + ulint space_id; + dict_hdr_get_new_id(NULL, NULL, &space_id, table, false); DBUG_EXECUTE_IF( "ib_create_table_fail_out_of_space_ids", - space = ULINT_UNDEFINED; + space_id = ULINT_UNDEFINED; ); - if (space == ULINT_UNDEFINED) { + if (space_id == ULINT_UNDEFINED) { return DB_ERROR; } - table->space = unsigned(space); /* Determine the tablespace flags. */ bool has_data_dir = DICT_TF_HAS_DATA_DIR(table->flags); ulint fsp_flags = dict_tf_to_fsp_flags(table->flags); - char* filepath; - - if (has_data_dir) { - ut_ad(table->data_dir_path); - filepath = fil_make_filepath( - table->data_dir_path, - table->name.m_name, IBD, true); - - } else { - /* Make the tablespace file in the default dir - using the table name */ - filepath = fil_make_filepath( - NULL, table->name.m_name, IBD, false); - } + ut_ad(!has_data_dir || table->data_dir_path); + char* filepath = has_data_dir + ? fil_make_filepath(table->data_dir_path, + table->name.m_name, IBD, true) + : fil_make_filepath(NULL, + table->name.m_name, IBD, false); /* We create a new single-table tablespace for the table. We initially let it be 4 pages: @@ -427,49 +428,36 @@ dict_build_table_def_step( - page 3 will contain the root of the clustered index of the table we create here. */ - dberr_t err = fil_ibd_create( - space, table->name.m_name, filepath, fsp_flags, - FIL_IBD_FILE_INITIAL_SIZE, node->mode, node->key_id); + dberr_t err; + table->space = fil_ibd_create( + space_id, table->name.m_name, filepath, fsp_flags, + FIL_IBD_FILE_INITIAL_SIZE, + node->mode, node->key_id, &err); ut_free(filepath); - if (err != DB_SUCCESS) { + if (!table->space) { + ut_ad(err != DB_SUCCESS); return err; } + table->space_id = space_id; mtr.start(); mtr.set_named_space(table->space); - fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr); - mtr.commit(); } else { ut_ad(dict_tf_get_rec_format(table->flags) != REC_FORMAT_COMPRESSED); - ut_ad(table->space == srv_sys_space.space_id()); + table->space = fil_system.sys_space; + table->space_id = TRX_SYS_SPACE; } ins_node_set_new_row(node->tab_def, dict_create_sys_tables_tuple(table, node->heap)); - return DB_SUCCESS; } -/***************************************************************//** -Builds a column definition to insert. */ -static -void -dict_build_col_def_step( -/*====================*/ - tab_node_t* node) /*!< in: table create node */ -{ - dtuple_t* row; - - row = dict_create_sys_columns_tuple(node->table, node->col_no, - node->heap); - ins_node_set_new_row(node->col_def, row); -} - /** Builds a SYS_VIRTUAL row definition to insert. @param[in] node table create node */ static @@ -499,19 +487,19 @@ dict_create_sys_indexes_tuple( tuple is allocated */ { dict_table_t* sys_indexes; - dict_table_t* table; dtuple_t* entry; dfield_t* dfield; byte* ptr; ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(index); + ut_ad(index->table->space || index->table->file_unreadable); + ut_ad(!index->table->space + || index->table->space->id == index->table->space_id); ut_ad(heap); sys_indexes = dict_sys->sys_indexes; - table = dict_table_get_low(index->table_name); - entry = dtuple_create( heap, DICT_NUM_COLS__SYS_INDEXES + DATA_N_SYS_COLS); @@ -522,7 +510,7 @@ dict_create_sys_indexes_tuple( entry, DICT_COL__SYS_INDEXES__TABLE_ID); ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); - mach_write_to_8(ptr, table->id); + mach_write_to_8(ptr, index->table->id); dfield_set_data(dfield, ptr, 8); @@ -576,7 +564,7 @@ dict_create_sys_indexes_tuple( entry, DICT_COL__SYS_INDEXES__SPACE); ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); - mach_write_to_4(ptr, index->space); + mach_write_to_4(ptr, index->table->space_id); dfield_set_data(dfield, ptr, 4); @@ -744,7 +732,8 @@ dict_build_index_def_step( index = node->index; - table = dict_table_get_low(index->table_name); + table = index->table = node->table = dict_table_open_on_name( + node->table_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); if (table == NULL) { return(DB_TABLE_NOT_FOUND); @@ -755,8 +744,6 @@ dict_build_index_def_step( trx->table_id = table->id; } - node->table = table; - ut_ad((UT_LIST_GET_LEN(table->indexes) > 0) || dict_index_is_clust(index)); @@ -765,7 +752,6 @@ dict_build_index_def_step( /* Inherit the space id from the table; we store all indexes of a table in the same tablespace */ - index->space = table->space; node->page_no = FIL_NULL; row = dict_create_sys_indexes_tuple(index, node->heap); node->ind_row = row; @@ -776,6 +762,7 @@ dict_build_index_def_step( index->trx_id = trx->id; ut_ad(table->def_trx_id <= trx->id); table->def_trx_id = trx->id; + dict_table_close(table, true, false); return(DB_SUCCESS); } @@ -802,11 +789,6 @@ dict_build_index_def( dict_hdr_get_new_id(NULL, &index->id, NULL, table, false); - /* Inherit the space id from the table; we store all indexes of a - table in the same tablespace */ - - index->space = table->space; - /* Note that the index was created by this transaction. */ index->trx_id = trx->id; } @@ -859,14 +841,7 @@ dict_create_index_tree_step( the index and its root address is written to the index entry in sys_indexes */ - mtr_start(&mtr); - - const bool missing = !index->is_readable() - || dict_table_is_discarded(index->table); - - if (!missing) { - mtr.set_named_space(index->space); - } + mtr.start(); search_tuple = dict_create_search_tuple(node->ind_row, node->heap); @@ -879,12 +854,13 @@ dict_create_index_tree_step( dberr_t err = DB_SUCCESS; - if (missing) { + if (!index->is_readable()) { node->page_no = FIL_NULL; } else { + index->set_modified(mtr); + node->page_no = btr_create( - index->type, index->space, - dict_table_page_size(index->table), + index->type, index->table->space, index->id, index, NULL, &mtr); if (node->page_no == FIL_NULL) { @@ -905,7 +881,7 @@ dict_create_index_tree_step( mlog_write_ulint(data, node->page_no, MLOG_4BYTES, &mtr); } - mtr_commit(&mtr); + mtr.commit(); return(err); } @@ -921,7 +897,6 @@ dict_create_index_tree_in_mem( const trx_t* trx) /*!< in: InnoDB transaction handle */ { mtr_t mtr; - ulint page_no; ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(!(index->type & DICT_FTS)); @@ -929,28 +904,18 @@ dict_create_index_tree_in_mem( mtr_start(&mtr); mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - dberr_t err = DB_SUCCESS; - /* Currently this function is being used by temp-tables only. Import/Discard of temp-table is blocked and so this assert. */ ut_ad(index->is_readable()); - ut_ad(!dict_table_is_discarded(index->table)); + ut_ad(!(index->table->flags2 & DICT_TF2_DISCARDED)); - page_no = btr_create( - index->type, index->space, - dict_table_page_size(index->table), - index->id, index, NULL, &mtr); + index->page = btr_create(index->type, index->table->space, + index->id, index, NULL, &mtr); + mtr_commit(&mtr); - index->page = page_no; index->trx_id = trx->id; - if (page_no == FIL_NULL) { - err = DB_OUT_OF_FILE_SPACE; - } - - mtr_commit(&mtr); - - return(err); + return index->page == FIL_NULL ? DB_OUT_OF_FILE_SPACE : DB_SUCCESS; } /** Drop the index tree associated with a row in SYS_INDEXES table. @@ -992,7 +957,6 @@ bool dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) const uint32_t space_id = mach_read_from_4(ptr); ut_ad(space_id < SRV_TMP_SPACE_ID); if (space_id != TRX_SYS_SPACE - && srv_safe_truncate && trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE) { /* We are about to delete the entire .ibd file; do not bother to free pages inside it. */ @@ -1029,31 +993,6 @@ bool dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) } /*******************************************************************//** -Drops the index tree but don't update SYS_INDEXES table. */ -void -dict_drop_index_tree_in_mem( -/*========================*/ - const dict_index_t* index, /*!< in: index */ - ulint page_no) /*!< in: index page-no */ -{ - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(dict_table_is_temporary(index->table)); - - ulint root_page_no = page_no; - ulint space = index->space; - bool found; - const page_size_t page_size(fil_space_get_page_size(space, - &found)); - - /* If tree has already been freed or it is a single table - tablespace and the .ibd file is missing do nothing, - else free the all the pages */ - if (root_page_no != FIL_NULL && found) { - btr_free(page_id_t(space, root_page_no), page_size); - } -} - -/*******************************************************************//** Recreate the index tree associated with a row in SYS_INDEXES table. @return new root page number, or FIL_NULL on failure */ ulint @@ -1070,28 +1009,22 @@ dict_recreate_index_tree( { ut_ad(mutex_own(&dict_sys->mutex)); ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); + ut_ad(!table->space || table->space->id == table->space_id); ulint len; - rec_t* rec = btr_pcur_get_rec(pcur); + const rec_t* rec = btr_pcur_get_rec(pcur); const byte* ptr = rec_get_nth_field_old( rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); ut_ad(len == 4); - ulint root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); - - ptr = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__SPACE, &len); + ut_ad(table->space_id == mach_read_from_4( + rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__SPACE, + &len))); ut_ad(len == 4); - ut_a(table->space == mtr_read_ulint(ptr, MLOG_4BYTES, mtr)); - - ulint space = table->space; - bool found; - const page_size_t page_size(fil_space_get_page_size(space, - &found)); - - if (!found) { + if (!table->space) { /* It is a single table tablespae and the .ibd file is missing: do nothing. */ @@ -1117,7 +1050,7 @@ dict_recreate_index_tree( mtr_commit(mtr); mtr_start(mtr); - mtr->set_named_space(space); + mtr->set_named_space(table->space); btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); /* Find the index corresponding to this SYS_INDEXES record. */ @@ -1125,15 +1058,12 @@ dict_recreate_index_tree( index != NULL; index = UT_LIST_GET_NEXT(indexes, index)) { if (index->id == index_id) { - if (index->type & DICT_FTS) { - return(FIL_NULL); - } else { - root_page_no = btr_create( - type, space, page_size, index_id, - index, NULL, mtr); - index->page = (unsigned int) root_page_no; - return(root_page_no); - } + ulint root_page_no = (index->type & DICT_FTS) + ? FIL_NULL + : btr_create(type, table->space, + index_id, index, NULL, mtr); + index->page = unsigned(root_page_no); + return root_page_no; } } @@ -1143,73 +1073,6 @@ dict_recreate_index_tree( return(FIL_NULL); } -/*******************************************************************//** -Truncates the index tree but don't update SYSTEM TABLES. -@return DB_SUCCESS or error */ -dberr_t -dict_truncate_index_tree_in_mem( -/*============================*/ - dict_index_t* index) /*!< in/out: index */ -{ - mtr_t mtr; - bool truncate; - ulint space = index->space; - - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(dict_table_is_temporary(index->table)); - - ulint type = index->type; - ulint root_page_no = index->page; - - if (root_page_no == FIL_NULL) { - - /* The tree has been freed. */ - ib::warn() << "Trying to TRUNCATE a missing index of table " - << index->table->name << "!"; - - truncate = false; - } else { - truncate = true; - } - - bool found; - const page_size_t page_size(fil_space_get_page_size(space, - &found)); - - if (!found) { - - /* It is a single table tablespace and the .ibd file is - missing: do nothing */ - - ib::warn() - << "Trying to TRUNCATE a missing .ibd file of table " - << index->table->name << "!"; - } - - /* If table to truncate resides in its on own tablespace that will - be re-created on truncate then we can ignore freeing of existing - tablespace objects. */ - - if (truncate) { - btr_free(page_id_t(space, root_page_no), page_size); - } - - mtr_start(&mtr); - mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - - root_page_no = btr_create( - type, space, page_size, index->id, index, NULL, &mtr); - - DBUG_EXECUTE_IF("ib_err_trunc_temp_recreate_index", - root_page_no = FIL_NULL;); - - index->page = root_page_no; - - mtr_commit(&mtr); - - return(index->page == FIL_NULL ? DB_ERROR : DB_SUCCESS); -} - /*********************************************************************//** Creates a table create graph. @return own: table create node */ @@ -1253,6 +1116,7 @@ tab_create_graph_create( /** Creates an index create graph. @param[in] index index to create, built as a memory data structure +@param[in] table table name @param[in,out] heap heap where created @param[in] add_v new virtual columns added in the same clause with add index @@ -1260,6 +1124,7 @@ tab_create_graph_create( ind_node_t* ind_create_graph_create( dict_index_t* index, + const char* table, mem_heap_t* heap, const dict_add_v_col_t* add_v) { @@ -1272,6 +1137,8 @@ ind_create_graph_create( node->index = index; + node->table_name = table; + node->add_v = add_v; node->state = INDEX_BUILD_INDEX_DEF; @@ -1334,12 +1201,19 @@ dict_create_table_step( if (node->state == TABLE_BUILD_COL_DEF) { - if (node->col_no < (static_cast<ulint>(node->table->n_def) - + static_cast<ulint>(node->table->n_v_def))) { + if (node->col_no + DATA_N_SYS_COLS + < (static_cast<ulint>(node->table->n_def) + + static_cast<ulint>(node->table->n_v_def))) { - dict_build_col_def_step(node); + ulint i = node->col_no++; + if (i + DATA_N_SYS_COLS >= node->table->n_def) { + i += DATA_N_SYS_COLS; + } - node->col_no++; + ins_node_set_new_row( + node->col_def, + dict_create_sys_columns_tuple(node->table, i, + node->heap)); thr->run_node = node->col_def; @@ -1397,7 +1271,8 @@ dict_create_table_step( if (node->state == TABLE_ADD_TO_CACHE) { DBUG_EXECUTE_IF("ib_ddl_crash_during_create", DBUG_SUICIDE();); - dict_table_add_to_cache(node->table, TRUE, node->heap); + node->table->can_be_evicted = true; + node->table->add_to_cache(); err = DB_SUCCESS; } @@ -1482,16 +1357,25 @@ dict_create_index_step( } if (node->state == INDEX_ADD_TO_CACHE) { - err = dict_index_add_to_cache( - node->table, node->index, FIL_NULL, node->add_v); + ut_ad(node->index->table == node->table); + err = dict_index_add_to_cache(node->index, FIL_NULL, + node->add_v); ut_ad((node->index == NULL) == (err != DB_SUCCESS)); - if (err != DB_SUCCESS) { - + if (!node->index) { goto function_exit; } + ut_ad(!node->index->is_instant()); + ut_ad(node->index->n_core_null_bytes + == ((dict_index_is_clust(node->index) + && node->table->supports_instant()) + ? dict_index_t::NO_CORE_NULL_BYTES + : UT_BITS_IN_BYTES( + unsigned(node->index->n_nullable)))); + node->index->n_core_null_bytes = UT_BITS_IN_BYTES( + unsigned(node->index->n_nullable)); node->state = INDEX_CREATE_INDEX_TREE; } @@ -1646,7 +1530,7 @@ dict_create_or_check_foreign_constraint_tables(void) return(DB_READ_ONLY); } - trx = trx_allocate_for_mysql(); + trx = trx_create(); trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); @@ -1739,7 +1623,7 @@ dict_create_or_check_foreign_constraint_tables(void) row_mysql_unlock_data_dictionary(trx); - trx_free_for_mysql(trx); + trx->free(); srv_file_per_table = srv_file_per_table_backup; @@ -1785,7 +1669,7 @@ dict_create_or_check_sys_virtual() return(DB_READ_ONLY); } - trx = trx_allocate_for_mysql(); + trx = trx_create(); trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); @@ -1840,7 +1724,7 @@ dict_create_or_check_sys_virtual() row_mysql_unlock_data_dictionary(trx); - trx_free_for_mysql(trx); + trx->free(); srv_file_per_table = srv_file_per_table_backup; @@ -2079,7 +1963,8 @@ dict_create_add_foreign_to_dictionary( foreign->referenced_table_name); pars_info_add_int4_literal(info, "n_cols", - foreign->n_fields + (foreign->type << 24)); + ulint(foreign->n_fields) + | (ulint(foreign->type) << 24)); DBUG_PRINT("dict_create_add_foreign_to_dictionary", ("'%s', '%s', '%s', %d", foreign->id, name, @@ -2311,7 +2196,7 @@ dict_create_or_check_sys_tablespace(void) return(DB_READ_ONLY); } - trx = trx_allocate_for_mysql(); + trx = trx_create(); trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); @@ -2372,7 +2257,7 @@ dict_create_or_check_sys_tablespace(void) row_mysql_unlock_data_dictionary(trx); - trx_free_for_mysql(trx); + trx->free(); srv_file_per_table = srv_file_per_table_backup; diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc index 73f55cc8667..7c6f5d75b5d 100644 --- a/storage/innobase/dict/dict0defrag_bg.cc +++ b/storage/innobase/dict/dict0defrag_bg.cc @@ -281,11 +281,11 @@ dict_stats_save_defrag_stats( mtr_t mtr; ulint n_leaf_pages; ulint n_leaf_reserved; - mtr_start(&mtr); - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr.start(); + mtr_s_lock_index(index, &mtr); n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES, &n_leaf_pages, &mtr); - mtr_commit(&mtr); + mtr.commit(); if (n_leaf_reserved == ULINT_UNDEFINED) { // The index name is different during fast index creation, diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index d6330cb5906..1f05eb80e5d 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -2,7 +2,7 @@ Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2020, MariaDB Corporation. +Copyright (c) 2013, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -117,14 +117,12 @@ static bool innodb_index_stats_not_found_reported = false; /*******************************************************************//** Tries to find column names for the index and sets the col field of the index. -@param[in] table table @param[in] index index @param[in] add_v new virtual columns added along with an add index call -@return TRUE if the column names were found */ +@return whether the column names were found */ static -ibool +bool dict_index_find_cols( - const dict_table_t* table, dict_index_t* index, const dict_add_v_col_t* add_v); /*******************************************************************//** @@ -135,7 +133,6 @@ static dict_index_t* dict_index_build_internal_clust( /*============================*/ - const dict_table_t* table, /*!< in: table */ dict_index_t* index); /*!< in: user representation of a clustered index */ /*******************************************************************//** @@ -146,7 +143,6 @@ static dict_index_t* dict_index_build_internal_non_clust( /*================================*/ - const dict_table_t* table, /*!< in: table */ dict_index_t* index); /*!< in: user representation of a non-clustered index */ /**********************************************************************//** @@ -156,7 +152,6 @@ static dict_index_t* dict_index_build_internal_fts( /*==========================*/ - dict_table_t* table, /*!< in: table */ dict_index_t* index); /*!< in: user representation of an FTS index */ /**********************************************************************//** @@ -248,7 +243,7 @@ dict_get_db_name_len( const char* s; s = strchr(name, '/'); ut_a(s); - return(s - name); + return ulint(s - name); } /** Reserve the dictionary system mutex. */ @@ -281,7 +276,7 @@ dict_table_try_drop_aborted( { trx_t* trx; - trx = trx_allocate_for_background(); + trx = trx_create(); trx->op_info = "try to drop any indexes after an aborted index creation"; row_mysql_lock_data_dictionary(trx); trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); @@ -304,7 +299,7 @@ dict_table_try_drop_aborted( } row_mysql_unlock_data_dictionary(trx); - trx_free_for_background(trx); + trx->free(); } /**********************************************************************//** @@ -472,26 +467,28 @@ dict_table_has_column( return(col_max); } -/**********************************************************************//** -Returns a column's name. -@return column name. NOTE: not guaranteed to stay valid if table is -modified in any way (columns added, etc.). */ -const char* -dict_table_get_col_name( -/*====================*/ - const dict_table_t* table, /*!< in: table */ - ulint col_nr) /*!< in: column number */ +/** Retrieve the column name. +@param[in] table the table of this column */ +const char* dict_col_t::name(const dict_table_t& table) const { - ulint i; - const char* s; + ut_ad(table.magic_n == DICT_TABLE_MAGIC_N); - ut_ad(table); - ut_ad(col_nr < table->n_def); - ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + size_t col_nr; + const char *s; + + if (is_virtual()) { + col_nr = size_t(reinterpret_cast<const dict_v_col_t*>(this) + - table.v_cols); + ut_ad(col_nr < table.n_v_def); + s = table.v_col_names; + } else { + col_nr = size_t(this - table.cols); + ut_ad(col_nr < table.n_def); + s = table.col_names; + } - s = table->col_names; if (s) { - for (i = 0; i < col_nr; i++) { + for (size_t i = 0; i < col_nr; i++) { s += strlen(s) + 1; } } @@ -715,7 +712,7 @@ dict_index_get_nth_col_or_prefix_pos( @param[in] n column number @param[in] is_virtual whether it is a virtual col @return TRUE if contains the column or its prefix */ -ibool +bool dict_index_contains_col_or_prefix( const dict_index_t* index, ulint n, @@ -746,11 +743,11 @@ dict_index_contains_col_or_prefix( if (col == field->col) { - return(TRUE); + return(true); } } - return(FALSE); + return(false); } /********************************************************************//** @@ -923,7 +920,7 @@ dict_init(void) &dict_operation_lock, SYNC_DICT_OPERATION); if (!srv_read_only_mode) { - dict_foreign_err_file = os_file_create_tmpfile(NULL); + dict_foreign_err_file = os_file_create_tmpfile(); ut_a(dict_foreign_err_file); } @@ -1060,67 +1057,47 @@ dict_table_add_system_columns( DATA_ROW_ID | DATA_NOT_NULL, DATA_ROW_ID_LEN); -#if DATA_ROW_ID != 0 -#error "DATA_ROW_ID != 0" -#endif + compile_time_assert(DATA_ROW_ID == 0); dict_mem_table_add_col(table, heap, "DB_TRX_ID", DATA_SYS, DATA_TRX_ID | DATA_NOT_NULL, DATA_TRX_ID_LEN); -#if DATA_TRX_ID != 1 -#error "DATA_TRX_ID != 1" -#endif - + compile_time_assert(DATA_TRX_ID == 1); dict_mem_table_add_col(table, heap, "DB_ROLL_PTR", DATA_SYS, DATA_ROLL_PTR | DATA_NOT_NULL, DATA_ROLL_PTR_LEN); -#if DATA_ROLL_PTR != 2 -#error "DATA_ROLL_PTR != 2" -#endif + compile_time_assert(DATA_ROLL_PTR == 2); /* This check reminds that if a new system column is added to the program, it should be dealt with here */ -#if DATA_N_SYS_COLS != 3 -#error "DATA_N_SYS_COLS != 3" -#endif + compile_time_assert(DATA_N_SYS_COLS == 3); } -/**********************************************************************//** -Adds a table object to the dictionary cache. */ +/** Add the table definition to the data dictionary cache */ void -dict_table_add_to_cache( -/*====================*/ - dict_table_t* table, /*!< in: table */ - bool can_be_evicted, /*!< in: whether can be evicted */ - mem_heap_t* heap) /*!< in: temporary heap */ +dict_table_t::add_to_cache() { - ulint fold; - ulint id_fold; - ut_ad(dict_lru_validate()); ut_ad(mutex_own(&dict_sys->mutex)); - dict_table_add_system_columns(table, heap); - - mysql_mutex_init(0, &table->autoinc_mutex, NULL); - - table->cached = TRUE; + mysql_mutex_init(0, &autoinc_mutex, NULL); + cached = TRUE; - fold = ut_fold_string(table->name.m_name); - id_fold = ut_fold_ull(table->id); + ulint fold = ut_fold_string(name.m_name); + ulint id_fold = ut_fold_ull(id); /* Look for a table with the same name: error if such exists */ { dict_table_t* table2; HASH_SEARCH(name_hash, dict_sys->table_hash, fold, dict_table_t*, table2, ut_ad(table2->cached), - !strcmp(table2->name.m_name, table->name.m_name)); + !strcmp(table2->name.m_name, name.m_name)); ut_a(table2 == NULL); #ifdef UNIV_DEBUG /* Look for the same table pointer with a different name */ HASH_SEARCH_ALL(name_hash, dict_sys->table_hash, dict_table_t*, table2, ut_ad(table2->cached), - table2 == table); + table2 == this); ut_ad(table2 == NULL); #endif /* UNIV_DEBUG */ } @@ -1130,32 +1107,30 @@ dict_table_add_to_cache( dict_table_t* table2; HASH_SEARCH(id_hash, dict_sys->table_id_hash, id_fold, dict_table_t*, table2, ut_ad(table2->cached), - table2->id == table->id); + table2->id == id); ut_a(table2 == NULL); #ifdef UNIV_DEBUG /* Look for the same table pointer with a different id */ HASH_SEARCH_ALL(id_hash, dict_sys->table_id_hash, dict_table_t*, table2, ut_ad(table2->cached), - table2 == table); + table2 == this); ut_ad(table2 == NULL); #endif /* UNIV_DEBUG */ } /* Add table to hash table of tables */ HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, - table); + this); /* Add table to hash table of tables based on table id */ HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, id_fold, - table); + this); - table->can_be_evicted = can_be_evicted; - - if (table->can_be_evicted) { - UT_LIST_ADD_FIRST(dict_sys->table_LRU, table); + if (can_be_evicted) { + UT_LIST_ADD_FIRST(dict_sys->table_LRU, this); } else { - UT_LIST_ADD_FIRST(dict_sys->table_non_LRU, table); + UT_LIST_ADD_FIRST(dict_sys->table_non_LRU, this); } ut_ad(dict_lru_validate()); @@ -1452,7 +1427,6 @@ dict_table_rename_in_cache( { dberr_t err; dict_foreign_t* foreign; - dict_index_t* index; ulint fold; char old_name[MAX_FULL_NAME_LEN + 1]; os_file_type_t ftype; @@ -1484,12 +1458,12 @@ dict_table_rename_in_cache( /* If the table is stored in a single-table tablespace, rename the .ibd file and rebuild the .isl file if needed. */ - if (dict_table_is_discarded(table)) { + if (!table->space) { bool exists; char* filepath; ut_ad(dict_table_is_file_per_table(table)); - ut_ad(!dict_table_is_temporary(table)); + ut_ad(!table->is_temporary()); /* Make sure the data_dir_path is set. */ dict_get_and_save_data_dir_path(table, true); @@ -1509,8 +1483,7 @@ dict_table_rename_in_cache( return(DB_OUT_OF_MEMORY); } - fil_delete_tablespace(table->space, - dict_table_is_discarded(table)); + fil_delete_tablespace(table->space_id, !table->space); /* Delete any temp file hanging around. */ if (os_file_status(filepath, &exists, &ftype) @@ -1523,10 +1496,11 @@ dict_table_rename_in_cache( ut_free(filepath); } else if (dict_table_is_file_per_table(table)) { - char* new_path = NULL; - char* old_path = fil_space_get_first_path(table->space); + char* new_path; + const char* old_path = UT_LIST_GET_FIRST(table->space->chain) + ->name; - ut_ad(!dict_table_is_temporary(table)); + ut_ad(!table->is_temporary()); if (DICT_TF_HAS_DATA_DIR(table->flags)) { new_path = os_file_make_new_pathname( @@ -1536,7 +1510,6 @@ dict_table_rename_in_cache( if (err != DB_SUCCESS) { ut_free(new_path); - ut_free(old_path); return(DB_TABLESPACE_EXISTS); } } else { @@ -1545,32 +1518,19 @@ dict_table_rename_in_cache( } /* New filepath must not exist. */ - err = fil_rename_tablespace_check( - table->space, old_path, new_path, false, - replace_new_file); - if (err != DB_SUCCESS) { - ut_free(old_path); - ut_free(new_path); - return(err); - } - - fil_name_write_rename(table->space, old_path, new_path); - - bool success = fil_rename_tablespace( - table->space, old_path, new_name, new_path); - - ut_free(old_path); + err = table->space->rename(new_name, new_path, true, + replace_new_file); ut_free(new_path); /* If the tablespace is remote, a new .isl file was created If success, delete the old one. If not, delete the new one. */ if (DICT_TF_HAS_DATA_DIR(table->flags)) { RemoteDatafile::delete_link_file( - success ? old_name : new_name); + err == DB_SUCCESS ? old_name : new_name); } - if (!success) { - return(DB_ERROR); + if (err != DB_SUCCESS) { + return err; } } @@ -1593,14 +1553,6 @@ dict_table_rename_in_cache( HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, table); - /* Update the table_name field in indexes */ - for (index = dict_table_get_first_index(table); - index != NULL; - index = dict_table_get_next_index(index)) { - - index->table_name = table->name.m_name; - } - if (!rename_also_foreigns) { /* In ALTER TABLE we think of the rename table operation in the direction table -> temporary table (#sql...) @@ -1939,7 +1891,7 @@ dict_table_remove_from_cache_low( /* When evicting the table definition, drop the orphan indexes from the data dictionary and free the index pages. */ - trx_t* trx = trx_allocate_for_background(); + trx_t* trx = trx_create(); ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); @@ -1951,7 +1903,7 @@ dict_table_remove_from_cache_low( row_merge_drop_indexes_dict(trx, table->id); trx_commit_for_mysql(trx); trx->dict_operation_lock_mode = 0; - trx_free_for_background(trx); + trx->free(); } /* Free virtual column template if any */ @@ -1997,19 +1949,13 @@ dict_col_name_is_reserved( /*======================*/ const char* name) /*!< in: column name */ { - /* This check reminds that if a new system column is added to - the program, it should be dealt with here. */ -#if DATA_N_SYS_COLS != 3 -#error "DATA_N_SYS_COLS != 3" -#endif - static const char* reserved_names[] = { "DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR" }; - ulint i; + compile_time_assert(UT_ARR_SIZE(reserved_names) == DATA_N_SYS_COLS); - for (i = 0; i < UT_ARR_SIZE(reserved_names); i++) { + for (ulint i = 0; i < UT_ARR_SIZE(reserved_names); i++) { if (innobase_strcasecmp(name, reserved_names[i]) == 0) { return(TRUE); @@ -2022,8 +1968,8 @@ dict_col_name_is_reserved( /** Clears the virtual column's index list before index is being freed. @param[in] index Index being freed */ -void -dict_index_remove_from_v_col_list(dict_index_t* index) { +void dict_index_remove_from_v_col_list(dict_index_t* index) +{ /* Index is not completely formed */ if (!index->cached) { return; @@ -2059,7 +2005,6 @@ dict_index_remove_from_v_col_list(dict_index_t* index) { /** Adds an index to the dictionary cache, with possible indexing newly added column. -@param[in,out] table table on which the index is @param[in,out] index index; NOTE! The index memory object is freed in this function! @param[in] page_no root page number of the index @@ -2067,7 +2012,6 @@ added column. @return DB_SUCCESS, or DB_CORRUPTION */ dberr_t dict_index_add_to_cache( - dict_table_t* table, dict_index_t*& index, ulint page_no, const dict_add_v_col_t* add_v) @@ -2084,9 +2028,10 @@ dict_index_add_to_cache( ut_d(mem_heap_validate(index->heap)); ut_a(!dict_index_is_clust(index) - || UT_LIST_GET_LEN(table->indexes) == 0); + || UT_LIST_GET_LEN(index->table->indexes) == 0); + ut_ad(dict_index_is_clust(index) || !index->table->no_rollback()); - if (!dict_index_find_cols(table, index, add_v)) { + if (!dict_index_find_cols(index, add_v)) { dict_mem_index_free(index); index = NULL; @@ -2096,12 +2041,14 @@ dict_index_add_to_cache( /* Build the cache internal representation of the index, containing also the added system fields */ - if (index->type == DICT_FTS) { - new_index = dict_index_build_internal_fts(table, index); - } else if (dict_index_is_clust(index)) { - new_index = dict_index_build_internal_clust(table, index); + if (dict_index_is_clust(index)) { + new_index = dict_index_build_internal_clust(index); } else { - new_index = dict_index_build_internal_non_clust(table, index); + new_index = (index->type & DICT_FTS) + ? dict_index_build_internal_fts(index) + : dict_index_build_internal_non_clust(index); + new_index->n_core_null_bytes = UT_BITS_IN_BYTES( + unsigned(new_index->n_nullable)); } /* Set the n_fields value in new_index to the actual defined @@ -2173,9 +2120,7 @@ dict_index_add_to_cache( /* Add the new index as the last index for the table */ - UT_LIST_ADD_LAST(table->indexes, new_index); - new_index->table = table; - new_index->table_name = table->name.m_name; + UT_LIST_ADD_LAST(new_index->table->indexes, new_index); #ifdef BTR_CUR_ADAPT new_index->search_info = btr_search_info_create(new_index->heap); #endif /* BTR_CUR_ADAPT */ @@ -2184,6 +2129,8 @@ dict_index_add_to_cache( rw_lock_create(index_tree_rw_lock_key, &new_index->lock, SYNC_INDEX_TREE); + new_index->n_core_fields = new_index->n_fields; + dict_mem_index_free(index); index = new_index; return DB_SUCCESS; @@ -2271,18 +2218,17 @@ index. @param[in] table table @param[in,out] index index @param[in] add_v new virtual columns added along with an add index call -@return TRUE if the column names were found */ +@return whether the column names were found */ static -ibool +bool dict_index_find_cols( - const dict_table_t* table, dict_index_t* index, const dict_add_v_col_t* add_v) { std::vector<ulint, ut_allocator<ulint> > col_added; std::vector<ulint, ut_allocator<ulint> > v_col_added; - ut_ad(table != NULL && index != NULL); + const dict_table_t* table = index->table; ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); ut_ad(mutex_own(&dict_sys->mutex)); @@ -2380,7 +2326,7 @@ dict_index_add_col( dict_field_t* field; const char* col_name; - if (dict_col_is_virtual(col)) { + if (col->is_virtual()) { dict_v_col_t* v_col = reinterpret_cast<dict_v_col_t*>(col); /* When v_col->v_indexes==NULL, @@ -2390,11 +2336,8 @@ dict_index_add_col( if (v_col->v_indexes != NULL) { /* Register the index with the virtual column index list */ - struct dict_v_idx_t new_idx - = {index, index->n_def}; - - v_col->v_indexes->push_back(new_idx); - + v_col->v_indexes->push_back( + dict_v_idx_t(index, index->n_def)); } col_name = dict_table_get_v_col_name_mysql( @@ -2405,7 +2348,7 @@ dict_index_add_col( dict_mem_index_add_field(index, col_name, prefix_len); - field = dict_index_get_nth_field(index, index->n_def - 1); + field = dict_index_get_nth_field(index, unsigned(index->n_def) - 1); field->col = col; field->fixed_len = static_cast<unsigned int>( @@ -2423,12 +2366,11 @@ dict_index_add_col( if (field->fixed_len > DICT_MAX_FIXED_COL_LEN) { field->fixed_len = 0; } -#if DICT_MAX_FIXED_COL_LEN != 768 + /* The comparison limit above must be constant. If it were changed, the disk format of some fixed-length columns would change, which would be a disaster. */ -# error "DICT_MAX_FIXED_COL_LEN != 768" -#endif + compile_time_assert(DICT_MAX_FIXED_COL_LEN == 768); if (!(col->prtype & DATA_NOT_NULL)) { index->n_nullable++; @@ -2442,8 +2384,7 @@ void dict_index_copy( /*============*/ dict_index_t* index1, /*!< in: index to copy to */ - dict_index_t* index2, /*!< in: index to copy from */ - const dict_table_t* table, /*!< in: table */ + const dict_index_t* index2, /*!< in: index to copy from */ ulint start, /*!< in: first position to copy */ ulint end) /*!< in: last position to copy */ { @@ -2456,7 +2397,7 @@ dict_index_copy( field = dict_index_get_nth_field(index2, i); - dict_index_add_col(index1, table, field->col, + dict_index_add_col(index1, index2->table, field->col, field->prefix_len); } } @@ -2553,28 +2494,26 @@ static dict_index_t* dict_index_build_internal_clust( /*============================*/ - const dict_table_t* table, /*!< in: table */ dict_index_t* index) /*!< in: user representation of a clustered index */ { + dict_table_t* table = index->table; dict_index_t* new_index; dict_field_t* field; ulint trx_id_pos; ulint i; ibool* indexed; - ut_ad(table && index); ut_ad(dict_index_is_clust(index)); ut_ad(!dict_index_is_ibuf(index)); ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); /* Create a new index object with certainly enough fields */ - new_index = dict_mem_index_create(table->name.m_name, - index->name, table->space, + new_index = dict_mem_index_create(index->table, index->name, index->type, - index->n_fields + table->n_cols); + unsigned(index->n_fields + + table->n_cols)); /* Copy other relevant data from the old index struct to the new struct: it inherits the values */ @@ -2584,7 +2523,7 @@ dict_index_build_internal_clust( new_index->id = index->id; /* Copy the fields of index */ - dict_index_copy(new_index, index, table, 0, index->n_fields); + dict_index_copy(new_index, index, 0, index->n_fields); if (dict_index_is_unique(index)) { /* Only the fields defined so far are needed to identify @@ -2593,7 +2532,7 @@ dict_index_build_internal_clust( new_index->n_uniq = new_index->n_def; } else { /* Also the row id is needed to identify the entry */ - new_index->n_uniq = 1 + new_index->n_def; + new_index->n_uniq = 1 + unsigned(new_index->n_def); } new_index->trx_id_offset = 0; @@ -2602,15 +2541,9 @@ dict_index_build_internal_clust( trx_id_pos = new_index->n_def; -#if DATA_ROW_ID != 0 -# error "DATA_ROW_ID != 0" -#endif -#if DATA_TRX_ID != 1 -# error "DATA_TRX_ID != 1" -#endif -#if DATA_ROLL_PTR != 2 -# error "DATA_ROLL_PTR != 2" -#endif + compile_time_assert(DATA_ROW_ID == 0); + compile_time_assert(DATA_TRX_ID == 1); + compile_time_assert(DATA_ROLL_PTR == 2); if (!dict_index_is_unique(index)) { dict_index_add_col(new_index, table, @@ -2697,6 +2630,9 @@ dict_index_build_internal_clust( ut_ad(UT_LIST_GET_LEN(table->indexes) == 0); + new_index->n_core_null_bytes = table->supports_instant() + ? dict_index_t::NO_CORE_NULL_BYTES + : UT_BITS_IN_BYTES(unsigned(new_index->n_nullable)); new_index->cached = TRUE; return(new_index); @@ -2710,13 +2646,13 @@ static dict_index_t* dict_index_build_internal_non_clust( /*================================*/ - const dict_table_t* table, /*!< in: table */ dict_index_t* index) /*!< in: user representation of a non-clustered index */ { dict_field_t* field; dict_index_t* new_index; dict_index_t* clust_index; + dict_table_t* table = index->table; ulint i; ibool* indexed; @@ -2724,7 +2660,6 @@ dict_index_build_internal_non_clust( ut_ad(!dict_index_is_clust(index)); ut_ad(!dict_index_is_ibuf(index)); ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); /* The clustered index should be the first in the list of indexes */ clust_index = UT_LIST_GET_FIRST(table->indexes); @@ -2735,8 +2670,8 @@ dict_index_build_internal_non_clust( /* Create a new index */ new_index = dict_mem_index_create( - table->name.m_name, index->name, index->space, index->type, - index->n_fields + 1 + clust_index->n_uniq); + index->table, index->name, index->type, + ulint(index->n_fields + 1 + clust_index->n_uniq)); /* Copy other relevant data from the old index struct to the new struct: it inherits the values */ @@ -2746,7 +2681,7 @@ dict_index_build_internal_non_clust( new_index->id = index->id; /* Copy fields from index to new_index */ - dict_index_copy(new_index, index, table, 0, index->n_fields); + dict_index_copy(new_index, index, 0, index->n_fields); /* Remember the table columns already contained in new_index */ indexed = static_cast<ibool*>( @@ -2757,7 +2692,7 @@ dict_index_build_internal_non_clust( field = dict_index_get_nth_field(new_index, i); - if (dict_col_is_virtual(field->col)) { + if (field->col->is_virtual()) { continue; } @@ -2813,20 +2748,16 @@ static dict_index_t* dict_index_build_internal_fts( /*==========================*/ - dict_table_t* table, /*!< in: table */ dict_index_t* index) /*!< in: user representation of an FTS index */ { dict_index_t* new_index; - ut_ad(table && index); ut_ad(index->type == DICT_FTS); ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); /* Create a new index */ - new_index = dict_mem_index_create( - table->name.m_name, index->name, index->space, index->type, - index->n_fields); + new_index = dict_mem_index_create(index->table, index->name, + index->type, index->n_fields); /* Copy other relevant data from the old index struct to the new struct: it inherits the values */ @@ -2836,11 +2767,13 @@ dict_index_build_internal_fts( new_index->id = index->id; /* Copy fields from index to new_index */ - dict_index_copy(new_index, index, table, 0, index->n_fields); + dict_index_copy(new_index, index, 0, index->n_fields); new_index->n_uniq = 0; new_index->cached = TRUE; + dict_table_t* table = index->table; + if (table->fts->cache == NULL) { table->fts->cache = fts_cache_create(table); } @@ -3351,7 +3284,7 @@ dict_scan_id( ptr++; } - len = ptr - s; + len = ulint(ptr - s); } if (heap == NULL) { @@ -3372,7 +3305,7 @@ dict_scan_id( } } *d++ = 0; - len = d - str; + len = ulint(d - str); ut_ad(*s == quote); ut_ad(s + 1 == ptr); } else { @@ -3591,7 +3524,7 @@ dict_scan_table_name( for (s = scan_name; *s; s++) { if (*s == '.') { database_name = scan_name; - database_name_len = s - scan_name; + database_name_len = ulint(s - scan_name); scan_name = ++s; break;/* to do: multiple dots? */ } @@ -3902,7 +3835,7 @@ dict_foreign_push_index_error( const char* col_name; field = dict_index_get_nth_field(err_index, err_col); - col_name = dict_col_is_virtual(field->col) + col_name = field->col->is_virtual() ? "(null)" : dict_table_get_col_name( table, dict_col_get_no(field->col)); @@ -4160,6 +4093,11 @@ loop: /**********************************************************/ /* The following call adds the foreign key constraints to the data dictionary system tables on disk */ + trx->op_info = "adding foreign keys"; + + trx_start_if_not_started_xa(trx, true); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); error = dict_create_add_foreigns_to_dictionary( local_fk_set, table, trx); @@ -4374,23 +4312,6 @@ col_loop1: return(DB_CANNOT_ADD_CONSTRAINT); } - /* Don't allow foreign keys on partitioned tables yet. */ - ptr1 = dict_scan_to(ptr, "PARTITION"); - if (ptr1) { - ptr1 = dict_accept(cs, ptr1, "PARTITION", &success); - if (success && my_isspace(cs, *ptr1)) { - ptr2 = dict_accept(cs, ptr1, "BY", &success); - if (success) { - my_error(ER_FOREIGN_KEY_ON_PARTITIONED,MYF(0)); - return(DB_CANNOT_ADD_CONSTRAINT); - } - } - } - if (dict_table_is_partition(table)) { - my_error(ER_FOREIGN_KEY_ON_PARTITIONED,MYF(0)); - return(DB_CANNOT_ADD_CONSTRAINT); - } - /* Let us create a constraint struct */ foreign = dict_mem_foreign_create(); @@ -5114,7 +5035,9 @@ dict_index_build_node_ptr( dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4); - rec_copy_prefix_to_dtuple(tuple, rec, index, !level, n_unique, heap); + rec_copy_prefix_to_dtuple(tuple, rec, index, + level ? 0 : index->n_core_fields, + n_unique, heap); dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple) | REC_STATUS_NODE_PTR); @@ -5123,46 +5046,6 @@ dict_index_build_node_ptr( return(tuple); } -/**********************************************************************//** -Copies an initial segment of a physical record, long enough to specify an -index entry uniquely. -@return pointer to the prefix record */ -rec_t* -dict_index_copy_rec_order_prefix( -/*=============================*/ - const dict_index_t* index, /*!< in: index */ - const rec_t* rec, /*!< in: record for which to - copy prefix */ - ulint* n_fields,/*!< out: number of fields copied */ - byte** buf, /*!< in/out: memory buffer for the - copied prefix, or NULL */ - ulint* buf_size)/*!< in/out: buffer size */ -{ - ulint n; - - UNIV_PREFETCH_R(rec); - - if (dict_index_is_ibuf(index)) { - ut_a(!dict_table_is_comp(index->table)); - n = rec_get_n_fields_old(rec); - } else { - if (page_rec_is_leaf(rec)) { - n = dict_index_get_n_unique_in_tree(index); - } else { - n = dict_index_get_n_unique_in_tree_nonleaf(index); - /* For internal node of R-tree, since we need to - compare the page no field, so, we need to copy this - field as well. */ - if (dict_index_is_spatial(index)) { - n++; - } - } - } - - *n_fields = n; - return(rec_copy_prefix_to_buf(rec, index, n, buf, buf_size)); -} - /** Convert a physical record into a search tuple. @param[in] rec index record (not necessarily in an index page) @param[in] index index @@ -5171,25 +5054,21 @@ dict_index_copy_rec_order_prefix( @param[in,out] heap memory heap for allocation @return own: data tuple */ dtuple_t* -dict_index_build_data_tuple_func( +dict_index_build_data_tuple( const rec_t* rec, const dict_index_t* index, -#ifdef UNIV_DEBUG bool leaf, -#endif /* UNIV_DEBUG */ ulint n_fields, mem_heap_t* heap) { - dtuple_t* tuple; - - ut_ad(dict_table_is_comp(index->table) - || n_fields <= rec_get_n_fields_old(rec)); + ut_ad(!index->is_clust()); - tuple = dtuple_create(heap, n_fields); + dtuple_t* tuple = dtuple_create(heap, n_fields); dict_index_copy_types(tuple, index, n_fields); - rec_copy_prefix_to_dtuple(tuple, rec, index, leaf, n_fields, heap); + rec_copy_prefix_to_dtuple(tuple, rec, index, + leaf ? n_fields : 0, n_fields, heap); ut_ad(dtuple_check_typed(tuple)); @@ -5435,18 +5314,17 @@ dict_print_info_on_foreign_keys( /** Given a space_id of a file-per-table tablespace, search the dict_sys->table_LRU list and return the dict_table_t* pointer for it. -@param space_id Tablespace ID +@param space tablespace @return table if found, NULL if not */ static dict_table_t* -dict_find_single_table_by_space( - ulint space_id) +dict_find_single_table_by_space(const fil_space_t* space) { dict_table_t* table; ulint num_item; ulint count = 0; - ut_ad(space_id > 0); + ut_ad(space->id > 0); if (dict_sys == NULL) { /* This could happen when it's in redo processing. */ @@ -5461,7 +5339,7 @@ dict_find_single_table_by_space( killing the server, so it worth to risk some consequences for the action. */ while (table && count < num_item) { - if (table->space == space_id) { + if (table->space == space) { if (dict_table_is_file_per_table(table)) { return(table); } @@ -5478,41 +5356,28 @@ dict_find_single_table_by_space( /**********************************************************************//** Flags a table with specified space_id corrupted in the data dictionary cache -@return TRUE if successful */ -ibool -dict_set_corrupted_by_space( -/*========================*/ - ulint space_id) /*!< in: space ID */ +@return true if successful */ +bool dict_set_corrupted_by_space(const fil_space_t* space) { dict_table_t* table; - table = dict_find_single_table_by_space(space_id); + table = dict_find_single_table_by_space(space); if (!table) { - return(FALSE); + return false; } /* mark the table->corrupted bit only, since the caller could be too deep in the stack for SYS_INDEXES update */ table->corrupted = true; table->file_unreadable = true; - - return(TRUE); + return true; } - -/** Flag a table with specified space_id encrypted in the data dictionary -cache -@param[in] space_id Tablespace id */ -UNIV_INTERN -void -dict_set_encrypted_by_space(ulint space_id) +/** Flag a table encrypted in the data dictionary cache. */ +void dict_set_encrypted_by_space(const fil_space_t* space) { - dict_table_t* table; - - table = dict_find_single_table_by_space(space_id); - - if (table) { + if (dict_table_t* table = dict_find_single_table_by_space(space)) { table->file_unreadable = true; } } @@ -5590,7 +5455,7 @@ dict_set_corrupted( btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_LE, BTR_MODIFY_LEAF, - &cursor, 0, __FILE__, __LINE__, &mtr, 0); + &cursor, 0, __FILE__, __LINE__, &mtr); if (cursor.low_match == dtuple_get_n_fields(tuple)) { /* UPDATE SYS_INDEXES SET TYPE=index->type @@ -5693,7 +5558,7 @@ dict_index_set_merge_threshold( btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, - &cursor, 0, __FILE__, __LINE__, &mtr, 0); + &cursor, 0, __FILE__, __LINE__, &mtr); if (cursor.up_match == dtuple_get_n_fields(tuple) && rec_get_n_fields_old(btr_cur_get_rec(&cursor)) @@ -5765,15 +5630,13 @@ dict_ind_init() dict_table_t* table; /* create dummy table and index for REDUNDANT infimum and supremum */ - table = dict_mem_table_create("SYS_DUMMY1", DICT_HDR_SPACE, 1, 0, 0, 0); + table = dict_mem_table_create("SYS_DUMMY1", NULL, 1, 0, 0, 0); dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR, DATA_ENGLISH | DATA_NOT_NULL, 8); - dict_ind_redundant = dict_mem_index_create("SYS_DUMMY1", "SYS_DUMMY1", - DICT_HDR_SPACE, 0, 1); + dict_ind_redundant = dict_mem_index_create(table, "SYS_DUMMY1", 0, 1); dict_index_add_col(dict_ind_redundant, table, dict_table_get_nth_col(table, 0), 0); - dict_ind_redundant->table = table; /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ dict_ind_redundant->cached = TRUE; } @@ -6013,8 +5876,7 @@ dict_table_schema_check( } } - if (!table->is_readable() && - fil_space_get(table->space) == NULL) { + if (!table->is_readable() && !table->space) { /* missing tablespace */ snprintf(errstr, errstr_sz, @@ -6025,14 +5887,13 @@ dict_table_schema_check( return(DB_TABLE_NOT_FOUND); } - if (ulint(table->n_def) - DATA_N_SYS_COLS != req_schema->n_cols) { + if (ulint(table->n_def - DATA_N_SYS_COLS) != req_schema->n_cols) { /* the table has a different number of columns than required */ snprintf(errstr, errstr_sz, - "%s has " ULINTPF " columns but should have " - ULINTPF ".", + "%s has %d columns but should have " ULINTPF ".", ut_format_name(req_schema->table_name, buf, sizeof buf), - ulint(table->n_def) - DATA_N_SYS_COLS, + table->n_def - DATA_N_SYS_COLS, req_schema->n_cols); return(DB_ERROR); @@ -6277,11 +6138,18 @@ void dict_close(void) /*============*/ { - ulint i; + if (dict_sys == NULL) { + /* This should only happen if a failure occurred + during redo log processing. */ + return; + } + + /* Acquire only because it's a pre-condition. */ + mutex_enter(&dict_sys->mutex); /* Free the hash elements. We don't remove them from the table because we are going to destroy the table anyway. */ - for (i = 0; i < hash_get_n_cells(dict_sys->table_hash); i++) { + for (ulint i = 0; i < hash_get_n_cells(dict_sys->table_id_hash); i++) { dict_table_t* table; table = static_cast<dict_table_t*>( @@ -6293,12 +6161,7 @@ dict_close(void) table = static_cast<dict_table_t*>( HASH_GET_NEXT(name_hash, prev_table)); ut_ad(prev_table->magic_n == DICT_TABLE_MAGIC_N); - /* Acquire only because it's a pre-condition. */ - mutex_enter(&dict_sys->mutex); - dict_table_remove_from_cache(prev_table); - - mutex_exit(&dict_sys->mutex); } } @@ -6308,12 +6171,18 @@ dict_close(void) therefore we don't delete the individual elements. */ hash_table_free(dict_sys->table_id_hash); + mutex_exit(&dict_sys->mutex); mutex_free(&dict_sys->mutex); rw_lock_free(&dict_operation_lock); mutex_free(&dict_foreign_err_mutex); + if (dict_foreign_err_file) { + fclose(dict_foreign_err_file); + dict_foreign_err_file = NULL; + } + ut_free(dict_sys); dict_sys = NULL; @@ -6481,7 +6350,7 @@ dict_foreign_qualify_index( return(false); } - if (dict_col_is_virtual(field->col)) { + if (field->col->is_virtual()) { col_name = ""; for (ulint j = 0; j < table->n_v_def; j++) { col_name = dict_table_get_v_col_name(table, j); @@ -6561,7 +6430,7 @@ dict_index_zip_pad_update( /* Only do increment if it won't increase padding beyond max pad size. */ if (info->pad + ZIP_PAD_INCR - < (UNIV_PAGE_SIZE * zip_pad_max) / 100) { + < (srv_page_size * zip_pad_max) / 100) { /* Use atomics even though we have the mutex. This is to ensure that we are able to read info->pad atomically. */ @@ -6587,7 +6456,7 @@ dict_index_zip_pad_update( /* Use atomics even though we have the mutex. This is to ensure that we are able to read info->pad atomically. */ - my_atomic_addlint(&info->pad, -ZIP_PAD_INCR); + my_atomic_addlint(&info->pad, ulint(-ZIP_PAD_INCR)); info->n_rounds = 0; @@ -6651,17 +6520,17 @@ dict_index_zip_pad_optimal_page_size( if (!zip_failure_threshold_pct) { /* Disabled by user. */ - return(UNIV_PAGE_SIZE); + return(srv_page_size); } pad = my_atomic_loadlint(&index->zip_pad.pad); - ut_ad(pad < UNIV_PAGE_SIZE); - sz = UNIV_PAGE_SIZE - pad; + ut_ad(pad < srv_page_size); + sz = srv_page_size - pad; /* Min size allowed by user. */ ut_ad(zip_pad_max < 100); - min_sz = (UNIV_PAGE_SIZE * (100 - zip_pad_max)) / 100; + min_sz = (srv_page_size * (100 - zip_pad_max)) / 100; return(ut_max(sz, min_sz)); } @@ -6711,11 +6580,10 @@ dict_sys_get_size() size_t dict_table_t::get_overflow_field_local_len() const { - if (dict_table_get_format(this) < UNIV_FORMAT_B) { - /* up to MySQL 5.1: store a 768-byte prefix locally */ - return BTR_EXTERN_FIELD_REF_SIZE - + DICT_ANTELOPE_MAX_INDEX_COL_LEN; + if (dict_table_has_atomic_blobs(this)) { + /* new-format table: do not store any BLOB prefix locally */ + return BTR_EXTERN_FIELD_REF_SIZE; } - /* new-format table: do not store any BLOB prefix locally */ - return BTR_EXTERN_FIELD_REF_SIZE; + /* up to MySQL 5.1: store a 768-byte prefix locally */ + return BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN; } diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc index 22fc70a4e42..386b99bcaad 100644 --- a/storage/innobase/dict/dict0load.cc +++ b/storage/innobase/dict/dict0load.cc @@ -85,9 +85,9 @@ dict_load_table_one( /** Load a table definition from a SYS_TABLES record to dict_table_t. Do not load any columns or indexes. -@param[in] name Table name -@param[in] rec SYS_TABLES record -@param[out,own] table table, or NULL +@param[in] name Table name +@param[in] rec SYS_TABLES record +@param[out,own] table table, or NULL @return error message @retval NULL on success */ static const char* dict_load_table_low(const table_name_t& name, @@ -106,7 +106,6 @@ dict_load_index_low( byte* table_id, /*!< in/out: table id (8 bytes), an "in" value if allocate=TRUE and "out" when allocate=FALSE */ - const char* table_name, /*!< in: table name */ mem_heap_t* heap, /*!< in/out: temporary memory heap */ const rec_t* rec, /*!< in: SYS_INDEXES record */ ibool allocate, /*!< in: TRUE=allocate *index, @@ -138,7 +137,6 @@ dict_load_column_low( /** Load a virtual column "mapping" (to base columns) information from a SYS_VIRTUAL record @param[in,out] table table -@param[in,out] heap memory heap @param[in,out] column mapped base column's dict_column_t @param[in,out] table_id table id @param[in,out] pos virtual column position @@ -150,7 +148,6 @@ static const char* dict_load_virtual_low( dict_table_t* table, - mem_heap_t* heap, dict_col_t** column, table_id_t* table_id, ulint* pos, @@ -378,16 +375,12 @@ dict_process_sys_tables_rec_and_mtr_commit( mem_heap_t* heap, /*!< in/out: temporary memory heap */ const rec_t* rec, /*!< in: SYS_TABLES record */ dict_table_t** table, /*!< out: dict_table_t to fill */ - dict_table_info_t status, /*!< in: status bit controls - options such as whether we shall - look for dict_table_t from cache - first */ + bool cached, /*!< in: whether to load from cache */ mtr_t* mtr) /*!< in/out: mini-transaction, will be committed */ { ulint len; const char* field; - const char* err_msg = NULL; field = (const char*) rec_get_nth_field_old( rec, DICT_FLD__SYS_TABLES__NAME, &len); @@ -399,28 +392,17 @@ dict_process_sys_tables_rec_and_mtr_commit( /* Get the table name */ table_name_t table_name(mem_heap_strdupl(heap, field, len)); - /* If DICT_TABLE_LOAD_FROM_CACHE is set, first check - whether there is cached dict_table_t struct */ - if (status & DICT_TABLE_LOAD_FROM_CACHE) { - + if (cached) { /* Commit before load the table again */ mtr_commit(mtr); *table = dict_table_get_low(table_name.m_name); - - if (!(*table)) { - err_msg = "Table not found in cache"; - } + return *table ? NULL : "Table not found in cache"; } else { - err_msg = dict_load_table_low(table_name, rec, table); + const char* err = dict_load_table_low(table_name, rec, table); mtr_commit(mtr); + return err; } - - if (err_msg) { - return(err_msg); - } - - return(NULL); } /********************************************************************//** @@ -442,8 +424,7 @@ dict_process_sys_indexes_rec( buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); /* Parse the record, and get "dict_index_t" struct filled */ - err_msg = dict_load_index_low(buf, NULL, - heap, rec, FALSE, &index); + err_msg = dict_load_index_low(buf, heap, rec, FALSE, &index); *table_id = mach_read_from_8(buf); @@ -476,7 +457,6 @@ dict_process_sys_columns_rec( /** This function parses a SYS_VIRTUAL record and extracts virtual column information -@param[in,out] heap heap memory @param[in] rec current SYS_COLUMNS rec @param[in,out] table_id table id @param[in,out] pos virtual column position @@ -484,7 +464,6 @@ information @return error message, or NULL on success */ const char* dict_process_sys_virtual_rec( - mem_heap_t* heap, const rec_t* rec, table_id_t* table_id, ulint* pos, @@ -493,7 +472,7 @@ dict_process_sys_virtual_rec( const char* err_msg; /* Parse the record, and get "dict_col_t" struct filled */ - err_msg = dict_load_virtual_low(NULL, heap, NULL, table_id, + err_msg = dict_load_virtual_low(NULL, NULL, table_id, pos, base_pos, rec); return(err_msg); @@ -903,7 +882,7 @@ dict_update_filepath( ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); ut_ad(mutex_own(&dict_sys->mutex)); - trx = trx_allocate_for_background(); + trx = trx_create(); trx->op_info = "update filepath"; trx->dict_operation_lock_mode = RW_X_LATCH; trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); @@ -923,7 +902,7 @@ dict_update_filepath( trx_commit_for_mysql(trx); trx->dict_operation_lock_mode = 0; - trx_free_for_background(trx); + trx->free(); if (UNIV_LIKELY(err == DB_SUCCESS)) { /* We just updated SYS_DATAFILES due to the contents in @@ -972,7 +951,7 @@ dict_replace_tablespace_and_filepath( ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(filepath); - trx = trx_allocate_for_background(); + trx = trx_create(); trx->op_info = "insert tablespace and filepath"; trx->dict_operation_lock_mode = RW_X_LATCH; trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); @@ -985,7 +964,7 @@ dict_replace_tablespace_and_filepath( trx_commit_for_mysql(trx); trx->dict_operation_lock_mode = 0; - trx_free_for_background(trx); + trx->free(); return(err); } @@ -1145,8 +1124,9 @@ dict_sys_tables_type_valid(ulint type, bool not_redundant) } if (!not_redundant) { - /* SYS_TABLES.TYPE must be 1 for ROW_FORMAT=REDUNDANT. */ - return(false); + /* SYS_TABLES.TYPE must be 1 or 1|DICT_TF_MASK_NO_ROLLBACK + for ROW_FORMAT=REDUNDANT. */ + return !(type & ~(1U | DICT_TF_MASK_NO_ROLLBACK)); } if (type >= 1U << DICT_TF_POS_UNUSED) { @@ -1154,11 +1134,6 @@ dict_sys_tables_type_valid(ulint type, bool not_redundant) return(false); } - /* ATOMIC_WRITES cannot be 3; it is the 10.3 NO_ROLLBACK flag. */ - if (!(~type & DICT_TF_MASK_ATOMIC_WRITES)) { - return(false); - } - return(dict_tf_is_valid_not_redundant(type)); } @@ -1179,7 +1154,8 @@ dict_sys_tables_type_to_tf(ulint type, bool not_redundant) | DICT_TF_MASK_ATOMIC_BLOBS | DICT_TF_MASK_DATA_DIR | DICT_TF_MASK_PAGE_COMPRESSION - | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL); + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_NO_ROLLBACK); ut_ad(dict_tf_is_valid(flags)); return(flags); @@ -1231,7 +1207,8 @@ dict_sys_tables_rec_read( MariaDB 10.2.2 introduced the SHARED_SPACE flag from MySQL 5.7, shifting the flags PAGE_COMPRESSION, PAGE_COMPRESSION_LEVEL, - ATOMIC_WRITES by one bit. The SHARED_SPACE flag would always + ATOMIC_WRITES (repurposed to NO_ROLLBACK in 10.3.1) by one bit. + The SHARED_SPACE flag would always be written as 0 by MariaDB, because MariaDB does not support CREATE TABLESPACE or CREATE TABLE...TABLESPACE for InnoDB. @@ -1300,7 +1277,7 @@ dict_sys_tables_rec_read( /* The low order bit of SYS_TABLES.TYPE is always set to 1. But in dict_table_t::flags the low order bit is used to determine if the - row format is Redundant (0) or Compact (1) when the format is Antelope. + ROW_FORMAT=REDUNDANT (0) or anything else (1). Read the 4 byte N_COLS field and look at the high order bit. It should be set for COMPACT and later. It should not be set for REDUNDANT. */ @@ -1382,10 +1359,6 @@ static ulint dict_check_sys_tables() sys_datafiles = dict_table_get_low("SYS_DATAFILES"); ut_a(sys_datafiles != NULL); - const bool validate = recv_needed_recovery - && !srv_safe_truncate - && !srv_force_recovery; - for (rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES); rec != NULL; mtr.commit(), mtr.start(), @@ -1422,8 +1395,7 @@ next: continue; } - if (srv_safe_truncate - && strstr(table_name.m_name, "/" TEMP_FILE_PREFIX "-")) { + if (strstr(table_name.m_name, "/" TEMP_FILE_PREFIX "-")) { /* This table will be dropped by row_mysql_drop_garbage_tables(). We do not care if the file exists. */ @@ -1446,19 +1418,19 @@ next: /* Now that we have the proper name for this tablespace, look to see if it is already in the tablespace cache. */ - if (fil_space_for_table_exists_in_mem( + if (const fil_space_t* space + = fil_space_for_table_exists_in_mem( space_id, table_name.m_name, flags)) { /* Recovery can open a datafile that does not match SYS_DATAFILES. If they don't match, update SYS_DATAFILES. */ char *dict_path = dict_get_first_path(space_id); - char *fil_path = fil_space_get_first_path(space_id); - if (dict_path && fil_path + const char *fil_path = space->chain.start->name; + if (dict_path && strcmp(dict_path, fil_path)) { dict_update_filepath(space_id, fil_path); } ut_free(dict_path); - ut_free(fil_path); ut_free(table_name.m_name); continue; } @@ -1471,15 +1443,12 @@ next: char* filepath = dict_get_first_path(space_id); /* Check that the .ibd file exists. */ - dberr_t err = fil_ibd_open( - validate, - !srv_read_only_mode && srv_log_file_size != 0, - FIL_TYPE_TABLESPACE, - space_id, dict_tf_to_fsp_flags(flags), - table_name.m_name, - filepath); - - if (err != DB_SUCCESS) { + if (!fil_ibd_open( + false, + !srv_read_only_mode && srv_log_file_size != 0, + FIL_TYPE_TABLESPACE, + space_id, dict_tf_to_fsp_flags(flags), + table_name, filepath)) { ib::warn() << "Ignoring tablespace for " << table_name << " because it could not be opened."; @@ -1712,7 +1681,6 @@ static const char* dict_load_virtual_del = "delete-marked record in SYS_VIRTUAL" /** Load a virtual column "mapping" (to base columns) information from a SYS_VIRTUAL record @param[in,out] table table -@param[in,out] heap memory heap @param[in,out] column mapped base column's dict_column_t @param[in,out] table_id table id @param[in,out] pos virtual column position @@ -1724,7 +1692,6 @@ static const char* dict_load_virtual_low( dict_table_t* table, - mem_heap_t* heap, dict_col_t** column, table_id_t* table_id, ulint* pos, @@ -1984,7 +1951,7 @@ dict_load_virtual_one_col( ut_a(btr_pcur_is_on_user_rec(&pcur)); - err_msg = dict_load_virtual_low(table, heap, + err_msg = dict_load_virtual_low(table, &v_col->base_col[i - skipped], NULL, &pos, NULL, rec); @@ -2051,7 +2018,7 @@ dict_load_field_low( ulint len; unsigned pos_and_prefix_len; unsigned prefix_len; - ibool first_field; + bool first_field; ulint position; /* Either index or sys_field is supplied, not both */ @@ -2240,7 +2207,6 @@ dict_load_index_low( byte* table_id, /*!< in/out: table id (8 bytes), an "in" value if allocate=TRUE and "out" when allocate=FALSE */ - const char* table_name, /*!< in: table name */ mem_heap_t* heap, /*!< in/out: temporary memory heap */ const rec_t* rec, /*!< in: SYS_INDEXES record */ ibool allocate, /*!< in: TRUE=allocate *index, @@ -2255,7 +2221,6 @@ dict_load_index_low( index_id_t id; ulint n_fields; ulint type; - ulint space; unsigned merge_threshold; if (allocate) { @@ -2354,26 +2319,18 @@ err_len: } field = rec_get_nth_field_old( - rec, DICT_FLD__SYS_INDEXES__SPACE, &len); - if (len != 4) { - goto err_len; - } - space = mach_read_from_4(field); - - field = rec_get_nth_field_old( rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); if (len != 4) { goto err_len; } if (allocate) { - *index = dict_mem_index_create(table_name, name_buf, - space, type, n_fields); + *index = dict_mem_index_create(NULL, name_buf, type, n_fields); } else { ut_a(*index); - dict_mem_fill_index_struct(*index, NULL, NULL, name_buf, - space, type, n_fields); + dict_mem_fill_index_struct(*index, NULL, name_buf, + type, n_fields); } (*index)->id = id; @@ -2480,8 +2437,7 @@ dict_load_indexes( } } - err_msg = dict_load_index_low( - buf, table->name.m_name, heap, rec, TRUE, &index); + err_msg = dict_load_index_low(buf, heap, rec, TRUE, &index); ut_ad((index == NULL && err_msg != NULL) || (index != NULL && err_msg == NULL)); @@ -2612,17 +2568,16 @@ corrupted: dict_mem_index_free(index); } else { dict_load_fields(index, heap); - - error = dict_index_add_to_cache( - table, index, index->page, FALSE); + index->table = table; /* The data dictionary tables should never contain invalid index definitions. If we ignored this error and simply did not load this index definition, the .frm file would disagree with the index definitions inside InnoDB. */ - if (UNIV_UNLIKELY(error != DB_SUCCESS)) { - + if ((error = dict_index_add_to_cache(index, + index->page)) + != DB_SUCCESS) { goto func_exit; } } @@ -2654,9 +2609,9 @@ func_exit: /** Load a table definition from a SYS_TABLES record to dict_table_t. Do not load any columns or indexes. -@param[in] name Table name -@param[in] rec SYS_TABLES record -@param[out,own] table table, or NULL +@param[in] name Table name +@param[in] rec SYS_TABLES record +@param[out,own] table table, or NULL @return error message @retval NULL on success */ static const char* dict_load_table_low(const table_name_t& name, @@ -2684,7 +2639,8 @@ static const char* dict_load_table_low(const table_name_t& name, dict_table_decode_n_col(t_num, &n_cols, &n_v_col); *table = dict_mem_table_create( - name.m_name, space_id, n_cols + n_v_col, n_v_col, flags, flags2); + name.m_name, NULL, n_cols + n_v_col, n_v_col, flags, flags2); + (*table)->space_id = space_id; (*table)->id = table_id; (*table)->file_unreadable = !!(flags2 & DICT_TF2_DISCARDED); @@ -2702,7 +2658,7 @@ void dict_save_data_dir_path( /*====================*/ dict_table_t* table, /*!< in/out: table */ - char* filepath) /*!< in: filepath of tablespace */ + const char* filepath) /*!< in: filepath of tablespace */ { ut_ad(mutex_own(&dict_sys->mutex)); ut_a(DICT_TF_HAS_DATA_DIR(table->flags)); @@ -2737,25 +2693,17 @@ dict_get_and_save_data_dir_path( dict_table_t* table, bool dict_mutex_own) { - ut_ad(!dict_table_is_temporary(table)); - - if (!table->data_dir_path && table->space - && !dict_table_is_discarded(table)) { - char* path = fil_space_get_first_path(table->space); + ut_ad(!table->is_temporary()); + ut_ad(!table->space || table->space->id == table->space_id); + if (!table->data_dir_path && table->space_id && table->space) { if (!dict_mutex_own) { dict_mutex_enter_for_mysql(); } - if (path == NULL) { - path = dict_get_first_path(table->space); - } - - if (path != NULL) { - table->flags |= (1 << DICT_TF_POS_DATA_DIR); - dict_save_data_dir_path(table, path); - ut_free(path); - } + table->flags |= (1 << DICT_TF_POS_DATA_DIR); + dict_save_data_dir_path(table, + table->space->chain.start->name); if (table->data_dir_path == NULL) { /* Since we did not set the table data_dir_path, @@ -2818,10 +2766,13 @@ dict_load_tablespace( dict_table_t* table, dict_err_ignore_t ignore_err) { - ut_ad(!dict_table_is_temporary(table)); + ut_ad(!table->is_temporary()); + ut_ad(!table->space); + ut_ad(table->space_id < SRV_LOG_SPACE_FIRST_ID); + ut_ad(fil_system.sys_space); - /* The system tablespace is always available. */ - if (is_system_tablespace(table->space)) { + if (table->space_id == TRX_SYS_SPACE) { + table->space = fil_system.sys_space; return; } @@ -2832,11 +2783,10 @@ dict_load_tablespace( return; } - char* space_name = table->name.m_name; - /* The tablespace may already be open. */ - if (fil_space_for_table_exists_in_mem( - table->space, space_name, table->flags)) { + table->space = fil_space_for_table_exists_in_mem( + table->space_id, table->name.m_name, table->flags); + if (table->space) { return; } @@ -2849,12 +2799,12 @@ dict_load_tablespace( ib::error() << "Failed to find tablespace for table " << table->name << " in the cache. Attempting" " to load the tablespace with space id " - << table->space; + << table->space_id; } /* Use the remote filepath if needed. This parameter is optional in the call to fil_ibd_open(). If not supplied, it will be built - from the space_name. */ + from the table->name. */ char* filepath = NULL; if (DICT_TF_HAS_DATA_DIR(table->flags)) { /* This will set table->data_dir_path from either @@ -2870,12 +2820,12 @@ dict_load_tablespace( /* Try to open the tablespace. We set the 2nd param (fix_dict) to false because we do not have an x-lock on dict_operation_lock */ - dberr_t err = fil_ibd_open( - true, false, FIL_TYPE_TABLESPACE, table->space, + table->space = fil_ibd_open( + true, false, FIL_TYPE_TABLESPACE, table->space_id, dict_tf_to_fsp_flags(table->flags), - space_name, filepath); + table->name, filepath); - if (err != DB_SUCCESS) { + if (!table->space) { /* We failed to find a sensible tablespace file */ table->file_unreadable = true; } @@ -2908,7 +2858,6 @@ dict_load_table_one( dict_names_t& fk_tables) { dberr_t err; - dict_table_t* table; dict_table_t* sys_tables; btr_pcur_t pcur; dict_index_t* sys_index; @@ -2974,6 +2923,7 @@ err_exit: goto err_exit; } + dict_table_t* table; if (const char* err_msg = dict_load_table_low(name, rec, &table)) { if (err_msg != dict_load_table_flags) { ib::error() << err_msg; @@ -2990,7 +2940,10 @@ err_exit: dict_load_virtual(table, heap); - dict_table_add_to_cache(table, TRUE, heap); + dict_table_add_system_columns(table, heap); + + table->can_be_evicted = true; + table->add_to_cache(); mem_heap_empty(heap); @@ -3028,6 +2981,43 @@ err_exit: } } + if (err == DB_SUCCESS && table->is_readable()) { + if (table->space && !fil_space_get_size(table->space_id)) { +corrupted: + table->corrupted = true; + table->file_unreadable = true; + err = DB_CORRUPTION; + } else { + const page_id_t page_id( + table->space->id, + dict_table_get_first_index(table)->page); + mtr.start(); + buf_block_t* block = buf_page_get( + page_id, + dict_table_page_size(table), + RW_S_LATCH, &mtr); + const bool corrupted = !block + || page_get_space_id(block->frame) + != page_id.space() + || page_get_page_no(block->frame) + != page_id.page_no() + || (mach_read_from_2(FIL_PAGE_TYPE + + block->frame) + != FIL_PAGE_INDEX + && mach_read_from_2(FIL_PAGE_TYPE + + block->frame) + != FIL_PAGE_TYPE_INSTANT); + mtr.commit(); + if (corrupted) { + goto corrupted; + } + + if (table->supports_instant()) { + err = btr_cur_instant_init(table); + } + } + } + /* Initialize table foreign_child value. Its value could be changed when dict_load_foreigns() is called below */ table->fk_max_recusive_level = 0; @@ -3054,35 +3044,6 @@ err_exit: } else { dict_mem_table_fill_foreign_vcol_set(table); table->fk_max_recusive_level = 0; - - if (table->space - && !fil_space_get_size(table->space)) { -corrupted: - table->corrupted = true; - table->file_unreadable = true; - } else { - const page_id_t page_id( - table->space, - dict_table_get_first_index(table) - ->page); - mtr.start(); - buf_block_t* block = buf_page_get( - page_id, - dict_table_page_size(table), - RW_S_LATCH, &mtr); - const bool corrupted = !block - || page_get_space_id(block->frame) - != page_id.space() - || page_get_page_no(block->frame) - != page_id.page_no() - || mach_read_from_2(FIL_PAGE_TYPE - + block->frame) - != FIL_PAGE_INDEX; - mtr.commit(); - if (corrupted) { - goto corrupted; - } - } } } else { dict_index_t* index; diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc index 1667e1a0b20..265642ef210 100644 --- a/storage/innobase/dict/dict0mem.cc +++ b/storage/innobase/dict/dict0mem.cc @@ -2,7 +2,7 @@ Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2020, MariaDB Corporation. +Copyright (c) 2013, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -34,9 +34,9 @@ Created 1/8/1996 Heikki Tuuri #include "mach0data.h" #include "dict0dict.h" #include "fts0priv.h" -#include "ut0crc32.h" #include "lock0lock.h" #include "sync0sync.h" +#include "row0row.h" #include <iostream> #define DICT_HEAP_SIZE 100 /*!< initial memory heap size when @@ -50,6 +50,29 @@ static const char* innobase_system_databases[] = { NullS }; +/** Determine if a table belongs to innobase_system_databases[] +@param[in] name database_name/table_name +@return whether the database_name is in innobase_system_databases[] */ +static bool dict_mem_table_is_system(const char *name) +{ + /* table has the following format: database/table + and some system table are of the form SYS_* */ + if (!strchr(name, '/')) { + return true; + } + size_t table_len = strlen(name); + const char *system_db; + int i = 0; + while ((system_db = innobase_system_databases[i++]) + && (system_db != NullS)) { + size_t len = strlen(system_db); + if (table_len > len && !strncmp(name, system_db, len)) { + return true; + } + } + return false; +} + /** The start of the table basename suffix for partitioned tables */ const char table_name_t::part_suffix[4] #ifdef _WIN32 @@ -58,10 +81,6 @@ const char table_name_t::part_suffix[4] = "#P#"; #endif -/** An interger randomly initialized at startup used to make a temporary -table name as unuique as possible. */ -static ib_uint32_t dict_temp_file_num; - /** Display an identifier. @param[in,out] s output stream @param[in] id_name SQL identifier (other than table name) @@ -96,32 +115,18 @@ operator<<( return(s << ut_get_name(NULL, table_name.m_name)); } -/** @return whether a table belongs to a system database */ -static bool dict_mem_table_is_system(char *name) -{ - /* table has the following format: database/table - and some system table are of the form SYS_* */ - if (strchr(name, '/')) { - size_t table_len = strlen(name); - const char *system_db; - int i = 0; - while ((system_db = innobase_system_databases[i++]) - && (system_db != NullS)) { - size_t len = strlen(system_db); - if (table_len > len && !strncmp(name, system_db, len)) { - return true; - } - } - return false; - } else { - return true; - } -} - +/** Create a table memory object. +@param name table name +@param space tablespace +@param n_cols total number of columns (both virtual and non-virtual) +@param n_v_cols number of virtual columns +@param flags table flags +@param flags2 table flags2 +@return own: table object */ dict_table_t* dict_mem_table_create( const char* name, - ulint space, + fil_space_t* space, ulint n_cols, ulint n_v_cols, ulint flags, @@ -131,6 +136,10 @@ dict_mem_table_create( mem_heap_t* heap; ut_ad(name); + ut_ad(!space + || space->purpose == FIL_TYPE_TABLESPACE + || space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT); ut_a(dict_tf2_is_valid(flags, flags2)); ut_a(!(flags2 & DICT_TF2_UNUSED_BIT_MASK)); @@ -154,10 +163,11 @@ dict_mem_table_create( table->flags2 = (unsigned int) flags2; table->name.m_name = mem_strdup(name); table->is_system_db = dict_mem_table_is_system(table->name.m_name); - table->space = (unsigned int) space; + table->space = space; + table->space_id = space ? space->id : ULINT_UNDEFINED; table->n_t_cols = unsigned(n_cols + DATA_N_SYS_COLS); table->n_v_cols = (unsigned int) (n_v_cols); - table->n_cols = table->n_t_cols - table->n_v_cols; + table->n_cols = unsigned(table->n_t_cols - table->n_v_cols); table->cols = static_cast<dict_col_t*>( mem_heap_alloc(heap, table->n_cols * sizeof(dict_col_t))); @@ -262,7 +272,7 @@ dict_add_col_name( s += strlen(s) + 1; } - old_len = s - col_names; + old_len = unsigned(s - col_names); } else { old_len = 0; } @@ -324,6 +334,16 @@ dict_mem_table_add_col( col = dict_table_get_nth_col(table, i); dict_mem_fill_column_struct(col, i, mtype, prtype, len); + + switch (prtype & DATA_VERSIONED) { + case DATA_VERS_START: + ut_ad(!table->vers_start); + table->vers_start = i; + break; + case DATA_VERS_END: + ut_ad(!table->vers_end); + table->vers_end = i; + } } /** Adds a virtual column definition to a table. @@ -381,7 +401,7 @@ dict_mem_table_add_v_col( i, name, heap); } - v_col = dict_table_get_nth_v_col(table, i); + v_col = &table->v_cols[i]; dict_mem_fill_column_struct(&v_col->m_col, pos, mtype, prtype, len); v_col->v_pos = i; @@ -410,7 +430,7 @@ dict_mem_table_add_s_col( dict_table_t* table, ulint num_base) { - ulint i = table->n_def - 1; + unsigned i = unsigned(table->n_def) - 1; dict_col_t* col = dict_table_get_nth_col(table, i); dict_s_col_t s_col; @@ -471,13 +491,13 @@ dict_mem_table_col_rename_low( /* We need to adjust all affected index->field pointers, as in dict_index_add_col(). First, copy table->col_names. */ - ulint prefix_len = s - t_col_names; + ulint prefix_len = ulint(s - t_col_names); for (; i < n_col; i++) { s += strlen(s) + 1; } - ulint full_len = s - t_col_names; + ulint full_len = ulint(s - t_col_names); char* col_names; if (to_len > from_len) { @@ -510,12 +530,12 @@ dict_mem_table_col_rename_low( /* if is_virtual and that in field->col does not match, continue */ if ((!is_virtual) != - (!dict_col_is_virtual(field->col))) { + (!field->col->is_virtual())) { continue; } ulint name_ofs - = field->name - t_col_names; + = ulint(field->name - t_col_names); if (name_ofs <= prefix_len) { field->name = col_names + name_ofs; } else { @@ -719,9 +739,11 @@ dict_mem_fill_column_struct( column->mtype = (unsigned int) mtype; column->prtype = (unsigned int) prtype; column->len = (unsigned int) col_len; - dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen); + dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen); column->mbminlen = mbminlen; column->mbmaxlen = mbmaxlen; + column->def_val.data = NULL; + column->def_val.len = UNIV_SQL_DEFAULT; } /**********************************************************************//** @@ -730,11 +752,8 @@ Creates an index memory object. dict_index_t* dict_mem_index_create( /*==================*/ - const char* table_name, /*!< in: table name */ + dict_table_t* table, /*!< in: table */ const char* index_name, /*!< in: index name */ - ulint space, /*!< in: space where the index tree is - placed, ignored if the index is of - the clustered type */ ulint type, /*!< in: DICT_UNIQUE, DICT_CLUSTERED, ... ORed */ ulint n_fields) /*!< in: number of fields */ @@ -742,15 +761,16 @@ dict_mem_index_create( dict_index_t* index; mem_heap_t* heap; - ut_ad(table_name && index_name); + ut_ad(!table || table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(index_name); heap = mem_heap_create(DICT_HEAP_SIZE); index = static_cast<dict_index_t*>( mem_heap_zalloc(heap, sizeof(*index))); + index->table = table; - dict_mem_fill_index_struct(index, heap, table_name, index_name, - space, type, n_fields); + dict_mem_fill_index_struct(index, heap, index_name, type, n_fields); mysql_mutex_init(0, &index->zip_pad.mutex, NULL); @@ -1051,7 +1071,7 @@ dict_mem_index_add_field( index->n_def++; - field = dict_index_get_nth_field(index, index->n_def - 1); + field = dict_index_get_nth_field(index, unsigned(index->n_def) - 1); field->name = name; field->prefix_len = (unsigned int) prefix_len; @@ -1105,46 +1125,15 @@ dict_mem_create_temporary_tablename( ut_ad(dbend); size_t dblen = size_t(dbend - dbtab) + 1; - if (srv_safe_truncate) { - /* InnoDB will drop all #sql tables at startup. - Therefore, the id alone should generate a unique - and previously non-existent name. */ - size = dblen + (sizeof(TEMP_FILE_PREFIX) + 3 + 20); - name = static_cast<char*>(mem_heap_alloc(heap, size)); - memcpy(name, dbtab, dblen); - snprintf(name + dblen, size - dblen, - TEMP_FILE_PREFIX_INNODB UINT64PF, id); - return name; - } - /* Increment a randomly initialized number for each temp file. */ - my_atomic_add32((int32*) &dict_temp_file_num, 1); - - size = dblen + (sizeof(TEMP_FILE_PREFIX) + 3 + 20 + 1 + 10); + size = dblen + (sizeof(TEMP_FILE_PREFIX) + 3 + 20); name = static_cast<char*>(mem_heap_alloc(heap, size)); memcpy(name, dbtab, dblen); snprintf(name + dblen, size - dblen, - TEMP_FILE_PREFIX_INNODB UINT64PF "-" UINT32PF, - id, dict_temp_file_num); + TEMP_FILE_PREFIX_INNODB UINT64PF, id); return(name); } -/** Initialize dict memory variables */ -void -dict_mem_init(void) -{ - /* Initialize a randomly distributed temporary file number */ - ib_uint32_t now = static_cast<ib_uint32_t>(time(NULL)); - - const byte* buf = reinterpret_cast<const byte*>(&now); - - dict_temp_file_num = ut_crc32(buf, sizeof(now)); - - DBUG_PRINT("dict_mem_init", - ("Starting Temporary file number is " UINT32PF, - dict_temp_file_num)); -} - /** Validate the search order in the foreign key set. @param[in] fk_set the foreign key set to be validated @return true if search order is fine in the set, false otherwise. */ @@ -1219,3 +1208,356 @@ bool dict_foreign_t::affects_fulltext() const return false; } + +/** Adjust clustered index metadata for instant ADD COLUMN. +@param[in] clustered index definition after instant ADD COLUMN */ +inline void dict_index_t::instant_add_field(const dict_index_t& instant) +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(instant.is_primary()); + DBUG_ASSERT(!instant.is_instant()); + DBUG_ASSERT(n_def == n_fields); + DBUG_ASSERT(instant.n_def == instant.n_fields); + + DBUG_ASSERT(type == instant.type); + DBUG_ASSERT(trx_id_offset == instant.trx_id_offset); + DBUG_ASSERT(n_user_defined_cols == instant.n_user_defined_cols); + DBUG_ASSERT(n_uniq == instant.n_uniq); + DBUG_ASSERT(instant.n_fields > n_fields); + DBUG_ASSERT(instant.n_def > n_def); + DBUG_ASSERT(instant.n_nullable >= n_nullable); + DBUG_ASSERT(instant.n_core_fields >= n_core_fields); + DBUG_ASSERT(instant.n_core_null_bytes >= n_core_null_bytes); + + n_fields = instant.n_fields; + n_def = instant.n_def; + n_nullable = instant.n_nullable; + fields = static_cast<dict_field_t*>( + mem_heap_dup(heap, instant.fields, n_fields * sizeof *fields)); + + ut_d(unsigned n_null = 0); + + for (unsigned i = 0; i < n_fields; i++) { + DBUG_ASSERT(fields[i].same(instant.fields[i])); + const dict_col_t* icol = instant.fields[i].col; + DBUG_ASSERT(!icol->is_virtual()); + dict_col_t* col = fields[i].col = &table->cols[ + icol - instant.table->cols]; + fields[i].name = col->name(*table); + ut_d(n_null += col->is_nullable()); + } + + ut_ad(n_null == n_nullable); +} + +/** Adjust metadata for instant ADD COLUMN. +@param[in] table table definition after instant ADD COLUMN */ +void dict_table_t::instant_add_column(const dict_table_t& table) +{ + DBUG_ASSERT(!table.cached); + DBUG_ASSERT(table.n_def == table.n_cols); + DBUG_ASSERT(table.n_t_def == table.n_t_cols); + DBUG_ASSERT(n_def == n_cols); + DBUG_ASSERT(n_t_def == n_t_cols); + DBUG_ASSERT(table.n_cols > n_cols); + ut_ad(mutex_own(&dict_sys->mutex)); + + const char* end = table.col_names; + for (unsigned i = table.n_cols; i--; ) end += strlen(end) + 1; + + col_names = static_cast<char*>( + mem_heap_dup(heap, table.col_names, + ulint(end - table.col_names))); + const dict_col_t* const old_cols = cols; + const dict_col_t* const old_cols_end = cols + n_cols; + cols = static_cast<dict_col_t*>(mem_heap_dup(heap, table.cols, + table.n_cols + * sizeof *cols)); + + /* Preserve the default values of previously instantly + added columns. */ + for (unsigned i = unsigned(n_cols) - DATA_N_SYS_COLS; i--; ) { + cols[i].def_val = old_cols[i].def_val; + } + + /* Copy the new default values to this->heap. */ + for (unsigned i = n_cols; i < table.n_cols; i++) { + dict_col_t& c = cols[i - DATA_N_SYS_COLS]; + DBUG_ASSERT(c.is_instant()); + if (c.def_val.len == 0) { + c.def_val.data = field_ref_zero; + } else if (const void*& d = c.def_val.data) { + d = mem_heap_dup(heap, d, c.def_val.len); + } else { + DBUG_ASSERT(c.def_val.len == UNIV_SQL_NULL); + } + } + + const unsigned old_n_cols = n_cols; + const unsigned n_add = unsigned(table.n_cols - n_cols); + + n_t_def += n_add; + n_t_cols += n_add; + n_cols = table.n_cols; + n_def = n_cols; + + for (unsigned i = n_v_def; i--; ) { + const dict_v_col_t& v = v_cols[i]; + for (ulint n = v.num_base; n--; ) { + dict_col_t*& base = v.base_col[n]; + if (!base->is_virtual()) { + DBUG_ASSERT(base >= old_cols); + size_t n = size_t(base - old_cols); + DBUG_ASSERT(n + DATA_N_SYS_COLS < old_n_cols); + base = &cols[n]; + } + } + } + + dict_index_t* index = dict_table_get_first_index(this); + + index->instant_add_field(*dict_table_get_first_index(&table)); + + while ((index = dict_table_get_next_index(index)) != NULL) { + for (unsigned i = 0; i < index->n_fields; i++) { + dict_field_t& field = index->fields[i]; + if (field.col < old_cols + || field.col >= old_cols_end) { + DBUG_ASSERT(field.col->is_virtual()); + } else { + /* Secondary indexes may contain user + columns and DB_ROW_ID (if there is + GEN_CLUST_INDEX instead of PRIMARY KEY), + but not DB_TRX_ID,DB_ROLL_PTR. */ + DBUG_ASSERT(field.col >= old_cols); + size_t n = size_t(field.col - old_cols); + DBUG_ASSERT(n + DATA_N_SYS_COLS <= old_n_cols); + if (n + DATA_N_SYS_COLS >= old_n_cols) { + /* Replace DB_ROW_ID */ + n += n_add; + } + field.col = &cols[n]; + DBUG_ASSERT(!field.col->is_virtual()); + field.name = field.col->name(*this); + } + } + } +} + +/** Roll back instant_add_column(). +@param[in] old_n_cols original n_cols +@param[in] old_cols original cols +@param[in] old_col_names original col_names */ +void +dict_table_t::rollback_instant( + unsigned old_n_cols, + dict_col_t* old_cols, + const char* old_col_names) +{ + ut_ad(mutex_own(&dict_sys->mutex)); + dict_index_t* index = indexes.start; + /* index->is_instant() does not necessarily hold here, because + the table may have been emptied */ + DBUG_ASSERT(old_n_cols >= DATA_N_SYS_COLS); + DBUG_ASSERT(n_cols >= old_n_cols); + DBUG_ASSERT(n_cols == n_def); + DBUG_ASSERT(index->n_def == index->n_fields); + + const unsigned n_remove = n_cols - old_n_cols; + + for (unsigned i = index->n_fields - n_remove; i < index->n_fields; + i++) { + if (index->fields[i].col->is_nullable()) { + index->n_nullable--; + } + } + + index->n_fields -= n_remove; + index->n_def = index->n_fields; + if (index->n_core_fields > index->n_fields) { + index->n_core_fields = index->n_fields; + index->n_core_null_bytes + = UT_BITS_IN_BYTES(unsigned(index->n_nullable)); + } + + const dict_col_t* const new_cols = cols; + const dict_col_t* const new_cols_end = cols + n_cols; + + cols = old_cols; + col_names = old_col_names; + n_cols = old_n_cols; + n_def = old_n_cols; + n_t_def -= n_remove; + n_t_cols -= n_remove; + + for (unsigned i = n_v_def; i--; ) { + const dict_v_col_t& v = v_cols[i]; + for (ulint n = v.num_base; n--; ) { + dict_col_t*& base = v.base_col[n]; + if (!base->is_virtual()) { + base = &cols[base - new_cols]; + } + } + } + + do { + for (unsigned i = 0; i < index->n_fields; i++) { + dict_field_t& field = index->fields[i]; + if (field.col < new_cols + || field.col >= new_cols_end) { + DBUG_ASSERT(field.col->is_virtual()); + } else { + DBUG_ASSERT(field.col >= new_cols); + size_t n = size_t(field.col - new_cols); + DBUG_ASSERT(n <= n_cols); + if (n + DATA_N_SYS_COLS >= n_cols) { + n -= n_remove; + } + field.col = &cols[n]; + DBUG_ASSERT(!field.col->is_virtual()); + field.name = field.col->name(*this); + } + } + } while ((index = dict_table_get_next_index(index)) != NULL); +} + +/** Trim the instantly added columns when an insert into SYS_COLUMNS +is rolled back during ALTER TABLE or recovery. +@param[in] n number of surviving non-system columns */ +void dict_table_t::rollback_instant(unsigned n) +{ + ut_ad(mutex_own(&dict_sys->mutex)); + dict_index_t* index = indexes.start; + DBUG_ASSERT(index->is_instant()); + DBUG_ASSERT(index->n_def == index->n_fields); + DBUG_ASSERT(n_cols == n_def); + DBUG_ASSERT(n >= index->n_uniq); + DBUG_ASSERT(n_cols > n + DATA_N_SYS_COLS); + const unsigned n_remove = n_cols - n - DATA_N_SYS_COLS; + + char* names = const_cast<char*>(dict_table_get_col_name(this, n)); + const char* sys = names; + for (unsigned i = n_remove; i--; ) { + sys += strlen(sys) + 1; + } + static const char system[] = "DB_ROW_ID\0DB_TRX_ID\0DB_ROLL_PTR"; + DBUG_ASSERT(!memcmp(sys, system, sizeof system)); + for (unsigned i = index->n_fields - n_remove; i < index->n_fields; + i++) { + if (index->fields[i].col->is_nullable()) { + index->n_nullable--; + } + } + index->n_fields -= n_remove; + index->n_def = index->n_fields; + memmove(names, sys, sizeof system); + memmove(cols + n, cols + n_cols - DATA_N_SYS_COLS, + DATA_N_SYS_COLS * sizeof *cols); + n_cols -= n_remove; + n_def = n_cols; + n_t_cols -= n_remove; + n_t_def -= n_remove; + + for (unsigned i = DATA_N_SYS_COLS; i--; ) { + cols[n_cols - i].ind--; + } + + if (dict_index_is_auto_gen_clust(index)) { + DBUG_ASSERT(index->n_uniq == 1); + dict_field_t* field = index->fields; + field->name = sys; + field->col = dict_table_get_sys_col(this, DATA_ROW_ID); + field++; + field->name = sys + sizeof "DB_ROW_ID"; + field->col = dict_table_get_sys_col(this, DATA_TRX_ID); + field++; + field->name = sys + sizeof "DB_ROW_ID\0DB_TRX_ID"; + field->col = dict_table_get_sys_col(this, DATA_ROLL_PTR); + + /* Replace the DB_ROW_ID column in secondary indexes. */ + while ((index = dict_table_get_next_index(index)) != NULL) { + field = &index->fields[index->n_fields - 1]; + DBUG_ASSERT(field->col->mtype == DATA_SYS); + DBUG_ASSERT(field->col->prtype + == DATA_NOT_NULL + DATA_TRX_ID); + field->col--; + field->name = sys; + } + + return; + } + + dict_field_t* field = &index->fields[index->n_uniq]; + field->name = sys + sizeof "DB_ROW_ID"; + field->col = dict_table_get_sys_col(this, DATA_TRX_ID); + field++; + field->name = sys + sizeof "DB_ROW_ID\0DB_TRX_ID"; + field->col = dict_table_get_sys_col(this, DATA_ROLL_PTR); +} + + +/** Check if record in clustered index is historical row. +@param[in] rec clustered row +@param[in] offsets offsets +@return true if row is historical */ +bool +dict_index_t::vers_history_row( + const rec_t* rec, + const rec_offs* offsets) +{ + ut_ad(is_primary()); + + ulint len; + dict_col_t& col= table->cols[table->vers_end]; + ut_ad(col.vers_sys_end()); + ulint nfield = dict_col_get_clust_pos(&col, this); + const byte *data = rec_get_nth_field(rec, offsets, nfield, &len); + if (col.vers_native()) { + ut_ad(len == sizeof trx_id_max_bytes); + return 0 != memcmp(data, trx_id_max_bytes, len); + } + ut_ad(len == sizeof timestamp_max_bytes); + return 0 != memcmp(data, timestamp_max_bytes, len); +} + +/** Check if record in secondary index is historical row. +@param[in] rec record in a secondary index +@param[out] history_row true if row is historical +@return true on error */ +bool +dict_index_t::vers_history_row( + const rec_t* rec, + bool &history_row) +{ + ut_ad(!is_primary()); + + bool error = false; + mem_heap_t* heap = NULL; + dict_index_t* clust_index = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + mtr_t mtr; + mtr.start(); + + rec_t* clust_rec = + row_get_clust_rec(BTR_SEARCH_LEAF, rec, this, &clust_index, &mtr); + if (clust_rec) { + offsets = rec_get_offsets(clust_rec, clust_index, offsets, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + history_row = clust_index->vers_history_row(clust_rec, offsets); + } else { + ib::error() << "foreign constraints: secondary index is out of " + "sync"; + ut_ad(!"secondary index is out of sync"); + error = true; + } + mtr.commit(); + if (heap) { + mem_heap_free(heap); + } + return(error); +} diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index 1611703c2e9..e36e2184e2d 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2009, 2019, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2020, MariaDB Corporation. +Copyright (c) 2015, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -293,7 +293,7 @@ dict_stats_exec_sql( } if (trx == NULL) { - trx = trx_allocate_for_background(); + trx = trx_create(); trx_started = true; if (srv_read_only_mode) { @@ -327,7 +327,7 @@ dict_stats_exec_sql( } if (trx_started) { - trx_free_for_background(trx); + trx->free(); } return(err); @@ -443,8 +443,6 @@ dict_stats_table_clone_create( idx->name = mem_heap_strdup(heap, index->name); - idx->table_name = t->name.m_name; - idx->table = t; idx->type = index->type; @@ -853,10 +851,8 @@ dict_stats_update_transient_for_index( mtr_t mtr; ulint size; - mtr_start(&mtr); - - mtr_s_lock(dict_index_get_lock(index), &mtr); - + mtr.start(); + mtr_s_lock_index(index, &mtr); size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); if (size != ULINT_UNDEFINED) { @@ -866,7 +862,7 @@ dict_stats_update_transient_for_index( index, BTR_N_LEAF_PAGES, &mtr); } - mtr_commit(&mtr); + mtr.commit(); switch (size) { case ULINT_UNDEFINED: @@ -927,7 +923,7 @@ dict_stats_update_transient( index = dict_table_get_first_index(table); - if (dict_table_is_discarded(table)) { + if (!table->space) { /* Nothing to do. */ dict_stats_empty_table(table, true); return; @@ -1047,10 +1043,10 @@ dict_stats_analyze_index_level( memset(n_diff, 0x0, n_uniq * sizeof(n_diff[0])); /* Allocate space for the offsets header (the allocation size at - offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1, + offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_uniq + 1, so that this will never be less than the size calculated in rec_get_offsets_func(). */ - i = (REC_OFFS_HEADER_SIZE + 1 + 1) + index->n_fields; + i = (REC_OFFS_HEADER_SIZE + 1 + 1) + n_uniq; heap = mem_heap_create((2 * sizeof *rec_offsets) * i); rec_offsets = static_cast<rec_offs*>( @@ -1086,16 +1082,24 @@ dict_stats_analyze_index_level( == page_rec_get_next_const(page_get_infimum_rec(page))); /* check that we are indeed on the desired level */ - ut_a(btr_page_get_level(page, mtr) == level); + ut_a(btr_page_get_level(page) == level); /* there should not be any pages on the left */ ut_a(!page_has_prev(page)); - /* check whether the first record on the leftmost page is marked - as such, if we are on a non-leaf level */ - ut_a((level == 0) - == !(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( - btr_pcur_get_rec(&pcur), page_is_comp(page)))); + if (REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + btr_pcur_get_rec(&pcur), page_is_comp(page))) { + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + if (level == 0) { + /* Skip the metadata pseudo-record */ + ut_ad(index->is_instant()); + btr_pcur_move_to_next_user_rec(&pcur, mtr); + } + } else { + /* The first record on the leftmost page must be + marked as such on each level except the leaf level. */ + ut_a(level == 0); + } prev_rec = NULL; prev_rec_is_copied = false; @@ -1154,12 +1158,11 @@ dict_stats_analyze_index_level( prev_rec_offsets = rec_get_offsets( prev_rec, index, prev_rec_offsets, - true, + index->n_core_fields, n_uniq, &heap); prev_rec = rec_copy_prefix_to_buf( - prev_rec, index, - rec_offs_n_fields(prev_rec_offsets), + prev_rec, index, n_uniq, &prev_rec_buf, &prev_rec_buf_size); prev_rec_is_copied = true; @@ -1167,8 +1170,9 @@ dict_stats_analyze_index_level( continue; } - rec_offsets = rec_get_offsets( - rec, index, rec_offsets, !level, n_uniq, &heap); + rec_offsets = rec_get_offsets(rec, index, rec_offsets, + level ? 0 : index->n_core_fields, + n_uniq, &heap); (*total_recs)++; @@ -1176,7 +1180,8 @@ dict_stats_analyze_index_level( ulint matched_fields; prev_rec_offsets = rec_get_offsets( - prev_rec, index, prev_rec_offsets, !level, + prev_rec, index, prev_rec_offsets, + level ? 0 : index->n_core_fields, n_uniq, &heap); cmp_rec_rec(prev_rec, rec, @@ -1228,7 +1233,7 @@ dict_stats_analyze_index_level( btr_pcur_move_to_next_user_rec() will release the latch on the page that prev_rec is on */ prev_rec = rec_copy_prefix_to_buf( - rec, index, rec_offs_n_fields(rec_offsets), + rec, index, n_uniq, &prev_rec_buf, &prev_rec_buf_size); prev_rec_is_copied = true; @@ -1330,7 +1335,7 @@ be big enough) @param[in] index index of the page @param[in] page the page to scan @param[in] n_prefix look at the first n_prefix columns -@param[in] is_leaf whether this is the leaf page +@param[in] n_core 0, or index->n_core_fields for leaf @param[out] n_diff number of distinct records encountered @param[out] n_external_pages if this is non-NULL then it will be set to the number of externally stored pages which were encountered @@ -1345,7 +1350,7 @@ dict_stats_scan_page( const dict_index_t* index, const page_t* page, ulint n_prefix, - bool is_leaf, + ulint n_core, ib_uint64_t* n_diff, ib_uint64_t* n_external_pages) { @@ -1357,9 +1362,9 @@ dict_stats_scan_page( Because offsets1,offsets2 should be big enough, this memory heap should never be used. */ mem_heap_t* heap = NULL; - ut_ad(is_leaf == page_is_leaf(page)); + ut_ad(!!n_core == page_is_leaf(page)); const rec_t* (*get_next)(const rec_t*) - = !is_leaf || srv_stats_include_delete_marked + = !n_core || srv_stats_include_delete_marked ? page_rec_get_next_const : page_rec_get_next_non_del_marked; @@ -1378,7 +1383,7 @@ dict_stats_scan_page( return(NULL); } - offsets_rec = rec_get_offsets(rec, index, offsets_rec, is_leaf, + offsets_rec = rec_get_offsets(rec, index, offsets_rec, n_core, ULINT_UNDEFINED, &heap); if (should_count_external_pages) { @@ -1395,7 +1400,7 @@ dict_stats_scan_page( ulint matched_fields; offsets_next_rec = rec_get_offsets(next_rec, index, - offsets_next_rec, is_leaf, + offsets_next_rec, n_core, ULINT_UNDEFINED, &heap); @@ -1409,7 +1414,7 @@ dict_stats_scan_page( (*n_diff)++; - if (!is_leaf) { + if (!n_core) { break; } } @@ -1495,13 +1500,13 @@ dict_stats_analyze_index_below_cur( rec = btr_cur_get_rec(cur); ut_ad(!page_rec_is_leaf(rec)); - offsets_rec = rec_get_offsets(rec, index, offsets1, false, + offsets_rec = rec_get_offsets(rec, index, offsets1, 0, ULINT_UNDEFINED, &heap); - page_id_t page_id(dict_index_get_space(index), + page_id_t page_id(index->table->space_id, btr_node_ptr_get_child_page_no( rec, offsets_rec)); - const page_size_t page_size(dict_table_page_size(index->table)); + const page_size_t page_size(index->table->space->flags); /* assume no external pages by default - in case we quit from this function without analyzing any leaf pages */ @@ -1529,7 +1534,7 @@ dict_stats_analyze_index_below_cur( /* search for the first non-boring record on the page */ offsets_rec = dict_stats_scan_page( &rec, offsets1, offsets2, index, page, n_prefix, - false, n_diff, NULL); + 0, n_diff, NULL); /* pages on level > 0 are not allowed to be empty */ ut_a(offsets_rec != NULL); @@ -1574,7 +1579,7 @@ dict_stats_analyze_index_below_cur( offsets_rec = dict_stats_scan_page( &rec, offsets1, offsets2, index, page, n_prefix, - true, n_diff, + index->n_core_fields, n_diff, n_external_pages); #if 0 @@ -1687,7 +1692,7 @@ dict_stats_analyze_index_for_n_prefix( ut_ad(first_rec == page_rec_get_next_const(page_get_infimum_rec(page))); /* check that we are indeed on the desired level */ - ut_a(btr_page_get_level(page, mtr) == n_diff_data->level); + ut_a(btr_page_get_level(page) == n_diff_data->level); /* there should not be any pages on the left */ ut_a(!page_has_prev(page)); @@ -1744,9 +1749,6 @@ dict_stats_analyze_index_for_n_prefix( ut_a(left <= right); ut_a(right <= last_idx_on_level); - /* we do not pass (left, right) because we do not want to ask - ut_rnd_interval() to work with too big numbers since - ib_uint64_t could be bigger than ulint */ const ulint rnd = ut_rnd_interval( static_cast<ulint>(right - left)); @@ -1942,10 +1944,8 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index) DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name()); - mtr_start(&mtr); - - mtr_s_lock(dict_index_get_lock(index), &mtr); - + mtr.start(); + mtr_s_lock_index(index, &mtr); size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); if (size != ULINT_UNDEFINED) { @@ -1954,7 +1954,7 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index) } /* Release the X locks on the root page taken by btr_get_size() */ - mtr_commit(&mtr); + mtr.commit(); switch (size) { case ULINT_UNDEFINED: @@ -1967,10 +1967,8 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index) result.n_leaf_pages = size; - mtr_start(&mtr); - - mtr_sx_lock(dict_index_get_lock(index), &mtr); - + mtr.start(); + mtr_sx_lock_index(index, &mtr); root_level = btr_height_get(index, &mtr); n_uniq = dict_index_get_n_unique(index); @@ -2006,7 +2004,7 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index) NULL /* boundaries not needed */, &mtr); - mtr_commit(&mtr); + mtr.commit(); mutex_enter(&dict_sys->mutex); for (ulint i = 0; i < n_uniq; i++) { @@ -2060,9 +2058,9 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index) /* Commit the mtr to release the tree S lock to allow other threads to do some work too. */ - mtr_commit(&mtr); - mtr_start(&mtr); - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr.commit(); + mtr.start(); + mtr_sx_lock_index(index, &mtr); if (root_level != btr_height_get(index, &mtr)) { /* Just quit if the tree has changed beyond recognition here. The old stats from previous @@ -2200,7 +2198,7 @@ found_level: data, &mtr); } - mtr_commit(&mtr); + mtr.commit(); UT_DELETE_ARRAY(n_diff_boundaries); @@ -2352,7 +2350,7 @@ dict_stats_save_index_stat( char db_utf8[MAX_DB_UTF8_LEN]; char table_utf8[MAX_TABLE_UTF8_LEN]; - ut_ad(!trx || trx->internal || trx->in_mysql_trx_list); + ut_ad(!trx || trx->internal || trx->mysql_thd); ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); ut_ad(mutex_own(&dict_sys->mutex)); @@ -2364,7 +2362,7 @@ dict_stats_save_index_stat( pars_info_add_str_literal(pinfo, "table_name", table_utf8); pars_info_add_str_literal(pinfo, "index_name", index->name); MEM_CHECK_DEFINED(&last_update, 4); - pars_info_add_int4_literal(pinfo, "last_update", (lint)last_update); + pars_info_add_int4_literal(pinfo, "last_update", uint32(last_update)); MEM_CHECK_DEFINED(stat_name, strlen(stat_name)); pars_info_add_str_literal(pinfo, "stat_name", stat_name); MEM_CHECK_DEFINED(&stat_value, 8); @@ -2432,10 +2430,9 @@ dict_stats_report_error(dict_table_t* table, bool defragment) { dberr_t err; - FilSpace space(table->space); const char* df = defragment ? " defragment" : ""; - if (!space()) { + if (!table->space) { ib::warn() << "Cannot save" << df << " statistics for table " << table->name << " because the .ibd file is missing. " @@ -2444,7 +2441,8 @@ dict_stats_report_error(dict_table_t* table, bool defragment) } else { ib::warn() << "Cannot save" << df << " statistics for table " << table->name - << " because file " << space()->chain.start->name + << " because file " + << table->space->chain.start->name << (table->corrupted ? " is corrupted." : " cannot be decrypted."); @@ -2494,7 +2492,7 @@ dict_stats_save( pars_info_add_str_literal(pinfo, "database_name", db_utf8); pars_info_add_str_literal(pinfo, "table_name", table_utf8); - pars_info_add_int4_literal(pinfo, "last_update", (lint)now); + pars_info_add_int4_literal(pinfo, "last_update", uint32(now)); pars_info_add_ull_literal(pinfo, "n_rows", table->stat_n_rows); pars_info_add_ull_literal(pinfo, "clustered_index_size", table->stat_clustered_index_size); @@ -2535,7 +2533,7 @@ dict_stats_save( return(ret); } - trx_t* trx = trx_allocate_for_background(); + trx_t* trx = trx_create(); trx_start_internal(trx); dict_index_t* index; @@ -2632,7 +2630,7 @@ dict_stats_save( trx_commit_for_mysql(trx); end: - trx_free_for_background(trx); + trx->free(); mutex_exit(&dict_sys->mutex); rw_lock_x_unlock(&dict_operation_lock); @@ -2940,7 +2938,7 @@ dict_stats_fetch_index_stats_step( /* extract 12 from "n_diff_pfx12..." into n_pfx note that stat_name does not have a terminating '\0' */ - n_pfx = (num_ptr[0] - '0') * 10 + (num_ptr[1] - '0'); + n_pfx = ulong(num_ptr[0] - '0') * 10 + ulong(num_ptr[1] - '0'); ulint n_uniq = index->n_uniq; @@ -3014,7 +3012,7 @@ dict_stats_fetch_from_ps( stats. */ dict_stats_empty_table(table, true); - trx = trx_allocate_for_background(); + trx = trx_create(); /* Use 'read-uncommitted' so that the SELECTs we execute do not get blocked in case some user has locked the rows we @@ -3108,7 +3106,7 @@ dict_stats_fetch_from_ps( trx_commit_for_mysql(trx); - trx_free_for_background(trx); + trx->free(); if (!index_fetch_arg.stats_were_modified) { return(DB_STATS_DO_NOT_EXIST); diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc index a4619a6069b..386c7864579 100644 --- a/storage/innobase/dict/dict0stats_bg.cc +++ b/storage/innobase/dict/dict0stats_bg.cc @@ -83,6 +83,8 @@ typedef recalc_pool_t::iterator /** Pool where we store information on which tables are to be processed by background statistics gathering. */ static recalc_pool_t recalc_pool; +/** Whether the global data structures have been initialized */ +static bool stats_initialised; /*****************************************************************//** Free the resources occupied by the recalc pool, called once during @@ -314,7 +316,6 @@ dict_stats_thread_init() dict_stats_event = os_event_create(0); dict_stats_shutdown_event = os_event_create(0); - ut_d(dict_stats_disabled_event = os_event_create(0)); /* The recalc_pool_mutex is acquired from: @@ -334,6 +335,7 @@ dict_stats_thread_init() mutex_create(LATCH_ID_RECALC_POOL, &recalc_pool_mutex); dict_defrag_pool_init(); + stats_initialised = true; } /*****************************************************************//** @@ -346,6 +348,12 @@ dict_stats_thread_deinit() ut_a(!srv_read_only_mode); ut_ad(!srv_dict_stats_thread_active); + if (!stats_initialised) { + return; + } + + stats_initialised = false; + dict_stats_recalc_pool_deinit(); dict_defrag_pool_deinit(); @@ -388,7 +396,7 @@ dict_stats_process_entry_from_recalc_pool() return; } - ut_ad(!dict_table_is_temporary(table)); + ut_ad(!table->is_temporary()); if (!fil_table_accessible(table)) { dict_table_close(table, TRUE, FALSE); @@ -433,16 +441,9 @@ dict_stats_process_entry_from_recalc_pool() #ifdef UNIV_DEBUG /** Disables dict stats thread. It's used by: SET GLOBAL innodb_dict_stats_disabled_debug = 1 (0). -@param[in] thd thread handle -@param[in] var pointer to system variable -@param[out] var_ptr where the formal string goes @param[in] save immediate result from check function */ -void -dict_stats_disabled_debug_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save) +void dict_stats_disabled_debug_update(THD*, st_mysql_sys_var*, void*, + const void* save) { /* This method is protected by mutex, as every SET GLOBAL .. */ ut_ad(dict_stats_disabled_event != NULL); diff --git a/storage/innobase/eval/eval0eval.cc b/storage/innobase/eval/eval0eval.cc index 6f709707f7f..97540d00198 100644 --- a/storage/innobase/eval/eval0eval.cc +++ b/storage/innobase/eval/eval0eval.cc @@ -441,7 +441,7 @@ eval_instr( /* We have already matched j characters */ if (j == len2) { - int_val = i + 1; + int_val = lint(i) + 1; goto match_found; } @@ -568,12 +568,11 @@ eval_func( { que_node_t* arg; ulint fclass; - ulint func; ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC); fclass = func_node->fclass; - func = func_node->func; + const int func = func_node->func; arg = func_node->args; diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc index 68a8a9be261..f0f0351e20a 100644 --- a/storage/innobase/fil/fil0crypt.cc +++ b/storage/innobase/fil/fil0crypt.cc @@ -447,7 +447,6 @@ byte* fil_parse_write_crypt_data( byte* ptr, const byte* end_ptr, - const buf_block_t* block, dberr_t* err) { /* check that redo log entry is complete */ @@ -494,12 +493,12 @@ fil_parse_write_crypt_data( return NULL; } - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); fil_space_t* space = fil_space_get_by_id(space_id); if (!space) { - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return ptr + len; } @@ -520,7 +519,7 @@ fil_parse_write_crypt_data( space->crypt_data = crypt_data; } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); if (crypt_data->should_encrypt() && !crypt_data->is_key_found()) { *err = DB_DECRYPTION_FAILED; @@ -581,7 +580,7 @@ fil_encrypt_buf( int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen, crypt_data, key_version, - space, offset, lsn); + (uint32)space, (uint32)offset, lsn); ut_a(rc == MY_AES_OK); ut_a(dstlen == srclen); @@ -646,7 +645,7 @@ fil_space_encrypt( return (src_frame); } - ut_ad(space->n_pending_ios > 0); + ut_ad(space->pending_io()); return fil_encrypt_buf(space->crypt_data, space->id, offset, lsn, src_frame, page_size_t(space->flags), dst_frame); @@ -759,7 +758,7 @@ fil_space_decrypt( const page_size_t page_size(space->flags); ut_ad(space->crypt_data != NULL && space->crypt_data->is_encrypted()); - ut_ad(space->n_pending_ios > 0); + ut_ad(space->pending_io()); bool encrypted = fil_space_decrypt(space->crypt_data, tmp_frame, page_size, src_frame, &err); @@ -908,12 +907,12 @@ fil_crypt_read_crypt_data(fil_space_t* space) mtr.start(); if (buf_block_t* block = buf_page_get(page_id_t(space->id, 0), page_size, RW_S_LATCH, &mtr)) { - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); if (!space->crypt_data) { space->crypt_data = fil_space_read_crypt_data( page_size, block->frame); } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); } mtr.commit(); } @@ -966,9 +965,9 @@ static bool fil_crypt_start_encrypting_space(fil_space_t* space) crypt_data->rotate_state.starting = true; crypt_data->rotate_state.active_threads = 1; - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); space->crypt_data = crypt_data; - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); fil_crypt_start_converting = true; mutex_exit(&fil_crypt_threads_mutex); @@ -1103,7 +1102,7 @@ fil_crypt_space_needs_rotation( return false; } - ut_ad(space->n_pending_ops > 0); + ut_ad(space->referenced()); fil_space_crypt_t *crypt_data = space->crypt_data; @@ -1399,7 +1398,7 @@ inline fil_space_t *fil_system_t::keyrotate_next(fil_space_t *space, if (space) { - const bool released= !--space->n_pending_ops; + const bool released= !space->release(); if (space->is_in_rotation_list) { @@ -1421,12 +1420,15 @@ inline fil_space_t *fil_system_t::keyrotate_next(fil_space_t *space, } } - if (it == end) - return NULL; + while (it != end) + { + space= &*it; + if (space->acquire()) + return space; + while (++it != end && (!UT_LIST_GET_LEN(it->chain) || it->is_stopping())); + } - space= &*it; - space->n_pending_ops++; - return space; + return NULL; } /** Return the next tablespace. @@ -1439,23 +1441,23 @@ the encryption parameters were changed static fil_space_t *fil_space_next(fil_space_t *space, bool recheck, bool encrypt) { - mutex_enter(&fil_system->mutex); - ut_ad(!space || space->n_pending_ops); + mutex_enter(&fil_system.mutex); if (!srv_fil_crypt_rotate_key_age) - space= fil_system->keyrotate_next(space, recheck, encrypt); + space= fil_system.keyrotate_next(space, recheck, encrypt); else if (!space) { - space= UT_LIST_GET_FIRST(fil_system->space_list); + space= UT_LIST_GET_FIRST(fil_system.space_list); /* We can trust that space is not NULL because at least the system tablespace is always present and loaded first. */ - space->n_pending_ops++; + if (!space->acquire()) + goto next; } else { - ut_ad(space->n_pending_ops > 0); /* Move on to the next fil_space_t */ - space->n_pending_ops--; + space->release(); +next: space= UT_LIST_GET_NEXT(space_list, space); /* Skip abnormal tablespaces or those that are being created by @@ -1465,11 +1467,11 @@ static fil_space_t *fil_space_next(fil_space_t *space, bool recheck, space->is_stopping() || space->purpose != FIL_TYPE_TABLESPACE)) space= UT_LIST_GET_NEXT(space_list, space); - if (space) - space->n_pending_ops++; + if (space && !space->acquire()) + goto next; } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return space; } @@ -1486,7 +1488,7 @@ static bool fil_crypt_find_space_to_rotate( /* we need iops to start rotating */ while (!state->should_shutdown() && !fil_crypt_alloc_iops(state)) { if (state->space && state->space->is_stopping()) { - fil_space_release(state->space); + state->space->release(); state->space = NULL; } @@ -1496,7 +1498,7 @@ static bool fil_crypt_find_space_to_rotate( if (state->should_shutdown()) { if (state->space) { - fil_space_release(state->space); + state->space->release(); state->space = NULL; } return false; @@ -1505,7 +1507,7 @@ static bool fil_crypt_find_space_to_rotate( if (state->first) { state->first = false; if (state->space) { - fil_space_release(state->space); + state->space->release(); } state->space = NULL; } @@ -1534,7 +1536,7 @@ static bool fil_crypt_find_space_to_rotate( } if (state->space) { - fil_space_release(state->space); + state->space->release(); state->space = NULL; } @@ -1608,7 +1610,7 @@ fil_crypt_find_page_to_rotate( ulint batch = srv_alloc_time * state->allocated_iops; fil_space_t* space = state->space; - ut_ad(!space || space->n_pending_ops > 0); + ut_ad(!space || space->referenced()); /* If space is marked to be dropped stop rotation. */ if (!space || space->is_stopping()) { @@ -1666,7 +1668,7 @@ fil_crypt_get_page_throttle_func( fil_space_t* space = state->space; const page_size_t page_size = page_size_t(space->flags); const page_id_t page_id(space->id, offset); - ut_ad(space->n_pending_ops > 0); + ut_ad(space->referenced()); /* Before reading from tablespace we need to make sure that the tablespace is not about to be dropped. */ @@ -1748,11 +1750,11 @@ btr_scrub_get_block_and_allocation_status( buf_block_t *block = NULL; fil_space_t* space = state->space; - ut_ad(space->n_pending_ops > 0); + ut_ad(space->referenced()); mtr_start(&local_mtr); - *allocation_status = fseg_page_is_free(space, offset) ? + *allocation_status = fseg_page_is_free(space, (uint32_t)offset) ? BTR_SCRUB_PAGE_FREE : BTR_SCRUB_PAGE_ALLOCATED; @@ -1800,7 +1802,7 @@ fil_crypt_rotate_page( ulint sleeptime_ms = 0; fil_space_crypt_t *crypt_data = space->crypt_data; - ut_ad(space->n_pending_ops > 0); + ut_ad(space->referenced()); ut_ad(offset > 0); /* In fil_crypt_thread where key rotation is done we have @@ -1987,7 +1989,7 @@ fil_crypt_rotate_pages( ulint end = std::min(state->offset + state->batch, state->space->free_limit); - ut_ad(state->space->n_pending_ops > 0); + ut_ad(state->space->referenced()); for (; state->offset < end; state->offset++) { @@ -2026,7 +2028,7 @@ fil_crypt_flush_space( fil_space_t* space = state->space; fil_space_crypt_t *crypt_data = space->crypt_data; - ut_ad(space->n_pending_ops > 0); + ut_ad(space->referenced()); /* flush tablespace pages so that there are no pages left with old key */ lsn_t end_lsn = crypt_data->rotate_state.end_lsn; @@ -2081,18 +2083,13 @@ fil_crypt_flush_space( /*********************************************************************** Complete rotating a space -@param[in,out] key_state Key state @param[in,out] state Rotation state */ -static -void -fil_crypt_complete_rotate_space( - const key_state_t* key_state, - rotate_thread_t* state) +static void fil_crypt_complete_rotate_space(rotate_thread_t* state) { fil_space_crypt_t *crypt_data = state->space->crypt_data; ut_ad(crypt_data); - ut_ad(state->space->n_pending_ops > 0); + ut_ad(state->space->referenced()); /* Space might already be dropped */ if (!state->space->is_stopping()) { @@ -2245,9 +2242,8 @@ DECLARE_THREAD(fil_crypt_thread)(void*) /* If space is marked as stopping, release space and stop rotation. */ if (thr.space->is_stopping()) { - fil_crypt_complete_rotate_space( - &new_state, &thr); - fil_space_release(thr.space); + fil_crypt_complete_rotate_space(&thr); + thr.space->release(); thr.space = NULL; break; } @@ -2258,7 +2254,7 @@ DECLARE_THREAD(fil_crypt_thread)(void*) /* complete rotation */ if (thr.space) { - fil_crypt_complete_rotate_space(&new_state, &thr); + fil_crypt_complete_rotate_space(&thr); } /* force key state refresh */ @@ -2274,7 +2270,7 @@ DECLARE_THREAD(fil_crypt_thread)(void*) /* release current space if shutting down */ if (thr.space) { - fil_space_release(thr.space); + thr.space->release(); thr.space = NULL; } @@ -2339,38 +2335,34 @@ fil_crypt_set_thread_cnt( if innodb_encryption_rotate_key_age=0. */ static void fil_crypt_rotation_list_fill() { - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); - for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system->space_list); + for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list); space != NULL; space = UT_LIST_GET_NEXT(space_list, space)) { if (space->purpose != FIL_TYPE_TABLESPACE || space->is_in_rotation_list - || space->is_stopping() - || UT_LIST_GET_LEN(space->chain) == 0) { + || UT_LIST_GET_LEN(space->chain) == 0 + || !space->acquire()) { continue; } /* Ensure that crypt_data has been initialized. */ if (!space->size) { - /* Protect the tablespace while we may - release fil_system->mutex. */ - space->n_pending_ops++; ut_d(const fil_space_t* s=) - fil_system->read_page0(space->id); + fil_system.read_page0(space->id); ut_ad(!s || s == space); - space->n_pending_ops--; if (!space->size) { /* Page 0 was not loaded. Skip this tablespace. */ - continue; + goto next; } } /* Skip ENCRYPTION!=DEFAULT tablespaces. */ if (space->crypt_data && !space->crypt_data->is_default_encryption()) { - continue; + goto next; } if (srv_encrypt_tables) { @@ -2378,19 +2370,21 @@ static void fil_crypt_rotation_list_fill() innodb_encrypt_tables!=OFF */ if (space->crypt_data && space->crypt_data->min_key_version) { - continue; + goto next; } } else { /* Skip unencrypted tablespaces if innodb_encrypt_tables=OFF */ if (!space->crypt_data || !space->crypt_data->min_key_version) { - continue; + goto next; } } - fil_system->rotation_list.push_back(*space); + fil_system.rotation_list.push_back(*space); space->is_in_rotation_list = true; +next: + space->release(); } } @@ -2402,12 +2396,12 @@ void fil_crypt_set_rotate_key_age( uint val) { - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); srv_fil_crypt_rotate_key_age = val; if (val == 0) { fil_crypt_rotation_list_fill(); } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); os_event_set(fil_crypt_threads_event); } @@ -2431,7 +2425,7 @@ void fil_crypt_set_encrypt_tables( uint val) { - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); srv_encrypt_tables = val; @@ -2439,7 +2433,7 @@ fil_crypt_set_encrypt_tables( fil_crypt_rotation_list_fill(); } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); os_event_set(fil_crypt_threads_event); } @@ -2550,7 +2544,7 @@ fil_space_crypt_get_status( { memset(status, 0, sizeof(*status)); - ut_ad(space->n_pending_ops > 0); + ut_ad(space->referenced()); /* If there is no crypt data and we have not yet read page 0 for this tablespace, we need to read it before @@ -2615,7 +2609,7 @@ fil_space_get_scrub_status( { memset(status, 0, sizeof(*status)); - ut_ad(space->n_pending_ops > 0); + ut_ad(space->referenced()); fil_space_crypt_t* crypt_data = space->crypt_data; status->space = space->id; diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 39e7507373f..9b91fdd879f 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2021, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2014, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under @@ -28,7 +28,6 @@ Created 10/25/1995 Heikki Tuuri #include "fil0crypt.h" #include "btr0btr.h" -#include "btr0sea.h" #include "buf0buf.h" #include "dict0boot.h" #include "dict0dict.h" @@ -66,6 +65,35 @@ static bool fil_try_to_close_file_in_LRU(bool print_info); +/** Test if a tablespace file can be renamed to a new filepath by checking +if that the old filepath exists and the new filepath does not exist. +@param[in] old_path old filepath +@param[in] new_path new filepath +@param[in] is_discarded whether the tablespace is discarded +@param[in] replace_new whether to ignore the existence of new_path +@return innodb error code */ +static dberr_t +fil_rename_tablespace_check( + const char* old_path, + const char* new_path, + bool is_discarded, + bool replace_new = false); +/** Rename a single-table tablespace. +The tablespace must exist in the memory cache. +@param[in] id tablespace identifier +@param[in] old_path old file name +@param[in] new_name new table name in the +databasename/tablename format +@param[in] new_path_in new file name, +or NULL if it is located in the normal data directory +@return true if success */ +static bool +fil_rename_tablespace( + ulint id, + const char* old_path, + const char* new_name, + const char* new_path_in); + /* IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE ============================================= @@ -127,7 +155,7 @@ current working directory ".", but in the MySQL Embedded Server Library it is an absolute path. */ const char* fil_path_to_mysql_datadir; -/** Common InnoDB file extentions */ +/** Common InnoDB file extensions */ const char* dot_ext[] = { "", ".ibd", ".isl", ".cfg" }; /** The number of fsyncs done to the log */ @@ -139,11 +167,11 @@ ulint fil_n_pending_log_flushes = 0; ulint fil_n_pending_tablespace_flushes = 0; /** The null file address */ -fil_addr_t fil_addr_null = {FIL_NULL, 0}; +const fil_addr_t fil_addr_null = {FIL_NULL, 0}; /** The tablespace memory cache. This variable is NULL before the module is initialized. */ -UNIV_INTERN fil_system_t* fil_system = NULL; +fil_system_t fil_system; /** At this age or older a space/page will be rotated */ UNIV_INTERN extern uint srv_fil_crypt_rotate_key_age; @@ -179,26 +207,26 @@ bool fil_validate_skip(void) /*===================*/ { - /** The fil_validate() call skip counter. Use a signed type - because of the race condition below. */ + /** The fil_validate() call skip counter. */ static int fil_validate_count = FIL_VALIDATE_SKIP; - /* There is a race condition below, but it does not matter, - because this call is only for heuristic purposes. We want to - reduce the call frequency of the costly fil_validate() check - in debug builds. */ - if (--fil_validate_count > 0) { + /* We want to reduce the call frequency of the costly fil_validate() + check in debug builds. */ + int count = my_atomic_add32_explicit(&fil_validate_count, -1, + MY_MEMORY_ORDER_RELAXED); + if (count > 0) { return(true); } - fil_validate_count = FIL_VALIDATE_SKIP; + my_atomic_store32_explicit(&fil_validate_count, FIL_VALIDATE_SKIP, + MY_MEMORY_ORDER_RELAXED); return(fil_validate()); } #endif /* UNIV_DEBUG */ /********************************************************************//** Determines if a file node belongs to the least-recently-used list. -@return true if the file belongs to fil_system->LRU mutex. */ +@return true if the file belongs to fil_system.LRU mutex. */ UNIV_INLINE bool fil_space_belongs_in_lru( @@ -232,7 +260,6 @@ bool fil_node_prepare_for_io( /*====================*/ fil_node_t* node, /*!< in: file node */ - fil_system_t* system, /*!< in: tablespace memory cache */ fil_space_t* space); /*!< in: space */ /** Update the data structures when an i/o operation finishes. @@ -307,9 +334,10 @@ fil_space_get_by_id( { fil_space_t* space; - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(fil_system.is_initialised()); + ut_ad(mutex_own(&fil_system.mutex)); - HASH_SEARCH(hash, fil_system->spaces, id, + HASH_SEARCH(hash, fil_system.spaces, id, fil_space_t*, space, ut_ad(space->magic_n == FIL_SPACE_MAGIC_N), space->id == id); @@ -318,33 +346,11 @@ fil_space_get_by_id( } /** Look up a tablespace. -@param[in] name tablespace name -@return tablespace -@retval NULL if not found */ -fil_space_t* -fil_space_get_by_name(const char* name) -{ - fil_space_t* space; - ulint fold; - - ut_ad(mutex_own(&fil_system->mutex)); - - fold = ut_fold_string(name); - - HASH_SEARCH(name_hash, fil_system->name_hash, fold, - fil_space_t*, space, - ut_ad(space->magic_n == FIL_SPACE_MAGIC_N), - !strcmp(name, space->name)); - - return(space); -} - -/** Look up a tablespace. The caller should hold an InnoDB table lock or a MDL that prevents the tablespace from being dropped during the operation, or the caller should be in single-threaded crash recovery mode (no user connections that could drop tablespaces). -If this is not the case, fil_space_acquire() and fil_space_release() +If this is not the case, fil_space_acquire() and fil_space_t::release() should be used instead. @param[in] id tablespace ID @return tablespace, or NULL if not found */ @@ -352,65 +358,29 @@ fil_space_t* fil_space_get( ulint id) { - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); fil_space_t* space = fil_space_get_by_id(id); - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); ut_ad(space == NULL || space->purpose != FIL_TYPE_LOG); return(space); } -/** Gets the type of a file space. -@param[in] id tablespace identifier -@return file type */ -fil_type_t -fil_space_get_type( - ulint id) +/** Note that the tablespace has been imported. +Initially, purpose=FIL_TYPE_IMPORT so that no redo log is +written while the space ID is being updated in each page. */ +void fil_space_t::set_imported() { - fil_space_t* space; - - ut_ad(fil_system); - - mutex_enter(&fil_system->mutex); - - space = fil_space_get_by_id(id); - - ut_a(space); - - mutex_exit(&fil_system->mutex); - - return(space->purpose); -} - -/** Note that a tablespace has been imported. -It is initially marked as FIL_TYPE_IMPORT so that no logging is -done during the import process when the space ID is stamped to each page. -Now we change it to FIL_SPACE_TABLESPACE to start redo and undo logging. -NOTE: temporary tablespaces are never imported. -@param[in] id tablespace identifier */ -void -fil_space_set_imported( - ulint id) -{ - ut_ad(fil_system != NULL); - - mutex_enter(&fil_system->mutex); - - fil_space_t* space = fil_space_get_by_id(id); - const fil_node_t* node = UT_LIST_GET_FIRST(space->chain); - - ut_ad(space->purpose == FIL_TYPE_IMPORT); - space->purpose = FIL_TYPE_TABLESPACE; - space->atomic_write_supported = node->atomic_write + ut_ad(purpose == FIL_TYPE_IMPORT); + const fil_node_t* node = UT_LIST_GET_FIRST(chain); + atomic_write_supported = node->atomic_write && srv_use_atomic_writes && my_test_if_atomic_write(node->handle, - int(page_size_t(space->flags) - .physical())); - mutex_exit(&fil_system->mutex); + int(page_size_t(flags).physical())); + purpose = FIL_TYPE_TABLESPACE; } /**********************************************************************//** -Checks if all the file nodes in a space are flushed. The caller must hold -the fil_system mutex. +Checks if all the file nodes in a space are flushed. @return true if all are flushed */ static bool @@ -418,7 +388,7 @@ fil_space_is_flushed( /*=================*/ fil_space_t* space) /*!< in: space */ { - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); for (const fil_node_t* node = UT_LIST_GET_FIRST(space->chain); node != NULL; @@ -451,7 +421,7 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle, fil_node_t* node; ut_ad(name != NULL); - ut_ad(fil_system != NULL); + ut_ad(fil_system.is_initialised()); node = reinterpret_cast<fil_node_t*>(ut_zalloc_nokey(sizeof(*node))); @@ -474,13 +444,13 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle, node->atomic_write = atomic_write; - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); this->size += size; UT_LIST_ADD_LAST(chain, node); if (node->is_open()) { - fil_system->n_open++; + fil_system.n_open++; } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return node; } @@ -490,7 +460,7 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle, @return whether the page was found valid */ bool fil_node_t::read_page0(bool first) { - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); ut_a(space->purpose != FIL_TYPE_LOG); const page_size_t page_size(space->flags); const ulint psize = page_size.physical(); @@ -591,7 +561,6 @@ bool fil_node_t::read_page0(bool first) } /** Open a file node of a tablespace. -The caller must own the fil_system mutex. @param[in,out] node File node @return false if the file can't be opened, otherwise true */ static bool fil_node_open_file(fil_node_t* node) @@ -600,7 +569,7 @@ static bool fil_node_open_file(fil_node_t* node) bool read_only_mode; fil_space_t* space = node->space; - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); ut_a(node->n_pending == 0); ut_a(!node->is_open()); @@ -705,52 +674,45 @@ retry: ut_a(success); ut_a(node->is_open()); - fil_system->n_open++; + fil_system.n_open++; if (fil_space_belongs_in_lru(space)) { /* Put the node to the LRU list */ - UT_LIST_ADD_FIRST(fil_system->LRU, node); + UT_LIST_ADD_FIRST(fil_system.LRU, node); } return(true); } -/** Close a file node. -@param[in,out] node File node */ -static -void -fil_node_close_file( - fil_node_t* node) +/** Close the file handle. */ +void fil_node_t::close() { bool ret; - ut_ad(mutex_own(&(fil_system->mutex))); - ut_a(node->is_open()); - ut_a(node->n_pending == 0); - ut_a(node->n_pending_flushes == 0); - ut_a(!node->being_extended); - ut_a(!node->needs_flush - || node->space->purpose == FIL_TYPE_TEMPORARY + ut_ad(mutex_own(&fil_system.mutex)); + ut_a(is_open()); + ut_a(n_pending == 0); + ut_a(n_pending_flushes == 0); + ut_a(!being_extended); + ut_a(!needs_flush + || space->purpose == FIL_TYPE_TEMPORARY || srv_fast_shutdown == 2 || !srv_was_started); - ret = os_file_close(node->handle); + ret = os_file_close(handle); ut_a(ret); - /* printf("Closing file %s\n", node->name); */ - - node->handle = OS_FILE_CLOSED; - ut_ad(!node->is_open()); - ut_a(fil_system->n_open > 0); - fil_system->n_open--; + /* printf("Closing file %s\n", name); */ - if (fil_space_belongs_in_lru(node->space)) { + handle = OS_FILE_CLOSED; + ut_ad(!is_open()); + ut_a(fil_system.n_open > 0); + fil_system.n_open--; - ut_a(UT_LIST_GET_LEN(fil_system->LRU) > 0); - - /* The node is in the LRU list, remove it */ - UT_LIST_REMOVE(fil_system->LRU, node); + if (fil_space_belongs_in_lru(space)) { + ut_a(UT_LIST_GET_LEN(fil_system.LRU) > 0); + UT_LIST_REMOVE(fil_system.LRU, this); } } @@ -771,14 +733,14 @@ fil_try_to_close_file_in_LRU( { fil_node_t* node; - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); if (print_info) { ib::info() << "fil_sys open file LRU len " - << UT_LIST_GET_LEN(fil_system->LRU); + << UT_LIST_GET_LEN(fil_system.LRU); } - for (node = UT_LIST_GET_LAST(fil_system->LRU); + for (node = UT_LIST_GET_LAST(fil_system.LRU); node != NULL; node = UT_LIST_GET_PREV(LRU, node)) { @@ -786,7 +748,7 @@ fil_try_to_close_file_in_LRU( && node->n_pending_flushes == 0 && !node->being_extended) { - fil_node_close_file(node); + node->close(); return(true); } @@ -821,9 +783,8 @@ fil_try_to_close_file_in_LRU( @param[in] metadata whether to update file system metadata */ static void fil_flush_low(fil_space_t* space, bool metadata = false) { - ut_ad(mutex_own(&fil_system->mutex)); - ut_ad(space); - ut_ad(!space->stop_new_ops); + ut_ad(mutex_own(&fil_system.mutex)); + ut_ad(!space->is_stopping()); if (fil_buffering_disabled(space)) { @@ -861,6 +822,7 @@ static void fil_flush_low(fil_space_t* space, bool metadata = false) switch (space->purpose) { case FIL_TYPE_TEMPORARY: ut_ad(0); // we already checked for this + /* fall through */ case FIL_TYPE_TABLESPACE: case FIL_TYPE_IMPORT: fil_n_pending_tablespace_flushes++; @@ -881,11 +843,11 @@ static void fil_flush_low(fil_space_t* space, bool metadata = false) node->n_pending_flushes++; node->needs_flush = false; - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); os_file_flush(node->handle); - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); node->n_pending_flushes--; #ifdef _WIN32 @@ -895,7 +857,7 @@ skip_flush: if (space->is_in_unflushed_spaces && fil_space_is_flushed(space)) { - fil_system->unflushed_spaces.remove(*space); + fil_system.unflushed_spaces.remove(*space); space->is_in_unflushed_spaces = false; } } @@ -932,7 +894,7 @@ fil_space_extend_must_retry( ulint size, bool* success) { - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); ut_ad(UT_LIST_GET_LAST(space->chain) == node); ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE); @@ -948,23 +910,23 @@ fil_space_extend_must_retry( for it to finish. It'd have been better to use event driven mechanism but the entire module is peppered with polling stuff. */ - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); os_thread_sleep(100000); return(true); } node->being_extended = true; - if (!fil_node_prepare_for_io(node, fil_system, space)) { + if (!fil_node_prepare_for_io(node, space)) { /* The tablespace data file, such as .ibd file, is missing */ node->being_extended = false; return(false); } - /* At this point it is safe to release fil_system mutex. No + /* At this point it is safe to release fil_system.mutex. No other thread can rename, delete, close or extend the file because we have set the node->being_extended flag. */ - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); ut_ad(size >= space->size); @@ -980,11 +942,11 @@ fil_space_extend_must_retry( const page_size_t pageSize(space->flags); const ulint page_size = pageSize.physical(); - /* fil_read_first_page() expects UNIV_PAGE_SIZE bytes. - fil_node_open_file() expects at least 4 * UNIV_PAGE_SIZE bytes.*/ + /* fil_read_first_page() expects srv_page_size bytes. + fil_node_open_file() expects at least 4 * srv_page_size bytes.*/ os_offset_t new_size = std::max( os_offset_t(size - file_start_page_no) * page_size, - os_offset_t(FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE)); + os_offset_t(FIL_IBD_FILE_INITIAL_SIZE << srv_page_size_shift)); *success = os_file_set_size(node->name, node->handle, new_size, FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags)); @@ -1003,7 +965,7 @@ fil_space_extend_must_retry( last_page_no = ulint(fsize / page_size) + file_start_page_no; } - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); ut_a(node->being_extended); node->being_extended = false; @@ -1013,7 +975,7 @@ fil_space_extend_must_retry( space->size += file_size - node->size; node->size = file_size; const ulint pages_in_MiB = node->size - & ~((1 << (20 - UNIV_PAGE_SIZE_SHIFT)) - 1); + & ~ulint((1U << (20U - srv_page_size_shift)) - 1); fil_node_complete_io(node,IORequestRead); @@ -1041,7 +1003,7 @@ fil_space_extend_must_retry( } -/** Reserves the fil_system mutex and tries to make sure we can open at least one +/** Reserves the fil_system.mutex and tries to make sure we can open at least one file while holding it. This should be called before calling fil_node_prepare_for_io(), because that function may need to open a file. @param[in] space_id tablespace id @@ -1051,7 +1013,7 @@ bool fil_mutex_enter_and_prepare_for_io(ulint space_id) { for (ulint count = 0;;) { - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); if (space_id >= SRV_LOG_SPACE_FIRST_ID) { /* We keep log files always open. */ @@ -1086,19 +1048,19 @@ fil_mutex_enter_and_prepare_for_io(ulint space_id) situation in the function which called this function */ } else { - while (fil_system->n_open >= fil_system->max_n_open) { + while (fil_system.n_open >= srv_max_n_open_files) { /* Too many files are open */ if (fil_try_to_close_file_in_LRU(count > 1)) { /* No problem */ } else if (count >= 2) { ib::warn() << "innodb_open_files=" - << fil_system->max_n_open + << srv_max_n_open_files << " is exceeded (" - << fil_system->n_open + << fil_system.n_open << ") files stay open)"; break; } else { - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); os_aio_simulated_wake_handler_threads(); os_thread_sleep(20000); /* Flush tablespaces so that we can @@ -1106,13 +1068,14 @@ fil_mutex_enter_and_prepare_for_io(ulint space_id) fil_flush_file_spaces(FIL_TYPE_TABLESPACE); count++; - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); continue; } } } - if (ulint size = ulint(UNIV_UNLIKELY(space->recv_size))) { + ulint size = space->recv_size; + if (UNIV_UNLIKELY(size != 0)) { ut_ad(node); bool success; if (fil_space_extend_must_retry(space, node, size, @@ -1120,7 +1083,7 @@ fil_mutex_enter_and_prepare_for_io(ulint space_id) continue; } - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); /* Crash recovery requires the file extension to succeed. */ ut_a(success); @@ -1135,7 +1098,7 @@ fil_mutex_enter_and_prepare_for_io(ulint space_id) this tablespace). Also, fil_space_set_recv_size() may have been invoked - again during the file extension while fil_system->mutex + again during the file extension while fil_system.mutex was not being held by us. Only if space->recv_size matches what we read @@ -1176,7 +1139,7 @@ fil_space_extend( space, UT_LIST_GET_LAST(space->chain), size, &success)); - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(success); } @@ -1189,13 +1152,13 @@ fil_node_close_to_free( fil_node_t* node, fil_space_t* space) { - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); ut_a(node->magic_n == FIL_NODE_MAGIC_N); ut_a(node->n_pending == 0); ut_a(!node->being_extended); if (node->is_open()) { - /* We fool the assertion in fil_node_close_file() to think + /* We fool the assertion in fil_node_t::close() to think there are no unflushed modifications in the file */ node->needs_flush = false; @@ -1208,11 +1171,11 @@ fil_node_close_to_free( } else if (space->is_in_unflushed_spaces && fil_space_is_flushed(space)) { - fil_system->unflushed_spaces.remove(*space); + fil_system.unflushed_spaces.remove(*space); space->is_in_unflushed_spaces = false; } - fil_node_close_file(node); + node->close(); } } @@ -1225,32 +1188,23 @@ void fil_space_detach( fil_space_t* space) { - ut_ad(mutex_own(&fil_system->mutex)); - - HASH_DELETE(fil_space_t, hash, fil_system->spaces, space->id, space); - - fil_space_t* fnamespace = fil_space_get_by_name(space->name); - - ut_a(space == fnamespace); + ut_ad(mutex_own(&fil_system.mutex)); - HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash, - ut_fold_string(space->name), space); + HASH_DELETE(fil_space_t, hash, fil_system.spaces, space->id, space); if (space->is_in_unflushed_spaces) { ut_ad(!fil_buffering_disabled(space)); - - fil_system->unflushed_spaces.remove(*space); + fil_system.unflushed_spaces.remove(*space); space->is_in_unflushed_spaces = false; } if (space->is_in_rotation_list) { - - fil_system->rotation_list.remove(*space); + fil_system.rotation_list.remove(*space); space->is_in_rotation_list = false; } - UT_LIST_REMOVE(fil_system->space_list, space); + UT_LIST_REMOVE(fil_system.space_list, space); ut_a(space->magic_n == FIL_SPACE_MAGIC_N); ut_a(space->n_pending_flushes == 0); @@ -1261,6 +1215,12 @@ fil_space_detach( fil_node_close_to_free(fil_node, space); } + + if (space == fil_system.sys_space) { + fil_system.sys_space = NULL; + } else if (space == fil_system.temp_space) { + fil_system.temp_space = NULL; + } } /** Free a tablespace object on which fil_space_detach() was invoked. @@ -1271,14 +1231,14 @@ void fil_space_free_low( fil_space_t* space) { - /* The tablespace must not be in fil_system->named_spaces. */ + /* The tablespace must not be in fil_system.named_spaces. */ ut_ad(srv_fast_shutdown == 2 || !srv_was_started || space->max_lsn == 0); - /* Wait for fil_space_release_for_io(); after + /* Wait for fil_space_t::release_for_io(); after fil_space_detach(), the tablespace cannot be found, so fil_space_acquire_for_io() would return NULL */ - while (space->n_pending_ios) { + while (space->pending_io()) { os_thread_sleep(100); } @@ -1313,14 +1273,14 @@ fil_space_free( { ut_ad(id != TRX_SYS_SPACE); - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); fil_space_t* space = fil_space_get_by_id(id); if (space != NULL) { fil_space_detach(space); } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); if (space != NULL) { if (x_latched) { @@ -1335,7 +1295,7 @@ fil_space_free( if (space->max_lsn != 0) { ut_d(space->max_lsn = 0); - UT_LIST_REMOVE(fil_system->named_spaces, space); + UT_LIST_REMOVE(fil_system.named_spaces, space); } if (!recv_recovery_is_on()) { @@ -1369,26 +1329,14 @@ fil_space_create( { fil_space_t* space; - ut_ad(fil_system); + ut_ad(fil_system.is_initialised()); ut_ad(fsp_flags_is_valid(flags & ~FSP_FLAGS_MEM_MASK, id)); ut_ad(purpose == FIL_TYPE_LOG || srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0); DBUG_EXECUTE_IF("fil_space_create_failure", return(NULL);); - mutex_enter(&fil_system->mutex); - - /* Look for a matching tablespace. */ - space = fil_space_get_by_name(name); - - if (space != NULL) { - mutex_exit(&fil_system->mutex); - - ib::warn() << "Tablespace '" << name << "' exists in the" - " cache with id " << space->id << " != " << id; - - return(NULL); - } + mutex_enter(&fil_system.mutex); space = fil_space_get_by_id(id); @@ -1397,7 +1345,7 @@ fil_space_create( << "' with id " << id << " to the tablespace memory cache, but tablespace '" << space->name << "' already exists in the cache!"; - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(NULL); } @@ -1410,17 +1358,16 @@ fil_space_create( if ((purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT) && !recv_recovery_is_on() - && id > fil_system->max_assigned_id) { - - if (!fil_system->space_id_reuse_warned) { - fil_system->space_id_reuse_warned = true; + && id > fil_system.max_assigned_id) { + if (!fil_system.space_id_reuse_warned) { + fil_system.space_id_reuse_warned = true; ib::warn() << "Allocated tablespace ID " << id << " for " << name << ", old maximum was " - << fil_system->max_assigned_id; + << fil_system.max_assigned_id; } - fil_system->max_assigned_id = id; + fil_system.max_assigned_id = id; } space->purpose = purpose; @@ -1443,7 +1390,6 @@ fil_space_create( rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP); if (space->purpose == FIL_TYPE_TEMPORARY) { - ut_d(space->latch.set_temp_fsp()); /* SysTablespace::open_or_create() would pass size!=0 to fil_space_t::add(), so first_time_open would not hold in fil_node_open_file(), and we @@ -1453,16 +1399,13 @@ fil_space_create( space->atomic_write_supported = true; } - HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space); + HASH_INSERT(fil_space_t, hash, fil_system.spaces, id, space); - HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash, - ut_fold_string(name), space); + UT_LIST_ADD_LAST(fil_system.space_list, space); - UT_LIST_ADD_LAST(fil_system->space_list, space); + if (id < SRV_LOG_SPACE_FIRST_ID && id > fil_system.max_assigned_id) { - if (id < SRV_LOG_SPACE_FIRST_ID && id > fil_system->max_assigned_id) { - - fil_system->max_assigned_id = id; + fil_system.max_assigned_id = id; } /* Inform key rotation that there could be something @@ -1473,14 +1416,12 @@ fil_space_create( srv_encrypt_tables)) { /* Key rotation is not enabled, need to inform background encryption threads. */ - fil_system->rotation_list.push_back(*space); + fil_system.rotation_list.push_back(*space); space->is_in_rotation_list = true; - mutex_exit(&fil_system->mutex); - mutex_enter(&fil_crypt_threads_mutex); + mutex_exit(&fil_system.mutex); os_event_set(fil_crypt_threads_event); - mutex_exit(&fil_crypt_threads_mutex); } else { - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); } return(space); @@ -1499,12 +1440,12 @@ fil_assign_new_space_id( ulint id; bool success; - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); id = *space_id; - if (id < fil_system->max_assigned_id) { - id = fil_system->max_assigned_id; + if (id < fil_system.max_assigned_id) { + id = fil_system.max_assigned_id; } id++; @@ -1521,7 +1462,7 @@ fil_assign_new_space_id( success = (id < SRV_LOG_SPACE_FIRST_ID); if (success) { - *space_id = fil_system->max_assigned_id = id; + *space_id = fil_system.max_assigned_id = id; } else { ib::warn() << "You have run out of single-table tablespace" " id's! Current counter is " << id @@ -1531,7 +1472,7 @@ fil_assign_new_space_id( *space_id = ULINT_UNDEFINED; } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(success); } @@ -1568,7 +1509,7 @@ fil_space_t* fil_system_t::read_page0(ulint id) the file yet; the following calls will open it and update the size fields */ - if (!fil_node_prepare_for_io(node, fil_system, space)) { + if (!fil_node_prepare_for_io(node, space)) { /* The single-table tablespace can't be opened, because the ibd file is missing. */ return(NULL); @@ -1581,7 +1522,7 @@ fil_space_t* fil_system_t::read_page0(ulint id) /*******************************************************************//** Returns a pointer to the fil_space_t that is in the memory cache -associated with a space id. The caller must lock fil_system->mutex. +associated with a space id. The caller must lock fil_system.mutex. @return file_space_t pointer, NULL if space not found */ UNIV_INLINE fil_space_t* @@ -1600,52 +1541,12 @@ fil_space_get_space( case FIL_TYPE_TEMPORARY: case FIL_TYPE_TABLESPACE: case FIL_TYPE_IMPORT: - space = fil_system->read_page0(id); + space = fil_system.read_page0(id); } return(space); } -/** Returns the path from the first fil_node_t found with this space ID. -The caller is responsible for freeing the memory allocated here for the -value returned. -@param[in] id Tablespace ID -@return own: A copy of fil_node_t::path, NULL if space ID is zero -or not found. */ -char* -fil_space_get_first_path( - ulint id) -{ - fil_space_t* space; - fil_node_t* node; - char* path; - - ut_ad(fil_system); - ut_a(id); - - if (!fil_mutex_enter_and_prepare_for_io(id)) { -fail_exit: - mutex_exit(&fil_system->mutex); - return(NULL); - } - - space = fil_space_get_space(id); - - if (space == NULL) { - goto fail_exit; - } - - ut_ad(mutex_own(&fil_system->mutex)); - - node = UT_LIST_GET_FIRST(space->chain); - - path = mem_strdup(node->name); - - mutex_exit(&fil_system->mutex); - - return(path); -} - /** Set the recovered size of a tablespace in pages. @param id tablespace ID @param size recovered size in pages */ @@ -1653,7 +1554,7 @@ UNIV_INTERN void fil_space_set_recv_size(ulint id, ulint size) { - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); ut_ad(size); ut_ad(id < SRV_LOG_SPACE_FIRST_ID); @@ -1661,7 +1562,7 @@ fil_space_set_recv_size(ulint id, ulint size) space->recv_size = size; } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); } /*******************************************************************//** @@ -1676,14 +1577,14 @@ fil_space_get_size( fil_space_t* space; ulint size; - ut_ad(fil_system); - mutex_enter(&fil_system->mutex); + ut_ad(fil_system.is_initialised()); + mutex_enter(&fil_system.mutex); space = fil_space_get_space(id); size = space ? space->size : 0; - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(size); } @@ -1700,83 +1601,72 @@ fil_space_get_flags( fil_space_t* space; ulint flags; - ut_ad(fil_system); + ut_ad(fil_system.is_initialised()); - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); space = fil_space_get_space(id); if (space == NULL) { - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(ULINT_UNDEFINED); } flags = space->flags; - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(flags); } -/** Open each fil_node_t of a named fil_space_t if not already open. -@param[in] name Tablespace name -@return true if all nodes are open */ -bool -fil_space_open( - const char* name) +/** Open each file. Only invoked on fil_system.temp_space. +@return whether all files were opened */ +bool fil_space_t::open() { - ut_ad(fil_system != NULL); + ut_ad(fil_system.is_initialised()); - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); + ut_ad(this == fil_system.temp_space + || srv_operation == SRV_OPERATION_BACKUP + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_DELTA); - fil_space_t* space = fil_space_get_by_name(name); - fil_node_t* node; - - for (node = UT_LIST_GET_FIRST(space->chain); + for (fil_node_t* node = UT_LIST_GET_FIRST(chain); node != NULL; node = UT_LIST_GET_NEXT(chain, node)) { - - if (!node->is_open() - && !fil_node_open_file(node)) { - mutex_exit(&fil_system->mutex); - return(false); + if (!node->is_open() && !fil_node_open_file(node)) { + mutex_exit(&fil_system.mutex); + return false; } } - mutex_exit(&fil_system->mutex); - - return(true); + mutex_exit(&fil_system.mutex); + return true; } -/** Close each fil_node_t of a named fil_space_t if open. -@param[in] name Tablespace name */ -void -fil_space_close( - const char* name) +/** Close each file. Only invoked on fil_system.temp_space. */ +void fil_space_t::close() { - if (fil_system == NULL) { + if (!fil_system.is_initialised()) { return; } - mutex_enter(&fil_system->mutex); - - fil_space_t* space = fil_space_get_by_name(name); - if (space == NULL) { - mutex_exit(&fil_system->mutex); - return; - } + mutex_enter(&fil_system.mutex); + ut_ad(this == fil_system.temp_space + || srv_operation == SRV_OPERATION_BACKUP + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_DELTA); - for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain); + for (fil_node_t* node = UT_LIST_GET_FIRST(chain); node != NULL; node = UT_LIST_GET_NEXT(chain, node)) { - if (node->is_open()) { - fil_node_close_file(node); + node->close(); } } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); } /** Returns the page size of the space and whether it is compressed or not. @@ -1801,35 +1691,48 @@ fil_space_get_page_size( return(page_size_t(flags)); } -/****************************************************************//** -Initializes the tablespace memory cache. */ -void -fil_init( -/*=====*/ - ulint hash_size, /*!< in: hash table size */ - ulint max_n_open) /*!< in: max number of open files */ +void fil_system_t::create(ulint hash_size) { - ut_a(fil_system == NULL); - - ut_a(hash_size > 0); - ut_a(max_n_open > 0); + ut_ad(this == &fil_system); + ut_ad(!is_initialised()); + ut_ad(!(srv_page_size % FSP_EXTENT_SIZE)); + ut_ad(srv_page_size); + ut_ad(!spaces); - fil_system = new fil_system_t(); + m_initialised = true; - mutex_create(LATCH_ID_FIL_SYSTEM, &fil_system->mutex); + compile_time_assert(!(UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX)); + compile_time_assert(!(UNIV_PAGE_SIZE_MIN % FSP_EXTENT_SIZE_MIN)); - fil_system->spaces = hash_create(hash_size); - fil_system->name_hash = hash_create(hash_size); + ut_ad(hash_size > 0); - UT_LIST_INIT(fil_system->LRU, &fil_node_t::LRU); - UT_LIST_INIT(fil_system->space_list, &fil_space_t::space_list); - UT_LIST_INIT(fil_system->named_spaces, &fil_space_t::named_spaces); + mutex_create(LATCH_ID_FIL_SYSTEM, &mutex); - fil_system->max_n_open = max_n_open; + spaces = hash_create(hash_size); fil_space_crypt_init(); } +void fil_system_t::close() +{ + ut_ad(this == &fil_system); + ut_a(!UT_LIST_GET_LEN(LRU)); + ut_a(unflushed_spaces.empty()); + ut_a(!UT_LIST_GET_LEN(space_list)); + ut_ad(!sys_space); + ut_ad(!temp_space); + + if (is_initialised()) { + m_initialised = false; + hash_table_free(spaces); + spaces = NULL; + mutex_free(&mutex); + fil_space_crypt_cleanup(); + } + + ut_ad(!spaces); +} + /*******************************************************************//** Opens all log files and system tablespace data files. They stay open until the database server shutdown. This should be called at a server startup after the @@ -1842,9 +1745,9 @@ fil_open_log_and_system_tablespace_files(void) { fil_space_t* space; - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); - for (space = UT_LIST_GET_FIRST(fil_system->space_list); + for (space = UT_LIST_GET_FIRST(fil_system.space_list); space != NULL; space = UT_LIST_GET_NEXT(space_list, space)) { @@ -1870,7 +1773,7 @@ fil_open_log_and_system_tablespace_files(void) } } - if (fil_system->max_n_open < 10 + fil_system->n_open) { + if (srv_max_n_open_files < 10 + fil_system.n_open) { ib::warn() << "You must raise the value of" " innodb_open_files in my.cnf!" @@ -1882,15 +1785,15 @@ fil_open_log_and_system_tablespace_files(void) " some .ibd files if the" " file-per-table storage model is used." " Current open files " - << fil_system->n_open + << fil_system.n_open << ", max allowed open files " - << fil_system->max_n_open + << srv_max_n_open_files << "."; } } } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); } /*******************************************************************//** @@ -1903,13 +1806,14 @@ fil_close_all_files(void) fil_space_t* space; /* At shutdown, we should not have any files in this list. */ + ut_ad(fil_system.is_initialised()); ut_ad(srv_fast_shutdown == 2 || !srv_was_started - || UT_LIST_GET_LEN(fil_system->named_spaces) == 0); + || UT_LIST_GET_LEN(fil_system.named_spaces) == 0); - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); - for (space = UT_LIST_GET_FIRST(fil_system->space_list); + for (space = UT_LIST_GET_FIRST(fil_system.space_list); space != NULL; ) { fil_node_t* node; fil_space_t* prev_space = space; @@ -1919,7 +1823,7 @@ fil_close_all_files(void) node = UT_LIST_GET_NEXT(chain, node)) { if (node->is_open()) { - fil_node_close_file(node); + node->close(); } } @@ -1928,11 +1832,11 @@ fil_close_all_files(void) fil_space_free_low(prev_space); } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); ut_ad(srv_fast_shutdown == 2 || !srv_was_started - || UT_LIST_GET_LEN(fil_system->named_spaces) == 0); + || UT_LIST_GET_LEN(fil_system.named_spaces) == 0); } /*******************************************************************//** @@ -1945,9 +1849,9 @@ fil_close_log_files( { fil_space_t* space; - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); - space = UT_LIST_GET_FIRST(fil_system->space_list); + space = UT_LIST_GET_FIRST(fil_system.space_list); while (space != NULL) { fil_node_t* node; @@ -1958,7 +1862,7 @@ fil_close_log_files( continue; } - /* Log files are not in the fil_system->named_spaces list. */ + /* Log files are not in the fil_system.named_spaces list. */ ut_ad(space->max_lsn == 0); for (node = UT_LIST_GET_FIRST(space->chain); @@ -1966,7 +1870,7 @@ fil_close_log_files( node = UT_LIST_GET_NEXT(chain, node)) { if (node->is_open()) { - fil_node_close_file(node); + node->close(); } } @@ -1978,7 +1882,11 @@ fil_close_log_files( } } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); + + if (free) { + log_sys.log.close(); + } } /*******************************************************************//** @@ -1993,14 +1901,14 @@ fil_set_max_space_id_if_bigger( ib::fatal() << "Max tablespace id is too high, " << max_id; } - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); - if (fil_system->max_assigned_id < max_id) { + if (fil_system.max_assigned_id < max_id) { - fil_system->max_assigned_id = max_id; + fil_system.max_assigned_id = max_id; } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); } /** Write the flushed LSN to the page header of the first page in the @@ -2015,18 +1923,18 @@ fil_write_flushed_lsn( byte* buf; dberr_t err = DB_TABLESPACE_NOT_FOUND; - buf1 = static_cast<byte*>(ut_malloc_nokey(2 * UNIV_PAGE_SIZE)); - buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE)); + buf1 = static_cast<byte*>(ut_malloc_nokey(2U << srv_page_size_shift)); + buf = static_cast<byte*>(ut_align(buf1, srv_page_size)); const page_id_t page_id(TRX_SYS_SPACE, 0); - err = fil_read(page_id, univ_page_size, 0, univ_page_size.physical(), + err = fil_read(page_id, univ_page_size, 0, srv_page_size, buf); if (err == DB_SUCCESS) { mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, lsn); err = fil_write(page_id, univ_page_size, 0, - univ_page_size.physical(), buf); + srv_page_size, buf); fil_flush_file_spaces(FIL_TYPE_TABLESPACE); } @@ -2047,7 +1955,7 @@ fil_space_acquire_low(ulint id, bool silent) { fil_space_t* space; - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); space = fil_space_get_by_id(id); @@ -2056,29 +1964,15 @@ fil_space_acquire_low(ulint id, bool silent) ib::warn() << "Trying to access missing" " tablespace " << id; } - } else if (space->is_stopping()) { + } else if (!space->acquire()) { space = NULL; - } else { - space->n_pending_ops++; } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(space); } -/** Release a tablespace acquired with fil_space_acquire(). -@param[in,out] space tablespace to release */ -void -fil_space_release(fil_space_t* space) -{ - mutex_enter(&fil_system->mutex); - ut_ad(space->magic_n == FIL_SPACE_MAGIC_N); - ut_ad(space->n_pending_ops > 0); - space->n_pending_ops--; - mutex_exit(&fil_system->mutex); -} - /** Acquire a tablespace for reading or writing a block, when it could be dropped concurrently. @param[in] id tablespace ID @@ -2087,31 +1981,19 @@ when it could be dropped concurrently. fil_space_t* fil_space_acquire_for_io(ulint id) { - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); fil_space_t* space = fil_space_get_by_id(id); if (space) { - space->n_pending_ios++; + space->acquire_for_io(); } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(space); } -/** Release a tablespace acquired with fil_space_acquire_for_io(). -@param[in,out] space tablespace to release */ -void -fil_space_release_for_io(fil_space_t* space) -{ - mutex_enter(&fil_system->mutex); - ut_ad(space->magic_n == FIL_SPACE_MAGIC_N); - ut_ad(space->n_pending_ios > 0); - space->n_pending_ios--; - mutex_exit(&fil_system->mutex); -} - /********************************************************//** Creates the database directory for a table if it does not exist yet. */ void @@ -2127,12 +2009,13 @@ fil_create_directory_for_tablename( len = strlen(fil_path_to_mysql_datadir); namend = strchr(name, '/'); ut_a(namend); - path = static_cast<char*>(ut_malloc_nokey(len + (namend - name) + 2)); + path = static_cast<char*>( + ut_malloc_nokey(len + ulint(namend - name) + 2)); memcpy(path, fil_path_to_mysql_datadir, len); path[len] = '/'; - memcpy(path + len + 1, name, namend - name); - path[len + (namend - name) + 1] = 0; + memcpy(path + len + 1, name, ulint(namend - name)); + path[len + ulint(namend - name) + 1] = 0; os_normalize_path(path); @@ -2250,7 +2133,7 @@ fil_name_write_rename_low( @param[in] space_id tablespace id @param[in] old_name tablespace file name @param[in] new_name tablespace file name after renaming */ -void +static void fil_name_write_rename( ulint space_id, const char* old_name, @@ -2295,242 +2178,6 @@ fil_name_write( fil_name_write(space->id, first_page_no, file->name, mtr); } -/********************************************************//** -Recreates table indexes by applying -TRUNCATE log record during recovery. -@return DB_SUCCESS or error code */ -dberr_t -fil_recreate_table( -/*===============*/ - ulint space_id, /*!< in: space id */ - ulint format_flags, /*!< in: page format */ - ulint flags, /*!< in: tablespace flags */ - const char* name, /*!< in: table name */ - truncate_t& truncate) /*!< in: The information of - TRUNCATE log record */ -{ - dberr_t err = DB_SUCCESS; - bool found; - const page_size_t page_size(fil_space_get_page_size(space_id, - &found)); - - if (!found) { - ib::info() << "Missing .ibd file for table '" << name - << "' with tablespace " << space_id; - return(DB_ERROR); - } - - ut_ad(!truncate_t::s_fix_up_active); - truncate_t::s_fix_up_active = true; - - /* Step-1: Scan for active indexes from REDO logs and drop - all the indexes using low level function that take root_page_no - and space-id. */ - truncate.drop_indexes(space_id); - - /* Step-2: Scan for active indexes and re-create them. */ - err = truncate.create_indexes( - name, space_id, page_size, flags, format_flags); - if (err != DB_SUCCESS) { - ib::info() << "Failed to create indexes for the table '" - << name << "' with tablespace " << space_id - << " while fixing up truncate action"; - return(err); - } - - truncate_t::s_fix_up_active = false; - - return(err); -} - -/********************************************************//** -Recreates the tablespace and table indexes by applying -TRUNCATE log record during recovery. -@return DB_SUCCESS or error code */ -dberr_t -fil_recreate_tablespace( -/*====================*/ - ulint space_id, /*!< in: space id */ - ulint format_flags, /*!< in: page format */ - ulint flags, /*!< in: tablespace flags */ - const char* name, /*!< in: table name */ - truncate_t& truncate, /*!< in: The information of - TRUNCATE log record */ - lsn_t recv_lsn) /*!< in: the end LSN of - the log record */ -{ - dberr_t err = DB_SUCCESS; - mtr_t mtr; - - ut_ad(!truncate_t::s_fix_up_active); - truncate_t::s_fix_up_active = true; - - /* Step-1: Invalidate buffer pool pages belonging to the tablespace - to re-create. */ - buf_LRU_flush_or_remove_pages(space_id, NULL); - - /* Remove all insert buffer entries for the tablespace */ - ibuf_delete_for_discarded_space(space_id); - - /* Step-2: truncate tablespace (reset the size back to original or - default size) of tablespace. */ - err = truncate.truncate( - space_id, truncate.get_dir_path(), name, flags, true); - - if (err != DB_SUCCESS) { - - ib::info() << "Cannot access .ibd file for table '" - << name << "' with tablespace " << space_id - << " while truncating"; - return(DB_ERROR); - } - - bool found; - const page_size_t& page_size = - fil_space_get_page_size(space_id, &found); - - if (!found) { - ib::info() << "Missing .ibd file for table '" << name - << "' with tablespace " << space_id; - return(DB_ERROR); - } - - /* Step-3: Initialize Header. */ - if (page_size.is_compressed()) { - byte* buf; - page_t* page; - - buf = static_cast<byte*>(ut_zalloc_nokey(3 * UNIV_PAGE_SIZE)); - - /* Align the memory for file i/o */ - page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); - - flags |= FSP_FLAGS_PAGE_SSIZE(); - - fsp_header_init_fields(page, space_id, flags); - - mach_write_to_4( - page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); - - page_zip_des_t page_zip; - page_zip_set_size(&page_zip, page_size.physical()); - page_zip.data = page + UNIV_PAGE_SIZE; - -#ifdef UNIV_DEBUG - page_zip.m_start = -#endif /* UNIV_DEBUG */ - page_zip.m_end = page_zip.m_nonempty = page_zip.n_blobs = 0; - buf_flush_init_for_writing(NULL, page, &page_zip, 0); - - err = fil_write(page_id_t(space_id, 0), page_size, 0, - page_size.physical(), page_zip.data); - - ut_free(buf); - - if (err != DB_SUCCESS) { - ib::info() << "Failed to clean header of the" - " table '" << name << "' with tablespace " - << space_id; - return(err); - } - } - - mtr_start(&mtr); - /* Don't log the operation while fixing up table truncate operation - as crash at this level can still be sustained with recovery restarting - from last checkpoint. */ - mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - - /* Initialize the first extent descriptor page and - the second bitmap page for the new tablespace. */ - fsp_header_init(space_id, FIL_IBD_FILE_INITIAL_SIZE, &mtr); - mtr_commit(&mtr); - - /* Step-4: Re-Create Indexes to newly re-created tablespace. - This operation will restore tablespace back to what it was - when it was created during CREATE TABLE. */ - err = truncate.create_indexes( - name, space_id, page_size, flags, format_flags); - if (err != DB_SUCCESS) { - return(err); - } - - /* Step-5: Write new created pages into ibd file handle and - flush it to disk for the tablespace, in case i/o-handler thread - deletes the bitmap page from buffer. */ - mtr_start(&mtr); - - mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - - mutex_enter(&fil_system->mutex); - - fil_space_t* space = fil_space_get_by_id(space_id); - - mutex_exit(&fil_system->mutex); - - fil_node_t* node = UT_LIST_GET_FIRST(space->chain); - - for (ulint page_no = 0; page_no < node->size; ++page_no) { - - const page_id_t cur_page_id(space_id, page_no); - - buf_block_t* block = buf_page_get(cur_page_id, page_size, - RW_X_LATCH, &mtr); - - byte* page = buf_block_get_frame(block); - - if (!FSP_FLAGS_GET_ZIP_SSIZE(flags)) { - ut_ad(!page_size.is_compressed()); - - buf_flush_init_for_writing( - block, page, NULL, recv_lsn); - - err = fil_write(cur_page_id, page_size, 0, - page_size.physical(), page); - } else { - ut_ad(page_size.is_compressed()); - - /* We don't want to rewrite empty pages. */ - - if (fil_page_get_type(page) != 0) { - page_zip_des_t* page_zip = - buf_block_get_page_zip(block); - - buf_flush_init_for_writing( - block, page, page_zip, recv_lsn); - - err = fil_write(cur_page_id, page_size, 0, - page_size.physical(), - page_zip->data); - } else { -#ifdef UNIV_DEBUG - const byte* data = block->page.zip.data; - - /* Make sure that the page is really empty */ - for (ulint i = 0; - i < page_size.physical(); - ++i) { - - ut_a(data[i] == 0); - } -#endif /* UNIV_DEBUG */ - } - } - - if (err != DB_SUCCESS) { - ib::info() << "Cannot write page " << page_no - << " into a .ibd file for table '" - << name << "' with tablespace " << space_id; - } - } - - mtr_commit(&mtr); - - truncate_t::s_fix_up_active = false; - - return(err); -} - /** Replay a file rename operation if possible. @param[in] space_id tablespace identifier @param[in] first_page_no first page number in the file @@ -2573,9 +2220,9 @@ fil_op_replay_rename( ut_a(namend != NULL); char* dir = static_cast<char*>( - ut_malloc_nokey(namend - new_name + 1)); + ut_malloc_nokey(ulint(namend - new_name) + 1)); - memcpy(dir, new_name, namend - new_name); + memcpy(dir, new_name, ulint(namend - new_name)); dir[namend - new_name] = '\0'; bool success = os_file_create_directory(dir, false); @@ -2584,14 +2231,14 @@ fil_op_replay_rename( ulint dirlen = 0; if (const char* dirend = strrchr(dir, OS_PATH_SEPARATOR)) { - dirlen = dirend - dir + 1; + dirlen = ulint(dirend - dir) + 1; } ut_free(dir); /* New path must not exist. */ dberr_t err = fil_rename_tablespace_check( - space_id, name, new_name, false); + name, new_name, false); if (err != DB_SUCCESS) { ib::error() << " Cannot replay file rename." " Remove either file and try again."; @@ -2603,7 +2250,7 @@ fil_op_replay_rename( strlen(new_name + dirlen) - 4 /* remove ".ibd" */); - ut_ad(new_table[namend - new_name - dirlen] + ut_ad(new_table[ulint(namend - new_name) - dirlen] == OS_PATH_SEPARATOR); #if OS_PATH_SEPARATOR != '/' new_table[namend - new_name - dirlen] = '/'; @@ -2633,15 +2280,16 @@ static ulint fil_check_pending_ops(const fil_space_t* space, ulint count) { - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); - if (space == NULL) { + if (!space) { return 0; } - if (ulint n_pending_ops = space->n_pending_ops) { + if (uint32_t n_pending_ops = space->referenced()) { - if (count > 5000) { + /* Give a warning every 10 second, starting after 1 second */ + if ((count % 500) == 50) { ib::warn() << "Trying to close/delete/truncate" " tablespace '" << space->name << "' but there are " << n_pending_ops @@ -2666,8 +2314,8 @@ fil_check_pending_io( fil_node_t** node, /*!< out: Node in space list */ ulint count) /*!< in: number of attempts so far */ { - ut_ad(mutex_own(&fil_system->mutex)); - ut_a(space->n_pending_ops == 0); + ut_ad(mutex_own(&fil_system.mutex)); + ut_ad(!space->referenced()); switch (operation) { case FIL_OPERATION_DELETE: @@ -2723,19 +2371,17 @@ fil_check_pending_operations( *space = 0; - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); fil_space_t* sp = fil_space_get_by_id(id); if (sp) { - sp->stop_new_ops = true; - if (sp->crypt_data) { - sp->n_pending_ops++; - mutex_exit(&fil_system->mutex); + if (sp->crypt_data && sp->acquire()) { + mutex_exit(&fil_system.mutex); fil_space_crypt_close_tablespace(sp); - mutex_enter(&fil_system->mutex); - ut_ad(sp->n_pending_ops > 0); - sp->n_pending_ops--; + mutex_enter(&fil_system.mutex); + sp->release(); } + sp->set_stopping(true); } /* Check for pending operations. */ @@ -2745,13 +2391,13 @@ fil_check_pending_operations( count = fil_check_pending_ops(sp, count); - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); if (count > 0) { os_thread_sleep(20000); } - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); } while (count > 0); /* Check for pending IO. */ @@ -2760,7 +2406,7 @@ fil_check_pending_operations( sp = fil_space_get_by_id(id); if (sp == NULL) { - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(DB_TABLESPACE_NOT_FOUND); } @@ -2772,14 +2418,14 @@ fil_check_pending_operations( *path = mem_strdup(node->name); } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); if (count == 0) { break; } os_thread_sleep(20000); - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); } ut_ad(sp); @@ -2824,7 +2470,7 @@ fil_close_tablespace( fil_flush() from being applied to this tablespace. */ { - FlushObserver observer(id, trx, NULL); + FlushObserver observer(space, trx, NULL); buf_LRU_flush_or_remove_pages(id, &observer); } @@ -2866,14 +2512,11 @@ bool fil_table_accessible(const dict_table_t* table) return(false); } - if (fil_space_t* space = fil_space_acquire(table->space)) { - bool accessible = !space->is_stopping(); - fil_space_release(space); - ut_ad(accessible || dict_table_is_file_per_table(table)); - return(accessible); - } else { - return(false); - } + mutex_enter(&fil_system.mutex); + bool accessible = table->space && !table->space->is_stopping(); + mutex_exit(&fil_system.mutex); + ut_ad(accessible || dict_table_is_file_per_table(table)); + return accessible; } /** Delete a tablespace and associated .ibd file. @@ -2954,25 +2597,25 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists) RemoteDatafile::delete_link_file(space->name); } - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); /* Double check the sanity of pending ops after reacquiring the fil_system::mutex. */ if (const fil_space_t* s = fil_space_get_by_id(id)) { ut_a(s == space); - ut_a(space->n_pending_ops == 0); + ut_a(!space->referenced()); ut_a(UT_LIST_GET_LEN(space->chain) == 1); fil_node_t* node = UT_LIST_GET_FIRST(space->chain); ut_a(node->n_pending == 0); fil_space_detach(space); - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); log_mutex_enter(); if (space->max_lsn != 0) { ut_d(space->max_lsn = 0); - UT_LIST_REMOVE(fil_system->named_spaces, space); + UT_LIST_REMOVE(fil_system.named_spaces, space); } log_mutex_exit(); @@ -2988,7 +2631,7 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists) err = DB_IO_ERROR; } } else { - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); err = DB_TABLESPACE_NOT_FOUND; } @@ -3026,235 +2669,6 @@ void fil_truncate_log(fil_space_t* space, ulint size, mtr_t* mtr) NULL, space->flags & ~FSP_FLAGS_MEM_MASK, mtr); } -/** Truncate the tablespace to needed size. -@param[in] space_id id of tablespace to truncate -@param[in] size_in_pages truncate size. -@return true if truncate was successful. */ -bool -fil_truncate_tablespace( - ulint space_id, - ulint size_in_pages) -{ - /* Step-1: Prepare tablespace for truncate. This involves - stopping all the new operations + IO on that tablespace - and ensuring that related pages are flushed to disk. */ - if (fil_prepare_for_truncate(space_id) != DB_SUCCESS) { - return(false); - } - - /* Step-2: Invalidate buffer pool pages belonging to the tablespace - to re-create. Remove all insert buffer entries for the tablespace */ - buf_LRU_flush_or_remove_pages(space_id, NULL); - - /* Step-3: Truncate the tablespace and accordingly update - the fil_space_t handler that is used to access this tablespace. */ - mutex_enter(&fil_system->mutex); - fil_space_t* space = fil_space_get_by_id(space_id); - - /* The following code must change when InnoDB supports - multiple datafiles per tablespace. */ - ut_a(UT_LIST_GET_LEN(space->chain) == 1); - - fil_node_t* node = UT_LIST_GET_FIRST(space->chain); - - ut_ad(node->is_open()); - - space->size = node->size = size_in_pages; - - bool success = os_file_truncate(node->name, node->handle, 0); - if (success) { - - os_offset_t size = os_offset_t(size_in_pages) * UNIV_PAGE_SIZE; - - success = os_file_set_size( - node->name, node->handle, size, - FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags)); - - if (success) { - space->stop_new_ops = false; - space->is_being_truncated = false; - } - } - - mutex_exit(&fil_system->mutex); - - return(success); -} - -/*******************************************************************//** -Prepare for truncating a single-table tablespace. -1) Check pending operations on a tablespace; -2) Remove all insert buffer entries for the tablespace; -@return DB_SUCCESS or error */ -dberr_t -fil_prepare_for_truncate( -/*=====================*/ - ulint id) /*!< in: space id */ -{ - char* path = 0; - fil_space_t* space = 0; - - ut_a(!is_system_tablespace(id)); - - dberr_t err = fil_check_pending_operations( - id, FIL_OPERATION_TRUNCATE, &space, &path); - - ut_free(path); - - if (err == DB_TABLESPACE_NOT_FOUND) { - ib::error() << "Cannot truncate tablespace " << id - << " because it is not found in the tablespace" - " memory cache."; - } - - return(err); -} - -/** Reinitialize the original tablespace header with the same space id -for single tablespace -@param[in] table table belongs to tablespace -@param[in] size size in blocks -@param[in] trx Transaction covering truncate */ -void -fil_reinit_space_header_for_table( - dict_table_t* table, - ulint size, - trx_t* trx) -{ - ulint id = table->space; - - ut_a(!is_system_tablespace(id)); - - /* Invalidate in the buffer pool all pages belonging - to the tablespace. The buffer pool scan may take long - time to complete, therefore we release dict_sys->mutex - and the dict operation lock during the scan and aquire - it again after the buffer pool scan.*/ - - /* Release the lock on the indexes too. So that - they won't violate the latch ordering. */ - dict_table_x_unlock_indexes(table); - row_mysql_unlock_data_dictionary(trx); - - DEBUG_SYNC_C("buffer_pool_scan"); - buf_LRU_flush_or_remove_pages(id, NULL); - - row_mysql_lock_data_dictionary(trx); - - dict_table_x_lock_indexes(table); - - /* Remove all insert buffer entries for the tablespace */ - ibuf_delete_for_discarded_space(id); - - mutex_enter(&fil_system->mutex); - - fil_space_t* space = fil_space_get_by_id(id); - - /* The following code must change when InnoDB supports - multiple datafiles per tablespace. */ - ut_a(UT_LIST_GET_LEN(space->chain) == 1); - - fil_node_t* node = UT_LIST_GET_FIRST(space->chain); - - space->size = node->size = size; - - mutex_exit(&fil_system->mutex); - - mtr_t mtr; - - mtr_start(&mtr); - mtr.set_named_space(id); - - fsp_header_init(id, size, &mtr); - - mtr_commit(&mtr); -} - -#ifdef UNIV_DEBUG -/** Increase redo skipped count for a tablespace. -@param[in] id space id */ -void -fil_space_inc_redo_skipped_count( - ulint id) -{ - fil_space_t* space; - - mutex_enter(&fil_system->mutex); - - space = fil_space_get_by_id(id); - - ut_a(space != NULL); - - space->redo_skipped_count++; - - mutex_exit(&fil_system->mutex); -} - -/** Decrease redo skipped count for a tablespace. -@param[in] id space id */ -void -fil_space_dec_redo_skipped_count( - ulint id) -{ - fil_space_t* space; - - mutex_enter(&fil_system->mutex); - - space = fil_space_get_by_id(id); - - ut_a(space != NULL); - ut_a(space->redo_skipped_count > 0); - - space->redo_skipped_count--; - - mutex_exit(&fil_system->mutex); -} -#endif /* UNIV_DEBUG */ - -/*******************************************************************//** -Discards a single-table tablespace. The tablespace must be cached in the -memory cache. Discarding is like deleting a tablespace, but - - 1. We do not drop the table from the data dictionary; - - 2. We remove all insert buffer entries for the tablespace immediately; - in DROP TABLE they are only removed gradually in the background; - - 3. Free all the pages in use by the tablespace. -@return DB_SUCCESS or error */ -dberr_t -fil_discard_tablespace( -/*===================*/ - ulint id) /*!< in: space id */ -{ - dberr_t err; - - switch (err = fil_delete_tablespace(id)) { - case DB_SUCCESS: - break; - - case DB_IO_ERROR: - ib::warn() << "While deleting tablespace " << id - << " in DISCARD TABLESPACE. File rename/delete" - " failed: " << err; - break; - - case DB_TABLESPACE_NOT_FOUND: - ib::warn() << "Cannot delete tablespace " << id - << " in DISCARD TABLESPACE: " << err; - break; - - default: - ut_error; - } - - /* Remove all insert buffer entries for the tablespace */ - - ibuf_delete_for_discarded_space(id); - - return(err); -} - /*******************************************************************//** Allocates and builds a file name from a path, a table or tablespace name and a suffix. The string must be freed by caller with ut_free(). @@ -3361,15 +2775,13 @@ fil_make_filepath( /** Test if a tablespace file can be renamed to a new filepath by checking if that the old filepath exists and the new filepath does not exist. -@param[in] space_id tablespace id @param[in] old_path old filepath @param[in] new_path new filepath @param[in] is_discarded whether the tablespace is discarded @param[in] replace_new whether to ignore the existence of new_path @return innodb error code */ -dberr_t +static dberr_t fil_rename_tablespace_check( - ulint space_id, const char* old_path, const char* new_path, bool is_discarded, @@ -3383,8 +2795,7 @@ fil_rename_tablespace_check( && !exists) { ib::error() << "Cannot rename '" << old_path << "' to '" << new_path - << "' for space ID " << space_id - << " because the source file" + << "' because the source file" << " does not exist."; return(DB_TABLESPACE_NOT_FOUND); } @@ -3397,8 +2808,7 @@ fil_rename_tablespace_check( if (!replace_new) { ib::error() << "Cannot rename '" << old_path << "' to '" << new_path - << "' for space ID " << space_id - << " because the target file exists." + << "' because the target file exists." " Remove the target file and try again."; return(DB_TABLESPACE_EXISTS); } @@ -3410,8 +2820,8 @@ fil_rename_tablespace_check( a possibly existing tablespace that is associated with the new tablespace file. */ retry: - mutex_enter(&fil_system->mutex); - for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system->space_list); + mutex_enter(&fil_system.mutex); + for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list); space; space = UT_LIST_GET_NEXT(space_list, space)) { ulint id = space->id; if (id && id < SRV_LOG_SPACE_FIRST_ID @@ -3420,7 +2830,7 @@ retry: UT_LIST_GET_FIRST(space->chain)->name)) { ib::info() << "TRUNCATE rollback: " << id << "," << new_path; - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); dberr_t err = fil_delete_tablespace(id); if (err != DB_SUCCESS) { return err; @@ -3428,12 +2838,31 @@ retry: goto retry; } } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); fil_delete_file(new_path); return(DB_SUCCESS); } +dberr_t fil_space_t::rename(const char* name, const char* path, bool log, + bool replace) +{ + ut_ad(UT_LIST_GET_LEN(chain) == 1); + ut_ad(!is_system_tablespace(id)); + + if (log) { + dberr_t err = fil_rename_tablespace_check( + chain.start->name, path, false, replace); + if (err != DB_SUCCESS) { + return(err); + } + fil_name_write_rename(id, chain.start->name, path); + } + + return fil_rename_tablespace(id, chain.start->name, name, path) + ? DB_SUCCESS : DB_ERROR; +} + /** Rename a single-table tablespace. The tablespace must exist in the memory cache. @param[in] id tablespace identifier @@ -3443,7 +2872,7 @@ databasename/tablename format @param[in] new_path_in new file name, or NULL if it is located in the normal data directory @return true if success */ -bool +static bool fil_rename_tablespace( ulint id, const char* old_path, @@ -3456,7 +2885,7 @@ fil_rename_tablespace( ut_ad(strchr(new_name, '/') != NULL); - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); space = fil_space_get_by_id(id); @@ -3465,30 +2894,17 @@ fil_rename_tablespace( << " in the tablespace memory cache, though the file '" << old_path << "' in a rename operation should have that id."; -func_exit: - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(false); } - if (space != fil_space_get_by_name(space->name)) { - ib::error() << "Cannot find " << space->name - << " in tablespace memory cache"; - goto func_exit; - } - - if (fil_space_get_by_name(new_name)) { - ib::error() << new_name - << " is already in tablespace memory cache"; - goto func_exit; - } - /* The following code must change when InnoDB supports multiple datafiles per tablespace. */ ut_a(UT_LIST_GET_LEN(space->chain) == 1); node = UT_LIST_GET_FIRST(space->chain); - space->n_pending_ops++; + ut_a(space->acquire()); - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); char* new_file_name = new_path_in == NULL ? fil_make_filepath(NULL, new_name, IBD, false) @@ -3496,8 +2912,6 @@ func_exit: char* old_file_name = node->name; char* new_space_name = mem_strdup(new_name); char* old_space_name = space->name; - ulint old_fold = ut_fold_string(old_space_name); - ulint new_fold = ut_fold_string(new_space_name); ut_ad(strchr(old_file_name, OS_PATH_SEPARATOR) != NULL); ut_ad(strchr(new_file_name, OS_PATH_SEPARATOR) != NULL); @@ -3506,15 +2920,11 @@ func_exit: log_mutex_enter(); } - /* log_sys->mutex is above fil_system->mutex in the latching order */ + /* log_sys.mutex is above fil_system.mutex in the latching order */ ut_ad(log_mutex_own()); - mutex_enter(&fil_system->mutex); - ut_ad(space->n_pending_ops); - space->n_pending_ops--; + mutex_enter(&fil_system.mutex); + space->release(); ut_ad(space->name == old_space_name); - /* We already checked these. */ - ut_ad(space == fil_space_get_by_name(old_space_name)); - ut_ad(!fil_space_get_by_name(new_space_name)); ut_ad(node->name == old_file_name); bool success; DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2", @@ -3538,11 +2948,7 @@ skip_second_rename: ut_ad(space->name == old_space_name); if (success) { - HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash, - old_fold, space); space->name = new_space_name; - HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash, - new_fold, space); } else { /* Because nothing was renamed, we must free the new names, not the old ones. */ @@ -3550,7 +2956,7 @@ skip_second_rename: old_space_name = new_space_name; } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); ut_free(old_file_name); ut_free(old_space_name); @@ -3563,12 +2969,14 @@ skip_second_rename: @param[in] name Tablespace name in dbname/tablename format. @param[in] path Path and filename of the datafile to create. @param[in] flags Tablespace flags -@param[in] size Initial size of the tablespace file in - pages, must be >= FIL_IBD_FILE_INITIAL_SIZE +@param[in] size Initial size of the tablespace file in pages, +must be >= FIL_IBD_FILE_INITIAL_SIZE @param[in] mode MariaDB encryption mode @param[in] key_id MariaDB encryption key_id -@return DB_SUCCESS or error code */ -dberr_t +@param[out] err DB_SUCCESS or error code +@return the created tablespace +@retval NULL on error */ +fil_space_t* fil_ibd_create( ulint space_id, const char* name, @@ -3576,10 +2984,10 @@ fil_ibd_create( ulint flags, ulint size, fil_encryption_t mode, - uint32_t key_id) + uint32_t key_id, + dberr_t* err) { pfs_os_file_t file; - dberr_t err; byte* buf2; byte* page; bool success; @@ -3595,9 +3003,9 @@ fil_ibd_create( /* Create the subdirectories in the path, if they are not there already. */ - err = os_file_create_subdirs_if_needed(path); - if (err != DB_SUCCESS) { - return(err); + *err = os_file_create_subdirs_if_needed(path); + if (*err != DB_SUCCESS) { + return NULL; } ulint type; @@ -3621,26 +3029,24 @@ fil_ibd_create( if (!success) { /* The following call will print an error message */ - ulint error = os_file_get_last_error(true); - - ib::error() << "Cannot create file '" << path << "'"; - - if (error == OS_FILE_ALREADY_EXISTS) { + switch (os_file_get_last_error(true)) { + case OS_FILE_ALREADY_EXISTS: ib::info() << "The file '" << path << "'" " already exists though the" " corresponding table did not exist" " in the InnoDB data dictionary." " You can resolve the problem by removing" " the file."; - - return(DB_TABLESPACE_EXISTS); - } - - if (error == OS_FILE_DISK_FULL) { - return(DB_OUT_OF_FILE_SPACE); + *err = DB_TABLESPACE_EXISTS; + break; + case OS_FILE_DISK_FULL: + *err = DB_OUT_OF_FILE_SPACE; + break; + default: + *err = DB_ERROR; } - - return(DB_ERROR); + ib::error() << "Cannot create file '" << path << "'"; + return NULL; } const bool is_compressed = FSP_FLAGS_HAS_PAGE_COMPRESSION(flags); @@ -3651,14 +3057,14 @@ fil_ibd_create( } #endif - success = os_file_set_size( + if (!os_file_set_size( path, file, - os_offset_t(size) << UNIV_PAGE_SIZE_SHIFT, is_compressed); - - if (!success) { + os_offset_t(size) << srv_page_size_shift, is_compressed)) { + *err = DB_OUT_OF_FILE_SPACE; +err_exit: os_file_close(file); os_file_delete(innodb_data_file_key, path); - return(DB_OUT_OF_FILE_SPACE); + return NULL; } bool punch_hole = os_is_sparse_file_supported(file); @@ -3674,11 +3080,11 @@ fil_ibd_create( with zeros from the call of os_file_set_size(), until a buffer pool flush would write to it. */ - buf2 = static_cast<byte*>(ut_malloc_nokey(3 * UNIV_PAGE_SIZE)); + buf2 = static_cast<byte*>(ut_malloc_nokey(3U << srv_page_size_shift)); /* Align the memory for file i/o if we might have O_DIRECT set */ - page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); + page = static_cast<byte*>(ut_align(buf2, srv_page_size)); - memset(page, '\0', UNIV_PAGE_SIZE); + memset(page, '\0', srv_page_size); flags |= FSP_FLAGS_PAGE_SSIZE(); fsp_header_init_fields(page, space_id, flags); @@ -3704,12 +3110,12 @@ fil_ibd_create( buf_flush_init_for_writing(NULL, page, NULL, 0); - err = os_file_write( + *err = os_file_write( request, path, file, page, 0, page_size.physical()); } else { page_zip_des_t page_zip; page_zip_set_size(&page_zip, page_size.physical()); - page_zip.data = page + UNIV_PAGE_SIZE; + page_zip.data = page + srv_page_size; #ifdef UNIV_DEBUG page_zip.m_start = #endif /* UNIV_DEBUG */ @@ -3718,59 +3124,45 @@ fil_ibd_create( buf_flush_init_for_writing(NULL, page, &page_zip, 0); - err = os_file_write( + *err = os_file_write( request, path, file, page_zip.data, 0, page_size.physical()); } ut_free(buf2); - if (err != DB_SUCCESS) { - + if (*err != DB_SUCCESS) { ib::error() << "Could not write the first page to" << " tablespace '" << path << "'"; - - os_file_close(file); - os_file_delete(innodb_data_file_key, path); - - return(DB_ERROR); + goto err_exit; } - success = os_file_flush(file); - - if (!success) { + if (!os_file_flush(file)) { ib::error() << "File flush of tablespace '" << path << "' failed"; - os_file_close(file); - os_file_delete(innodb_data_file_key, path); - return(DB_ERROR); + *err = DB_ERROR; + goto err_exit; } if (has_data_dir) { /* Make the ISL file if the IBD file is not in the default location. */ - err = RemoteDatafile::create_link_file(name, path); - if (err != DB_SUCCESS) { - os_file_close(file); - os_file_delete(innodb_data_file_key, path); - return(err); + *err = RemoteDatafile::create_link_file(name, path); + if (*err != DB_SUCCESS) { + goto err_exit; } } space = fil_space_create(name, space_id, flags, FIL_TYPE_TABLESPACE, crypt_data, mode); if (!space) { - if (crypt_data) { - free(crypt_data); - } - - err = DB_ERROR; + free(crypt_data); + *err = DB_ERROR; } else { - mtr_t mtr; fil_node_t* file = space->add(path, OS_FILE_CLOSED, size, false, true); - + mtr_t mtr; mtr.start(); fil_op_write_log( MLOG_FILE_CREATE2, space_id, 0, file->name, @@ -3781,12 +3173,12 @@ fil_ibd_create( file->block_size = block_size; space->punch_hole = punch_hole; - err = DB_SUCCESS; + *err = DB_SUCCESS; } os_file_close(file); - if (err != DB_SUCCESS) { + if (*err != DB_SUCCESS) { if (has_data_dir) { RemoteDatafile::delete_link_file(name); } @@ -3794,7 +3186,7 @@ fil_ibd_create( os_file_delete(innodb_data_file_key, path); } - return(err); + return space; } /** Try to open a single-table tablespace and optionally check that the @@ -3825,18 +3217,44 @@ statement to update the dictionary tables if they are incorrect. @param[in] space_name tablespace name of the datafile If file-per-table, it is the table name in the databasename/tablename format @param[in] path_in expected filepath, usually read from dictionary -@return DB_SUCCESS or error code */ -dberr_t +@param[out] err DB_SUCCESS or error code +@return tablespace +@retval NULL if the tablespace could not be opened */ +fil_space_t* fil_ibd_open( - bool validate, - bool fix_dict, - fil_type_t purpose, - ulint id, - ulint flags, - const char* space_name, - const char* path_in) + bool validate, + bool fix_dict, + fil_type_t purpose, + ulint id, + ulint flags, + const table_name_t& tablename, + const char* path_in, + dberr_t* err) { - dberr_t err = DB_SUCCESS; + mutex_enter(&fil_system.mutex); + if (fil_space_t* space = fil_space_get_by_id(id)) { + if (strcmp(space->name, tablename.m_name)) { + table_name_t space_name; + space_name.m_name = space->name; + ib::error() + << "Trying to open table " << tablename + << " with id " << id + << ", conflicting with " << space_name; + space = NULL; + if (err) *err = DB_TABLESPACE_EXISTS; + } else if (err) *err = DB_SUCCESS; + + mutex_exit(&fil_system.mutex); + + if (space && validate && !srv_read_only_mode) { + fsp_flags_try_adjust(space, + flags & ~FSP_FLAGS_MEM_MASK); + } + + return space; + } + mutex_exit(&fil_system.mutex); + bool dict_filepath_same_as_default = false; bool link_file_found = false; bool link_file_is_bad = false; @@ -3856,19 +3274,21 @@ fil_ibd_open( /* Table flags can be ULINT_UNDEFINED if dict_tf_to_fsp_flags_failure is set. */ if (flags == ULINT_UNDEFINED) { - return(DB_CORRUPTION); +corrupted: + if (err) *err = DB_CORRUPTION; + return NULL; } ut_ad(fsp_flags_is_valid(flags & ~FSP_FLAGS_MEM_MASK, id)); - df_default.init(space_name, flags); - df_dict.init(space_name, flags); - df_remote.init(space_name, flags); + df_default.init(tablename.m_name, flags); + df_dict.init(tablename.m_name, flags); + df_remote.init(tablename.m_name, flags); /* Discover the correct file by looking in three possible locations while avoiding unecessary effort. */ /* We will always look for an ibd in the default location. */ - df_default.make_filepath(NULL, space_name, IBD); + df_default.make_filepath(NULL, tablename.m_name, IBD); /* Look for a filepath embedded in an ISL where the default file would be. */ @@ -3952,8 +3372,8 @@ fil_ibd_open( if (valid_tablespaces_found == 0) { os_file_get_last_error(true); ib::error() << "Could not find a valid tablespace file for `" - << space_name << "`. " << TROUBLESHOOT_DATADICT_MSG; - return(DB_CORRUPTION); + << tablename << "`. " << TROUBLESHOOT_DATADICT_MSG; + goto corrupted; } if (!validate) { goto skip_validate; @@ -3962,7 +3382,7 @@ fil_ibd_open( /* Do not open any tablespaces if more than one tablespace with the correct space ID and flags were found. */ if (tablespaces_found > 1) { - ib::error() << "A tablespace for `" << space_name + ib::error() << "A tablespace for `" << tablename << "` has been found in multiple places;"; if (df_default.is_open()) { @@ -3993,7 +3413,7 @@ fil_ibd_open( any bad tablespaces. */ if (valid_tablespaces_found > 1 || srv_force_recovery > 0) { ib::error() << "Will not open tablespace `" - << space_name << "`"; + << tablename << "`"; /* If the file is not open it cannot be valid. */ ut_ad(df_default.is_open() || !df_default.is_valid()); @@ -4005,10 +3425,11 @@ fil_ibd_open( if (df_default.is_open() != df_default.is_valid() || df_dict.is_open() != df_dict.is_valid() || df_remote.is_open() != df_remote.is_valid()) { - return(DB_CORRUPTION); + goto corrupted; } error: - return(DB_ERROR); + if (err) *err = DB_ERROR; + return NULL; } /* There is only one valid tablespace found and we did @@ -4062,7 +3483,8 @@ error: ut_ad(!dict_filepath_same_as_default); dict_update_filepath(id, df_default.filepath()); if (link_file_is_bad) { - RemoteDatafile::delete_link_file(space_name); + RemoteDatafile::delete_link_file( + tablename.m_name); } } else if (!link_file_found || link_file_is_bad) { @@ -4070,9 +3492,9 @@ error: /* Fix the link file if we got our filepath from the dictionary but a link file did not exist or it did not point to a valid file. */ - RemoteDatafile::delete_link_file(space_name); + RemoteDatafile::delete_link_file(tablename.m_name); RemoteDatafile::create_link_file( - space_name, df_dict.filepath()); + tablename.m_name, df_dict.filepath()); } } else if (df_remote.is_open()) { @@ -4083,7 +3505,8 @@ error: /* SYS_DATAFILES record for this space ID was not found. */ dict_replace_tablespace_and_filepath( - id, space_name, df_remote.filepath(), flags); + id, tablename.m_name, + df_remote.filepath(), flags); } } else if (df_default.is_open()) { @@ -4098,46 +3521,44 @@ error: || (path_in == NULL && DICT_TF_HAS_DATA_DIR(flags)) || df_remote.filepath() != NULL) { dict_replace_tablespace_and_filepath( - id, space_name, df_default.filepath(), flags); + id, tablename.m_name, df_default.filepath(), + flags); } } skip_validate: - if (err == DB_SUCCESS) { - const byte* first_page = - df_default.is_open() ? df_default.get_first_page() : - df_dict.is_open() ? df_dict.get_first_page() : - df_remote.get_first_page(); - - fil_space_crypt_t* crypt_data = first_page - ? fil_space_read_crypt_data(page_size_t(flags), - first_page) - : NULL; - - fil_space_t* space = fil_space_create( - space_name, id, flags, purpose, crypt_data); - if (!space) { - goto error; - } + const byte* first_page = + df_default.is_open() ? df_default.get_first_page() : + df_dict.is_open() ? df_dict.get_first_page() : + df_remote.get_first_page(); - /* We do not measure the size of the file, that is why - we pass the 0 below */ + fil_space_crypt_t* crypt_data = first_page + ? fil_space_read_crypt_data(page_size_t(flags), first_page) + : NULL; + + fil_space_t* space = fil_space_create( + tablename.m_name, id, flags, purpose, crypt_data); + if (!space) { + goto error; + } - space->add( - df_remote.is_open() ? df_remote.filepath() : - df_dict.is_open() ? df_dict.filepath() : - df_default.filepath(), OS_FILE_CLOSED, 0, false, true); + /* We do not measure the size of the file, that is why + we pass the 0 below */ - if (err == DB_SUCCESS && validate - && purpose != FIL_TYPE_IMPORT && !srv_read_only_mode) { - df_remote.close(); - df_dict.close(); - df_default.close(); - fsp_flags_try_adjust(id, flags & ~FSP_FLAGS_MEM_MASK); - } + space->add( + df_remote.is_open() ? df_remote.filepath() : + df_dict.is_open() ? df_dict.filepath() : + df_default.filepath(), OS_FILE_CLOSED, 0, false, true); + + if (validate && purpose != FIL_TYPE_IMPORT && !srv_read_only_mode) { + df_remote.close(); + df_dict.close(); + df_default.close(); + fsp_flags_try_adjust(space, flags & ~FSP_FLAGS_MEM_MASK); } - return(err); + if (err) *err = DB_SUCCESS; + return space; } /** Looks for a pre-existing fil_space_t with the given tablespace ID @@ -4158,7 +3579,7 @@ fil_space_read_name_and_filepath( *name = NULL; *filepath = NULL; - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); fil_space_t* space = fil_space_get_by_id(space_id); @@ -4171,7 +3592,7 @@ fil_space_read_name_and_filepath( success = true; } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(success); } @@ -4200,7 +3621,7 @@ fil_path_to_space_name( while (const char* t = static_cast<const char*>( memchr(tablename, OS_PATH_SEPARATOR, - end - tablename))) { + ulint(end - tablename)))) { dbname = tablename; tablename = t + 1; } @@ -4212,7 +3633,7 @@ fil_path_to_space_name( ut_ad(end - tablename > 4); ut_ad(memcmp(end - 4, DOT_IBD, 4) == 0); - char* name = mem_strdupl(dbname, end - dbname - 4); + char* name = mem_strdupl(dbname, ulint(end - dbname) - 4); ut_ad(name[tablename - dbname - 1] == OS_PATH_SEPARATOR); #if OS_PATH_SEPARATOR != '/' @@ -4352,9 +3773,9 @@ fil_ibd_load( { /* If the a space is already in the file system cache with this space ID, then there is nothing to do. */ - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); space = fil_space_get_by_id(space_id); - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); if (space != NULL) { /* Compare the filename we are trying to open with the @@ -4420,7 +3841,8 @@ fil_ibd_load( /* Every .ibd file is created >= 4 pages in size. Smaller files cannot be OK. */ - minimum_size = FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE; + minimum_size = os_offset_t(FIL_IBD_FILE_INITIAL_SIZE) + << srv_page_size_shift; if (size == static_cast<os_offset_t>(-1)) { /* The following call prints an error message */ @@ -4518,33 +3940,35 @@ fil_file_readdir_next_file( /** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations. (Typically when upgrading from MariaDB 10.1.0..10.1.20.) -@param[in] space_id tablespace ID +@param[in,out] space tablespace @param[in] flags desired tablespace flags */ -UNIV_INTERN -void -fsp_flags_try_adjust(ulint space_id, ulint flags) +void fsp_flags_try_adjust(fil_space_t* space, ulint flags) { ut_ad(!srv_read_only_mode); - ut_ad(fsp_flags_is_valid(flags, space_id)); - if (!fil_space_get_size(space_id)) { + ut_ad(fsp_flags_is_valid(flags, space->id)); + if (!space->size && (space->purpose != FIL_TYPE_TABLESPACE + || !fil_space_get_size(space->id))) { return; } + /* This code is executed during server startup while no + connections are allowed. We do not need to protect against + DROP TABLE by fil_space_acquire(). */ mtr_t mtr; mtr.start(); if (buf_block_t* b = buf_page_get( - page_id_t(space_id, 0), page_size_t(flags), + page_id_t(space->id, 0), page_size_t(flags), RW_X_LATCH, &mtr)) { ulint f = fsp_header_get_flags(b->frame); /* Suppress the message if only the DATA_DIR flag to differs. */ if ((f ^ flags) & ~(1U << FSP_FLAGS_POS_RESERVED)) { ib::warn() - << "adjusting FSP_SPACE_FLAGS of tablespace " - << space_id - << " from " << ib::hex(f) + << "adjusting FSP_SPACE_FLAGS of file '" + << UT_LIST_GET_FIRST(space->chain)->name + << "' from " << ib::hex(f) << " to " << ib::hex(flags); } if (f != flags) { - mtr.set_named_space(space_id); + mtr.set_named_space(space); mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + b->frame, flags, MLOG_4BYTES, &mtr); @@ -4559,147 +3983,47 @@ startup, there may be many tablespaces which are not yet in the memory cache. @param[in] id Tablespace ID @param[in] name Tablespace name used in fil_space_create(). @param[in] table_flags table flags -@return true if a matching tablespace exists in the memory cache */ -bool +@return the tablespace +@retval NULL if no matching tablespace exists in the memory cache */ +fil_space_t* fil_space_for_table_exists_in_mem( ulint id, const char* name, ulint table_flags) { - fil_space_t* space; - const ulint expected_flags = dict_tf_to_fsp_flags(table_flags); - mutex_enter(&fil_system->mutex); - - /* Look if there is a space with the same id */ - - space = fil_space_get_by_id(id); - - /* Look if there is a space with the same name; the name is the - directory path from the datadir to the file */ + mutex_enter(&fil_system.mutex); + if (fil_space_t* space = fil_space_get_by_id(id)) { + if ((space->flags ^ expected_flags) & ~FSP_FLAGS_MEM_MASK) { + goto func_exit; + } - const bool valid = space - && !((space->flags ^ expected_flags) & ~FSP_FLAGS_MEM_MASK) - && space == fil_space_get_by_name(name); + if (strcmp(space->name, name)) { + ib::error() << "Table " << name + << " in InnoDB data dictionary" + " has tablespace id " << id + << ", but the tablespace" + " with that id has name " << space->name << "." + " Have you deleted or moved .ibd files?"; + ib::info() << TROUBLESHOOT_DATADICT_MSG; + goto func_exit; + } - if (valid) { /* Adjust the flags that are in FSP_FLAGS_MEM_MASK. FSP_SPACE_FLAGS will not be written back here. */ space->flags = expected_flags; + mutex_exit(&fil_system.mutex); + if (!srv_read_only_mode) { + fsp_flags_try_adjust(space, expected_flags + & ~FSP_FLAGS_MEM_MASK); + } + return space; } - mutex_exit(&fil_system->mutex); - - if (valid && !srv_read_only_mode) { - fsp_flags_try_adjust(id, expected_flags & ~FSP_FLAGS_MEM_MASK); - } - - return(valid); -} - -/** Return the space ID based on the tablespace name. -The tablespace must be found in the tablespace memory cache. -This call is made from external to this module, so the mutex is not owned. -@param[in] tablespace Tablespace name -@return space ID if tablespace found, ULINT_UNDEFINED if space not. */ -ulint -fil_space_get_id_by_name( - const char* tablespace) -{ - mutex_enter(&fil_system->mutex); - - /* Search for a space with the same name. */ - fil_space_t* space = fil_space_get_by_name(tablespace); - ulint id = (space == NULL) ? ULINT_UNDEFINED : space->id; - - mutex_exit(&fil_system->mutex); - - return(id); -} - -/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/ - -/*******************************************************************//** -Tries to reserve free extents in a file space. -@return true if succeed */ -bool -fil_space_reserve_free_extents( -/*===========================*/ - ulint id, /*!< in: space id */ - ulint n_free_now, /*!< in: number of free extents now */ - ulint n_to_reserve) /*!< in: how many one wants to reserve */ -{ - fil_space_t* space; - bool success; - - ut_ad(fil_system); - - mutex_enter(&fil_system->mutex); - - space = fil_space_get_by_id(id); - - ut_a(space); - - if (space->n_reserved_extents + n_to_reserve > n_free_now) { - success = false; - } else { - space->n_reserved_extents += n_to_reserve; - success = true; - } - - mutex_exit(&fil_system->mutex); - - return(success); -} - -/*******************************************************************//** -Releases free extents in a file space. */ -void -fil_space_release_free_extents( -/*===========================*/ - ulint id, /*!< in: space id */ - ulint n_reserved) /*!< in: how many one reserved */ -{ - fil_space_t* space; - - ut_ad(fil_system); - - mutex_enter(&fil_system->mutex); - - space = fil_space_get_by_id(id); - - ut_a(space); - ut_a(space->n_reserved_extents >= n_reserved); - - space->n_reserved_extents -= n_reserved; - - mutex_exit(&fil_system->mutex); -} - -/*******************************************************************//** -Gets the number of reserved extents. If the database is silent, this number -should be zero. */ -ulint -fil_space_get_n_reserved_extents( -/*=============================*/ - ulint id) /*!< in: space id */ -{ - fil_space_t* space; - ulint n; - - ut_ad(fil_system); - mutex_enter(&fil_system->mutex); - - space = fil_space_get_by_id(id); - - ut_a(space); - - n = space->n_reserved_extents; - - mutex_exit(&fil_system->mutex); - - return(n); +func_exit: + mutex_exit(&fil_system.mutex); + return NULL; } /*============================ FILE I/O ================================*/ @@ -4717,15 +4041,14 @@ bool fil_node_prepare_for_io( /*====================*/ fil_node_t* node, /*!< in: file node */ - fil_system_t* system, /*!< in: tablespace memory cache */ fil_space_t* space) /*!< in: space */ { - ut_ad(node && system && space); - ut_ad(mutex_own(&(system->mutex))); + ut_ad(node && space); + ut_ad(mutex_own(&fil_system.mutex)); - if (system->n_open > system->max_n_open + 5) { - ib::warn() << "Open files " << system->n_open - << " exceeds the limit " << system->max_n_open; + if (fil_system.n_open > srv_max_n_open_files + 5) { + ib::warn() << "Open files " << fil_system.n_open + << " exceeds the limit " << srv_max_n_open_files; } if (!node->is_open()) { @@ -4739,10 +4062,8 @@ fil_node_prepare_for_io( if (node->n_pending == 0 && fil_space_belongs_in_lru(space)) { /* The node is in the LRU list, remove it */ - - ut_a(UT_LIST_GET_LEN(system->LRU) > 0); - - UT_LIST_REMOVE(system->LRU, node); + ut_a(UT_LIST_GET_LEN(fil_system.LRU) > 0); + UT_LIST_REMOVE(fil_system.LRU, node); } node->n_pending++; @@ -4757,7 +4078,7 @@ static void fil_node_complete_io(fil_node_t* node, const IORequest& type) { - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); ut_a(node->n_pending > 0); --node->n_pending; @@ -4767,7 +4088,7 @@ fil_node_complete_io(fil_node_t* node, const IORequest& type) if (type.is_write()) { ut_ad(!srv_read_only_mode - || fsp_is_system_temporary(node->space->id)); + || node->space->purpose == FIL_TYPE_TEMPORARY); if (fil_buffering_disabled(node->space)) { @@ -4781,10 +4102,9 @@ fil_node_complete_io(fil_node_t* node, const IORequest& type) node->needs_flush = true; if (!node->space->is_in_unflushed_spaces) { - - fil_system->unflushed_spaces.push_front( - *node->space); node->space->is_in_unflushed_spaces = true; + fil_system.unflushed_spaces.push_front( + *node->space); } } } @@ -4792,7 +4112,7 @@ fil_node_complete_io(fil_node_t* node, const IORequest& type) if (node->n_pending == 0 && fil_space_belongs_in_lru(node->space)) { /* The node must be put back to the LRU list */ - UT_LIST_ADD_FIRST(fil_system->LRU, node); + UT_LIST_ADD_FIRST(fil_system.LRU, node); } } @@ -4856,15 +4176,13 @@ fil_io( ut_ad(req_type.validate()); ut_ad(len > 0); - ut_ad(byte_offset < UNIV_PAGE_SIZE); + ut_ad(byte_offset < srv_page_size); ut_ad(!page_size.is_compressed() || byte_offset == 0); - ut_ad(UNIV_PAGE_SIZE == (ulong)(1 << UNIV_PAGE_SIZE_SHIFT)); -#if (1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX -# error "(1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX" -#endif -#if (1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN -# error "(1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN" -#endif + ut_ad(srv_page_size == 1UL << srv_page_size_shift); + compile_time_assert((1U << UNIV_PAGE_SIZE_SHIFT_MAX) + == UNIV_PAGE_SIZE_MAX); + compile_time_assert((1U << UNIV_PAGE_SIZE_SHIFT_MIN) + == UNIV_PAGE_SIZE_MIN); ut_ad(fil_validate_skip()); /* ibuf bitmap pages must be read in the sync AIO mode: */ @@ -4922,10 +4240,10 @@ fil_io( if (space == NULL || (req_type.is_read() && !sync - && space->stop_new_ops + && space->is_stopping() && !space->is_being_truncated)) { - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); if (!req_type.ignore_missing() && !ignore_missing_space) { ib::error() @@ -4949,7 +4267,7 @@ fil_io( if (node == NULL) { if (req_type.ignore_missing()) { - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(DB_ERROR); } @@ -4973,14 +4291,13 @@ fil_io( if (space->id != TRX_SYS_SPACE && UT_LIST_GET_LEN(space->chain) == 1 && (srv_is_tablespace_truncated(space->id) - || space->is_being_truncated || srv_was_tablespace_truncated(space)) && req_type.is_read()) { /* Handle page which is outside the truncated tablespace bounds when recovering from a crash happened during a truncation */ - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(DB_TABLESPACE_TRUNCATED); } @@ -4991,10 +4308,10 @@ fil_io( } /* Open file if closed */ - if (!fil_node_prepare_for_io(node, fil_system, space)) { + if (!fil_node_prepare_for_io(node, space)) { if (fil_type_is_data(space->purpose) && fil_is_user_tablespace_id(space->id)) { - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); if (!req_type.ignore_missing()) { ib::error() @@ -5030,7 +4347,7 @@ fil_io( should return with DB_ERROR and let caller decide what to do. */ fil_node_complete_io(node, req_type); - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(DB_ERROR); } @@ -5040,18 +4357,18 @@ fil_io( } /* Now we have made the changes in the data structures of fil_system */ - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); /* Calculate the low 32 bits and the high 32 bits of the file offset */ if (!page_size.is_compressed()) { offset = ((os_offset_t) cur_page_no - << UNIV_PAGE_SIZE_SHIFT) + byte_offset; + << srv_page_size_shift) + byte_offset; ut_a(node->size - cur_page_no - >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1)) - / UNIV_PAGE_SIZE)); + >= ((byte_offset + len + (srv_page_size - 1)) + >> srv_page_size_shift)); } else { ulint size_shift; @@ -5105,11 +4422,11 @@ fil_io( /* The i/o operation is already completed when we return from os_aio: */ - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); fil_node_complete_io(node, req_type); - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); ut_ad(fil_validate_skip()); } @@ -5145,14 +4462,14 @@ fil_aio_wait( srv_set_io_thread_op_info(segment, "complete io for fil node"); - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); fil_node_complete_io(node, type); const fil_type_t purpose = node->space->purpose; const ulint space_id= node->space->id; const bool dblwr = node->space->use_doublewrite(); - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); ut_ad(fil_validate_skip()); @@ -5165,7 +4482,26 @@ fil_aio_wait( switch (purpose) { case FIL_TYPE_LOG: srv_set_io_thread_op_info(segment, "complete io for log"); - log_io_complete(static_cast<log_group_t*>(message)); + /* We use synchronous writing of the logs + and can only end up here when writing a log checkpoint! */ + ut_a(ptrdiff_t(message) == 1); + /* It was a checkpoint write */ + switch (srv_flush_t(srv_file_flush_method)) { + case SRV_O_DSYNC: + case SRV_NOSYNC: + break; + case SRV_FSYNC: + case SRV_LITTLESYNC: + case SRV_O_DIRECT: + case SRV_O_DIRECT_NO_FSYNC: +#ifdef _WIN32 + case SRV_ALL_O_DIRECT_FSYNC: +#endif + fil_flush(SRV_LOG_SPACE_FIRST_ID); + } + + DBUG_PRINT("ib_log", ("checkpoint info written")); + log_sys.complete_checkpoint(); return; case FIL_TYPE_TABLESPACE: case FIL_TYPE_TEMPORARY: @@ -5198,7 +4534,7 @@ fil_aio_wait( << ": " << err; } - fil_space_release_for_io(space); + space->release_for_io(); } return; } @@ -5215,7 +4551,7 @@ fil_flush( ulint space_id) /*!< in: file space id (this can be a group of log files or a tablespace of the database) */ { - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); if (fil_space_t* space = fil_space_get_by_id(space_id)) { if (space->purpose != FIL_TYPE_TEMPORARY @@ -5224,7 +4560,7 @@ fil_flush( } } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); } /** Flush a tablespace. @@ -5232,16 +4568,16 @@ fil_flush( void fil_flush(fil_space_t* space) { - ut_ad(space->n_pending_ios > 0); + ut_ad(space->pending_io()); ut_ad(space->purpose == FIL_TYPE_TABLESPACE || space->purpose == FIL_TYPE_IMPORT); if (!space->is_stopping()) { - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); if (!space->is_stopping()) { fil_flush_low(space); } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); } } @@ -5257,12 +4593,12 @@ fil_flush_file_spaces( ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_LOG); - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); - n_space_ids = fil_system->unflushed_spaces.size(); + n_space_ids = fil_system.unflushed_spaces.size(); if (n_space_ids == 0) { - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return; } @@ -5272,8 +4608,8 @@ fil_flush_file_spaces( n_space_ids = 0; for (sized_ilist<fil_space_t, unflushed_spaces_tag_t>::iterator it - = fil_system->unflushed_spaces.begin(), - end = fil_system->unflushed_spaces.end(); + = fil_system.unflushed_spaces.begin(), + end = fil_system.unflushed_spaces.end(); it != end; ++it) { if (it->purpose == purpose && !it->is_stopping()) { @@ -5281,7 +4617,7 @@ fil_flush_file_spaces( } } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); /* Flush the spaces. It will not hurt to call fil_flush() on a non-existing space id. */ @@ -5317,10 +4653,24 @@ struct Check { @return number of open file nodes */ static ulint validate(const fil_space_t* space) { - ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(mutex_own(&fil_system.mutex)); Check check; ut_list_validate(space->chain, check); ut_a(space->size == check.size); + + switch (space->id) { + case TRX_SYS_SPACE: + ut_ad(fil_system.sys_space == NULL + || fil_system.sys_space == space); + break; + case SRV_TMP_SPACE_ID: + ut_ad(fil_system.temp_space == NULL + || fil_system.temp_space == space); + break; + default: + break; + } + return(check.n_open); } }; @@ -5335,19 +4685,19 @@ fil_validate(void) fil_node_t* fil_node; ulint n_open = 0; - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); - for (fil_space_t *space = UT_LIST_GET_FIRST(fil_system->space_list); + for (fil_space_t *space = UT_LIST_GET_FIRST(fil_system.space_list); space != NULL; space = UT_LIST_GET_NEXT(space_list, space)) { n_open += Check::validate(space); } - ut_a(fil_system->n_open == n_open); + ut_a(fil_system.n_open == n_open); - ut_list_validate(fil_system->LRU); + ut_list_validate(fil_system.LRU); - for (fil_node = UT_LIST_GET_FIRST(fil_system->LRU); + for (fil_node = UT_LIST_GET_FIRST(fil_system.LRU); fil_node != 0; fil_node = UT_LIST_GET_NEXT(LRU, fil_node)) { @@ -5357,7 +4707,7 @@ fil_validate(void) ut_a(fil_space_belongs_in_lru(fil_node->space)); } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return(true); } @@ -5408,30 +4758,6 @@ fil_page_set_type( mach_write_to_2(page + FIL_PAGE_TYPE, type); } -/****************************************************************//** -Closes the tablespace memory cache. */ -void -fil_close(void) -/*===========*/ -{ - if (fil_system) { - hash_table_free(fil_system->spaces); - - hash_table_free(fil_system->name_hash); - - ut_a(UT_LIST_GET_LEN(fil_system->LRU) == 0); - ut_a(fil_system->unflushed_spaces.size() == 0); - ut_a(UT_LIST_GET_LEN(fil_system->space_list) == 0); - - mutex_free(&fil_system->mutex); - - delete fil_system; - fil_system = NULL; - - fil_space_crypt_cleanup(); - } -} - /********************************************************************//** Delete the tablespace file and any related files like .cfg. This should not be called for temporary tables. @@ -5468,88 +4794,65 @@ fil_mtr_rename_log( const char* tmp_name, mtr_t* mtr) { - dberr_t err; - - bool old_is_file_per_table = - !is_system_tablespace(old_table->space); - - bool new_is_file_per_table = - !is_system_tablespace(new_table->space); + ut_ad(old_table->space != fil_system.temp_space); + ut_ad(new_table->space != fil_system.temp_space); + ut_ad(old_table->space->id == old_table->space_id); + ut_ad(new_table->space->id == new_table->space_id); /* If neither table is file-per-table, there will be no renaming of files. */ - if (!old_is_file_per_table && !new_is_file_per_table) { + if (!old_table->space_id && !new_table->space_id) { return(DB_SUCCESS); } - const char* old_dir = DICT_TF_HAS_DATA_DIR(old_table->flags) - ? old_table->data_dir_path - : NULL; - - char* old_path = fil_make_filepath( - old_dir, old_table->name.m_name, IBD, (old_dir != NULL)); - if (old_path == NULL) { - return(DB_OUT_OF_MEMORY); - } + const bool has_data_dir = DICT_TF_HAS_DATA_DIR(old_table->flags); - if (old_is_file_per_table) { + if (old_table->space_id) { char* tmp_path = fil_make_filepath( - old_dir, tmp_name, IBD, (old_dir != NULL)); + has_data_dir ? old_table->data_dir_path : NULL, + tmp_name, IBD, has_data_dir); if (tmp_path == NULL) { - ut_free(old_path); return(DB_OUT_OF_MEMORY); } + const char* old_path = old_table->space->chain.start->name; /* Temp filepath must not exist. */ - err = fil_rename_tablespace_check( - old_table->space, old_path, tmp_path, - dict_table_is_discarded(old_table)); + dberr_t err = fil_rename_tablespace_check( + old_path, tmp_path, !old_table->space); if (err != DB_SUCCESS) { - ut_free(old_path); ut_free(tmp_path); return(err); } fil_name_write_rename_low( - old_table->space, 0, old_path, tmp_path, mtr); + old_table->space_id, 0, old_path, tmp_path, mtr); ut_free(tmp_path); } - if (new_is_file_per_table) { - const char* new_dir = DICT_TF_HAS_DATA_DIR(new_table->flags) - ? new_table->data_dir_path - : NULL; - char* new_path = fil_make_filepath( - new_dir, new_table->name.m_name, - IBD, (new_dir != NULL)); - if (new_path == NULL) { - ut_free(old_path); - return(DB_OUT_OF_MEMORY); - } + if (new_table->space_id) { + const char* new_path = new_table->space->chain.start->name; + char* old_path = fil_make_filepath( + has_data_dir ? old_table->data_dir_path : NULL, + old_table->name.m_name, IBD, has_data_dir); /* Destination filepath must not exist unless this ALTER TABLE starts and ends with a file_per-table tablespace. */ - if (!old_is_file_per_table) { - err = fil_rename_tablespace_check( - new_table->space, new_path, old_path, - dict_table_is_discarded(new_table)); + if (!old_table->space_id) { + dberr_t err = fil_rename_tablespace_check( + new_path, old_path, !new_table->space); if (err != DB_SUCCESS) { ut_free(old_path); - ut_free(new_path); return(err); } } fil_name_write_rename_low( - new_table->space, 0, new_path, old_path, mtr); - - ut_free(new_path); + new_table->space_id, 0, new_path, old_path, mtr); + ut_free(old_path); } - ut_free(old_path); - - return(DB_SUCCESS); + return DB_SUCCESS; } #ifdef UNIV_DEBUG @@ -5560,7 +4863,7 @@ void fil_space_validate_for_mtr_commit( const fil_space_t* space) { - ut_ad(!mutex_own(&fil_system->mutex)); + ut_ad(!mutex_own(&fil_system.mutex)); ut_ad(space != NULL); ut_ad(space->purpose == FIL_TYPE_TABLESPACE); ut_ad(!is_predefined_tablespace(space->id)); @@ -5575,11 +4878,11 @@ fil_space_validate_for_mtr_commit( to quiesce. This is not a problem, because ibuf_merge_or_delete_for_page() would call fil_space_acquire() before mtr_start() and - fil_space_release() after mtr_commit(). This is why + fil_space_t::release() after mtr_commit(). This is why n_pending_ops should not be zero if stop_new_ops is set. */ - ut_ad(!space->stop_new_ops + ut_ad(!space->is_stopping() || space->is_being_truncated /* fil_truncate_prepare() */ - || space->n_pending_ops > 0); + || space->referenced()); } #endif /* UNIV_DEBUG */ @@ -5605,12 +4908,12 @@ fil_names_dirty( { ut_ad(log_mutex_own()); ut_ad(recv_recovery_is_on()); - ut_ad(log_sys->lsn != 0); + ut_ad(log_sys.lsn != 0); ut_ad(space->max_lsn == 0); ut_d(fil_space_validate_for_mtr_commit(space)); - UT_LIST_ADD_LAST(fil_system->named_spaces, space); - space->max_lsn = log_sys->lsn; + UT_LIST_ADD_LAST(fil_system.named_spaces, space); + space->max_lsn = log_sys.lsn; } /** Write MLOG_FILE_NAME records when a non-predefined persistent @@ -5625,9 +4928,9 @@ fil_names_dirty_and_write( { ut_ad(log_mutex_own()); ut_d(fil_space_validate_for_mtr_commit(space)); - ut_ad(space->max_lsn == log_sys->lsn); + ut_ad(space->max_lsn == log_sys.lsn); - UT_LIST_ADD_LAST(fil_system->named_spaces, space); + UT_LIST_ADD_LAST(fil_system.named_spaces, space); fil_names_write(space, mtr); DBUG_EXECUTE_IF("fil_names_write_bogus", @@ -5662,14 +4965,14 @@ fil_names_clear( ut_ad(log_mutex_own()); - if (log_sys->append_on_checkpoint) { - mtr_write_log(log_sys->append_on_checkpoint); + if (log_sys.append_on_checkpoint) { + mtr_write_log(log_sys.append_on_checkpoint); do_write = true; } mtr.start(); - for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system->named_spaces); + for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.named_spaces); space != NULL; ) { fil_space_t* next = UT_LIST_GET_NEXT(named_spaces, space); @@ -5681,7 +4984,7 @@ fil_names_clear( modified any more, subsequent checkpoints will avoid calling fil_names_write() on it. */ space->max_lsn = 0; - UT_LIST_REMOVE(fil_system->named_spaces, space); + UT_LIST_REMOVE(fil_system.named_spaces, space); } /* max_lsn is the last LSN where fil_names_dirty_and_write() @@ -5752,7 +5055,7 @@ truncate_t::truncate( return(DB_OUT_OF_MEMORY); } - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); fil_space_t* space = fil_space_get_by_id(space_id); @@ -5775,8 +5078,8 @@ truncate_t::truncate( node->handle = os_file_create_simple_no_error_handling( innodb_data_file_key, path, OS_FILE_OPEN, OS_FILE_READ_WRITE, - fsp_is_system_temporary(space_id) - ? false : srv_read_only_mode, &ret); + space->purpose != FIL_TYPE_TEMPORARY + && srv_read_only_mode, &ret); if (!ret) { ib::error() << "Failed to open tablespace file " @@ -5795,7 +5098,7 @@ truncate_t::truncate( : space->size; const bool success = os_file_truncate( - path, node->handle, trunc_size * UNIV_PAGE_SIZE); + path, node->handle, trunc_size << srv_page_size_shift); if (!success) { ib::error() << "Cannot truncate file " << path @@ -5803,8 +5106,7 @@ truncate_t::truncate( err = DB_ERROR; } - space->stop_new_ops = false; - space->is_being_truncated = false; + space->set_stopping(false); /* If we opened the file in this function, close it. */ if (!already_open) { @@ -5821,7 +5123,7 @@ truncate_t::truncate( } } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); ut_free(path); @@ -5875,17 +5177,6 @@ test_make_filepath() #endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */ /* @} */ -/** Release the reserved free extents. -@param[in] n_reserved number of reserved extents */ -void -fil_space_t::release_free_extents(ulint n_reserved) -{ - ut_ad(rw_lock_own(&latch, RW_LOCK_X)); - - ut_a(n_reserved_extents >= n_reserved); - n_reserved_extents -= n_reserved; -} - /** Determine the block size of the data file. @param[in] space tablespace @param[in] offset page number @@ -5916,26 +5207,6 @@ fil_space_get_block_size(const fil_space_t* space, unsigned offset) return block_size; } -/*******************************************************************//** -Returns the table space by a given id, NULL if not found. */ -fil_space_t* -fil_space_found_by_id( -/*==================*/ - ulint id) /*!< in: space id */ -{ - fil_space_t* space = NULL; - mutex_enter(&fil_system->mutex); - space = fil_space_get_by_id(id); - - /* Not found if space is being deleted */ - if (space && space->stop_new_ops) { - space = NULL; - } - - mutex_exit(&fil_system->mutex); - return space; -} - /** Get should we punch hole to tablespace. @param[in] node File node diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index 4f7fa89c662..7d034a194c9 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -105,7 +105,7 @@ ulint fil_page_compress(const byte* buf, byte* out_buf, ulint level, /* If no compression level was provided to this table, use system default level */ if (comp_level == 0) { - comp_level = page_zip_level; + comp_level = int(page_zip_level); } ulint write_size = srv_page_size - header_len; @@ -264,12 +264,6 @@ success: srv_stats.page_compression_saved.add(srv_page_size - write_size); srv_stats.pages_page_compressed.inc(); - /* If we do not persistently trim rest of page, we need to write it - all */ - if (!srv_use_trim) { - memset(out_buf + write_size, 0, srv_page_size - write_size); - } - return write_size; } diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc index b367ed37c2b..90ddca77549 100644 --- a/storage/innobase/fsp/fsp0file.cc +++ b/storage/innobase/fsp/fsp0file.cc @@ -297,7 +297,7 @@ Datafile::read_first_page(bool read_only_mode) /* Align the memory for a possible read from a raw device */ m_first_page = static_cast<byte*>( - ut_align(m_first_page_buf, UNIV_PAGE_SIZE)); + ut_align(m_first_page_buf, srv_page_size)); IORequest request; dberr_t err = DB_ERROR; @@ -524,7 +524,7 @@ err_exit: /* Check if the whole page is blank. */ if (!m_space_id && !m_flags) { const byte* b = m_first_page; - ulint nonzero_bytes = UNIV_PAGE_SIZE; + ulint nonzero_bytes = srv_page_size; while (*b == '\0' && --nonzero_bytes != 0) { @@ -545,13 +545,13 @@ err_exit: const page_size_t page_size(m_flags); - if (univ_page_size.logical() != page_size.logical()) { - /* Page size must be univ_page_size. */ + if (srv_page_size != page_size.logical()) { + /* Logical size must be innodb_page_size. */ ib::error() << "Data file '" << m_filepath << "' uses page size " << page_size.logical() << ", but the innodb_page_size" " start-up parameter is " - << univ_page_size.logical(); + << srv_page_size; free_first_page(); return(DB_ERROR); } @@ -677,8 +677,8 @@ Datafile::find_space_id() bool noncompressed_ok = false; /* For noncompressed pages, the page size must be - equal to univ_page_size.physical(). */ - if (page_size == univ_page_size.physical()) { + equal to srv_page_size. */ + if (page_size == srv_page_size) { noncompressed_ok = !buf_page_is_corrupted( false, page, univ_page_size, NULL); } @@ -692,11 +692,11 @@ Datafile::find_space_id() assume the page is compressed if univ_page_size. logical() is equal to or less than 16k and the page_size we are checking is equal to or less than - univ_page_size.logical(). */ - if (univ_page_size.logical() <= UNIV_PAGE_SIZE_DEF - && page_size <= univ_page_size.logical()) { + srv_page_size. */ + if (srv_page_size <= UNIV_PAGE_SIZE_DEF + && page_size <= srv_page_size) { const page_size_t compr_page_size( - page_size, univ_page_size.logical(), + page_size, srv_page_size, true); compressed_ok = !buf_page_is_corrupted( @@ -820,7 +820,10 @@ open that file, and read the contents into m_filepath. dberr_t RemoteDatafile::open_link_file() { - set_link_filepath(NULL); + if (m_link_filepath == NULL) { + m_link_filepath = fil_make_filepath(NULL, name(), ISL, false); + } + m_filepath = read_link_file(m_link_filepath); return(m_filepath == NULL ? DB_CANNOT_OPEN_FILE : DB_SUCCESS); @@ -886,18 +889,6 @@ RemoteDatafile::shutdown() } } -/** Set the link filepath. Use default datadir, the base name of -the path provided without its suffix, plus DOT_ISL. -@param[in] path filepath which contains a basename to use. - If NULL, use m_name as the basename. */ -void -RemoteDatafile::set_link_filepath(const char* path) -{ - if (m_link_filepath == NULL) { - m_link_filepath = fil_make_filepath(NULL, name(), ISL, false); - } -} - /** Creates a new InnoDB Symbolic Link (ISL) file. It is always created under the 'datadir' of MySQL. The datadir is the directory of a running mysqld program. We can refer to it by simply using the path ".". @@ -1032,7 +1023,7 @@ char* RemoteDatafile::read_link_file( const char* link_filepath) { - FILE* file = fopen(link_filepath, "r+b"); + FILE* file = fopen(link_filepath, "r+b" STR_O_CLOEXEC); if (file == NULL) { return(NULL); } diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 58aa18ac323..155aab0011d 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -171,20 +171,6 @@ fsp_get_space_header( return(header); } -#ifdef UNIV_DEBUG -/** Skip some of the sanity checks that are time consuming even in debug mode -and can affect frequent verification runs that are done to ensure stability of -the product. -@return true if check should be skipped for given space. */ -bool -fsp_skip_sanity_check( - ulint space_id) -{ - return(srv_skip_temp_table_checks_debug - && fsp_is_system_temporary(space_id)); -} -#endif /* UNIV_DEBUG */ - /**********************************************************************//** Gets a descriptor bit of a page. @return TRUE if free */ @@ -599,7 +585,7 @@ void fsp_apply_init_file_page(buf_block_t* block) { page_t* page = buf_block_get_frame(block); - memset(page, 0, UNIV_PAGE_SIZE); + memset(page, 0, srv_page_size); mach_write_to_4(page + FIL_PAGE_OFFSET, block->page.id.page_no()); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, @@ -631,7 +617,7 @@ void fil_space_t::modify_check(const mtr_t& mtr) const case MTR_LOG_NO_REDO: ut_ad(purpose == FIL_TYPE_TEMPORARY || purpose == FIL_TYPE_IMPORT - || redo_skipped_count + || my_atomic_loadlint(&redo_skipped_count) || is_being_truncated || srv_is_tablespace_truncated(id)); return; @@ -648,26 +634,6 @@ void fil_space_t::modify_check(const mtr_t& mtr) const #endif /**********************************************************************//** -Initializes the fsp system. */ -void -fsp_init(void) -/*==========*/ -{ - /* FSP_EXTENT_SIZE must be a multiple of page & zip size */ - ut_a(0 == (UNIV_PAGE_SIZE % FSP_EXTENT_SIZE)); - ut_a(UNIV_PAGE_SIZE); - -#if UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX -# error "UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX != 0" -#endif -#if UNIV_ZIP_SIZE_MIN % FSP_EXTENT_SIZE_MIN -# error "UNIV_ZIP_SIZE_MIN % FSP_EXTENT_SIZE_MIN != 0" -#endif - - /* Does nothing at the moment */ -} - -/**********************************************************************//** Writes the space id and flags to a tablespace header. The flags contain row type, physical/compressed page size, and logical/uncompressed page size of the tablespace. */ @@ -688,24 +654,16 @@ fsp_header_init_fields( } /** Initialize a tablespace header. -@param[in] space_id space id -@param[in] size current size in blocks -@param[in,out] mtr mini-transaction */ -void -fsp_header_init(ulint space_id, ulint size, mtr_t* mtr) +@param[in,out] space tablespace +@param[in] size current size in blocks +@param[in,out] mtr mini-transaction */ +void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr) { - fsp_header_t* header; - buf_block_t* block; - page_t* page; - - ut_ad(mtr); - - fil_space_t* space = mtr_x_lock_space(space_id, mtr); - - const page_id_t page_id(space_id, 0); + const page_id_t page_id(space->id, 0); const page_size_t page_size(space->flags); - block = buf_page_create(page_id, page_size, mtr); + mtr_x_lock_space(space, mtr); + buf_block_t* block = buf_page_create(page_id, page_size, mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); space->size_in_header = size; @@ -715,40 +673,41 @@ fsp_header_init(ulint space_id, ulint size, mtr_t* mtr) /* The prior contents of the file page should be ignored */ fsp_init_file_page(space, block, mtr); - page = buf_block_get_frame(block); - mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_FSP_HDR, + mlog_write_ulint(block->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_FSP_HDR, MLOG_2BYTES, mtr); - header = FSP_HEADER_OFFSET + page; - - mlog_write_ulint(header + FSP_SPACE_ID, space_id, MLOG_4BYTES, mtr); - mlog_write_ulint(header + FSP_NOT_USED, 0, MLOG_4BYTES, mtr); - - mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr); - mlog_write_ulint(header + FSP_FREE_LIMIT, 0, MLOG_4BYTES, mtr); - mlog_write_ulint(header + FSP_SPACE_FLAGS, + mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SPACE_ID + block->frame, + space->id, MLOG_4BYTES, mtr); + mlog_write_ulint(FSP_HEADER_OFFSET + FSP_NOT_USED + block->frame, 0, + MLOG_4BYTES, mtr); + mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SIZE + block->frame, size, + MLOG_4BYTES, mtr); + mlog_write_ulint(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + block->frame, 0, + MLOG_4BYTES, mtr); + mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + block->frame, space->flags & ~FSP_FLAGS_MEM_MASK, MLOG_4BYTES, mtr); - mlog_write_ulint(header + FSP_FRAG_N_USED, 0, MLOG_4BYTES, mtr); + mlog_write_ulint(FSP_HEADER_OFFSET + FSP_FRAG_N_USED + block->frame, 0, + MLOG_4BYTES, mtr); - flst_init(header + FSP_FREE, mtr); - flst_init(header + FSP_FREE_FRAG, mtr); - flst_init(header + FSP_FULL_FRAG, mtr); - flst_init(header + FSP_SEG_INODES_FULL, mtr); - flst_init(header + FSP_SEG_INODES_FREE, mtr); + flst_init(FSP_HEADER_OFFSET + FSP_FREE + block->frame, mtr); + flst_init(FSP_HEADER_OFFSET + FSP_FREE_FRAG + block->frame, mtr); + flst_init(FSP_HEADER_OFFSET + FSP_FULL_FRAG + block->frame, mtr); + flst_init(FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL + block->frame, mtr); + flst_init(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + block->frame, mtr); - mlog_write_ull(header + FSP_SEG_ID, 1, mtr); + mlog_write_ull(FSP_HEADER_OFFSET + FSP_SEG_ID + block->frame, 1, mtr); - fsp_fill_free_list(!is_system_tablespace(space_id), - space, header, mtr); + fsp_fill_free_list(!is_system_tablespace(space->id), + space, FSP_HEADER_OFFSET + block->frame, mtr); /* Write encryption metadata to page 0 if tablespace is encrypted or encryption is disabled by table option. */ if (space->crypt_data && (space->crypt_data->should_encrypt() || space->crypt_data->not_encrypted())) { - space->crypt_data->write_page0(space, page, mtr); + space->crypt_data->write_page0(space, block->frame, mtr); } } @@ -779,63 +738,6 @@ fsp_header_get_space_id( return(id); } -/**********************************************************************//** -Increases the space size field of a space. */ -void -fsp_header_inc_size( -/*================*/ - ulint space_id, /*!< in: space id */ - ulint size_inc, /*!< in: size increment in pages */ - mtr_t* mtr) /*!< in/out: mini-transaction */ -{ - fsp_header_t* header; - ulint size; - - ut_ad(mtr); - - fil_space_t* space = mtr_x_lock_space(space_id, mtr); - ut_d(space->modify_check(*mtr)); - - header = fsp_get_space_header( - space, page_size_t(space->flags), mtr); - - size = mach_read_from_4(header + FSP_SIZE); - ut_ad(size == space->size_in_header); - - size += size_inc; - - mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr); - space->size_in_header = size; -} - -/**********************************************************************//** -Gets the size of the system tablespace from the tablespace header. If -we do not have an auto-extending data file, this should be equal to -the size of the data files. If there is an auto-extending data file, -this can be smaller. -@return size in pages */ -ulint -fsp_header_get_tablespace_size(void) -/*================================*/ -{ - fsp_header_t* header; - ulint size; - mtr_t mtr; - - mtr_start(&mtr); - - fil_space_t* space = mtr_x_lock_space(TRX_SYS_SPACE, &mtr); - - header = fsp_get_space_header(space, univ_page_size, &mtr); - - size = mach_read_from_4(header + FSP_SIZE); - ut_ad(space->size_in_header == size); - - mtr_commit(&mtr); - - return(size); -} - /** Try to extend a single-table tablespace so that a page would fit in the data file. @param[in,out] space tablespace @@ -1415,7 +1317,7 @@ fsp_alloc_free_page( /** Frees a single page of a space. The page is marked as free and clean. @param[in,out] space tablespace -@param[in] page_id page id +@param[in] offset page number @param[in] page_size page size @param[in,out] mtr mini-transaction */ static @@ -1963,32 +1865,18 @@ fseg_get_n_frag_pages( return(count); } -/** Creates a new segment. -@param[in] space_id space_id -@param[in] byte_offset byte offset of the created segment - header on the page -@param[in] has_done_reservation TRUE if the caller has already - done the reservation for the pages - with fsp_reserve_free_externts - (at least 2 extents: one for - the inode and the other for the - segment) then there is no need to do - the check for this individual - operation -@param[in,out] mtr mini-transaction -@param[in] block block where the segment header is - placed. If it is null then new page - will be allocated and it will belong - to the created segment -@return the block where the segment header is placed, x-latched, NULL -if could not create segment because of lack of space */ +/** Create a new segment. +@param space tablespace +@param byte_offset byte offset of the created segment header +@param mtr mini-transaction +@param has_done_reservation whether fsp_reserve_free_extents() was invoked +@param block block where segment header is placed, + or NULL to allocate an additional page for that +@return the block where the segment header is placed, x-latched +@retval NULL if could not create segment because of lack of space */ buf_block_t* -fseg_create_general( - ulint space_id, - ulint byte_offset, - ibool has_done_reservation, - mtr_t* mtr, - buf_block_t* block) +fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, + bool has_done_reservation, buf_block_t *block) { fsp_header_t* space_header; fseg_inode_t* inode; @@ -1997,13 +1885,14 @@ fseg_create_general( ulint n_reserved; ulint i; - DBUG_ENTER("fseg_create_general"); + DBUG_ENTER("fseg_create"); ut_ad(mtr); + ut_ad(byte_offset >= FIL_PAGE_DATA); ut_ad(byte_offset + FSEG_HEADER_SIZE - <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END); + <= srv_page_size - FIL_PAGE_DATA_END); - fil_space_t* space = mtr_x_lock_space(space_id, mtr); + mtr_x_lock_space(space, mtr); const page_size_t page_size(space->flags); ut_d(space->modify_check(*mtr)); @@ -2019,7 +1908,7 @@ fseg_create_general( } if (!has_done_reservation - && !fsp_reserve_free_extents(&n_reserved, space_id, 2, + && !fsp_reserve_free_extents(&n_reserved, space, 2, FSP_NORMAL, mtr)) { DBUG_RETURN(NULL); } @@ -2086,38 +1975,16 @@ fseg_create_general( page_get_page_no(page_align(inode)), MLOG_4BYTES, mtr); - mlog_write_ulint(header + FSEG_HDR_SPACE, space_id, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSEG_HDR_SPACE, space->id, MLOG_4BYTES, mtr); funct_exit: if (!has_done_reservation) { - - fil_space_release_free_extents(space_id, n_reserved); + space->release_free_extents(n_reserved); } DBUG_RETURN(block); } -/** Creates a new segment. -@param[in] space space id -@param[in] byte_offset byte offset of the created segment header - on the page -@param[in,out] mtr mini-transaction -@param[in,out] block block where segment header is placed; - If it is null then new page will be - allocated and it will belong to - the created segment -@return the block where the segment header is placed, x-latched, NULL -if could not create segment because of lack of space */ -buf_block_t* -fseg_create( - ulint space, - ulint byte_offset, - mtr_t* mtr, - buf_block_t* block) -{ - return(fseg_create_general(space, byte_offset, FALSE, mtr, block)); -} - /**********************************************************************//** Calculates the number of pages reserved by a segment, and how many pages are currently used. @@ -2602,7 +2469,7 @@ fseg_alloc_free_page_general( fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); if (!has_done_reservation - && !fsp_reserve_free_extents(&n_reserved, space_id, 2, + && !fsp_reserve_free_extents(&n_reserved, space, 2, FSP_NORMAL, mtr)) { return(NULL); } @@ -2620,7 +2487,7 @@ fseg_alloc_free_page_general( ut_ad(!has_done_reservation || block != NULL); if (!has_done_reservation) { - fil_space_release_free_extents(space_id, n_reserved); + space->release_free_extents(n_reserved); } return(block); @@ -2669,7 +2536,7 @@ fsp_reserve_free_pages( use several pages from the tablespace should call this function beforehand and reserve enough free extents so that they certainly will be able to do their operation, like a B-tree page split, fully. Reservations -must be released with function fil_space_release_free_extents! +must be released with function fil_space_t::release_free_extents()! The alloc_type below has the following meaning: FSP_NORMAL means an operation which will probably result in more space usage, like an @@ -2695,7 +2562,7 @@ free pages available. return true and the tablespace size is < FSP_EXTENT_SIZE pages, then this can be 0, otherwise it is n_ext -@param[in] space_id tablespace identifier +@param[in,out] space tablespace @param[in] n_ext number of extents to reserve @param[in] alloc_type page reservation type (FSP_BLOB, etc) @param[in,out] mtr the mini transaction @@ -2706,7 +2573,7 @@ free pages available. bool fsp_reserve_free_extents( ulint* n_reserved, - ulint space_id, + fil_space_t* space, ulint n_ext, fsp_reserve_t alloc_type, mtr_t* mtr, @@ -2724,7 +2591,7 @@ fsp_reserve_free_extents( ut_ad(mtr); *n_reserved = n_ext; - fil_space_t* space = mtr_x_lock_space(space_id, mtr); + mtr_x_lock_space(space, mtr); const page_size_t page_size(space->flags); space_header = fsp_get_space_header(space, page_size, mtr); @@ -2796,7 +2663,7 @@ try_again: ut_error; } - if (fil_space_reserve_free_extents(space_id, n_free, n_ext)) { + if (space->reserve_free_extents(n_free, n_ext)) { return(true); } try_to_extend: @@ -2808,69 +2675,6 @@ try_to_extend: return(false); } -/** Calculate how many KiB of new data we will be able to insert to the -tablespace without running out of space. -@param[in] space_id tablespace ID -@return available space in KiB -@retval UINTMAX_MAX if unknown */ -uintmax_t -fsp_get_available_space_in_free_extents( - ulint space_id) -{ - FilSpace space(space_id); - if (space() == NULL) { - return(UINTMAX_MAX); - } - - return(fsp_get_available_space_in_free_extents(space)); -} - -/** Calculate how many KiB of new data we will be able to insert to the -tablespace without running out of space. Start with a space object that has -been acquired by the caller who holds it for the calculation, -@param[in] space tablespace object from fil_space_acquire() -@return available space in KiB */ -uintmax_t -fsp_get_available_space_in_free_extents( - const fil_space_t* space) -{ - ut_ad(space->n_pending_ops > 0); - - ulint size_in_header = space->size_in_header; - if (size_in_header < FSP_EXTENT_SIZE) { - return(0); /* TODO: count free frag pages and - return a value based on that */ - } - - /* Below we play safe when counting free extents above the free limit: - some of them will contain extent descriptor pages, and therefore - will not be free extents */ - ut_ad(size_in_header >= space->free_limit); - ulint n_free_up = - (size_in_header - space->free_limit) / FSP_EXTENT_SIZE; - - page_size_t page_size(space->flags); - if (n_free_up > 0) { - n_free_up--; - n_free_up -= n_free_up / (page_size.physical() - / FSP_EXTENT_SIZE); - } - - /* We reserve 1 extent + 0.5 % of the space size to undo logs - and 1 extent + 0.5 % to cleaning operations; NOTE: this source - code is duplicated in the function above! */ - - ulint reserve = 2 + ((size_in_header / FSP_EXTENT_SIZE) * 2) / 200; - ulint n_free = space->free_len + n_free_up; - - if (reserve > n_free) { - return(0); - } - - return(static_cast<uintmax_t>(n_free - reserve) - * FSP_EXTENT_SIZE * (page_size.physical() / 1024)); -} - /********************************************************************//** Marks a page used. The page must reside within the extents of the given segment. */ @@ -3039,31 +2843,34 @@ fseg_free_page_low( } } -/**********************************************************************//** -Frees a single page of a segment. */ +/** Free a page in a file segment. +@param[in,out] seg_header file segment header +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction */ void fseg_free_page( - fseg_header_t* seg_header, /*!< in: segment header */ - ulint space_id,/*!< in: space id */ - ulint page, /*!< in: page offset */ - mtr_t* mtr) /*!< in/out: mini-transaction */ + fseg_header_t* seg_header, + fil_space_t* space, + ulint offset, + mtr_t* mtr) { DBUG_ENTER("fseg_free_page"); fseg_inode_t* seg_inode; buf_block_t* iblock; - fil_space_t* space = mtr_x_lock_space(space_id, mtr); + mtr_x_lock_space(space, mtr); const page_size_t page_size(space->flags); - DBUG_LOG("fseg_free_page", "space_id: " << space_id - << ", page_no: " << page); + DBUG_LOG("fseg_free_page", "space_id: " << space->id + << ", page_no: " << offset); - seg_inode = fseg_inode_get(seg_header, space_id, page_size, mtr, + seg_inode = fseg_inode_get(seg_header, space->id, page_size, mtr, &iblock); fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); - fseg_free_page_low(seg_inode, space, page, page_size, mtr); + fseg_free_page_low(seg_inode, space, offset, page_size, mtr); - ut_d(buf_page_set_file_page_was_freed(page_id_t(space_id, page))); + ut_d(buf_page_set_file_page_was_freed(page_id_t(space->id, offset))); DBUG_VOID_RETURN; } @@ -3081,7 +2888,7 @@ fseg_page_is_free(fil_space_t* space, unsigned page) page_no_t dpage = xdes_calc_descriptor_page(page_size, page); mtr.start(); - mtr_s_lock(&space->latch, &mtr); + mtr_s_lock_space(space, &mtr); if (page >= space->free_limit || page >= space->size_in_header) { is_free = true; diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc index dc5a27e2f2c..451187a35d9 100644 --- a/storage/innobase/fsp/fsp0sysspace.cc +++ b/storage/innobase/fsp/fsp0sysspace.cc @@ -47,14 +47,6 @@ SysTablespace srv_tmp_space; at a time. We have to make this public because it is a config variable. */ ulong sys_tablespace_auto_extend_increment; -#ifdef UNIV_DEBUG -/** Control if extra debug checks need to be done for temporary tablespace. -Default = true that is disable such checks. -This variable is not exposed to end-user but still kept as variable for -developer to enable it during debug. */ -bool srv_skip_temp_table_checks_debug = true; -#endif /* UNIV_DEBUG */ - /** Convert a numeric string that optionally ends in G or M or K, to a number containing megabytes. @param[in] str String with a quantity in bytes @@ -358,7 +350,7 @@ SysTablespace::check_size( So we need to round the size downward to a megabyte.*/ const ulint rounded_size_pages = static_cast<ulint>( - size >> UNIV_PAGE_SIZE_SHIFT); + size >> srv_page_size_shift); /* If last file */ if (&file == &m_files.back() && m_auto_extend_last_file) { @@ -402,16 +394,16 @@ SysTablespace::set_size( /* We created the data file and now write it full of zeros */ ib::info() << "Setting file '" << file.filepath() << "' size to " - << (file.m_size >> (20 - UNIV_PAGE_SIZE_SHIFT)) << " MB." + << (file.m_size >> (20U - srv_page_size_shift)) << " MB." " Physically writing the file full; Please wait ..."; bool success = os_file_set_size( file.m_filepath, file.m_handle, - static_cast<os_offset_t>(file.m_size) << UNIV_PAGE_SIZE_SHIFT); + static_cast<os_offset_t>(file.m_size) << srv_page_size_shift); if (success) { ib::info() << "File '" << file.filepath() << "' size is now " - << (file.m_size >> (20 - UNIV_PAGE_SIZE_SHIFT)) + << (file.m_size >> (20U - srv_page_size_shift)) << " MB."; } else { ib::error() << "Could not set the file size of '" @@ -771,11 +763,10 @@ SysTablespace::check_file_spec( } if (!m_auto_extend_last_file - && get_sum_of_sizes() < min_expected_size / UNIV_PAGE_SIZE) { - + && get_sum_of_sizes() + < (min_expected_size >> srv_page_size_shift)) { ib::error() << "Tablespace size must be at least " - << min_expected_size / (1024 * 1024) << " MB"; - + << (min_expected_size >> 20) << " MB"; return(DB_ERROR); } @@ -909,15 +900,30 @@ SysTablespace::open_or_create( it->close(); it->m_exists = true; - if (it == begin) { - /* First data file. */ + if (it != begin) { + } else if (is_temp) { + ut_ad(!fil_system.temp_space); + ut_ad(space_id() == SRV_TMP_SPACE_ID); + space = fil_space_create( + name(), SRV_TMP_SPACE_ID, flags(), + FIL_TYPE_TEMPORARY, NULL); - /* Create the tablespace entry for the multi-file - tablespace in the tablespace manager. */ + mutex_enter(&fil_system.mutex); + fil_system.temp_space = space; + mutex_exit(&fil_system.mutex); + if (!space) { + return DB_ERROR; + } + } else { + ut_ad(!fil_system.sys_space); + ut_ad(space_id() == TRX_SYS_SPACE); space = fil_space_create( - name(), space_id(), flags(), is_temp - ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE, - NULL); + name(), TRX_SYS_SPACE, flags(), + FIL_TYPE_TABLESPACE, NULL); + + mutex_enter(&fil_system.mutex); + fil_system.sys_space = space; + mutex_exit(&fil_system.mutex); if (!space) { return DB_ERROR; } @@ -940,16 +946,16 @@ SysTablespace::open_or_create( /** Normalize the file size, convert from megabytes to number of pages. */ void -SysTablespace::normalize() +SysTablespace::normalize_size() { files_t::iterator end = m_files.end(); for (files_t::iterator it = m_files.begin(); it != end; ++it) { - it->m_size *= (1024 * 1024) / UNIV_PAGE_SIZE; + it->m_size <<= (20U - srv_page_size_shift); } - m_last_file_size_max *= (1024 * 1024) / UNIV_PAGE_SIZE; + m_last_file_size_max <<= (20U - srv_page_size_shift); } diff --git a/storage/innobase/fts/fts0config.cc b/storage/innobase/fts/fts0config.cc index ed4340d818b..9f8e07285bf 100644 --- a/storage/innobase/fts/fts0config.cc +++ b/storage/innobase/fts/fts0config.cc @@ -417,7 +417,7 @@ fts_config_set_ulint( ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN); - value.f_len = snprintf( + value.f_len = (ulint) snprintf( (char*) value.f_str, FTS_MAX_INT_LEN, ULINTPF, int_value); error = fts_config_set_value(trx, fts_table, name, &value); diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc index a74b5083128..ae60a6a9875 100644 --- a/storage/innobase/fts/fts0fts.cc +++ b/storage/innobase/fts/fts0fts.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2020, MariaDB Corporation. +Copyright (c) 2016, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -469,7 +469,7 @@ cleanup: return ret; } - trx_t* trx = trx_allocate_for_background(); + trx_t* trx = trx_create(); trx->op_info = "Load user stopword table into FTS cache"; if (!stopword_info->cached_stopword) { @@ -489,7 +489,6 @@ cleanup: stopword_info); que_t* graph = fts_parse_sql_no_dict_lock( - NULL, info, "DECLARE FUNCTION my_func;\n" "DECLARE CURSOR c IS" @@ -532,7 +531,7 @@ cleanup: } que_graph_free(graph); - trx_free_for_background(trx); + trx->free(); ret = true; goto cleanup; } @@ -1729,9 +1728,9 @@ fts_create_in_mem_aux_table( ulint n_cols) { dict_table_t* new_table = dict_mem_table_create( - aux_table_name, table->space, n_cols, 0, table->flags, - table->space == TRX_SYS_SPACE - ? 0 : table->space == SRV_TMP_SPACE_ID + aux_table_name, NULL, n_cols, 0, table->flags, + table->space_id == TRX_SYS_SPACE + ? 0 : table->space_id == SRV_TMP_SPACE_ID ? DICT_TF2_TEMPORARY : DICT_TF2_USE_FILE_PER_TABLE); if (DICT_TF_HAS_DATA_DIR(table->flags)) { @@ -1748,7 +1747,7 @@ fts_create_in_mem_aux_table( @param[in] table Table that has FTS Index @param[in] fts_table_name FTS AUX table name @param[in] fts_suffix FTS AUX table suffix -@param[in] heap heap +@param[in,out] heap temporary memory heap @return table object if created, else NULL */ static dict_table_t* @@ -1785,14 +1784,15 @@ fts_create_one_common_table( FTS_CONFIG_TABLE_VALUE_COL_LEN); } + dict_table_add_system_columns(new_table, heap); error = row_create_table_for_mysql(new_table, trx, FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); if (error == DB_SUCCESS) { dict_index_t* index = dict_mem_index_create( - fts_table_name, "FTS_COMMON_TABLE_IND", - new_table->space, DICT_UNIQUE|DICT_CLUSTERED, 1); + new_table, "FTS_COMMON_TABLE_IND", + DICT_UNIQUE|DICT_CLUSTERED, 1); if (!is_config) { dict_mem_index_add_field(index, "doc_id", 0); @@ -1837,16 +1837,14 @@ CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE CREATE TABLE $FTS_PREFIX_CONFIG (key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key) @param[in,out] trx transaction -@param[in] table table with FTS index -@param[in] name table name normalized +@param[in,out] table table with FTS index @param[in] skip_doc_id_index Skip index on doc id @return DB_SUCCESS if succeed */ dberr_t fts_create_common_tables( - trx_t* trx, - const dict_table_t* table, - const char* name, - bool skip_doc_id_index) + trx_t* trx, + dict_table_t* table, + bool skip_doc_id_index) { dberr_t error; que_t* graph; @@ -1881,13 +1879,15 @@ fts_create_common_tables( dict_table_t* common_table = fts_create_one_common_table( trx, table, full_name[i], fts_table.suffix, heap); - if (common_table == NULL) { + if (common_table == NULL) { error = DB_ERROR; goto func_exit; } else { common_tables.push_back(common_table); } + mem_heap_empty(heap); + DBUG_EXECUTE_IF("ib_fts_aux_table_error", /* Return error after creating FTS_AUX_CONFIG table. */ if (i == 4) { @@ -1906,7 +1906,7 @@ fts_create_common_tables( pars_info_bind_id(info, true, "config_table", fts_name); graph = fts_parse_sql_no_dict_lock( - &fts_table, info, fts_config_table_insert_values_sql); + info, fts_config_table_insert_values_sql); error = fts_eval_sql(trx, graph); @@ -1917,9 +1917,8 @@ fts_create_common_tables( goto func_exit; } - index = dict_mem_index_create( - name, FTS_DOC_ID_INDEX_NAME, table->space, - DICT_UNIQUE, 1); + index = dict_mem_index_create(table, FTS_DOC_ID_INDEX_NAME, + DICT_UNIQUE, 1); dict_mem_index_add_field(index, FTS_DOC_ID_COL_NAME, 0); op = trx_get_dict_operation(trx); @@ -1947,7 +1946,7 @@ func_exit: @param[in,out] trx transaction @param[in] index the index instance @param[in] fts_table fts_table structure -@param[in,out] heap memory heap +@param[in,out] heap temporary memory heap @see row_merge_create_fts_sort_index() @return DB_SUCCESS or error code */ static @@ -1980,7 +1979,7 @@ fts_create_one_index_table( ? DATA_VARCHAR : DATA_VARMYSQL, field->col->prtype, FTS_MAX_WORD_LEN_IN_CHAR - * field->col->mbmaxlen); + * unsigned(field->col->mbmaxlen)); dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, @@ -2004,12 +2003,13 @@ fts_create_one_index_table( (DATA_MTYPE_MAX << 16) | DATA_UNSIGNED | DATA_NOT_NULL, FTS_INDEX_ILIST_LEN); + dict_table_add_system_columns(new_table, heap); error = row_create_table_for_mysql(new_table, trx, FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); if (error == DB_SUCCESS) { dict_index_t* index = dict_mem_index_create( - table_name, "FTS_INDEX_TABLE_IND", new_table->space, + new_table, "FTS_INDEX_TABLE_IND", DICT_UNIQUE|DICT_CLUSTERED, 2); dict_mem_index_add_field(index, "word", 0); dict_mem_index_add_field(index, "first_doc_id", 0); @@ -2034,18 +2034,24 @@ fts_create_one_index_table( return(new_table); } -/** Create auxiliary index tables for an FTS index. -@param[in,out] trx transaction -@param[in] index the index instance -@param[in] table_name table name -@param[in] table_id the table id +/** Creates the column specific ancillary tables needed for supporting an +FTS index on the given table. row_mysql_lock_data_dictionary must have +been called before this. + +All FTS AUX Index tables have the following schema. +CREAT TABLE $FTS_PREFIX_INDEX_[1-6]( + word VARCHAR(FTS_MAX_WORD_LEN), + first_doc_id INT NOT NULL, + last_doc_id UNSIGNED NOT NULL, + doc_count UNSIGNED INT NOT NULL, + ilist VARBINARY NOT NULL, + UNIQUE CLUSTERED INDEX ON (word, first_doc_id)) +@param[in,out] trx dictionary transaction +@param[in] index fulltext index +@param[in] id table id @return DB_SUCCESS or error code */ dberr_t -fts_create_index_tables_low( - trx_t* trx, - const dict_index_t* index, - const char* table_name, - table_id_t table_id) +fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id) { ulint i; fts_table_t fts_table; @@ -2054,7 +2060,7 @@ fts_create_index_tables_low( fts_table.type = FTS_INDEX_TABLE; fts_table.index_id = index->id; - fts_table.table_id = table_id; + fts_table.table_id = id; fts_table.table = index->table; /* aux_idx_tables vector is used for dropping FTS AUX INDEX @@ -2080,6 +2086,8 @@ fts_create_index_tables_low( aux_idx_tables.push_back(new_table); } + mem_heap_empty(heap); + DBUG_EXECUTE_IF("ib_fts_index_table_error", /* Return error after creating FTS_INDEX_5 aux table. */ @@ -2105,42 +2113,6 @@ fts_create_index_tables_low( return(error); } -/** Creates the column specific ancillary tables needed for supporting an -FTS index on the given table. row_mysql_lock_data_dictionary must have -been called before this. - -All FTS AUX Index tables have the following schema. -CREAT TABLE $FTS_PREFIX_INDEX_[1-6]( - word VARCHAR(FTS_MAX_WORD_LEN), - first_doc_id INT NOT NULL, - last_doc_id UNSIGNED NOT NULL, - doc_count UNSIGNED INT NOT NULL, - ilist VARBINARY NOT NULL, - UNIQUE CLUSTERED INDEX ON (word, first_doc_id)) -@param[in,out] trx transaction -@param[in] index index instance -@return DB_SUCCESS or error code */ -dberr_t -fts_create_index_tables( - trx_t* trx, - const dict_index_t* index) -{ - dberr_t err; - dict_table_t* table; - - table = dict_table_get_low(index->table_name); - ut_a(table != NULL); - - err = fts_create_index_tables_low( - trx, index, table->name.m_name, table->id); - - if (err == DB_SUCCESS) { - trx_commit(trx); - } - - return(err); -} - /******************************************************************//** Calculate the new state of a row given the existing state and a new event. @return new state of row */ @@ -2282,7 +2254,7 @@ fts_trx_create( savep != NULL; savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) { - fts_savepoint_take(trx, ftt, savep->name); + fts_savepoint_take(ftt, savep->name); } return(ftt); @@ -2640,7 +2612,7 @@ retry: fts_table.type = FTS_COMMON_TABLE; fts_table.table = table; - trx = trx_allocate_for_background(); + trx = trx_create(); if (srv_read_only_mode) { trx_start_internal_read_only(trx); } else { @@ -2730,7 +2702,7 @@ func_exit: } } - trx_free_for_background(trx); + trx->free(); return(error); } @@ -2767,7 +2739,7 @@ fts_update_sync_doc_id( fts_table.table = table; if (!trx) { - trx = trx_allocate_for_background(); + trx = trx_create(); trx_start_internal(trx); trx->op_info = "setting last FTS document id"; @@ -2776,7 +2748,7 @@ fts_update_sync_doc_id( info = pars_info_create(); - id_len = snprintf( + id_len = (ulint) snprintf( (char*) id, sizeof(id), FTS_DOC_ID_FORMAT, doc_id + 1); pars_info_bind_varchar_literal(info, "doc_id", id, id_len); @@ -2806,7 +2778,7 @@ fts_update_sync_doc_id( fts_sql_rollback(trx); } - trx_free_for_background(trx); + trx->free(); } return(error); @@ -2994,7 +2966,7 @@ fts_commit_table( ib_rbt_t* rows; dberr_t error = DB_SUCCESS; fts_cache_t* cache = ftt->table->fts->cache; - trx_t* trx = trx_allocate_for_background(); + trx_t* trx = trx_create(); trx_start_internal(trx); @@ -3036,7 +3008,7 @@ fts_commit_table( fts_sql_commit(trx); - trx_free_for_background(trx); + trx->free(); return(error); } @@ -3227,6 +3199,8 @@ fts_fetch_doc_from_rec( parser = get_doc->index_cache->index->parser; clust_rec = btr_pcur_get_rec(pcur); + ut_ad(!page_rec_is_comp(clust_rec) + || rec_get_status(clust_rec) == REC_STATUS_ORDINARY); num_field = dict_index_get_n_fields(index); @@ -3510,7 +3484,7 @@ fts_add_doc_by_id( dict_index_copy_types(clust_ref, clust_index, n_fields); row_build_row_ref_in_tuple( - clust_ref, rec, fts_id_index, NULL, NULL); + clust_ref, rec, fts_id_index, NULL); btr_pcur_open_with_no_init( clust_index, clust_ref, PAGE_CUR_LE, @@ -3521,7 +3495,8 @@ fts_add_doc_by_id( } - offsets = rec_get_offsets(clust_rec, clust_index, NULL, true, + offsets = rec_get_offsets(clust_rec, clust_index, NULL, + clust_index->n_core_fields, ULINT_UNDEFINED, &heap); for (ulint i = 0; i < num_idx; ++i) { @@ -3659,6 +3634,8 @@ fts_get_max_doc_id( return(0); } + ut_ad(!index->is_instant()); + dfield = dict_index_get_nth_field(index, 0); #if 0 /* This can fail when renaming a column to FTS_DOC_ID_COL_NAME. */ @@ -3693,8 +3670,10 @@ fts_get_max_doc_id( goto func_exit; } + ut_ad(!rec_is_metadata(rec, index)); offsets = rec_get_offsets( - rec, index, offsets, true, ULINT_UNDEFINED, &heap); + rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &heap); data = rec_get_nth_field(rec, offsets, 0, &len); @@ -3730,7 +3709,7 @@ fts_doc_fetch_by_doc_id( const char* select_str; doc_id_t write_doc_id; dict_index_t* index; - trx_t* trx = trx_allocate_for_background(); + trx_t* trx = trx_create(); que_t* graph; trx->op_info = "fetching indexed FTS document"; @@ -3751,7 +3730,7 @@ fts_doc_fetch_by_doc_id( pars_info_bind_function(info, "my_func", callback, arg); select_str = fts_get_select_columns_str(index, info, info->heap); - pars_info_bind_id(info, TRUE, "table_name", index->table_name); + pars_info_bind_id(info, TRUE, "table_name", index->table->name.m_name); if (!get_doc || !get_doc->get_document_graph) { if (option == FTS_FETCH_DOC_BY_ID_EQUAL) { @@ -3821,7 +3800,7 @@ fts_doc_fetch_by_doc_id( error = fts_eval_sql(trx, graph); fts_sql_commit(trx); - trx_free_for_background(trx); + trx->free(); if (!get_doc) { fts_que_graph_free(graph); @@ -4076,7 +4055,7 @@ fts_sync_begin( sync->start_time = time(NULL); - sync->trx = trx_allocate_for_background(); + sync->trx = trx_create(); trx_start_internal(sync->trx); if (UNIV_UNLIKELY(fts_enable_diag_print)) { @@ -4218,7 +4197,7 @@ fts_sync_commit( /* Avoid assertion in trx_t::free(). */ trx->dict_operation_lock_mode = 0; - trx_free_for_background(trx); + trx->free(); return(error); } @@ -4272,7 +4251,7 @@ fts_sync_rollback( /* Avoid assertion in trx_t::free(). */ trx->dict_operation_lock_mode = 0; - trx_free_for_background(trx); + trx->free(); } /** Run SYNC on the table, i.e., write out data from the cache to the @@ -4417,7 +4396,7 @@ dberr_t fts_sync_table(dict_table_t* table, bool wait) ut_ad(table->fts); - if (!dict_table_is_discarded(table) && table->fts->cache + if (table->space && table->fts->cache && !dict_table_is_corrupted(table)) { err = fts_sync(table->fts->cache->sync, !wait, wait); } @@ -4654,7 +4633,7 @@ fts_tokenize_add_word_for_parser( MYSQL_FTPARSER_PARAM* param, /* in: parser paramter */ const char* word, /* in: token word */ int word_len, /* in: word len */ - MYSQL_FTPARSER_BOOLEAN_INFO* boolean_info) /* in: word boolean info */ + MYSQL_FTPARSER_BOOLEAN_INFO*) { fts_string_t str; fts_tokenize_param_t* fts_param; @@ -4666,9 +4645,9 @@ fts_tokenize_add_word_for_parser( ut_ad(result_doc != NULL); str.f_str = (byte*)(word); - str.f_len = word_len; + str.f_len = ulint(word_len); str.f_n_char = fts_get_token_size( - const_cast<CHARSET_INFO*>(param->cs), word, word_len); + const_cast<CHARSET_INFO*>(param->cs), word, str.f_len); /* JAN: TODO: MySQL 5.7 FTS ut_ad(boolean_info->position >= 0); @@ -4930,7 +4909,7 @@ fts_get_rows_count( ulint count = 0; char table_name[MAX_FULL_NAME_LEN]; - trx = trx_allocate_for_background(); + trx = trx_create(); trx->op_info = "fetching FT table rows count"; info = pars_info_create(); @@ -4985,7 +4964,7 @@ fts_get_rows_count( fts_que_graph_free(graph); - trx_free_for_background(trx); + trx->free(); return(count); } @@ -5002,7 +4981,7 @@ fts_update_max_cache_size( trx_t* trx; fts_table_t fts_table; - trx = trx_allocate_for_background(); + trx = trx_create(); FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, sync->table); @@ -5011,7 +4990,7 @@ fts_update_max_cache_size( fts_sql_commit(trx); - trx_free_for_background(trx); + trx->free(); } #endif /* FTS_CACHE_SIZE_DEBUG */ @@ -5189,7 +5168,8 @@ fts_get_doc_id_from_rec( rec_offs_init(offsets_); offsets = rec_get_offsets( - rec, index, offsets, true, ULINT_UNDEFINED, &my_heap); + rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &my_heap); col_no = dict_col_get_index_pos( &table->cols[table->fts->doc_col], index); @@ -5462,7 +5442,6 @@ Take a FTS savepoint. */ void fts_savepoint_take( /*===============*/ - trx_t* trx, /*!< in: transaction */ fts_trx_t* fts_trx, /*!< in: fts transaction */ const char* name) /*!< in: savepoint name */ { @@ -5740,7 +5719,7 @@ fts_savepoint_rollback( ut_a(ib_vector_size(savepoints) > 0); /* Restore the savepoint. */ - fts_savepoint_take(trx, trx->fts_trx, name); + fts_savepoint_take(trx->fts_trx, name); } } @@ -5835,21 +5814,21 @@ fts parent table id and index id. index id */ static void fil_get_fts_spaces(fts_space_set_t& fts_space_set) { - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); - for (fil_space_t *space = UT_LIST_GET_FIRST(fil_system->space_list); - space != NULL; - space = UT_LIST_GET_NEXT(space_list, space)) + for (fil_space_t *space= UT_LIST_GET_FIRST(fil_system.space_list); + space; + space= UT_LIST_GET_NEXT(space_list, space)) { - index_id_t index_id = 0; - table_id_t table_id = 0; + index_id_t index_id= 0; + table_id_t table_id= 0; if (space->purpose == FIL_TYPE_TABLESPACE && fts_check_aux_table(space->name, &table_id, &index_id)) fts_space_set.insert(std::make_pair(table_id, index_id)); } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); } /** Check whether the parent table id and index id of fts auxilary @@ -5861,7 +5840,7 @@ static void fts_check_orphaned_tables(fts_space_set_t& fts_space_set) { btr_pcur_t pcur; mtr_t mtr; - trx_t* trx = trx_allocate_for_background(); + trx_t* trx = trx_create(); trx->op_info = "checking fts orphaned tables"; row_mysql_lock_data_dictionary(trx); @@ -5913,7 +5892,7 @@ static void fts_check_orphaned_tables(fts_space_set_t& fts_space_set) btr_pcur_close(&pcur); mtr.commit(); row_mysql_unlock_data_dictionary(trx); - trx_free_for_background(trx); + trx->free(); } /** Drop all fts auxilary table for the respective fts_id @@ -5957,7 +5936,7 @@ void fts_drop_orphaned_tables() if (fts_space_set.empty()) return; - trx_t* trx= trx_allocate_for_background(); + trx_t* trx= trx_create(); trx->op_info= "Drop orphaned aux FTS tables"; row_mysql_lock_data_dictionary(trx); @@ -5982,7 +5961,7 @@ void fts_drop_orphaned_tables() trx_commit_for_mysql(trx); row_mysql_unlock_data_dictionary(trx); trx->dict_operation_lock_mode= 0; - trx_free_for_background(trx); + trx->free(); } /**********************************************************************//** @@ -6076,7 +6055,7 @@ fts_load_stopword( } if (!trx) { - trx = trx_allocate_for_background(); + trx = trx_create(); if (srv_read_only_mode) { trx_start_internal_read_only(trx); } else { @@ -6155,7 +6134,7 @@ cleanup: fts_sql_rollback(trx); } - trx_free_for_background(trx); + trx->free(); } if (!cache->stopword_info.cached_stopword) { diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc index 3cabb64b851..abb46eb7f4b 100644 --- a/storage/innobase/fts/fts0opt.cc +++ b/storage/innobase/fts/fts0opt.cc @@ -615,9 +615,9 @@ fts_zip_read_word( ptr[len] = 0; zip->zp->next_out = ptr; - zip->zp->avail_out = len; + zip->zp->avail_out = uInt(len); - word->f_len = len; + word->f_len = ulint(len); len = 0; } break; @@ -709,7 +709,7 @@ fts_fetch_index_words( case Z_OK: if (zip->zp->avail_in == 0) { zip->zp->next_in = static_cast<byte*>(data); - zip->zp->avail_in = len; + zip->zp->avail_in = uInt(len); ut_a(len <= FTS_MAX_WORD_LEN); len = 0; } @@ -968,7 +968,7 @@ fts_table_fetch_doc_ids( ut_a(fts_table->type == FTS_COMMON_TABLE); if (!trx) { - trx = trx_allocate_for_background(); + trx = trx_create(); alloc_bk_trx = TRUE; } @@ -1008,7 +1008,7 @@ fts_table_fetch_doc_ids( } if (alloc_bk_trx) { - trx_free_for_background(trx); + trx->free(); } return(error); @@ -1138,7 +1138,7 @@ fts_optimize_encode_node( ++src; /* Number of encoded pos bytes to copy. */ - pos_enc_len = src - enc->src_ilist_ptr; + pos_enc_len = ulint(src - enc->src_ilist_ptr); /* Total number of bytes required for copy. */ enc_len += pos_enc_len; @@ -1210,7 +1210,7 @@ fts_optimize_node( enc->src_ilist_ptr = src_node->ilist; } - copied = enc->src_ilist_ptr - src_node->ilist; + copied = ulint(enc->src_ilist_ptr - src_node->ilist); /* While there is data in the source node and space to copy into in the destination node. */ @@ -1231,7 +1231,7 @@ test_again: doc_id_t* update; update = (doc_id_t*) ib_vector_get( - del_vec, *del_pos); + del_vec, ulint(*del_pos)); del_doc_id = *update; } @@ -1275,7 +1275,7 @@ test_again: } /* Bytes copied so for from source. */ - copied = enc->src_ilist_ptr - src_node->ilist; + copied = ulint(enc->src_ilist_ptr - src_node->ilist); } if (copied >= src_node->ilist_size) { @@ -1376,7 +1376,7 @@ fts_optimize_word( ut_a(enc.src_ilist_ptr != NULL); /* Determine the numer of bytes copied to dst_node. */ - copied = enc.src_ilist_ptr - src_node->ilist; + copied = ulint(enc.src_ilist_ptr - src_node->ilist); /* Can't copy more than whats in the vlc array. */ ut_a(copied <= src_node->ilist_size); @@ -1587,7 +1587,7 @@ fts_optimize_create( optim->table = table; - optim->trx = trx_allocate_for_background(); + optim->trx = trx_create(); trx_start_internal(optim->trx); optim->fts_common_table.table_id = table->id; @@ -1710,7 +1710,8 @@ fts_optimize_free( mem_heap_t* heap = static_cast<mem_heap_t*>(optim->self_heap->arg); trx_commit_for_mysql(optim->trx); - trx_free_for_background(optim->trx); + optim->trx->free(); + optim->trx = NULL; fts_doc_ids_free(optim->to_delete); fts_optimize_graph_free(&optim->graph); @@ -2781,7 +2782,7 @@ Optimize all FTS tables. @return Dummy return */ static os_thread_ret_t -fts_optimize_thread( +DECLARE_THREAD(fts_optimize_thread)( /*================*/ void* arg) /*!< in: work queue*/ { diff --git a/storage/innobase/fts/fts0plugin.cc b/storage/innobase/fts/fts0plugin.cc index 9a37ec52516..de99d1709ad 100644 --- a/storage/innobase/fts/fts0plugin.cc +++ b/storage/innobase/fts/fts0plugin.cc @@ -32,26 +32,12 @@ Created 2013/06/04 Shaohua Wang /******************************************************************//** FTS default parser init @return 0 */ -static -int -fts_default_parser_init( -/*====================*/ - MYSQL_FTPARSER_PARAM *param) /*!< in: plugin parser param */ -{ - return(0); -} +static int fts_default_parser_init(MYSQL_FTPARSER_PARAM*) { return 0; } /******************************************************************//** FTS default parser deinit @return 0 */ -static -int -fts_default_parser_deinit( -/*======================*/ - MYSQL_FTPARSER_PARAM *param) /*!< in: plugin parser param */ -{ - return(0); -} +static int fts_default_parser_deinit(MYSQL_FTPARSER_PARAM*) { return 0; } /******************************************************************//** FTS default parser parse from ft_static.c in MYISAM. @@ -134,7 +120,7 @@ fts_query_add_word_for_parser( case FT_TOKEN_WORD: term_node = fts_ast_create_node_term_for_parser( - state, word, word_len); + state, word, ulint(word_len)); if (info->trunc) { fts_ast_term_set_wildcard(term_node); @@ -251,7 +237,7 @@ fts_parse_query_internal( int ret = param->mysql_add_word( param, reinterpret_cast<char*>(w.pos), - w.len, &info); + int(w.len), &info); if (ret) { return(ret); } diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc index df2b330fe4b..b3ef795a463 100644 --- a/storage/innobase/fts/fts0que.cc +++ b/storage/innobase/fts/fts0que.cc @@ -1749,7 +1749,7 @@ fts_query_match_phrase_add_word_for_parser( MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */ const char* word, /*!< in: token */ int word_len, /*!< in: token length */ - MYSQL_FTPARSER_BOOLEAN_INFO* info) /*!< in: token info */ + MYSQL_FTPARSER_BOOLEAN_INFO*) { fts_phrase_param_t* phrase_param; fts_phrase_t* phrase; @@ -1771,8 +1771,8 @@ fts_query_match_phrase_add_word_for_parser( } match.f_str = (uchar *)(word); - match.f_len = word_len; - match.f_n_char = fts_get_token_size(phrase->charset, word, word_len); + match.f_len = ulint(word_len); + match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len); if (match.f_len > 0) { /* Get next token to match. */ @@ -1904,7 +1904,7 @@ fts_query_match_phrase( &phrase_param, phrase->parser, ptr, - (end - ptr))) { + ulint(end - ptr))) { break; } } else { @@ -3305,7 +3305,7 @@ fts_query_filter_doc_ids( ++ptr; /* Bytes decoded so far */ - decoded = ptr - (byte*) data; + decoded = ulint(ptr - (byte*) data); /* We simply collect the matching documents and the positions here and match later. */ @@ -3921,7 +3921,7 @@ fts_query_can_optimize( } /** FTS Query entry point. -@param[in] trx transaction +@param[in,out] trx transaction @param[in] index fts index to search @param[in] flags FTS search mode @param[in] query_str FTS query @@ -3943,7 +3943,7 @@ fts_query( ulint lc_query_str_len; ulint result_len; bool boolean_mode; - trx_t* query_trx; + trx_t* query_trx; /* FIXME: use provided trx */ CHARSET_INFO* charset; ulint start_time_ms; bool will_be_ignored = false; @@ -3952,7 +3952,7 @@ fts_query( *result = NULL; memset(&query, 0x0, sizeof(query)); - query_trx = trx_allocate_for_background(); + query_trx = trx_create(); query_trx->op_info = "FTS query"; start_time_ms = ut_time_ms(); @@ -4119,7 +4119,7 @@ fts_query( << diff_time / 1000 << " secs: " << diff_time % 1000 << " millisec: row(s) " << ((*result)->rankings_by_id - ? rbt_size((*result)->rankings_by_id) + ? lint(rbt_size((*result)->rankings_by_id)) : -1); /* Log memory consumption & result size */ @@ -4134,7 +4134,7 @@ fts_query( func_exit: fts_query_free(&query); - trx_free_for_background(query_trx); + query_trx->free(); return(error); } diff --git a/storage/innobase/fts/fts0sql.cc b/storage/innobase/fts/fts0sql.cc index e61f2118e70..f799a46d088 100644 --- a/storage/innobase/fts/fts0sql.cc +++ b/storage/innobase/fts/fts0sql.cc @@ -176,7 +176,6 @@ Parse an SQL string. que_t* fts_parse_sql_no_dict_lock( /*=======================*/ - fts_table_t* fts_table, /*!< in: FTS aux table info */ pars_info_t* info, /*!< in: info struct, or NULL */ const char* sql) /*!< in: SQL string to evaluate */ { diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc index 9359e5bf478..203820037e1 100644 --- a/storage/innobase/fut/fut0lst.cc +++ b/storage/innobase/fut/fut0lst.cc @@ -40,7 +40,6 @@ flst_add_to_empty( { ulint space; fil_addr_t node_addr; - ulint len; ut_ad(mtr && base && node); ut_ad(base != node); @@ -50,8 +49,7 @@ flst_add_to_empty( ut_ad(mtr_memo_contains_page_flagged(mtr, node, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); - len = flst_get_len(base); - ut_a(len == 0); + ut_a(!flst_get_len(base)); buf_ptr_get_fsp_addr(node, &space, &node_addr); @@ -64,7 +62,7 @@ flst_add_to_empty( flst_write_addr(node + FLST_NEXT, fil_addr_null, mtr); /* Update len of base node */ - mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr); + mlog_write_ulint(base + FLST_LEN, 1, MLOG_4BYTES, mtr); } /********************************************************************//** diff --git a/storage/innobase/gis/gis0geo.cc b/storage/innobase/gis/gis0geo.cc index 3a727185632..dad40d19da7 100644 --- a/storage/innobase/gis/gis0geo.cc +++ b/storage/innobase/gis/gis0geo.cc @@ -31,6 +31,7 @@ Created 2013/03/27 Allen Lai and Jimmy Yang #include "mach0data.h" #include <spatial.h> +#include <cmath> /* These definitions are for comparing 2 mbrs. */ @@ -73,7 +74,6 @@ rtree_add_point_to_mbr( where point is stored */ const uchar* end, /*!< in: end of wkb. */ uint n_dims, /*!< in: dimensions. */ - uchar byte_order, /*!< in: byte order. */ double* mbr) /*!< in/out: mbr, which must be of length n_dims * 2. */ { @@ -113,11 +113,10 @@ rtree_get_point_mbr( where point is stored. */ const uchar* end, /*!< in: end of wkb. */ uint n_dims, /*!< in: dimensions. */ - uchar byte_order, /*!< in: byte order. */ double* mbr) /*!< in/out: mbr, must be of length n_dims * 2. */ { - return rtree_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr); + return rtree_add_point_to_mbr(wkb, end, n_dims, mbr); } @@ -132,7 +131,6 @@ rtree_get_linestring_mbr( where point is stored. */ const uchar* end, /*!< in: end of wkb. */ uint n_dims, /*!< in: dimensions. */ - uchar byte_order, /*!< in: byte order. */ double* mbr) /*!< in/out: mbr, must be of length n_dims * 2. */ { @@ -143,8 +141,7 @@ rtree_get_linestring_mbr( for (; n_points > 0; --n_points) { /* Add next point to mbr */ - if (rtree_add_point_to_mbr(wkb, end, n_dims, - byte_order, mbr)) { + if (rtree_add_point_to_mbr(wkb, end, n_dims, mbr)) { return(-1); } } @@ -163,7 +160,6 @@ rtree_get_polygon_mbr( where point is stored. */ const uchar* end, /*!< in: end of wkb. */ uint n_dims, /*!< in: dimensions. */ - uchar byte_order, /*!< in: byte order. */ double* mbr) /*!< in/out: mbr, must be of length n_dims * 2. */ { @@ -179,8 +175,7 @@ rtree_get_polygon_mbr( for (; n_points > 0; --n_points) { /* Add next point to mbr */ - if (rtree_add_point_to_mbr(wkb, end, n_dims, - byte_order, mbr)) { + if (rtree_add_point_to_mbr(wkb, end, n_dims, mbr)) { return(-1); } } @@ -206,11 +201,10 @@ rtree_get_geometry_mbr( by itself. */ { int res; - uchar byte_order = 2; uint wkb_type = 0; uint n_items; - byte_order = *(*wkb); + /* byte_order = *(*wkb); */ ++(*wkb); wkb_type = uint4korr((*wkb)); @@ -218,24 +212,22 @@ rtree_get_geometry_mbr( switch ((enum wkbType) wkb_type) { case wkbPoint: - res = rtree_get_point_mbr(wkb, end, n_dims, byte_order, mbr); + res = rtree_get_point_mbr(wkb, end, n_dims, mbr); break; case wkbLineString: - res = rtree_get_linestring_mbr(wkb, end, n_dims, - byte_order, mbr); + res = rtree_get_linestring_mbr(wkb, end, n_dims, mbr); break; case wkbPolygon: - res = rtree_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr); + res = rtree_get_polygon_mbr(wkb, end, n_dims, mbr); break; case wkbMultiPoint: n_items = uint4korr((*wkb)); (*wkb) += 4; for (; n_items > 0; --n_items) { - byte_order = *(*wkb); + /* byte_order = *(*wkb); */ ++(*wkb); (*wkb) += 4; - if (rtree_get_point_mbr(wkb, end, n_dims, - byte_order, mbr)) { + if (rtree_get_point_mbr(wkb, end, n_dims, mbr)) { return(-1); } } @@ -245,11 +237,10 @@ rtree_get_geometry_mbr( n_items = uint4korr((*wkb)); (*wkb) += 4; for (; n_items > 0; --n_items) { - byte_order = *(*wkb); + /* byte_order = *(*wkb); */ ++(*wkb); (*wkb) += 4; - if (rtree_get_linestring_mbr(wkb, end, n_dims, - byte_order, mbr)) { + if (rtree_get_linestring_mbr(wkb, end, n_dims, mbr)) { return(-1); } } @@ -259,11 +250,10 @@ rtree_get_geometry_mbr( n_items = uint4korr((*wkb)); (*wkb) += 4; for (; n_items > 0; --n_items) { - byte_order = *(*wkb); + /* byte_order = *(*wkb); */ ++(*wkb); (*wkb) += 4; - if (rtree_get_polygon_mbr(wkb, end, n_dims, - byte_order, mbr)) { + if (rtree_get_polygon_mbr(wkb, end, n_dims, mbr)) { return(-1); } } @@ -403,7 +393,7 @@ copy_coords( /*========*/ double* dst, /*!< in/out: destination. */ const double* src, /*!< in: source. */ - int n_dim) /*!< in: dimensions. */ + int) { memcpy(dst, src, DATA_MBR_LEN); } @@ -604,7 +594,7 @@ rtree_key_cmp( /*==========*/ page_cur_mode_t mode, /*!< in: compare method. */ const uchar* b, /*!< in: first key. */ - int b_len, /*!< in: first key len. */ + int, const uchar* a, /*!< in: second key. */ int a_len) /*!< in: second key len. */ { diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc index e3d5a09f736..3ac2fbc0093 100644 --- a/storage/innobase/gis/gis0rtree.cc +++ b/storage/innobase/gis/gis0rtree.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, 2020, MariaDB Corporation. +Copyright (c) 2018, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,6 +38,7 @@ Created 2013/03/27 Allen Lai and Jimmy Yang #include "trx0undo.h" #include "srv0mon.h" #include "gis0geo.h" +#include <cmath> /*************************************************************//** Initial split nodes info for R-tree split. @@ -71,7 +72,7 @@ rtr_page_split_initialize_nodes( page = buf_block_get_frame(block); n_uniq = dict_index_get_n_unique_in_tree(cursor->index); - n_recs = page_get_n_recs(page) + 1; + n_recs = ulint(page_get_n_recs(page)) + 1; /*We reserve 2 MBRs memory space for temp result of split algrithm. And plus the new mbr that need to insert, we @@ -86,8 +87,9 @@ rtr_page_split_initialize_nodes( stop = task + n_recs; rec = page_rec_get_next(page_get_infimum_rec(page)); - ut_d(const bool is_leaf = page_is_leaf(page)); - *offsets = rec_get_offsets(rec, cursor->index, *offsets, is_leaf, + const ulint n_core = page_is_leaf(page) + ? cursor->index->n_core_fields : 0; + *offsets = rec_get_offsets(rec, cursor->index, *offsets, n_core, n_uniq, &heap); source_cur = rec_get_nth_field(rec, *offsets, 0, &len); @@ -100,7 +102,7 @@ rtr_page_split_initialize_nodes( rec = page_rec_get_next(rec); *offsets = rec_get_offsets(rec, cursor->index, *offsets, - is_leaf, n_uniq, &heap); + n_core, n_uniq, &heap); source_cur = rec_get_nth_field(rec, *offsets, 0, &len); } @@ -133,10 +135,8 @@ rtr_index_build_node_ptr( pointer */ ulint page_no,/*!< in: page number to put in node pointer */ - mem_heap_t* heap, /*!< in: memory heap where pointer + mem_heap_t* heap) /*!< in: memory heap where pointer created */ - ulint level) /*!< in: level of rec in tree: - 0 means leaf level */ { dtuple_t* tuple; dfield_t* field; @@ -291,7 +291,6 @@ rtr_update_mbr_field( ulint up_match = 0; ulint low_match = 0; ulint child; - ulint level; ulint rec_info; page_zip_des_t* page_zip; bool ins_suc = true; @@ -310,7 +309,8 @@ rtr_update_mbr_field( page_zip = buf_block_get_page_zip(block); child = btr_node_ptr_get_child_page_no(rec, offsets); - level = btr_page_get_level(buf_block_get_frame(block), mtr); + const ulint n_core = page_is_leaf(block->frame) + ? index->n_core_fields : 0; if (new_rec) { child_rec = new_rec; @@ -319,14 +319,14 @@ rtr_update_mbr_field( } dtuple_t* node_ptr = rtr_index_build_node_ptr( - index, mbr, child_rec, child, heap, level); + index, mbr, child_rec, child, heap); /* We need to remember the child page no of cursor2, since page could be reorganized or insert a new rec before it. */ if (cursor2) { rec_t* del_rec = btr_cur_get_rec(cursor2); offsets2 = rec_get_offsets(btr_cur_get_rec(cursor2), - index, NULL, false, + index, NULL, 0, ULINT_UNDEFINED, &heap); del_page_no = btr_node_ptr_get_child_page_no(del_rec, offsets2); cur2_pos = page_rec_get_n_recs_before(btr_cur_get_rec(cursor2)); @@ -391,7 +391,7 @@ rtr_update_mbr_field( = page_rec_get_nth(page, cur2_pos); } offsets2 = rec_get_offsets(btr_cur_get_rec(cursor2), - index, NULL, false, + index, NULL, 0, ULINT_UNDEFINED, &heap); ut_ad(del_page_no == btr_node_ptr_get_child_page_no( cursor2->page_cur.rec, @@ -429,7 +429,7 @@ rtr_update_mbr_field( ut_ad(old_rec != insert_rec); page_cur_position(old_rec, block, &page_cur); - offsets2 = rec_get_offsets(old_rec, index, NULL, !level, + offsets2 = rec_get_offsets(old_rec, index, NULL, n_core, ULINT_UNDEFINED, &heap); page_cur_delete_rec(&page_cur, index, offsets2, mtr); @@ -459,7 +459,7 @@ update_mbr: cur2_rec = cursor2->page_cur.rec; offsets2 = rec_get_offsets(cur2_rec, index, NULL, - !level, + n_core, ULINT_UNDEFINED, &heap); cur2_rec_info = rec_get_info_bits(cur2_rec, @@ -519,7 +519,7 @@ update_mbr: if (ins_suc) { btr_cur_position(index, insert_rec, block, cursor); offsets = rec_get_offsets(insert_rec, - index, offsets, !level, + index, offsets, n_core, ULINT_UNDEFINED, &heap); } @@ -534,7 +534,7 @@ update_mbr: cur2_rec = btr_cur_get_rec(cursor2); offsets2 = rec_get_offsets(cur2_rec, index, NULL, - !level, + n_core, ULINT_UNDEFINED, &heap); /* If the cursor2 position is on a wrong rec, we @@ -548,7 +548,7 @@ update_mbr: while (!page_rec_is_supremum(cur2_rec)) { offsets2 = rec_get_offsets(cur2_rec, index, NULL, - !level, + n_core, ULINT_UNDEFINED, &heap); cur2_pno = btr_node_ptr_get_child_page_no( @@ -628,7 +628,6 @@ rtr_adjust_upper_level( buf_block_t* new_block, /*!< in/out: the new half page */ rtr_mbr_t* mbr, /*!< in: MBR on the old page */ rtr_mbr_t* new_mbr, /*!< in: MBR on the new page */ - ulint direction, /*!< in: FSP_UP or FSP_DOWN */ mtr_t* mtr) /*!< in: mtr */ { page_t* page; @@ -644,10 +643,8 @@ rtr_adjust_upper_level( ulint level; dtuple_t* node_ptr_upper; page_cur_t* page_cursor; - rtr_mbr_t parent_mbr; lock_prdt_t prdt; lock_prdt_t new_prdt; - lock_prdt_t parent_prdt; dberr_t err; big_rec_t* dummy_big_rec; rec_t* rec; @@ -659,9 +656,8 @@ rtr_adjust_upper_level( cursor.thr = sea_cur->thr; /* Get the level of the split pages */ - level = btr_page_get_level(buf_block_get_frame(block), mtr); - ut_ad(level - == btr_page_get_level(buf_block_get_frame(new_block), mtr)); + level = btr_page_get_level(buf_block_get_frame(block)); + ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block))); page = buf_block_get_frame(block); page_no = block->page.id.page_no(); @@ -678,8 +674,6 @@ rtr_adjust_upper_level( page_cursor = btr_cur_get_page_cur(&cursor); - rtr_get_mbr_from_rec(page_cursor->rec, offsets, &parent_mbr); - rtr_update_mbr_field(&cursor, offsets, NULL, page, mbr, NULL, mtr); /* Already updated parent MBR, reset in our path */ @@ -695,7 +689,7 @@ rtr_adjust_upper_level( node_ptr_upper = rtr_index_build_node_ptr( index, new_mbr, page_rec_get_next(page_get_infimum_rec(new_page)), - new_page_no, heap, level); + new_page_no, heap); ulint up_match = 0; ulint low_match = 0; @@ -741,11 +735,9 @@ rtr_adjust_upper_level( prdt.op = 0; new_prdt.data = static_cast<void*>(new_mbr); new_prdt.op = 0; - parent_prdt.data = static_cast<void*>(&parent_mbr); - parent_prdt.op = 0; lock_prdt_update_parent(block, new_block, &prdt, &new_prdt, - &parent_prdt, dict_index_get_space(index), + index->table->space_id, page_cursor->block->page.id.page_no()); mem_heap_free(heap); @@ -844,7 +836,8 @@ rtr_split_page_move_rec_list( rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc( heap, sizeof (*rec_move) * max_to_move)); - const bool is_leaf = page_is_leaf(page); + const ulint n_core = page_is_leaf(page) + ? index->n_core_fields : 0; /* Insert the recs in group 2 to new page. */ for (cur_split_node = node_array; @@ -854,10 +847,10 @@ rtr_split_page_move_rec_list( block, cur_split_node->key); offsets = rec_get_offsets(cur_split_node->key, - index, offsets, is_leaf, + index, offsets, n_core, ULINT_UNDEFINED, &heap); - ut_ad(!is_leaf || cur_split_node->key != first_rec); + ut_ad(!n_core || cur_split_node->key != first_rec); rec = page_cur_insert_rec_low( page_cur_get_rec(&new_page_cursor), @@ -892,7 +885,7 @@ rtr_split_page_move_rec_list( same temp-table in parallel. max_trx_id is ignored for temp tables because it not required for MVCC. */ - if (is_leaf && !dict_table_is_temporary(index->table)) { + if (n_core && !index->table->is_temporary()) { page_update_max_trx_id(new_block, NULL, page_get_max_trx_id(page), mtr); @@ -945,7 +938,7 @@ rtr_split_page_move_rec_list( block, &page_cursor); offsets = rec_get_offsets( page_cur_get_rec(&page_cursor), index, - offsets, is_leaf, ULINT_UNDEFINED, + offsets, n_core, ULINT_UNDEFINED, &heap); page_cur_delete_rec(&page_cursor, index, offsets, mtr); @@ -980,7 +973,6 @@ rtr_page_split_and_insert( page_t* page; page_t* new_page; ulint page_no; - byte direction; ulint hint_page_no; buf_block_t* new_block; page_zip_des_t* page_zip; @@ -1026,7 +1018,7 @@ func_start: block = btr_cur_get_block(cursor); page = buf_block_get_frame(block); page_zip = buf_block_get_page_zip(block); - page_level = btr_page_get_level(page, mtr); + page_level = btr_page_get_level(page); current_ssn = page_get_ssn_id(page); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); @@ -1044,7 +1036,7 @@ func_start: *heap, cursor, offsets, tuple, &buf_pos); /* Divide all mbrs to two groups. */ - n_recs = page_get_n_recs(page) + 1; + n_recs = ulint(page_get_n_recs(page)) + 1; end_split_node = rtr_split_node_array + n_recs; @@ -1070,9 +1062,8 @@ func_start: static_cast<uchar*>(first_rec)); /* Allocate a new page to the index */ - direction = FSP_UP; hint_page_no = page_no + 1; - new_block = btr_page_alloc(cursor->index, hint_page_no, direction, + new_block = btr_page_alloc(cursor->index, hint_page_no, FSP_UP, page_level, mtr, mtr); new_page_zip = buf_block_get_page_zip(new_block); btr_page_create(new_block, new_page_zip, cursor->index, @@ -1146,6 +1137,9 @@ func_start: /* Update the lock table */ lock_rtr_move_rec_list(new_block, block, rec_move, moved); + const ulint n_core = page_level + ? 0 : cursor->index->n_core_fields; + /* Delete recs in first group from the new page. */ for (cur_split_node = rtr_split_node_array; cur_split_node < end_split_node - 1; ++cur_split_node) { @@ -1164,7 +1158,7 @@ func_start: *offsets = rec_get_offsets( page_cur_get_rec(page_cursor), - cursor->index, *offsets, !page_level, + cursor->index, *offsets, n_core, ULINT_UNDEFINED, heap); page_cur_delete_rec(page_cursor, @@ -1181,7 +1175,7 @@ func_start: block, page_cursor); *offsets = rec_get_offsets( page_cur_get_rec(page_cursor), - cursor->index, *offsets, !page_level, + cursor->index, *offsets, n_core, ULINT_UNDEFINED, heap); page_cur_delete_rec(page_cursor, cursor->index, *offsets, mtr); @@ -1247,12 +1241,12 @@ after_insert: /* Check any predicate locks need to be moved/copied to the new page */ - lock_prdt_update_split(block, new_block, &prdt, &new_prdt, - dict_index_get_space(cursor->index), page_no); + lock_prdt_update_split(new_block, &prdt, &new_prdt, + cursor->index->table->space_id, page_no); /* Adjust the upper level. */ rtr_adjust_upper_level(cursor, flags, block, new_block, - &mbr, &new_mbr, direction, mtr); + &mbr, &new_mbr, mtr); /* Save the new ssn to the root page, since we need to reinit the first ssn value from it after restart server. */ @@ -1276,7 +1270,7 @@ after_insert: if (!rec) { /* We play safe and reset the free bits for new_page */ if (!dict_index_is_clust(cursor->index) - && !dict_table_is_temporary(cursor->index->table)) { + && !cursor->index->table->is_temporary()) { ibuf_reset_free_bits(new_block); ibuf_reset_free_bits(block); } @@ -1313,7 +1307,6 @@ dberr_t rtr_ins_enlarge_mbr( /*================*/ btr_cur_t* btr_cur, /*!< in: btr cursor */ - que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in: mtr */ { dberr_t err = DB_SUCCESS; @@ -1411,7 +1404,8 @@ rtr_page_copy_rec_list_end_no_locks( rec_offs offsets_2[REC_OFFS_NORMAL_SIZE]; rec_offs* offsets2 = offsets_2; ulint moved = 0; - bool is_leaf = page_is_leaf(new_page); + const ulint n_core = page_is_leaf(new_page) + ? index->n_core_fields : 0; rec_offs_init(offsets_1); rec_offs_init(offsets_2); @@ -1424,7 +1418,7 @@ rtr_page_copy_rec_list_end_no_locks( btr_assert_not_corrupted(new_block, index); ut_a(page_is_comp(new_page) == page_rec_is_comp(rec)); - ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) + ut_a(mach_read_from_2(new_page + srv_page_size - 10) == (ulint) (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); cur_rec = page_rec_get_next( @@ -1440,14 +1434,14 @@ rtr_page_copy_rec_list_end_no_locks( cur_rec = page_rec_get_next(cur_rec); } - offsets1 = rec_get_offsets(cur1_rec, index, offsets1, is_leaf, + offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core, ULINT_UNDEFINED, &heap); while (!page_rec_is_supremum(cur_rec)) { ulint cur_matched_fields = 0; int cmp; offsets2 = rec_get_offsets(cur_rec, index, offsets2, - is_leaf, + n_core, ULINT_UNDEFINED, &heap); cmp = cmp_rec_rec(cur1_rec, cur_rec, offsets1, offsets2, index, false, @@ -1459,7 +1453,7 @@ rtr_page_copy_rec_list_end_no_locks( /* Skip small recs. */ page_cur_move_to_next(&page_cur); cur_rec = page_cur_get_rec(&page_cur); - } else if (is_leaf) { + } else if (n_core) { if (rec_get_deleted_flag(cur1_rec, dict_table_is_comp(index->table))) { goto next; @@ -1482,7 +1476,7 @@ rtr_page_copy_rec_list_end_no_locks( cur_rec = page_cur_get_rec(&page_cur); - offsets1 = rec_get_offsets(cur1_rec, index, offsets1, is_leaf, + offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core, ULINT_UNDEFINED, &heap); ins_rec = page_cur_insert_rec_low(cur_rec, index, @@ -1538,7 +1532,8 @@ rtr_page_copy_rec_list_start_no_locks( rec_offs* offsets2 = offsets_2; page_cur_t page_cur; ulint moved = 0; - bool is_leaf = page_is_leaf(buf_block_get_frame(block)); + const ulint n_core = page_is_leaf(buf_block_get_frame(block)) + ? index->n_core_fields : 0; rec_offs_init(offsets_1); rec_offs_init(offsets_2); @@ -1558,14 +1553,14 @@ rtr_page_copy_rec_list_start_no_locks( cur_rec = page_rec_get_next(cur_rec); } - offsets1 = rec_get_offsets(cur1_rec, index, offsets1, is_leaf, + offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core, ULINT_UNDEFINED, &heap); while (!page_rec_is_supremum(cur_rec)) { ulint cur_matched_fields = 0; offsets2 = rec_get_offsets(cur_rec, index, offsets2, - is_leaf, + n_core, ULINT_UNDEFINED, &heap); int cmp = cmp_rec_rec(cur1_rec, cur_rec, offsets1, offsets2, index, false, @@ -1578,7 +1573,7 @@ rtr_page_copy_rec_list_start_no_locks( /* Skip small recs. */ page_cur_move_to_next(&page_cur); cur_rec = page_cur_get_rec(&page_cur); - } else if (is_leaf) { + } else if (n_core) { if (rec_get_deleted_flag( cur1_rec, dict_table_is_comp(index->table))) { @@ -1602,7 +1597,7 @@ rtr_page_copy_rec_list_start_no_locks( cur_rec = page_cur_get_rec(&page_cur); - offsets1 = rec_get_offsets(cur1_rec, index, offsets1, is_leaf, + offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core, ULINT_UNDEFINED, &heap); ins_rec = page_cur_insert_rec_low(cur_rec, index, @@ -1644,10 +1639,7 @@ rtr_merge_mbr_changed( btr_cur_t* cursor2, /*!< in: the other cursor */ rec_offs* offsets, /*!< in: rec offsets */ rec_offs* offsets2, /*!< in: rec offsets */ - rtr_mbr_t* new_mbr, /*!< out: MBR to update */ - buf_block_t* merge_block, /*!< in: page to merge */ - buf_block_t* block, /*!< in: page be merged */ - dict_index_t* index) /*!< in: index */ + rtr_mbr_t* new_mbr) /*!< out: MBR to update */ { double* mbr; double mbr1[SPDIMS * 2]; @@ -1692,9 +1684,6 @@ rtr_merge_and_update_mbr( rec_offs* offsets, /*!< in: rec offsets */ rec_offs* offsets2, /*!< in: rec offsets */ page_t* child_page, /*!< in: the page. */ - buf_block_t* merge_block, /*!< in: page to merge */ - buf_block_t* block, /*!< in: page be merged */ - dict_index_t* index, /*!< in: index */ mtr_t* mtr) /*!< in: mtr */ { dberr_t err = DB_SUCCESS; @@ -1704,8 +1693,7 @@ rtr_merge_and_update_mbr( ut_ad(dict_index_is_spatial(cursor->index)); changed = rtr_merge_mbr_changed(cursor, cursor2, offsets, offsets2, - &new_mbr, merge_block, - block, index); + &new_mbr); /* Update the mbr field of the rec. And will delete the record pointed by cursor2 */ @@ -1715,7 +1703,7 @@ rtr_merge_and_update_mbr( err = DB_ERROR; } } else { - rtr_node_ptr_delete(cursor2->index, cursor2, block, mtr); + rtr_node_ptr_delete(cursor2, mtr); } return(err); @@ -1726,10 +1714,8 @@ Deletes on the upper level the node pointer to a page. */ void rtr_node_ptr_delete( /*================*/ - dict_index_t* index, /*!< in: index tree */ btr_cur_t* cursor, /*!< in: search cursor, contains information about parent nodes in search */ - buf_block_t* block, /*!< in: page whose node pointer is deleted */ mtr_t* mtr) /*!< in: mtr */ { ibool compressed; @@ -1765,7 +1751,7 @@ rtr_check_same_block( while (!page_rec_is_supremum(rec)) { offsets = rec_get_offsets( - rec, index, NULL, false, ULINT_UNDEFINED, &heap); + rec, index, NULL, 0, ULINT_UNDEFINED, &heap); if (btr_node_ptr_get_child_page_no(rec, offsets) == page_no) { btr_cur_position(index, rec, parentb, cursor); @@ -1819,12 +1805,14 @@ rtr_rec_cal_increase( @param[in] tuple range tuple containing mbr, may also be empty tuple @param[in] mode search mode @return estimated number of rows */ -int64_t +ha_rows rtr_estimate_n_rows_in_range( dict_index_t* index, const dtuple_t* tuple, page_cur_mode_t mode) { + ut_ad(dict_index_is_spatial(index)); + /* Check tuple & mode */ if (tuple->n_fields == 0) { return(HA_POS_ERROR); @@ -1846,64 +1834,47 @@ rtr_estimate_n_rows_in_range( ); /* Read mbr from tuple. */ - const dfield_t* dtuple_field; - ulint dtuple_f_len MY_ATTRIBUTE((unused)); rtr_mbr_t range_mbr; double range_area; - dtuple_field = dtuple_get_nth_field(tuple, 0); - dtuple_f_len = dfield_get_len(dtuple_field); - const byte* range_mbr_ptr = static_cast<const byte*>( + const dfield_t* dtuple_field = dtuple_get_nth_field(tuple, 0); + ut_ad(dfield_get_len(dtuple_field) >= DATA_MBR_LEN); + const byte* range_mbr_ptr = reinterpret_cast<const byte*>( dfield_get_data(dtuple_field)); - ut_ad(dtuple_f_len >= DATA_MBR_LEN); rtr_read_mbr(range_mbr_ptr, &range_mbr); range_area = (range_mbr.xmax - range_mbr.xmin) * (range_mbr.ymax - range_mbr.ymin); /* Get index root page. */ - page_size_t page_size(dict_table_page_size(index->table)); - page_id_t page_id(dict_index_get_space(index), - dict_index_get_page(index)); mtr_t mtr; - buf_block_t* block; - page_t* page; - ulint n_recs; - mtr_start(&mtr); - mtr.set_named_space(dict_index_get_space(index)); - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr.start(); + index->set_modified(mtr); + mtr_s_lock_index(index, &mtr); - block = btr_block_get(page_id, page_size, RW_S_LATCH, index, &mtr); - page = buf_block_get_frame(block); - n_recs = page_header_get_field(page, PAGE_N_RECS); + buf_block_t* block = btr_block_get( + page_id_t(index->table->space_id, index->page), + page_size_t(index->table->space->flags), + RW_S_LATCH, index, &mtr); + const page_t* page = buf_block_get_frame(block); + const unsigned n_recs = page_header_get_field(page, PAGE_N_RECS); if (n_recs == 0) { - mtr_commit(&mtr); + mtr.commit(); return(HA_POS_ERROR); } - rec_t* rec; - byte* field; - ulint len; - rec_offs* offsets = NULL; - mem_heap_t* heap; - - heap = mem_heap_create(512); - rec = page_rec_get_next(page_get_infimum_rec(page)); - offsets = rec_get_offsets(rec, index, offsets, page_rec_is_leaf(rec), - ULINT_UNDEFINED, &heap); - /* Scan records in root page and calculate area. */ double area = 0; - while (!page_rec_is_supremum(rec)) { + for (const rec_t* rec = page_rec_get_next( + page_get_infimum_rec(block->frame)); + !page_rec_is_supremum(rec); + rec = page_rec_get_next_const(rec)) { rtr_mbr_t mbr; double rec_area; - field = rec_get_nth_field(rec, offsets, 0, &len); - ut_ad(len == DATA_MBR_LEN); - - rtr_read_mbr(field, &mbr); + rtr_read_mbr(rec, &mbr); rec_area = (mbr.xmax - mbr.xmin) * (mbr.ymax - mbr.ymin); @@ -1920,8 +1891,8 @@ rtr_estimate_n_rows_in_range( case PAGE_CUR_WITHIN: case PAGE_CUR_MBR_EQUAL: if (rtree_key_cmp( - PAGE_CUR_WITHIN, range_mbr_ptr, - DATA_MBR_LEN, field, DATA_MBR_LEN) + PAGE_CUR_WITHIN, range_mbr_ptr, + DATA_MBR_LEN, rec, DATA_MBR_LEN) == 0) { area += 1; } @@ -1935,22 +1906,23 @@ rtr_estimate_n_rows_in_range( switch (mode) { case PAGE_CUR_CONTAIN: case PAGE_CUR_INTERSECT: - area += rtree_area_overlapping(range_mbr_ptr, - field, DATA_MBR_LEN) / rec_area; + area += rtree_area_overlapping( + range_mbr_ptr, rec, DATA_MBR_LEN) + / rec_area; break; case PAGE_CUR_DISJOINT: area += 1; - area -= rtree_area_overlapping(range_mbr_ptr, - field, DATA_MBR_LEN) / rec_area; + area -= rtree_area_overlapping( + range_mbr_ptr, rec, DATA_MBR_LEN) + / rec_area; break; case PAGE_CUR_WITHIN: case PAGE_CUR_MBR_EQUAL: - if (rtree_key_cmp( - PAGE_CUR_WITHIN, range_mbr_ptr, - DATA_MBR_LEN, field, DATA_MBR_LEN) - == 0) { + if (!rtree_key_cmp( + PAGE_CUR_WITHIN, range_mbr_ptr, + DATA_MBR_LEN, rec, DATA_MBR_LEN)) { area += range_area / rec_area; } @@ -1959,17 +1931,14 @@ rtr_estimate_n_rows_in_range( ut_error; } } - - rec = page_rec_get_next(rec); } - mtr_commit(&mtr); - mem_heap_free(heap); + mtr.commit(); if (!std::isfinite(area)) { return(HA_POS_ERROR); } - return(static_cast<int64_t>(dict_table_get_n_rows(index->table) - * area / n_recs)); + area /= n_recs; + return ha_rows(dict_table_get_n_rows(index->table) * area); } diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc index 5ea3f328d5c..e5ba43faa0b 100644 --- a/storage/innobase/gis/gis0sea.cc +++ b/storage/innobase/gis/gis0sea.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2016, 2018, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -96,7 +96,6 @@ rtr_pcur_getnext_from_path( { dict_index_t* index = btr_cur->index; bool found = false; - ulint space = dict_index_get_space(index); page_cur_t* page_cursor; ulint level = 0; node_visit_t next_rec; @@ -138,7 +137,7 @@ rtr_pcur_getnext_from_path( if (!index_locked) { ut_ad(latch_mode & BTR_SEARCH_LEAF || latch_mode & BTR_MODIFY_LEAF); - mtr_s_lock(dict_index_get_lock(index), mtr); + mtr_s_lock_index(index, mtr); } else { ut_ad(mtr_memo_contains_flagged(mtr, &index->lock, MTR_MEMO_SX_LOCK @@ -146,7 +145,7 @@ rtr_pcur_getnext_from_path( | MTR_MEMO_X_LOCK)); } - const page_size_t& page_size = dict_table_page_size(index->table); + const page_size_t page_size(index->table->space->flags); /* Pop each node/page to be searched from "path" structure and do a search on it. Please note, any pages that are in @@ -266,11 +265,11 @@ rtr_pcur_getnext_from_path( btr_cur->page_cur.block))); #endif /* UNIV_RTR_DEBUG */ - page_id_t page_id(space, next_rec.page_no); dberr_t err = DB_SUCCESS; block = buf_page_get_gen( - page_id, page_size, + page_id_t(index->table->space_id, + next_rec.page_no), page_size, rw_latch, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err); if (block == NULL) { @@ -299,11 +298,12 @@ rtr_pcur_getnext_from_path( && mode != PAGE_CUR_RTREE_LOCATE) { ut_ad(rtr_info->thr); lock_place_prdt_page_lock( - space, next_page_no, index, + index->table->space_id, + next_page_no, index, rtr_info->thr); } new_split = true; -#if UNIV_GIS_DEBUG +#if defined(UNIV_GIS_DEBUG) fprintf(stderr, "GIS_DIAG: Splitted page found: %d, %ld\n", static_cast<int>(need_parent), next_page_no); @@ -405,8 +405,7 @@ rtr_pcur_getnext_from_path( } lock_prdt_lock(block, &prdt, index, LOCK_S, - LOCK_PREDICATE, btr_cur->rtr_info->thr, - mtr); + LOCK_PREDICATE, btr_cur->rtr_info->thr); if (rw_latch == RW_NO_LATCH) { rw_lock_s_unlock(&(block->lock)); @@ -420,11 +419,11 @@ rtr_pcur_getnext_from_path( if (my_latch_mode == BTR_MODIFY_TREE && level == 0) { ut_ad(rw_latch == RW_NO_LATCH); - page_id_t my_page_id( - space, block->page.id.page_no()); btr_cur_latch_leaves( - block, my_page_id, + block, + page_id_t(index->table->space_id, + block->page.id.page_no()), page_size, BTR_MODIFY_TREE, btr_cur, mtr); } @@ -531,8 +530,7 @@ rtr_compare_cursor_rec( rec = btr_cur_get_rec(cursor); - offsets = rec_get_offsets( - rec, index, NULL, false, ULINT_UNDEFINED, heap); + offsets = rec_get_offsets(rec, index, NULL, 0, ULINT_UNDEFINED, heap); return(btr_node_ptr_get_child_page_no(rec, offsets) == page_no); } @@ -590,7 +588,7 @@ rtr_pcur_open_low( } btr_cur_search_to_nth_level(index, level, tuple, mode, latch_mode, - btr_cursor, 0, file, line, mtr, 0); + btr_cursor, 0, file, line, mtr); cursor->pos_state = BTR_PCUR_IS_POSITIONED; cursor->trx_if_known = NULL; @@ -759,7 +757,7 @@ static void rtr_get_father_node( btr_cur_search_to_nth_level( index, level, tuple, PAGE_CUR_RTREE_LOCATE, BTR_CONT_MODIFY_TREE, btr_cur, 0, - __FILE__, __LINE__, mtr, 0); + __FILE__, __LINE__, mtr); } else { /* btr_validate */ @@ -769,7 +767,7 @@ static void rtr_get_father_node( btr_cur_search_to_nth_level( index, level, tuple, PAGE_CUR_RTREE_LOCATE, BTR_CONT_MODIFY_TREE, btr_cur, 0, - __FILE__, __LINE__, mtr, 0); + __FILE__, __LINE__, mtr); rec = btr_cur_get_rec(btr_cur); n_fields = dtuple_get_n_fields_cmp(tuple); @@ -832,17 +830,18 @@ rtr_page_get_father_node_ptr( ut_ad(dict_index_get_page(index) != page_no); - level = btr_page_get_level(btr_cur_get_page(cursor), mtr); + level = btr_page_get_level(btr_cur_get_page(cursor)); user_rec = btr_cur_get_rec(cursor); ut_a(page_rec_is_user_rec(user_rec)); - offsets = rec_get_offsets(user_rec, index, offsets, !level, + offsets = rec_get_offsets(user_rec, index, offsets, + level ? 0 : index->n_fields, ULINT_UNDEFINED, &heap); rtr_get_mbr_from_rec(user_rec, offsets, &mbr); tuple = rtr_index_build_node_ptr( - index, &mbr, user_rec, page_no, heap, level); + index, &mbr, user_rec, page_no, heap); if (sea_cur && !sea_cur->rtr_info) { sea_cur = NULL; @@ -854,7 +853,7 @@ rtr_page_get_father_node_ptr( node_ptr = btr_cur_get_rec(cursor); ut_ad(!page_rec_is_comp(node_ptr) || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR); - offsets = rec_get_offsets(node_ptr, index, offsets, false, + offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); ulint child_page = btr_node_ptr_get_child_page_no(node_ptr, offsets); @@ -872,13 +871,14 @@ rtr_page_get_father_node_ptr( print_rec = page_rec_get_next( page_get_infimum_rec(page_align(user_rec))); offsets = rec_get_offsets(print_rec, index, offsets, - page_rec_is_leaf(user_rec), + page_rec_is_leaf(user_rec) + ? index->n_fields : 0, ULINT_UNDEFINED, &heap); error << "; child "; rec_print(error.m_oss, print_rec, rec_get_info_bits(print_rec, rec_offs_comp(offsets)), offsets); - offsets = rec_get_offsets(node_ptr, index, offsets, false, + offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); error << "; parent "; rec_print(error.m_oss, print_rec, @@ -1252,8 +1252,8 @@ rtr_check_discard_page( mutex_exit(&index->rtr_track->rtr_active_mutex); lock_mutex_enter(); - lock_prdt_page_free_from_discard(block, lock_sys->prdt_hash); - lock_prdt_page_free_from_discard(block, lock_sys->prdt_page_hash); + lock_prdt_page_free_from_discard(block, lock_sys.prdt_hash); + lock_prdt_page_free_from_discard(block, lock_sys.prdt_page_hash); lock_mutex_exit(); } @@ -1324,10 +1324,12 @@ rtr_cur_restore_position( heap = mem_heap_create(256); offsets1 = rec_get_offsets( - r_cursor->old_rec, index, NULL, !level, + r_cursor->old_rec, index, NULL, + level ? 0 : r_cursor->old_n_fields, r_cursor->old_n_fields, &heap); offsets2 = rec_get_offsets( - rec, index, NULL, !level, + rec, index, NULL, + level ? 0 : r_cursor->old_n_fields, r_cursor->old_n_fields, &heap); comp = rec_offs_comp(offsets1); @@ -1357,9 +1359,8 @@ rtr_cur_restore_position( const page_t* page; page_cur_t* page_cursor; node_visit_t* node = rtr_get_parent_node(btr_cur, level, false); - ulint space = dict_index_get_space(index); node_seq_t path_ssn = node->seq_no; - page_size_t page_size = dict_table_page_size(index->table); + const page_size_t page_size(index->table->space->flags); ulint page_no = node->page_no; @@ -1372,11 +1373,11 @@ rtr_cur_restore_position( ut_ad(r_cursor == node->cursor); search_again: - page_id_t page_id(space, page_no); dberr_t err = DB_SUCCESS; block = buf_page_get_gen( - page_id, page_size, RW_X_LATCH, NULL, + page_id_t(index->table->space_id, page_no), + page_size, RW_X_LATCH, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err); ut_ad(block); @@ -1396,12 +1397,12 @@ search_again: rec = btr_pcur_get_rec(r_cursor); - offsets1 = rec_get_offsets( - r_cursor->old_rec, index, NULL, !level, - r_cursor->old_n_fields, &heap); - offsets2 = rec_get_offsets( - rec, index, NULL, !level, - r_cursor->old_n_fields, &heap); + offsets1 = rec_get_offsets(r_cursor->old_rec, index, NULL, + level ? 0 : r_cursor->old_n_fields, + r_cursor->old_n_fields, &heap); + offsets2 = rec_get_offsets(rec, index, NULL, + level ? 0 : r_cursor->old_n_fields, + r_cursor->old_n_fields, &heap); comp = rec_offs_comp(offsets1); @@ -1466,7 +1467,7 @@ rtr_leaf_push_match_rec( data_len = rec_offs_data_size(offsets) + rec_offs_extra_size(offsets); match_rec->used += data_len; - ut_ad(match_rec->used < UNIV_PAGE_SIZE); + ut_ad(match_rec->used < srv_page_size); } /**************************************************************//** @@ -1690,8 +1691,8 @@ rtr_cur_search_with_match( page = buf_block_get_frame(block); - const ulint level = btr_page_get_level(page, mtr); - const bool is_leaf = !level; + const ulint level = btr_page_get_level(page); + const ulint n_core = level ? 0 : index->n_fields; if (mode == PAGE_CUR_RTREE_LOCATE) { ut_ad(level != 0); @@ -1713,7 +1714,7 @@ rtr_cur_search_with_match( ulint new_rec_size = rec_get_converted_size(index, tuple, 0); - offsets = rec_get_offsets(rec, index, offsets, is_leaf, + offsets = rec_get_offsets(rec, index, offsets, n_core, dtuple_get_n_fields_cmp(tuple), &heap); @@ -1734,10 +1735,10 @@ rtr_cur_search_with_match( } while (!page_rec_is_supremum(rec)) { - offsets = rec_get_offsets(rec, index, offsets, is_leaf, + offsets = rec_get_offsets(rec, index, offsets, n_core, dtuple_get_n_fields_cmp(tuple), &heap); - if (!is_leaf) { + if (!n_core) { switch (mode) { case PAGE_CUR_CONTAIN: case PAGE_CUR_INTERSECT: @@ -1818,7 +1819,7 @@ rtr_cur_search_with_match( to rtr_info->path for non-leaf nodes, or rtr_info->matches for leaf nodes */ if (rtr_info && mode != PAGE_CUR_RTREE_INSERT) { - if (!is_leaf) { + if (!n_core) { ulint page_no; node_seq_t new_seq; bool is_loc; @@ -1829,7 +1830,7 @@ rtr_cur_search_with_match( == PAGE_CUR_RTREE_GET_FATHER); offsets = rec_get_offsets( - rec, index, offsets, false, + rec, index, offsets, 0, ULINT_UNDEFINED, &heap); page_no = btr_node_ptr_get_child_page_no( @@ -1878,7 +1879,8 @@ rtr_cur_search_with_match( /* Collect matched records on page */ offsets = rec_get_offsets( - rec, index, offsets, true, + rec, index, offsets, + index->n_fields, ULINT_UNDEFINED, &heap); rtr_leaf_push_match_rec( rec, rtr_info, offsets, @@ -1901,7 +1903,7 @@ rtr_cur_search_with_match( /* All records on page are searched */ if (page_rec_is_supremum(rec)) { - if (!is_leaf) { + if (!n_core) { if (!found) { /* No match case, if it is for insertion, then we select the record that result in @@ -1911,7 +1913,7 @@ rtr_cur_search_with_match( ut_ad(least_inc < DBL_MAX); offsets = rec_get_offsets( best_rec, index, offsets, - false, ULINT_UNDEFINED, &heap); + 0, ULINT_UNDEFINED, &heap); child_no = btr_node_ptr_get_child_page_no( best_rec, offsets); @@ -1963,11 +1965,11 @@ rtr_cur_search_with_match( /* Verify the record to be positioned is the same as the last record in matched_rec vector */ offsets2 = rec_get_offsets(test_rec.r_rec, index, - offsets2, true, + offsets2, index->n_fields, ULINT_UNDEFINED, &heap); offsets = rec_get_offsets(last_match_rec, index, - offsets, true, + offsets, index->n_fields, ULINT_UNDEFINED, &heap); ut_ad(cmp_rec_rec(test_rec.r_rec, last_match_rec, @@ -1984,9 +1986,8 @@ rtr_cur_search_with_match( ulint child_no; ut_ad(!last_match_rec && rec); - offsets = rec_get_offsets( - rec, index, offsets, false, - ULINT_UNDEFINED, &heap); + offsets = rec_get_offsets(rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); child_no = btr_node_ptr_get_child_page_no(rec, offsets); @@ -1994,7 +1995,7 @@ rtr_cur_search_with_match( index, rtr_info->parent_path, level, child_no, block, rec, 0); - } else if (rtr_info && found && !is_leaf) { + } else if (rtr_info && found && !n_core) { rec = last_match_rec; } @@ -2004,11 +2005,11 @@ rtr_cur_search_with_match( #ifdef UNIV_DEBUG /* Verify that we are positioned at the same child page as pushed in the path stack */ - if (!is_leaf && (!page_rec_is_supremum(rec) || found) + if (!n_core && (!page_rec_is_supremum(rec) || found) && mode != PAGE_CUR_RTREE_INSERT) { ulint page_no; - offsets = rec_get_offsets(rec, index, offsets, false, + offsets = rec_get_offsets(rec, index, offsets, 0, ULINT_UNDEFINED, &heap); page_no = btr_node_ptr_get_child_page_no(rec, offsets); diff --git a/storage/innobase/ha/ha0ha.cc b/storage/innobase/ha/ha0ha.cc index cf9a454ad8d..f7e3c53495b 100644 --- a/storage/innobase/ha/ha0ha.cc +++ b/storage/innobase/ha/ha0ha.cc @@ -60,7 +60,8 @@ ib_create( if (n_sync_obj == 0) { table->heap = mem_heap_create_typed( - ut_min(static_cast<ulint>(4096), + std::min<ulong>( + 4096, MEM_MAX_ALLOC_IN_BUF / 2 - MEM_BLOCK_HEADER_SIZE - MEM_SPACE_NEEDED(0)), type); @@ -84,7 +85,8 @@ ib_create( for (ulint i = 0; i < n_sync_obj; i++) { table->heaps[i] = mem_heap_create_typed( - ut_min(static_cast<ulint>(4096), + std::min<ulong>( + 4096, MEM_MAX_ALLOC_IN_BUF / 2 - MEM_BLOCK_HEADER_SIZE - MEM_SPACE_NEEDED(0)), type); @@ -126,7 +128,8 @@ ib_recreate( for (ulint i = 0; i < new_table->n_sync_obj; i++) { new_table->heaps[i] = mem_heap_create_typed( - ut_min(static_cast<ulint>(4096), + std::min<ulong>( + 4096, MEM_MAX_ALLOC_IN_BUF / 2 - MEM_BLOCK_HEADER_SIZE - MEM_SPACE_NEEDED(0)), MEM_HEAP_FOR_PAGE_HASH); @@ -192,7 +195,7 @@ ha_clear( #ifdef BTR_CUR_HASH_ADAPT # if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG /** Maximum number of records in a page */ -static const lint MAX_N_POINTERS +static const ulint MAX_N_POINTERS = UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES; # endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ @@ -242,8 +245,8 @@ ha_insert_for_fold_func( buf_block_t* prev_block = prev_node->block; ut_a(prev_block->frame == page_align(prev_node->data)); - ut_a(my_atomic_addlint( - &prev_block->n_pointers, -1) + ut_a(my_atomic_addlint(&prev_block->n_pointers, + ulint(-1)) < MAX_N_POINTERS); ut_a(my_atomic_addlint(&block->n_pointers, 1) < MAX_N_POINTERS); @@ -339,7 +342,7 @@ ha_delete_hash_node( #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG if (table->adaptive) { ut_a(del_node->block->frame = page_align(del_node->data)); - ut_a(my_atomic_addlint(&del_node->block->n_pointers, -1) + ut_a(my_atomic_addlint(&del_node->block->n_pointers, ulint(-1)) < MAX_N_POINTERS); } #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ @@ -382,7 +385,8 @@ ha_search_and_update_if_found_func( if (node) { #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG if (table->adaptive) { - ut_a(my_atomic_addlint(&node->block->n_pointers, -1) + ut_a(my_atomic_addlint(&node->block->n_pointers, + ulint(-1)) < MAX_N_POINTERS); ut_a(my_atomic_addlint(&new_block->n_pointers, 1) < MAX_N_POINTERS); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index b837aec396b..862ed7e41a4 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -109,10 +109,9 @@ this program; if not, write to the Free Software Foundation, Inc., #include "trx0purge.h" #endif /* UNIV_DEBUG */ #include "trx0roll.h" -#include "trx0sys.h" +#include "trx0rseg.h" #include "trx0trx.h" #include "fil0pagecompress.h" -#include "trx0xa.h" #include "ut0mem.h" #include "row0ext.h" @@ -131,12 +130,6 @@ TABLE *open_purge_table(THD *thd, const char *db, size_t dblen, const char *tb, size_t tblen); void close_thread_tables(THD* thd); -/** Check if user has used xtradb extended system variable that -is not currently supported by innodb or marked as deprecated. */ -static -void -innodb_check_deprecated(void); - #ifdef MYSQL_DYNAMIC_PLUGIN #define tc_size 400 #define tdc_size 400 @@ -154,19 +147,15 @@ innodb_check_deprecated(void); #ifdef WITH_WSREP #include "dict0priv.h" -#include "ut0byte.h" #include <mysql/service_md5.h> #include "wsrep_sst.h" -extern MYSQL_PLUGIN_IMPORT MYSQL_BIN_LOG mysql_bin_log; - static inline wsrep_ws_handle_t* wsrep_ws_handle(THD* thd, const trx_t* trx) { return wsrep_ws_handle_for_trx(wsrep_thd_ws_handle(thd), (wsrep_trx_id_t)trx->id); } -extern TC_LOG* tc_log; extern void wsrep_cleanup_transaction(THD *thd); static void wsrep_abort_transaction(handlerton*, THD *, THD *, my_bool); static void wsrep_fake_trx_id(handlerton* hton, THD *thd); @@ -190,60 +179,44 @@ static const long AUTOINC_OLD_STYLE_LOCKING = 0; static const long AUTOINC_NEW_STYLE_LOCKING = 1; static const long AUTOINC_NO_LOCKING = 2; -static long innobase_log_buffer_size; -static long innobase_open_files=0; +static ulong innobase_open_files; static long innobase_autoinc_lock_mode; -static ulong innobase_commit_concurrency = 0; -static ulong innobase_read_io_threads; -static ulong innobase_write_io_threads; +static ulong innobase_commit_concurrency; -static long long innobase_buffer_pool_size; +static ulonglong innobase_buffer_pool_size; /** Percentage of the buffer pool to reserve for 'old' blocks. Connected to buf_LRU_old_ratio. */ static uint innobase_old_blocks_pct; +static char* innobase_data_file_path; +static char* innobase_temp_data_file_path; + /* The default values for the following char* start-up parameters -are determined in innobase_init below: */ +are determined in innodb_init_params(). */ static char* innobase_data_home_dir; -static char* innobase_data_file_path; -static char* innobase_temp_data_file_path; -static char* innobase_file_format_name; -static char* innobase_change_buffering; static char* innobase_enable_monitor_counter; static char* innobase_disable_monitor_counter; static char* innobase_reset_monitor_counter; static char* innobase_reset_all_monitor_counter; -/* The highest file format being used in the database. The value can be -set by user, however, it will be adjusted to the newer file format if -a table of such format is created/opened. */ -char* innobase_file_format_max; +static ulong innodb_flush_method; -/** Default value of innodb_file_format */ -static const char* innodb_file_format_default = "Barracuda"; -/** Default value of innodb_file_format_max */ -static const char* innodb_file_format_max_default = "Antelope"; - -static char* innobase_file_flush_method; +/** Deprecated; no effect other than issuing a deprecation warning. */ +static char* innodb_file_format; +/** Deprecated; no effect other than issuing a deprecation warning. */ +static char* innodb_large_prefix; /* This variable can be set in the server configure file, specifying stopword table to be used */ static char* innobase_server_stopword_table; -/* Below we have boolean-valued start-up parameters, and their default -values */ - -static my_bool innobase_file_format_check; -static my_bool innobase_use_fallocate; -static my_bool innobase_use_doublewrite; static my_bool innobase_use_checksums; static my_bool innobase_locks_unsafe_for_binlog; static my_bool innobase_rollback_on_timeout; static my_bool innobase_create_status_file; my_bool innobase_stats_on_metadata; -static my_bool innobase_large_prefix; static my_bool innodb_optimize_fulltext_only; static char* innodb_version_str = (char*) INNODB_VERSION_STR; @@ -257,9 +230,13 @@ extern my_bool srv_background_scrub_data_compressed; extern uint srv_background_scrub_data_interval; extern uint srv_background_scrub_data_check_interval; #ifdef UNIV_DEBUG +my_bool innodb_evict_tables_on_commit_debug; extern my_bool srv_scrub_force_testing; #endif +/** File format constraint for ALTER TABLE */ +ulong innodb_instant_alter_column_allowed; + /** Note we cannot use rec_format_enum because we do not allow COMPRESSED row format for innodb_default_row_format option. */ enum default_row_format_enum { @@ -276,10 +253,10 @@ static void innodb_max_purge_lag_wait_update(THD *thd, st_mysql_sys_var *, void *, const void *limit) { const uint l= *static_cast<const uint*>(limit); - if (trx_sys->rseg_history_len <= l) + if (trx_sys.history_size() <= l) return; mysql_mutex_unlock(&LOCK_global_system_variables); - while (trx_sys->rseg_history_len > l) + while (trx_sys.history_size() > l) { if (thd_kill_level(thd)) break; @@ -351,7 +328,7 @@ thd_destructor_proxy(void *) MY_MEMORY_ORDER_RELAXED); while (srv_fast_shutdown == 0 && - (trx_sys_any_active_transactions() || + (trx_sys.any_active_transactions() || (uint)thread_count > srv_n_purge_threads + 1)) { thd_proc_info(thd, "InnoDB slow shutdown wait"); os_thread_sleep(1000); @@ -465,6 +442,30 @@ static TYPELIB innodb_lock_schedule_algorithm_typelib = { NULL }; +/** Names of allowed values of innodb_flush_method */ +const char* innodb_flush_method_names[] = { + "fsync", + "O_DSYNC", + "littlesync", + "nosync", + "O_DIRECT", + "O_DIRECT_NO_FSYNC", +#ifdef _WIN32 + "unbuffered", + "async_unbuffered" /* alias for "unbuffered" */, + "normal" /* alias for "fsync" */, +#endif + NullS +}; + +/** Enumeration of innodb_flush_method */ +TYPELIB innodb_flush_method_typelib = { + array_elements(innodb_flush_method_names) - 1, + "innodb_flush_method_typelib", + innodb_flush_method_names, + NULL +}; + /* The following counter is used to convey information to InnoDB about server activity: in case of normal DML ops it is not sensible to call srv_active_wake_master_thread after each @@ -474,13 +475,37 @@ operation, we only do it every INNOBASE_WAKE_INTERVAL'th step. */ static ulong innobase_active_counter = 0; /** Allowed values of innodb_change_buffering */ -static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = { +static const char* innodb_change_buffering_names[] = { "none", /* IBUF_USE_NONE */ "inserts", /* IBUF_USE_INSERT */ "deletes", /* IBUF_USE_DELETE_MARK */ "changes", /* IBUF_USE_INSERT_DELETE_MARK */ "purges", /* IBUF_USE_DELETE */ - "all" /* IBUF_USE_ALL */ + "all", /* IBUF_USE_ALL */ + NullS +}; + +/** Enumeration of innodb_change_buffering */ +static TYPELIB innodb_change_buffering_typelib = { + array_elements(innodb_change_buffering_names) - 1, + "innodb_change_buffering_typelib", + innodb_change_buffering_names, + NULL +}; + +/** Allowed values of innodb_instant_alter_column_allowed */ +const char* innodb_instant_alter_column_allowed_names[] = { + "never", /* compatible with MariaDB 5.5 to 10.2 */ + "add_last",/* allow instant ADD COLUMN */ + NullS +}; + +/** Enumeration of innodb_instant_alter_column_allowed */ +static TYPELIB innodb_instant_alter_column_allowed_typelib = { + array_elements(innodb_instant_alter_column_allowed_names) - 1, + "innodb_instant_alter_column_allowed_typelib", + innodb_instant_alter_column_allowed_names, + NULL }; /** Retrieve the FTS Relevance Ranking result for doc with doc_id @@ -601,7 +626,6 @@ static PSI_mutex_info all_innodb_mutexes[] = { PSI_KEY(dict_foreign_err_mutex), PSI_KEY(dict_sys_mutex), PSI_KEY(recalc_pool_mutex), - PSI_KEY(file_format_max_mutex), PSI_KEY(fil_system_mutex), PSI_KEY(flush_list_mutex), PSI_KEY(fts_delete_mutex), @@ -630,7 +654,6 @@ static PSI_mutex_info all_innodb_mutexes[] = { PSI_KEY(srv_misc_tmpfile_mutex), PSI_KEY(srv_monitor_file_mutex), PSI_KEY(buf_dblwr_mutex), - PSI_KEY(trx_undo_mutex), PSI_KEY(trx_pool_mutex), PSI_KEY(trx_pool_manager_mutex), PSI_KEY(srv_sys_mutex), @@ -791,7 +814,7 @@ static int innodb_tmpdir_validate( THD* thd, - struct st_mysql_sys_var* var, + struct st_mysql_sys_var*, void* save, struct st_mysql_value* value) { @@ -940,42 +963,6 @@ innodb_encrypt_tables_validate( static const char innobase_hton_name[]= "InnoDB"; -static const char* deprecated_innodb_support_xa - = "Using innodb_support_xa is deprecated and the" - " parameter may be removed in future releases."; - -static const char* deprecated_innodb_support_xa_off - = "Using innodb_support_xa is deprecated and the" - " parameter may be removed in future releases." - " Only innodb_support_xa=ON is allowed."; - -/** Update the session variable innodb_support_xa. -@param[in] thd current session -@param[in] var the system variable innodb_support_xa -@param[in,out] var_ptr the contents of the variable -@param[in] save the to-be-updated value */ -static -void -innodb_support_xa_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save) -{ - my_bool innodb_support_xa = *static_cast<const my_bool*>(save); - - push_warning(thd, Sql_condition::WARN_LEVEL_WARN, - HA_ERR_WRONG_COMMAND, - innodb_support_xa - ? deprecated_innodb_support_xa - : deprecated_innodb_support_xa_off); -} - -static MYSQL_THDVAR_BOOL(support_xa, PLUGIN_VAR_OPCMDARG, - "Enable InnoDB support for the XA two-phase commit", - /* check_func */ NULL, innodb_support_xa_update, - /* default */ TRUE); - static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG, "Enable InnoDB locking in LOCK TABLES", /* check_func */ NULL, /* update_func */ NULL, @@ -992,7 +979,7 @@ static MYSQL_THDVAR_BOOL(ft_enable_stopword, PLUGIN_VAR_OPCMDARG, static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG, "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.", - NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0); + NULL, NULL, 50, 0, 1024 * 1024 * 1024, 0); static MYSQL_THDVAR_STR(ft_user_stopword_table, PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC, @@ -1011,6 +998,8 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_buffer_pool_load_status, SHOW_CHAR}, {"buffer_pool_resize_status", (char*) &export_vars.innodb_buffer_pool_resize_status, SHOW_CHAR}, + {"buffer_pool_load_incomplete", + &export_vars.innodb_buffer_pool_load_incomplete, SHOW_BOOL}, {"buffer_pool_pages_data", (char*) &export_vars.innodb_buffer_pool_pages_data, SHOW_LONG}, {"buffer_pool_bytes_data", @@ -1164,6 +1153,9 @@ static SHOW_VAR innodb_status_variables[]= { {"defragment_count", (char*) &export_vars.innodb_defragment_count, SHOW_LONG}, + {"instant_alter_column", + (char*) &export_vars.innodb_instant_alter_column, SHOW_LONG}, + /* Online alter table status variables */ {"onlineddl_rowlog_rows", (char*) &export_vars.innodb_onlineddl_rowlog_rows, SHOW_LONG}, @@ -1369,25 +1361,6 @@ static void innodb_params_adjust(); -/************************************************************//** -Validate the file format name and return its corresponding id. -@return valid file format id */ -static -uint -innobase_file_format_name_lookup( -/*=============================*/ - const char* format_name); /*!< in: pointer to file format - name */ -/************************************************************//** -Validate the file format check config parameters, as a side effect it -sets the srv_max_file_format_at_startup variable. -@return the format_id if valid config value, otherwise, return -1 */ -static -int -innobase_file_format_validate_and_set( -/*==================================*/ - const char* format_max); /*!< in: parameter value */ - /*******************************************************************//** This function is used to prepare an X/Open XA distributed transaction. @return 0 or error number */ @@ -1568,9 +1541,7 @@ static int innobase_commit_concurrency_validate( /*=================================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to system - variable */ + THD*, st_mysql_sys_var*, void* save, /*!< out: immediate result for update function */ struct st_mysql_value* value) /*!< in: incoming string */ @@ -1856,19 +1827,6 @@ thd_lock_wait_timeout( return(THDVAR(thd, lock_wait_timeout)); } -/******************************************************************//** -Set the time waited for the lock for the current query. */ -void -thd_set_lock_wait_time( -/*===================*/ - THD* thd, /*!< in/out: thread handle */ - ulint value) /*!< in: time waited for the lock */ -{ - if (thd) { - thd_storage_lock_wait(thd, value); - } -} - /** Get the value of innodb_tmpdir. @param[in] thd thread handle, or NULL to query the global innodb_tmpdir. @@ -1931,7 +1889,7 @@ Converts an InnoDB error code to a MySQL error code and also tells to MySQL about a possible transaction rollback inside InnoDB caused by a lock wait timeout or a deadlock. @return MySQL error code */ -int +static int convert_error_code_to_mysql( /*========================*/ dberr_t error, /*!< in: InnoDB error code */ @@ -2059,7 +2017,7 @@ convert_error_code_to_mysql( /* If prefix is true then a 768-byte prefix is stored locally for BLOB fields. Refer to dict_table_get_format(). We limit max record size to 16k for 64k page size. */ - bool prefix = (dict_tf_get_format(flags) == UNIV_FORMAT_A); + bool prefix = !DICT_TF_HAS_ATOMIC_BLOBS(flags); bool comp = !!(flags & DICT_TF_COMPACT); ulint free_space = page_get_free_space_of_empty(comp) / 2; @@ -2151,17 +2109,6 @@ innobase_mysql_print_thd( } /******************************************************************//** -Get the error message format string. -@return the format string or 0 if not found. */ -const char* -innobase_get_err_msg( -/*=================*/ - int error_code) /*!< in: MySQL error code */ -{ - return(my_get_err_msg(error_code)); -} - -/******************************************************************//** Get the variable length bounds of the given character set. */ void innobase_get_cset_width( @@ -2398,24 +2345,8 @@ static int mysql_tmpfile_path(const char *path, const char *prefix) DBUG_ASSERT((strlen(path) + strlen(prefix)) <= FN_REFLEN); char filename[FN_REFLEN]; - File fd = create_temp_file(filename, path, prefix, -#ifdef __WIN__ - O_BINARY | O_TRUNC | O_SEQUENTIAL | - O_SHORT_LIVED | -#endif /* __WIN__ */ - O_CREAT | O_EXCL | O_RDWR | O_TEMPORARY, - MYF(MY_WME)); - if (fd >= 0) { -#ifndef __WIN__ - /* - This can be removed once the following bug is fixed: - Bug #28903 create_temp_file() doesn't honor O_TEMPORARY option - (file not removed) (Unix) - */ - unlink(filename); -#endif /* !__WIN__ */ - } - + File fd = create_temp_file(filename, path, prefix, O_BINARY | O_SEQUENTIAL, + MYF(MY_WME | MY_TEMPORARY)); return fd; } @@ -2423,19 +2354,18 @@ static int mysql_tmpfile_path(const char *path, const char *prefix) path. If the path is NULL, then it will be created in tmpdir. @param[in] path location for creating temporary file @return temporary file descriptor, or < 0 on error */ -int +os_file_t innobase_mysql_tmpfile( const char* path) { #ifdef WITH_INNODB_DISALLOW_WRITES os_event_wait(srv_allow_writes_event); #endif /* WITH_INNODB_DISALLOW_WRITES */ - int fd2 = -1; File fd; DBUG_EXECUTE_IF( "innobase_tmpfile_creation_failure", - return(-1); + return(OS_FILE_CLOSED); ); if (path == NULL) { @@ -2444,54 +2374,59 @@ innobase_mysql_tmpfile( fd = mysql_tmpfile_path(path, "ib"); } - if (fd >= 0) { - /* Copy the file descriptor, so that the additional resources - allocated by create_temp_file() can be freed by invoking - my_close(). + if (fd < 0) + return OS_FILE_CLOSED; - Because the file descriptor returned by this function - will be passed to fdopen(), it will be closed by invoking - fclose(), which in turn will invoke close() instead of - my_close(). */ + /* Copy the file descriptor, so that the additional resources + allocated by create_temp_file() can be freed by invoking + my_close(). + + Because the file descriptor returned by this function + will be passed to fdopen(), it will be closed by invoking + fclose(), which in turn will invoke close() instead of + my_close(). */ #ifdef _WIN32 - /* Note that on Windows, the integer returned by mysql_tmpfile - has no relation to C runtime file descriptor. Here, we need - to call my_get_osfhandle to get the HANDLE and then convert it - to C runtime filedescriptor. */ - { - HANDLE hFile = my_get_osfhandle(fd); - HANDLE hDup; - BOOL bOK = DuplicateHandle( - GetCurrentProcess(), - hFile, GetCurrentProcess(), - &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS); - if (bOK) { - fd2 = _open_osfhandle((intptr_t) hDup, 0); - } else { - my_osmaperr(GetLastError()); - fd2 = -1; - } - } + /* Note that on Windows, the integer returned by mysql_tmpfile + has no relation to C runtime file descriptor. Here, we need + to call my_get_osfhandle to get the HANDLE and then convert it + to C runtime filedescriptor. */ + + HANDLE hFile = my_get_osfhandle(fd); + HANDLE hDup; + BOOL bOK = DuplicateHandle( + GetCurrentProcess(), + hFile, GetCurrentProcess(), + &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS); + my_close(fd, MYF(MY_WME)); + + if (!bOK) { + my_osmaperr(GetLastError()); + goto error; + } + return hDup; #else #ifdef F_DUPFD_CLOEXEC - fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0); + int fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0); #else - fd2 = dup(fd); -#endif + int fd2 = dup(fd); #endif - if (fd2 < 0) { - char errbuf[MYSYS_STRERROR_SIZE]; - DBUG_PRINT("error",("Got error %d on dup",fd2)); - set_my_errno(errno); - my_error(EE_OUT_OF_FILERESOURCES, - MYF(0), - "ib*", errno, - my_strerror(errbuf, sizeof(errbuf), errno)); - } - my_close(fd, MYF(MY_WME)); + my_close(fd, MYF(MY_WME)); + if (fd2 < 0) { + set_my_errno(errno); + goto error; } - return(fd2); + return fd2; +#endif + +error: + char errbuf[MYSYS_STRERROR_SIZE]; + + my_error(EE_OUT_OF_FILERESOURCES, + MYF(0), + "ib*", errno, + my_strerror(errbuf, sizeof(errbuf), errno)); + return (OS_FILE_CLOSED); } /*********************************************************************//** @@ -2840,7 +2775,7 @@ innobase_trx_allocate( DBUG_ASSERT(thd != NULL); DBUG_ASSERT(EQ_CURRENT_THD(thd)); - trx = trx_allocate_for_mysql(); + trx = trx_create(); trx->mysql_thd = thd; @@ -2871,13 +2806,17 @@ check_trx_exists( } } -/************************************************************************* -Gets current trx. */ -trx_t* -innobase_get_trx() +/** + Gets current trx. + + This function may be called during InnoDB initialisation, when + innodb_hton_ptr->slot is not yet set to meaningful value. +*/ + +trx_t *current_trx() { THD *thd=current_thd; - if (likely(thd != 0)) { + if (likely(thd != 0) && innodb_hton_ptr->slot != HA_SLOT_UNDEF) { return thd_to_trx(thd); } else { return(NULL); @@ -2935,7 +2874,8 @@ innobase_copy_frm_flags_from_create_info( ibool ps_on; ibool ps_off; - if (dict_table_is_temporary(innodb_table)) { + if (innodb_table->is_temporary() + || innodb_table->no_rollback()) { /* Temp tables do not use persistent stats. */ ps_on = FALSE; ps_off = TRUE; @@ -2970,7 +2910,7 @@ innobase_copy_frm_flags_from_table_share( ibool ps_on; ibool ps_off; - if (dict_table_is_temporary(innodb_table)) { + if (innodb_table->is_temporary()) { /* Temp tables do not use persistent stats */ ps_on = FALSE; ps_off = TRUE; @@ -3020,8 +2960,9 @@ ha_innobase::ha_innobase( */ | HA_CAN_EXPORT | HA_CAN_RTREEKEYS + | HA_CAN_TABLES_WITHOUT_ROLLBACK | HA_CONCURRENT_OPTIMIZE - | (srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0) + | (srv_force_primary_key ? HA_WANTS_PRIMARY_KEY : 0) ), m_start_of_scan(), m_mysql_has_locked() @@ -3192,8 +3133,8 @@ static bool innobase_query_caching_table_check_low( return false; } - return !MVCC::is_view_active(trx->read_view) - || trx->read_view->low_limit_id() + return !trx->read_view.is_open() + || trx->read_view.low_limit_id() >= table->query_cache_inv_trx_id; } @@ -3227,12 +3168,12 @@ static bool innobase_query_caching_table_check( if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ && !srv_read_only_mode - && !MVCC::is_view_active(trx->read_view)) { + && !trx->read_view.is_open()) { /* Start the transaction if it is not started yet */ trx_start_if_not_started(trx, false); - trx_sys->mvcc->view_open(trx->read_view, trx); + trx->read_view.open(trx); } } @@ -3254,9 +3195,9 @@ read view to it if there is no read view yet. Why a deadlock of threads is not possible: the query cache calls this function at the start of a SELECT processing. Then the calling thread cannot be holding any InnoDB semaphores. The calling thread is holding the -query cache mutex, and this function will reserve the InnoDB trx_sys->mutex. +query cache mutex, and this function will reserve the InnoDB trx_sys.mutex. Thus, the 'rank' in sync0mutex.h of the MySQL query cache mutex is above -the InnoDB trx_sys->mutex. +the InnoDB trx_sys.mutex. @return TRUE if permitted, FALSE if not; note that the value FALSE does not mean we should invalidate the query cache: invalidation is called explicitly */ @@ -3267,10 +3208,10 @@ innobase_query_caching_of_table_permitted( THD* thd, /*!< in: thd of the user who is trying to store a result to the query cache or retrieve it */ - char* full_name, /*!< in: normalized path to the table */ + const char* full_name, /*!< in: normalized path to the table */ uint full_name_len, /*!< in: length of the normalized path to the table */ - ulonglong *unused) /*!< unused for this engine */ + ulonglong *) { char norm_name[1000]; trx_t* trx = check_trx_exists(thd); @@ -3324,13 +3265,11 @@ innobase_invalidate_query_cache( /*============================*/ trx_t* trx, /*!< in: transaction which modifies the table */ - const char* full_name, /*!< in: concatenation of + const char* full_name) /*!< in: concatenation of database name, path separator, table name, null char NUL; NOTE that in Windows this is always in LOWER CASE! */ - ulint full_name_len) /*!< in: full name length where - also the null chars count */ { /* Note that the sync0mutex.h rank of the query cache mutex is just above the InnoDB trx_sys_t->lock. The caller of this function must @@ -3341,12 +3280,12 @@ innobase_invalidate_query_cache( char db_name[NAME_CHAR_LEN * MY_CS_MBMAXLEN + 1]; const char *key_ptr; size_t tabname_len; - size_t dbname_len; // Extract the database name. key_ptr= strchr(full_name, '/'); DBUG_ASSERT(key_ptr != NULL); // Database name should be present - memcpy(db_name, full_name, (dbname_len= (key_ptr - full_name))); + size_t dbname_len= size_t(key_ptr - full_name); + memcpy(db_name, full_name, dbname_len); db_name[dbname_len]= '\0'; /* Construct the key("db-name\0table$name\0") for the query cache using @@ -3362,7 +3301,7 @@ innobase_invalidate_query_cache( /* Argument TRUE below means we are using transactions */ mysql_query_cache_invalidate4(trx->mysql_thd, qcache_key_name, - (dbname_len + tabname_len + 2), + uint(dbname_len + tabname_len + 2), TRUE); #endif } @@ -3415,9 +3354,9 @@ innobase_quote_identifier( if (q == EOF) { quoted_identifier.append(id); } else { - quoted_identifier += (unsigned char)q; + quoted_identifier += char(q); quoted_identifier.append(id); - quoted_identifier += (unsigned char)q; + quoted_identifier += char(q); } return (quoted_identifier); @@ -3485,12 +3424,13 @@ innobase_convert_name( } /* Print the database name and table name separately. */ - s = innobase_convert_identifier(s, bufend - s, id, slash - id, thd); + s = innobase_convert_identifier(s, ulint(bufend - s), + id, ulint(slash - id), thd); if (s < bufend) { *s++ = '.'; - s = innobase_convert_identifier(s, bufend - s, + s = innobase_convert_identifier(s, ulint(bufend - s), slash + 1, idlen - - (slash - id) - 1, + - ulint(slash - id) - 1, thd); } @@ -3519,8 +3459,8 @@ innobase_format_name( /**********************************************************************//** Determines if the currently running transaction has been interrupted. -@return TRUE if interrupted */ -ibool +@return true if interrupted */ +bool trx_is_interrupted( /*===============*/ const trx_t* trx) /*!< in: transaction */ @@ -3545,7 +3485,7 @@ ha_innobase::reset_template(void) } ); - m_prebuilt->keep_other_fields_on_keyread = 0; + m_prebuilt->keep_other_fields_on_keyread = false; m_prebuilt->read_just_key = 0; m_prebuilt->in_fts_query = 0; @@ -3588,7 +3528,7 @@ ha_innobase::init_table_handle_for_HANDLER(void) /* Assign a read view if the transaction does not have it yet */ - trx_assign_read_view(m_prebuilt->trx); + m_prebuilt->trx->read_view.open(m_prebuilt->trx); innobase_register_trx(ht, m_user_thd, m_prebuilt->trx); @@ -3618,12 +3558,16 @@ ha_innobase::init_table_handle_for_HANDLER(void) /*********************************************************************//** Free any resources that were allocated and return failure. @return always return 1 */ -static int innobase_init_abort() +static int innodb_init_abort() { - DBUG_ENTER("innobase_init_abort"); + DBUG_ENTER("innodb_init_abort"); + + if (fil_system.temp_space) { + fil_system.temp_space->close(); + } + srv_sys_space.shutdown(); if (srv_tmp_space.get_sanity_check_status()) { - fil_space_close(srv_tmp_space.name()); srv_tmp_space.delete_files(); } srv_tmp_space.shutdown(); @@ -3634,59 +3578,15 @@ static int innobase_init_abort() DBUG_RETURN(1); } -/** Return partitioning flags. */ -static uint innobase_partition_flags() -{ - return (0); -} - -/** Deprecation message about InnoDB file format related parameters */ -#define DEPRECATED_FORMAT_PARAMETER(x) \ - "Using " x " is deprecated and the parameter" \ - " may be removed in future releases." \ - " See https://mariadb.com/kb/en/library/xtradbinnodb-file-format/" - -/** Deprecation message about innodb_file_format */ -static const char* deprecated_file_format - = DEPRECATED_FORMAT_PARAMETER("innodb_file_format"); - -/** Deprecation message about innodb_large_prefix */ -static const char* deprecated_large_prefix - = DEPRECATED_FORMAT_PARAMETER("innodb_large_prefix"); - -/** Deprecation message about innodb_file_format_check */ -static const char* deprecated_file_format_check - = DEPRECATED_FORMAT_PARAMETER("innodb_file_format_check"); - -/** Deprecation message about innodb_file_format_max */ -static const char* deprecated_file_format_max - = DEPRECATED_FORMAT_PARAMETER("innodb_file_format_max"); - -/** Deprecation message about innodb_use_trim */ -static const char* deprecated_use_trim - = "Using innodb_use_trim is deprecated" - " and the parameter will be removed in MariaDB 10.3."; - -/** Deprecation message about innodb_instrument_semaphores */ -static const char* deprecated_instrument_semaphores - = "Using innodb_instrument_semaphores is deprecated" - " and the parameter will be removed in MariaDB 10.3."; - -static const char* deprecated_use_mtflush - = "Using innodb_use_mtflush is deprecated" - " and the parameter will be removed in MariaDB 10.3." - " Use innodb-page-cleaners instead. "; - -static const char* deprecated_mtflush_threads - = "Using innodb_mtflush_threads is deprecated" - " and the parameter will be removed in MariaDB 10.3." - " Use innodb-page-cleaners instead. "; - /** Deprecation message about innodb_idle_flush_pct */ static const char* deprecated_idle_flush_pct = "innodb_idle_flush_pct is DEPRECATED and has no effect."; -static my_bool innodb_instrument_semaphores; +static const char* deprecated_innodb_checksum_algorithm + = "Setting innodb_checksum_algorithm to values other than" + " crc32 or strict_crc32 is UNSAFE and DEPRECATED." + " These deprecated values will be disallowed in MariaDB 10.6."; + static ulong innodb_idle_flush_pct; /** If applicable, emit a message that log checksums cannot be disabled. @@ -3715,6 +3615,21 @@ innodb_log_checksums_func_update(THD* thd, bool check) return(check); } +static void innodb_checksum_algorithm_update(THD *thd, st_mysql_sys_var*, + void *, const void *save) +{ + srv_checksum_algorithm= *static_cast<const ulong*>(save); + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + break; + default: + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + deprecated_innodb_checksum_algorithm); + } +} + /****************************************************************//** Gives the file extension of an InnoDB single-table tablespace. */ static const char* ha_innobase_exts[] = { @@ -3723,94 +3638,118 @@ static const char* ha_innobase_exts[] = { NullS }; -/*********************************************************************//** -Opens an InnoDB database. -@return 0 on success, 1 on failure */ -static -int -innobase_init( -/*==========*/ - void *p) /*!< in: InnoDB handlerton */ +/** Determine if system-versioned data was modified by the transaction. +@param[in,out] thd current session +@param[out] trx_id transaction start ID +@return transaction commit ID +@retval 0 if no system-versioned data was affected by the transaction */ +static ulonglong innodb_prepare_commit_versioned(THD* thd, ulonglong *trx_id) { - static char current_dir[3]; /*!< Set if using current lib */ - int err; - char *default_path; - uint format_id; - ulong num_pll_degree; + if (const trx_t* trx = thd_to_trx(thd)) { + *trx_id = trx->id; - DBUG_ENTER("innobase_init"); - handlerton* innobase_hton= (handlerton*) p; - innodb_hton_ptr = innobase_hton; + for (trx_mod_tables_t::const_iterator t + = trx->mod_tables.begin(); + t != trx->mod_tables.end(); t++) { + if (t->second.is_versioned()) { + DBUG_ASSERT(t->first->versioned_by_id()); + DBUG_ASSERT(trx->rsegs.m_redo.rseg); - innobase_hton->state = SHOW_OPTION_YES; - innobase_hton->db_type = DB_TYPE_INNODB; - innobase_hton->savepoint_offset = sizeof(trx_named_savept_t); - innobase_hton->close_connection = innobase_close_connection; - innobase_hton->kill_query = innobase_kill_query; - innobase_hton->savepoint_set = innobase_savepoint; - innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint; - - innobase_hton->savepoint_rollback_can_release_mdl = - innobase_rollback_to_savepoint_can_release_mdl; - - innobase_hton->savepoint_release = innobase_release_savepoint; - innobase_hton->prepare_ordered= NULL; - innobase_hton->commit_ordered= innobase_commit_ordered; - innobase_hton->commit = innobase_commit; - innobase_hton->rollback = innobase_rollback; - innobase_hton->prepare = innobase_xa_prepare; - innobase_hton->recover = innobase_xa_recover; - innobase_hton->commit_by_xid = innobase_commit_by_xid; - innobase_hton->rollback_by_xid = innobase_rollback_by_xid; - innobase_hton->commit_checkpoint_request=innobase_checkpoint_request; - innobase_hton->create = innobase_create_handler; - - innobase_hton->drop_database = innobase_drop_database; - innobase_hton->panic = innobase_end; - innobase_hton->partition_flags= innobase_partition_flags; - - innobase_hton->start_consistent_snapshot = - innobase_start_trx_and_assign_read_view; - - innobase_hton->flush_logs = innobase_flush_logs; - innobase_hton->show_status = innobase_show_status; - innobase_hton->flags = - HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS | - HTON_REQUIRES_CLOSE_AFTER_TRUNCATE; - -#ifdef WITH_WSREP - innobase_hton->abort_transaction=wsrep_abort_transaction; - innobase_hton->set_checkpoint=innobase_wsrep_set_checkpoint; - innobase_hton->get_checkpoint=innobase_wsrep_get_checkpoint; - innobase_hton->fake_trx_id=wsrep_fake_trx_id; -#endif /* WITH_WSREP */ + return trx_sys.get_new_trx_id(); + } + } - innobase_hton->tablefile_extensions = ha_innobase_exts; - innobase_hton->table_options = innodb_table_option_list; + return 0; + } - innodb_remember_check_sysvar_funcs(); + *trx_id = 0; + return 0; +} - ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); +/** Initialize and normalize innodb_buffer_pool_size. */ +static void innodb_buffer_pool_size_init() +{ + if (srv_buf_pool_size >= BUF_POOL_SIZE_THRESHOLD) { + + if (srv_buf_pool_instances == srv_buf_pool_instances_default) { +#if defined(_WIN32) && !defined(_WIN64) + /* Do not allocate too large of a buffer pool on + Windows 32-bit systems, which can have trouble + allocating larger single contiguous memory blocks. */ + srv_buf_pool_size = ulint( + ut_uint64_align_up(srv_buf_pool_size, + srv_buf_pool_chunk_unit)); + srv_buf_pool_instances = std::min<ulong>( + MAX_BUFFER_POOLS, + ulong(srv_buf_pool_size + / srv_buf_pool_chunk_unit)); +#else /* defined(_WIN32) && !defined(_WIN64) */ + /* Default to 8 instances when size > 1GB. */ + srv_buf_pool_instances = 8; +#endif /* defined(_WIN32) && !defined(_WIN64) */ + } + } else { + /* If buffer pool is less than 1 GiB, assume fewer + threads. Also use only one buffer pool instance. */ + if (srv_buf_pool_instances != srv_buf_pool_instances_default + && srv_buf_pool_instances != 1) { + /* We can't distinguish whether the user has explicitly + started mysqld with --innodb-buffer-pool-instances=0, + (srv_buf_pool_instances_default is 0) or has not + specified that option at all. Thus we have the + limitation that if the user started with =0, we + will not emit a warning here, but we should actually + do so. */ + ib::info() + << "Adjusting innodb_buffer_pool_instances" + " from " << srv_buf_pool_instances << " to 1" + " since innodb_buffer_pool_size is less than " + << BUF_POOL_SIZE_THRESHOLD / (1024 * 1024) + << " MiB"; + } + + srv_buf_pool_instances = 1; + } + + if (srv_buf_pool_chunk_unit * srv_buf_pool_instances + > srv_buf_pool_size) { + /* Size unit of buffer pool is larger than srv_buf_pool_size. + adjust srv_buf_pool_chunk_unit for srv_buf_pool_size. */ + srv_buf_pool_chunk_unit + = static_cast<ulong>(srv_buf_pool_size) + / srv_buf_pool_instances; + if (srv_buf_pool_size % srv_buf_pool_instances != 0) { + ++srv_buf_pool_chunk_unit; + } + } + + srv_buf_pool_size = buf_pool_size_align(srv_buf_pool_size); + innobase_buffer_pool_size = srv_buf_pool_size; +} + +/** Initialize, validate and normalize the InnoDB startup parameters. +@return failure code +@retval 0 on success +@retval HA_ERR_OUT_OF_MEM when out of memory +@retval HA_ERR_INITIALIZATION when some parameters are out of range */ +static int innodb_init_params() +{ + DBUG_ENTER("innodb_init_params"); + + static char current_dir[3]; + char *default_path; + ulong num_pll_degree; -#ifndef DBUG_OFF - static const char test_filename[] = "-@"; - char test_tablename[sizeof test_filename - + sizeof(srv_mysql50_table_name_prefix) - 1]; - if ((sizeof(test_tablename)) - 1 - != filename_to_tablename(test_filename, - test_tablename, - sizeof(test_tablename), true) - || strncmp(test_tablename, - srv_mysql50_table_name_prefix, - sizeof(srv_mysql50_table_name_prefix) - 1) - || strcmp(test_tablename - + sizeof(srv_mysql50_table_name_prefix) - 1, - test_filename)) { - - sql_print_error("tablename encoding has been changed"); - DBUG_RETURN(innobase_init_abort()); + if (innodb_large_prefix || innodb_file_format) { + const char* p = innodb_file_format + ? "file_format" + : "large_prefix"; + sql_print_warning("The parameter innodb_%s is deprecated" + " and has no effect." + " It may be removed in future releases." + " See https://mariadb.com/kb/en/library/" + "xtradbinnodb-file-format/", p); } -#endif /* DBUG_OFF */ /* Check that values don't overflow on 32-bit systems. */ if (sizeof(ulint) == 4) { @@ -3818,26 +3757,19 @@ innobase_init( sql_print_error( "innodb_buffer_pool_size can't be over 4GB" " on 32-bit systems"); - - DBUG_RETURN(innobase_init_abort()); + DBUG_RETURN(HA_ERR_OUT_OF_MEM); } } - os_file_set_umask(my_umask); - - /* Setup the memory alloc/free tracing mechanisms before calling - any functions that could possibly allocate memory. */ - ut_new_boot(); - /* The buffer pool needs to be able to accommodate enough many pages, even for larger pages */ - if (UNIV_PAGE_SIZE > UNIV_PAGE_SIZE_DEF + if (srv_page_size > UNIV_PAGE_SIZE_DEF && innobase_buffer_pool_size < (24 * 1024 * 1024)) { ib::error() << "innodb_page_size=" - << UNIV_PAGE_SIZE << " requires " + << srv_page_size << " requires " << "innodb_buffer_pool_size > 24M current " << innobase_buffer_pool_size; - goto error; + DBUG_RETURN(HA_ERR_INITIALIZATION); } if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { @@ -3864,14 +3796,6 @@ innobase_init( || !strcmp(wsrep_sst_method, "xtrabackup-v2"))) { ib::info() << "Galera SST method xtrabackup is deprecated and the " " support for it may be removed in future releases."; - - /* We can't blindly turn on this as it will cause a - modification of the redo log format identifier. See - MDEV-13564 for more information. */ - if (!srv_safe_truncate) { - ib::info() << "Requested xtrabackup based SST for Galera but" - << "innodb_safe_truncate is disabled."; - } } #endif /* WITH_WSREP */ @@ -3880,7 +3804,7 @@ innobase_init( sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n" "InnoDB: liblz4 is not installed. \n", innodb_compression_algorithm); - goto error; + DBUG_RETURN(HA_ERR_INITIALIZATION); } #endif @@ -3889,7 +3813,7 @@ innobase_init( sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n" "InnoDB: liblzo is not installed. \n", innodb_compression_algorithm); - goto error; + DBUG_RETURN(HA_ERR_INITIALIZATION); } #endif @@ -3898,7 +3822,7 @@ innobase_init( sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n" "InnoDB: liblzma is not installed. \n", innodb_compression_algorithm); - goto error; + DBUG_RETURN(HA_ERR_INITIALIZATION); } #endif @@ -3907,7 +3831,7 @@ innobase_init( sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n" "InnoDB: libbz2 is not installed. \n", innodb_compression_algorithm); - goto error; + DBUG_RETURN(HA_ERR_INITIALIZATION); } #endif @@ -3916,7 +3840,7 @@ innobase_init( sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n" "InnoDB: libsnappy is not installed. \n", innodb_compression_algorithm); - goto error; + DBUG_RETURN(HA_ERR_INITIALIZATION); } #endif @@ -3925,11 +3849,17 @@ innobase_init( && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) { sql_print_error("InnoDB: cannot enable encryption, " "encryption plugin is not available"); - goto error; + DBUG_RETURN(HA_ERR_INITIALIZATION); } - innodb_check_deprecated(); - +#ifdef _WIN32 + if (!is_filename_allowed(srv_buf_dump_filename, + strlen(srv_buf_dump_filename), FALSE)) { + sql_print_error("InnoDB: innodb_buffer_pool_filename" + " cannot have colon (:) in the file name."); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } +#endif /* First calculate the default path for innodb_data_home_dir etc., in case the user has not given any value. @@ -3974,13 +3904,7 @@ innobase_init( if (!srv_page_size_shift) { sql_print_error("InnoDB: Invalid page size=%lu.\n", srv_page_size); - DBUG_RETURN(innobase_init_abort()); - } - - /* Set default InnoDB temp data file size to 12 MB and let it be - auto-extending. */ - if (!innobase_data_file_path) { - innobase_data_file_path = (char*) "ibdata1:12M:autoextend"; + DBUG_RETURN(HA_ERR_INITIALIZATION); } /* This is the first time univ_page_size is used. @@ -3997,19 +3921,9 @@ innobase_init( if (!srv_sys_space.parse_params(innobase_data_file_path, true)) { ib::error() << "Unable to parse innodb_data_file_path=" << innobase_data_file_path; - DBUG_RETURN(innobase_init_abort()); + DBUG_RETURN(HA_ERR_INITIALIZATION); } - /* Set default InnoDB temp data file size to 12 MB and let it be - auto-extending. */ - - if (!innobase_temp_data_file_path) { - innobase_temp_data_file_path = (char*) "ibtmp1:12M:autoextend"; - } - - /* We set the temporary tablspace id later, after recovery. - The temp tablespace doesn't support raw devices. - Set the name and path. */ srv_tmp_space.set_name("innodb_temporary"); srv_tmp_space.set_path(srv_data_home); srv_tmp_space.set_flags(FSP_FLAGS_PAGE_SSIZE()); @@ -4017,16 +3931,19 @@ innobase_init( if (!srv_tmp_space.parse_params(innobase_temp_data_file_path, false)) { ib::error() << "Unable to parse innodb_temp_data_file_path=" << innobase_temp_data_file_path; - DBUG_RETURN(innobase_init_abort()); + DBUG_RETURN(HA_ERR_INITIALIZATION); } /* Perform all sanity check before we take action of deleting files*/ if (srv_sys_space.intersection(&srv_tmp_space)) { sql_print_error("%s and %s file names seem to be the same.", srv_tmp_space.name(), srv_sys_space.name()); - DBUG_RETURN(innobase_init_abort()); + DBUG_RETURN(HA_ERR_INITIALIZATION); } + srv_sys_space.normalize_size(); + srv_tmp_space.normalize_size(); + /* ------------ UNDO tablespaces files ---------------------*/ if (!srv_undo_dir) { srv_undo_dir = default_path; @@ -4036,7 +3953,7 @@ innobase_init( if (strchr(srv_undo_dir, ';')) { sql_print_error("syntax error in innodb_undo_directory"); - DBUG_RETURN(innobase_init_abort()); + DBUG_RETURN(HA_ERR_INITIALIZATION); } /* -------------- All log files ---------------------------*/ @@ -4051,119 +3968,19 @@ innobase_init( if (strchr(srv_log_group_home_dir, ';')) { sql_print_error("syntax error in innodb_log_group_home_dir"); - DBUG_RETURN(innobase_init_abort()); - } - - if (!innobase_large_prefix) { - ib::warn() << deprecated_large_prefix; - } - - if (!THDVAR(NULL, support_xa)) { - ib::warn() << deprecated_innodb_support_xa_off; - THDVAR(NULL, support_xa) = TRUE; - } - - if (innobase_file_format_name != innodb_file_format_default) { - ib::warn() << deprecated_file_format; - } - - if (innodb_instrument_semaphores) { - ib::warn() << deprecated_instrument_semaphores; - } - - if (srv_use_mtflush) { - ib::warn() << deprecated_use_mtflush; - } - - if (srv_use_mtflush && srv_mtflush_threads != MTFLUSH_DEFAULT_WORKER) { - ib::warn() << deprecated_mtflush_threads; - } - - /* Validate the file format by animal name */ - if (innobase_file_format_name != NULL) { - - format_id = innobase_file_format_name_lookup( - innobase_file_format_name); - - if (format_id > UNIV_FORMAT_MAX) { - - sql_print_error("InnoDB: wrong innodb_file_format."); - - DBUG_RETURN(innobase_init_abort()); - } - } else { - /* Set it to the default file format id. Though this - should never happen. */ - format_id = 0; - } - - srv_file_format = format_id; - - /* Given the type of innobase_file_format_name we have little - choice but to cast away the constness from the returned name. - innobase_file_format_name is used in the MySQL set variable - interface and so can't be const. */ - - innobase_file_format_name = - (char*) trx_sys_file_format_id_to_name(format_id); - - /* Check innobase_file_format_check variable */ - if (!innobase_file_format_check) { - ib::warn() << deprecated_file_format_check; - - /* Set the value to disable checking. */ - srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1; - - } else { - - /* Set the value to the lowest supported format. */ - srv_max_file_format_at_startup = UNIV_FORMAT_MIN; + DBUG_RETURN(HA_ERR_INITIALIZATION); } - if (innobase_file_format_max != innodb_file_format_max_default) { - ib::warn() << deprecated_file_format_max; + if (srv_n_log_files * srv_log_file_size >= log_group_max_size) { + /* Log group size is limited by the size of page number. + Remove this limitation when fil_io() is not used for + recovery log io. */ + ib::error() << "Combined size of log files must be < " + << log_group_max_size; + DBUG_RETURN(HA_ERR_INITIALIZATION); } - /* Did the user specify a format name that we support? - As a side effect it will update the variable - srv_max_file_format_at_startup */ - if (innobase_file_format_validate_and_set( - innobase_file_format_max) < 0) { - - sql_print_error("InnoDB: invalid" - " innodb_file_format_max value:" - " should be any value up to %s or its" - " equivalent numeric id", - trx_sys_file_format_id_to_name( - UNIV_FORMAT_MAX)); - - DBUG_RETURN(innobase_init_abort()); - } - - if (innobase_change_buffering) { - ulint use; - - for (use = 0; - use < UT_ARR_SIZE(innobase_change_buffering_values); - use++) { - if (!innobase_strcasecmp( - innobase_change_buffering, - innobase_change_buffering_values[use])) { - ibuf_use = (ibuf_use_t) use; - goto innobase_change_buffering_inited_ok; - } - } - - sql_print_error("InnoDB: invalid value" - " innodb_change_buffering=%s", - innobase_change_buffering); - DBUG_RETURN(innobase_init_abort()); - } - -innobase_change_buffering_inited_ok: - ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values)); - innobase_change_buffering = (char*) - innobase_change_buffering_values[ibuf_use]; + DBUG_ASSERT(innodb_change_buffering <= IBUF_USE_ALL); /* Check that interdependent parameters have sane values. */ if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) { @@ -4199,24 +4016,13 @@ innobase_change_buffering_inited_ok: srv_io_capacity = srv_max_io_capacity; } - if (!is_filename_allowed(srv_buf_dump_filename, - strlen(srv_buf_dump_filename), FALSE)) { - sql_print_error("InnoDB: innodb_buffer_pool_filename" - " cannot have colon (:) in the file name."); - DBUG_RETURN(innobase_init_abort()); - } - - /* --------------------------------------------------*/ - - srv_file_flush_method_str = innobase_file_flush_method; - if (UNIV_PAGE_SIZE_DEF != srv_page_size) { ib::info() << "innodb_page_size=" << srv_page_size; srv_max_undo_log_size = std::max( srv_max_undo_log_size, ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) - * srv_page_size); + << srv_page_size_shift); } if (srv_log_write_ahead_size > srv_page_size) { @@ -4236,20 +4042,20 @@ innobase_change_buffering_inited_ok: } } - srv_log_buffer_size = (ulint) innobase_log_buffer_size; - - srv_buf_pool_size = (ulint) innobase_buffer_pool_size; - - srv_n_read_io_threads = (ulint) innobase_read_io_threads; - srv_n_write_io_threads = (ulint) innobase_write_io_threads; - - srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; + srv_buf_pool_size = ulint(innobase_buffer_pool_size); if (!innobase_use_checksums) { ib::warn() << "Setting innodb_checksums to OFF is DEPRECATED." - " This option may be removed in future releases. You" - " should set innodb_checksum_algorithm=NONE instead."; + " This option was removed in MariaDB 10.5."; srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE; + } else { + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + break; + default: + ib::warn() << deprecated_innodb_checksum_algorithm; + } } innodb_log_checksums = innodb_log_checksums_func_update( @@ -4272,16 +4078,16 @@ innobase_change_buffering_inited_ok: } } - if (innobase_open_files > (long) open_files_limit) { + if (innobase_open_files > open_files_limit) { ib::warn() << "innodb_open_files " << innobase_open_files << " should not be greater" << " than the open_files_limit " << open_files_limit; - if (innobase_open_files > (long) tc_size) { + if (innobase_open_files > tc_size) { innobase_open_files = tc_size; } } - srv_max_n_open_files = (ulint) innobase_open_files; + srv_max_n_open_files = innobase_open_files; srv_innodb_status = (ibool) innobase_create_status_file; srv_print_verbose_log = mysqld_embedded ? 0 : 1; @@ -4303,12 +4109,6 @@ innobase_change_buffering_inited_ok: innobase_commit_concurrency_init_default(); - if (innobase_use_fallocate) { - ib::warn() << "innodb_use_fallocate is DEPRECATED" - " and has no effect in MariaDB 10.2." - " It will be removed in MariaDB 10.3."; - } - if (innodb_idle_flush_pct != 100) { ib::warn() << deprecated_idle_flush_pct; } @@ -4319,15 +4119,154 @@ innobase_change_buffering_inited_ok: Force O_DIRECT on Unixes (on Windows writes are always unbuffered) */ - if (!innobase_file_flush_method || - !strstr(innobase_file_flush_method, "O_DIRECT")) { - innobase_file_flush_method = - srv_file_flush_method_str = (char*)"O_DIRECT"; + switch (innodb_flush_method) { + case SRV_O_DIRECT: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + innodb_flush_method = SRV_O_DIRECT; fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n"); } } #endif + if (srv_read_only_mode) { + ib::info() << "Started in read only mode"; + srv_use_doublewrite_buf = FALSE; + } + +#ifdef LINUX_NATIVE_AIO + if (srv_use_native_aio) { + ib::info() << "Using Linux native AIO"; + } +#elif !defined _WIN32 + /* Currently native AIO is supported only on windows and linux + and that also when the support is compiled in. In all other + cases, we ignore the setting of innodb_use_native_aio. */ + srv_use_native_aio = FALSE; +#endif + +#ifndef _WIN32 + ut_ad(innodb_flush_method <= SRV_O_DIRECT_NO_FSYNC); +#else + switch (innodb_flush_method) { + case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */: + innodb_flush_method = SRV_ALL_O_DIRECT_FSYNC; + break; + case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */: + innodb_flush_method = SRV_FSYNC; + break; + default: + ut_ad(innodb_flush_method <= SRV_ALL_O_DIRECT_FSYNC); + } +#endif + srv_file_flush_method = srv_flush_t(innodb_flush_method); + + innodb_buffer_pool_size_init(); + + if (srv_n_page_cleaners > srv_buf_pool_instances) { + /* limit of page_cleaner parallelizability + is number of buffer pool instances. */ + srv_n_page_cleaners = srv_buf_pool_instances; + } + + srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift); + DBUG_RETURN(0); +} + +/** Initialize the InnoDB storage engine plugin. +@param[in,out] p InnoDB handlerton +@return error code +@retval 0 on success */ +static int innodb_init(void* p) +{ + DBUG_ENTER("innodb_init"); + handlerton* innobase_hton= static_cast<handlerton*>(p); + innodb_hton_ptr = innobase_hton; + + innobase_hton->state = SHOW_OPTION_YES; + innobase_hton->db_type = DB_TYPE_INNODB; + innobase_hton->savepoint_offset = sizeof(trx_named_savept_t); + innobase_hton->close_connection = innobase_close_connection; + innobase_hton->kill_query = innobase_kill_query; + innobase_hton->savepoint_set = innobase_savepoint; + innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint; + + innobase_hton->savepoint_rollback_can_release_mdl = + innobase_rollback_to_savepoint_can_release_mdl; + + innobase_hton->savepoint_release = innobase_release_savepoint; + innobase_hton->prepare_ordered= NULL; + innobase_hton->commit_ordered= innobase_commit_ordered; + innobase_hton->commit = innobase_commit; + innobase_hton->rollback = innobase_rollback; + innobase_hton->prepare = innobase_xa_prepare; + innobase_hton->recover = innobase_xa_recover; + innobase_hton->commit_by_xid = innobase_commit_by_xid; + innobase_hton->rollback_by_xid = innobase_rollback_by_xid; + innobase_hton->commit_checkpoint_request=innobase_checkpoint_request; + innobase_hton->create = innobase_create_handler; + + innobase_hton->drop_database = innobase_drop_database; + innobase_hton->panic = innobase_end; + + innobase_hton->start_consistent_snapshot = + innobase_start_trx_and_assign_read_view; + + innobase_hton->flush_logs = innobase_flush_logs; + innobase_hton->show_status = innobase_show_status; + innobase_hton->flags = + HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS | + HTON_NATIVE_SYS_VERSIONING | + HTON_REQUIRES_CLOSE_AFTER_TRUNCATE; + +#ifdef WITH_WSREP + innobase_hton->abort_transaction=wsrep_abort_transaction; + innobase_hton->set_checkpoint=innobase_wsrep_set_checkpoint; + innobase_hton->get_checkpoint=innobase_wsrep_get_checkpoint; + innobase_hton->fake_trx_id=wsrep_fake_trx_id; +#endif /* WITH_WSREP */ + + innobase_hton->tablefile_extensions = ha_innobase_exts; + innobase_hton->table_options = innodb_table_option_list; + + /* System Versioning */ + innobase_hton->prepare_commit_versioned + = innodb_prepare_commit_versioned; + + innodb_remember_check_sysvar_funcs(); + + compile_time_assert(DATA_MYSQL_TRUE_VARCHAR == MYSQL_TYPE_VARCHAR); + +#ifndef DBUG_OFF + static const char test_filename[] = "-@"; + char test_tablename[sizeof test_filename + + sizeof(srv_mysql50_table_name_prefix) - 1]; + DBUG_ASSERT(sizeof test_tablename - 1 + == filename_to_tablename(test_filename, + test_tablename, + sizeof test_tablename, true)); + DBUG_ASSERT(!strncmp(test_tablename, + srv_mysql50_table_name_prefix, + sizeof srv_mysql50_table_name_prefix - 1)); + DBUG_ASSERT(!strcmp(test_tablename + + sizeof srv_mysql50_table_name_prefix - 1, + test_filename)); +#endif /* DBUG_OFF */ + + os_file_set_umask(my_umask); + + /* Setup the memory alloc/free tracing mechanisms before calling + any functions that could possibly allocate memory. */ + ut_new_boot(); + + if (int error = innodb_init_params()) { + DBUG_RETURN(error); + } + + /* After this point, error handling has to use + innodb_init_abort(). */ + #ifdef HAVE_PSI_INTERFACE /* Register keys with MySQL performance schema */ int count; @@ -4359,13 +4298,20 @@ innobase_change_buffering_inited_ok: mysql_cond_register("innodb", all_innodb_conds, count); #endif /* HAVE_PSI_INTERFACE */ - err = innobase_start_or_create_for_mysql(); + bool create_new_db = false; + + /* Check whether the data files exist. */ + dberr_t err = srv_sys_space.check_file_spec(&create_new_db, 5U << 20); - innobase_buffer_pool_size = static_cast<long long>(srv_buf_pool_size); + if (err != DB_SUCCESS) { + DBUG_RETURN(innodb_init_abort()); + } + + err = srv_start(create_new_db); if (err != DB_SUCCESS) { innodb_shutdown(); - DBUG_RETURN(innobase_init_abort()); + DBUG_RETURN(innodb_init_abort()); } else if (!srv_read_only_mode) { mysql_thread_create(thd_destructor_thread_key, &thd_destructor_thread, @@ -4397,9 +4343,6 @@ innobase_change_buffering_inited_ok: } #endif /* MYSQL_DYNAMIC_PLUGIN */ - /* Get the current high water mark format. */ - innobase_file_format_max = (char*) trx_sys_file_format_max_get(); - /* Currently, monitor counter information are not persistent. */ memset(monitor_set_tbl, 0, sizeof monitor_set_tbl); @@ -4416,7 +4359,6 @@ innobase_change_buffering_inited_ok: /* Turn on monitor counters that are default on */ srv_mon_default_on(); - /* Unit Tests */ #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR unit_test_os_file_get_parent_dir(); @@ -4437,9 +4379,6 @@ innobase_change_buffering_inited_ok: #endif /* UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT */ DBUG_RETURN(0); - -error: - DBUG_RETURN(1); } /** Shut down the InnoDB storage engine. @@ -4453,9 +4392,8 @@ innobase_end(handlerton*, ha_panic_function) if (srv_was_started) { THD *thd= current_thd; if (thd) { // may be UNINSTALL PLUGIN statement - trx_t* trx = thd_to_trx(thd); - if (trx) { - trx_free_for_mysql(trx); + if (trx_t* trx = thd_to_trx(thd)) { + trx->free(); } } @@ -4552,7 +4490,7 @@ innobase_start_trx_and_assign_read_view( thd_get_trx_isolation(thd)); if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) { - trx_assign_read_view(trx); + trx->read_view.open(trx); } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED, @@ -4614,11 +4552,8 @@ innobase_commit_ordered_2( If the binary log is not enabled, or the transaction is not written to the binary log, the file name will be a NULL pointer. */ - ulonglong pos; - - thd_binlog_pos(thd, &trx->mysql_log_file_name, &pos); - - trx->mysql_log_offset = static_cast<int64_t>(pos); + thd_binlog_pos(thd, &trx->mysql_log_file_name, + &trx->mysql_log_offset); /* Don't do write + flush right now. For group commit to work we want to do the flush later. */ @@ -4948,7 +4883,6 @@ UNIV_INTERN void innobase_mysql_log_notify( /*======================*/ - ib_uint64_t write_lsn, /*!< in: LSN written to log file */ ib_uint64_t flush_lsn) /*!< in: LSN flushed to disk */ { struct pending_checkpoint * pending; @@ -5038,7 +4972,7 @@ innobase_rollback_to_savepoint( char name[64]; - longlong2str((ulint) savepoint, name, 36); + longlong2str(longlong(savepoint), name, 36); int64_t mysql_binlog_cache_pos; @@ -5107,7 +5041,7 @@ innobase_release_savepoint( /* TODO: use provided savepoint data area to store savepoint data */ - longlong2str((ulint) savepoint, name, 36); + longlong2str(longlong(savepoint), name, 36); error = trx_release_savepoint_for_mysql(trx, name); @@ -5146,12 +5080,12 @@ innobase_savepoint( /* TODO: use provided savepoint data area to store savepoint data */ char name[64]; - longlong2str((ulint) savepoint,name,36); + longlong2str(longlong(savepoint), name, 36); dberr_t error = trx_savepoint_for_mysql(trx, name, 0); if (error == DB_SUCCESS && trx->fts_trx != NULL) { - fts_savepoint_take(trx, trx->fts_trx, name); + fts_savepoint_take(trx->fts_trx, name); } DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); @@ -5221,7 +5155,7 @@ innobase_close_connection( } else { rollback_and_free: innobase_rollback_trx(trx); - trx_free_for_mysql(trx); + trx->free(); } } @@ -5307,9 +5241,7 @@ ha_innobase::table_flags() const /* Need to use tx_isolation here since table flags is (also) called before prebuilt is inited. */ - ulong const tx_isolation = thd_tx_isolation(thd); - - if (tx_isolation <= ISO_READ_COMMITTED) { + if (thd_tx_isolation(thd) <= ISO_READ_COMMITTED) { return(flags); } @@ -5428,7 +5360,7 @@ ha_innobase::max_supported_key_length() const Note: Handle 16k and 32k pages the same here since the limits are higher than imposed by MySQL. */ - switch (UNIV_PAGE_SIZE) { + switch (srv_page_size) { case 4096: /* Hack: allow mysql.innodb_index_stats to be created. */ /* FIXME: rewrite this API, and in sql_table.cc consider @@ -5587,7 +5519,8 @@ create_table_info_t::create_table_info_t( m_form(form), m_default_row_format(innodb_default_row_format), m_create_info(create_info), - m_table_name(table_name), m_drop_before_rollback(false), + m_table_name(table_name), m_table(NULL), + m_drop_before_rollback(false), m_remote_path(remote_path), m_innodb_file_per_table(file_per_table) { @@ -5900,7 +5833,7 @@ innobase_build_v_templ( const dict_add_v_col_t* add_v, bool locked) { - ulint ncol = ib_table->n_cols - DATA_N_SYS_COLS; + ulint ncol = unsigned(ib_table->n_cols) - DATA_N_SYS_COLS; ulint n_v_col = ib_table->n_v_cols; bool marker[REC_MAX_N_FIELDS]; @@ -5977,7 +5910,7 @@ innobase_build_v_templ( } ut_ad(!my_strcasecmp(system_charset_info, name, - field->field_name)); + field->field_name.str)); #endif const dict_v_col_t* vcol; @@ -6011,7 +5944,7 @@ innobase_build_v_templ( ut_ad(!my_strcasecmp(system_charset_info, dict_table_get_col_name( ib_table, j), - field->field_name)); + field->field_name.str)); s_templ->vtempl[j] = static_cast< mysql_row_templ_t*>( @@ -6058,14 +5991,13 @@ check_index_consistency(const TABLE* table, const dict_table_t* ib_table) corresponding InnoDB index pointer into index_mapping array. */ for (ulint count = 0; count < mysql_num_index; count++) { - const dict_index_t* index = dict_table_get_index_on_name( - ib_table, table->key_info[count].name); + ib_table, table->key_info[count].name.str); if (index == NULL) { sql_print_error("Cannot find index %s in InnoDB" " index dictionary.", - table->key_info[count].name); + table->key_info[count].name.str); ret = false; goto func_exit; } @@ -6076,7 +6008,7 @@ check_index_consistency(const TABLE* table, const dict_table_t* ib_table) index)) { sql_print_error("Found index %s whose column info" " does not match that of MariaDB.", - table->key_info[count].name); + table->key_info[count].name.str); ret = false; goto func_exit; } @@ -6161,7 +6093,7 @@ static void initialize_auto_increment(dict_table_t* table, const Field* field) { - ut_ad(!dict_table_is_temporary(table)); + ut_ad(!table->is_temporary()); const unsigned col_no = innodb_col_no(field); @@ -6236,9 +6168,9 @@ no_such_table: DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); } - uint n_fields = omits_virtual_cols(*table_share) + size_t n_fields = omits_virtual_cols(*table_share) ? table_share->stored_fields : table_share->fields; - uint n_cols = dict_table_get_n_user_cols(ib_table) + size_t n_cols = dict_table_get_n_user_cols(ib_table) + dict_table_get_n_v_cols(ib_table) - !!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID); @@ -6272,11 +6204,7 @@ no_such_table: MONITOR_INC(MONITOR_TABLE_OPEN); - bool no_tablespace = false; - bool encrypted = false; - FilSpace space; - - if (dict_table_is_discarded(ib_table)) { + if ((ib_table->flags2 & DICT_TF2_DISCARDED)) { ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED, @@ -6286,77 +6214,37 @@ no_such_table: all the flags and index root page numbers to FIL_NULL that should prevent any DML from running but it should allow DDL operations. */ - - no_tablespace = false; - } else if (!ib_table->is_readable()) { - space = fil_space_acquire_silent(ib_table->space); - - if (space()) { - if (space()->crypt_data && space()->crypt_data->is_encrypted()) { - /* This means that tablespace was found but we could not - decrypt encrypted page. */ - no_tablespace = true; - encrypted = true; - } else { - no_tablespace = true; - } - } else { + const fil_space_t* space = ib_table->space; + if (!space) { ib_senderrf( thd, IB_LOG_LEVEL_WARN, ER_TABLESPACE_MISSING, norm_name); - - /* This means we have no idea what happened to the tablespace - file, best to play it safe. */ - - no_tablespace = true; } - } else { - no_tablespace = false; - } - - if (!thd_tablespace_op(thd) && no_tablespace) { - set_my_errno(ENOENT); - int ret_err = HA_ERR_TABLESPACE_MISSING; - /* If table has no talespace but it has crypt data, check - is tablespace made unaccessible because encryption service - or used key_id is not available. */ - if (encrypted) { - bool warning_pushed = false; + if (!thd_tablespace_op(thd)) { + set_my_errno(ENOENT); + int ret_err = HA_ERR_TABLESPACE_MISSING; - if (!encryption_key_id_exists(space()->crypt_data->key_id)) { + if (space && space->crypt_data + && space->crypt_data->is_encrypted()) { push_warning_printf( - thd, Sql_condition::WARN_LEVEL_WARN, + thd, + Sql_condition::WARN_LEVEL_WARN, HA_ERR_DECRYPTION_FAILED, - "Table %s in file %s is encrypted but encryption service or" + "Table %s in file %s is encrypted" + " but encryption service or" " used key_id %u is not available. " " Can't continue reading table.", table_share->table_name.str, - space()->chain.start->name, - space()->crypt_data->key_id); + space->chain.start->name, + space->crypt_data->key_id); ret_err = HA_ERR_DECRYPTION_FAILED; - warning_pushed = true; } - /* If table is marked as encrypted then we push - warning if it has not been already done as used - key_id might be found but it is incorrect. */ - if (!warning_pushed) { - push_warning_printf( - thd, Sql_condition::WARN_LEVEL_WARN, - HA_ERR_DECRYPTION_FAILED, - "Table %s in file %s is encrypted but encryption service or" - " used key_id is not available. " - " Can't continue reading table.", - table_share->table_name.str, - space()->chain.start->name); - ret_err = HA_ERR_DECRYPTION_FAILED; - } + dict_table_close(ib_table, FALSE, FALSE); + DBUG_RETURN(ret_err); } - - dict_table_close(ib_table, FALSE, FALSE); - DBUG_RETURN(ret_err); } m_prebuilt = row_create_prebuilt(ib_table, table->s->reclength); @@ -6480,19 +6368,10 @@ no_such_table: } /* Index block size in InnoDB: used by MySQL in query optimization */ - stats.block_size = UNIV_PAGE_SIZE; - - if (m_prebuilt->table != NULL) { - /* We update the highest file format in the system table - space, if this table has higher file format setting. */ - - trx_sys_file_format_max_upgrade( - (const char**) &innobase_file_format_max, - dict_table_get_format(m_prebuilt->table)); - } + stats.block_size = srv_page_size; if (m_prebuilt->table == NULL - || dict_table_is_temporary(m_prebuilt->table) + || m_prebuilt->table->is_temporary() || m_prebuilt->table->persistent_autoinc || !m_prebuilt->table->is_readable()) { } else if (const Field* ai = table->found_next_number_field) { @@ -6515,7 +6394,11 @@ no_such_table: } } - info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); + if (table && m_prebuilt->table) { + ut_ad(table->versioned() == m_prebuilt->table->versioned()); + } + + info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST | HA_STATUS_OPEN); DBUG_RETURN(0); } @@ -6547,7 +6430,11 @@ platforms. @return dictionary table object or NULL if not found */ dict_table_t* ha_innobase::open_dict_table( - const char* table_name, + const char* +#ifdef _WIN32 + table_name +#endif + , const char* norm_name, bool is_partition, dict_err_ignore_t ignore_err) @@ -6634,7 +6521,7 @@ ha_innobase::clone( DBUG_ENTER("ha_innobase::clone"); ha_innobase* new_handler = static_cast<ha_innobase*>( - handler::clone(name, mem_root)); + handler::clone(m_prebuilt->table->name.m_name, mem_root)); if (new_handler != NULL) { DBUG_ASSERT(new_handler->m_prebuilt != NULL); @@ -6653,9 +6540,7 @@ ha_innobase::max_supported_key_part_length() const { /* A table format specific index column length check will be performed at ha_innobase::add_index() and row_create_index_for_mysql() */ - return(innobase_large_prefix - ? REC_VERSION_56_MAX_INDEX_COL_LEN - : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1); + return(REC_VERSION_56_MAX_INDEX_COL_LEN); } /******************************************************************//** @@ -6691,7 +6576,7 @@ ha_innobase::close() #ifdef WITH_WSREP UNIV_INTERN -int +ulint wsrep_innobase_mysql_sort( /*======================*/ /* out: str contains sort string */ @@ -6705,7 +6590,7 @@ wsrep_innobase_mysql_sort( { CHARSET_INFO* charset; enum_field_types mysql_tp; - int ret_length = str_length; + ulint ret_length = str_length; DBUG_ASSERT(str_length != UNIV_SQL_NULL); @@ -6947,7 +6832,7 @@ innobase_mysql_fts_get_token( for (;;) { if (doc >= end) { - return(doc - start); + return ulint(doc - start); } int ctype; @@ -6989,7 +6874,7 @@ innobase_mysql_fts_get_token( token->f_len = (uint) (doc - token->f_str) - mwc; token->f_n_char = length; - return(doc - start); + return ulint(doc - start); } /** Converts a MySQL type to an InnoDB type. Note that this function returns @@ -7544,9 +7429,7 @@ build_template_field( ut_ad(clust_index->table == index->table); templ = prebuilt->mysql_template + prebuilt->n_template++; -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(templ, sizeof *templ); -#endif /* HAVE_valgrind_or_MSAN */ templ->rec_field_is_prefix = FALSE; templ->rec_prefix_field_no = ULINT_UNDEFINED; templ->is_virtual = !field->stored_in_db(); @@ -7589,7 +7472,7 @@ build_template_field( ib::info() << "MySQL table " << table->s->table_name.str << " field " << j << " name " - << table->field[j]->field_name; + << table->field[j]->field_name.str; } ib::fatal() << "Clustered record field for column " << i @@ -7711,10 +7594,11 @@ ha_innobase::build_template( ibool fetch_all_in_key = FALSE; ibool fetch_primary_key_cols = FALSE; - if (m_prebuilt->select_lock_type == LOCK_X) { + if (m_prebuilt->select_lock_type == LOCK_X || m_prebuilt->table->no_rollback()) { /* We always retrieve the whole clustered index record if we use exclusive row level locks, for example, if the read is - done in an UPDATE statement. */ + done in an UPDATE statement or if we are using a no rollback + table */ whole_row = true; } else if (!whole_row) { @@ -7753,6 +7637,7 @@ ha_innobase::build_template( index = whole_row ? clust_index : m_prebuilt->index; + m_prebuilt->versioned_write = table->versioned_write(VERS_TRX_ID); m_prebuilt->need_to_access_clustered = (index == clust_index); /* Either m_prebuilt->index should be a secondary index, or it @@ -8174,19 +8059,11 @@ ha_innobase::write_row( if (high_level_read_only) { ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); DBUG_RETURN(HA_ERR_TABLE_READONLY); - } else if (UNIV_UNLIKELY(m_prebuilt->trx != trx)) { - ib::error() << "The transaction object for the table handle is" - " at " << static_cast<const void*>(m_prebuilt->trx) - << ", but for the current thread it is at " - << static_cast<const void*>(trx); - - fputs("InnoDB: Dump of 200 bytes around m_prebuilt: ", stderr); - ut_print_buf(stderr, ((const byte*) m_prebuilt) - 100, 200); - fputs("\nInnoDB: Dump of 200 bytes around ha_data: ", stderr); - ut_print_buf(stderr, ((const byte*) trx) - 100, 200); - putc('\n', stderr); - ut_error; - } else if (!trx_is_started(trx)) { + } + + ut_a(m_prebuilt->trx == trx); + + if (!trx_is_started(trx)) { ++trx->will_lock; } @@ -8200,6 +8077,7 @@ ha_innobase::write_row( } #endif /* WITH_WSREP */ + ins_mode_t vers_set_fields; /* Handling of Auto-Increment Columns. */ if (table->next_number_field && record == table->record[0]) { @@ -8248,8 +8126,11 @@ ha_innobase::write_row( innobase_srv_conc_enter_innodb(m_prebuilt); + vers_set_fields = table->versioned_write(VERS_TRX_ID) ? + ROW_INS_VERSIONED : ROW_INS_NORMAL; + /* Execute insert graph that will result in actual insert. */ - error = row_insert_for_mysql((byte*) record, m_prebuilt); + error = row_insert_for_mysql((byte*) record, m_prebuilt, vers_set_fields); DEBUG_SYNC(m_user_thd, "ib_after_row_insert"); @@ -8461,7 +8342,7 @@ dberr_t calc_row_difference( upd_t* uvect, const uchar* old_row, - uchar* new_row, + const uchar* new_row, TABLE* table, uchar* upd_buff, ulint buff_len, @@ -8565,7 +8446,7 @@ calc_row_difference( if (field_mysql_type == MYSQL_TYPE_LONGLONG && prebuilt->table->fts && innobase_strcasecmp( - field->field_name, FTS_DOC_ID_COL_NAME) == 0) { + field->field_name.str, FTS_DOC_ID_COL_NAME) == 0) { doc_id = (doc_id_t) mach_read_from_n_little_endian( n_ptr, 8); if (doc_id == 0) { @@ -8646,9 +8527,7 @@ calc_row_difference( /* The field has changed */ ufield = uvect->fields + n_changed; -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(ufield, sizeof *ufield); -#endif /* HAVE_valgrind_or_MSAN */ /* Let us use a dummy dfield to make the conversion from the MySQL column format to the InnoDB format */ @@ -8864,8 +8743,7 @@ wsrep_calc_row_hash( const uchar* row, /*!< in: row in MySQL format */ TABLE* table, /*!< in: table in MySQL data dictionary */ - row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */ - THD* thd) /*!< in: user thread */ + row_prebuilt_t* prebuilt) /*!< in: InnoDB prebuilt struct */ { ulint len; const byte* ptr; @@ -8947,7 +8825,7 @@ if its index columns are updated! int ha_innobase::update_row( const uchar* old_row, - uchar* new_row) + const uchar* new_row) { int err; @@ -9006,15 +8884,48 @@ ha_innobase::update_row( MySQL that the row is not really updated and it should not increase the count of updated rows. This is fix for http://bugs.mysql.com/29157 */ + if (m_prebuilt->versioned_write + && thd_sql_command(m_user_thd) != SQLCOM_ALTER_TABLE + /* Multiple UPDATE of same rows in single transaction create + historical rows only once. */ + && trx->id != table->vers_start_id()) { + error = row_insert_for_mysql((byte*) old_row, + m_prebuilt, + ROW_INS_HISTORICAL); + if (error != DB_SUCCESS) { + goto func_exit; + } + innobase_srv_conc_exit_innodb(m_prebuilt); + innobase_active_small(); + } DBUG_RETURN(HA_ERR_RECORD_IS_THE_SAME); - } + } else { + const bool vers_set_fields = m_prebuilt->versioned_write + && m_prebuilt->upd_node->update->affects_versioned(); + const bool vers_ins_row = vers_set_fields + && thd_sql_command(m_user_thd) != SQLCOM_ALTER_TABLE; + + /* This is not a delete */ + m_prebuilt->upd_node->is_delete = + (vers_set_fields && !vers_ins_row) || + (thd_sql_command(m_user_thd) == SQLCOM_DELETE && + table->versioned(VERS_TIMESTAMP)) + ? VERSIONED_DELETE + : NO_DELETE; - /* This is not a delete */ - m_prebuilt->upd_node->is_delete = FALSE; + innobase_srv_conc_enter_innodb(m_prebuilt); - innobase_srv_conc_enter_innodb(m_prebuilt); + error = row_update_for_mysql(m_prebuilt); - error = row_update_for_mysql(m_prebuilt); + if (error == DB_SUCCESS && vers_ins_row + /* Multiple UPDATE of same rows in single transaction create + historical rows only once. */ + && trx->id != table->vers_start_id()) { + error = row_insert_for_mysql((byte*) old_row, + m_prebuilt, + ROW_INS_HISTORICAL); + } + } if (error == DB_SUCCESS && autoinc) { /* A value for an AUTO_INCREMENT column @@ -9116,8 +9027,11 @@ ha_innobase::delete_row( } /* This is a delete */ - - m_prebuilt->upd_node->is_delete = TRUE; + m_prebuilt->upd_node->is_delete = table->versioned_write(VERS_TRX_ID) + && table->vers_end_field()->is_max() + && trx->id != table->vers_start_id() + ? VERSIONED_DELETE + : PLAIN_DELETE; innobase_srv_conc_enter_innodb(m_prebuilt); @@ -9233,8 +9147,7 @@ int ha_innobase::index_init( /*====================*/ uint keynr, /*!< in: key (index) number */ - bool sorted) /*!< in: 1 if result MUST be sorted - according to index */ + bool) { DBUG_ENTER("index_init"); @@ -9420,8 +9333,7 @@ ha_innobase::index_read( m_prebuilt->srch_key_val_len, index, (byte*) key_ptr, - (ulint) key_len, - m_prebuilt->trx); + (ulint) key_len); DBUG_ASSERT(m_prebuilt->search_tuple->n_fields > 0); } else { @@ -9556,7 +9468,7 @@ ha_innobase::innobase_get_index( if (keynr != MAX_KEY && table->s->keys > 0) { key = &table->key_info[keynr]; - index = dict_table_get_index_on_name(ib_table, key->name); + index = dict_table_get_index_on_name(ib_table, key->name.str); } else { index = dict_table_get_first_index(ib_table); } @@ -9565,7 +9477,7 @@ ha_innobase::innobase_get_index( sql_print_error( "InnoDB could not find key no %u with name %s" " from dict cache for table %s", - keynr, key ? key->name : "NULL", + keynr, key ? key->name.str : "NULL", ib_table->name.m_name); } @@ -9651,7 +9563,7 @@ ha_innobase::change_active_index( for (uint i = 0; i < table->s->fields; i++) { if (m_prebuilt->read_just_key && bitmap_is_set(table->read_set, i) - && !strcmp(table->s->field[i]->field_name, + && !strcmp(table->s->field[i]->field_name.str, FTS_DOC_ID_COL_NAME)) { m_prebuilt->fts_doc_id_in_read_set = true; break; @@ -9710,9 +9622,7 @@ ha_innobase::general_fetch( } else if (m_prebuilt->table->corrupted) { DBUG_RETURN(HA_ERR_CRASHED); } else { - FilSpace space(m_prebuilt->table->space, true); - - DBUG_RETURN(space() + DBUG_RETURN(m_prebuilt->table->space ? HA_ERR_DECRYPTION_FAILED : HA_ERR_NO_SUCH_TABLE); } @@ -9798,8 +9708,7 @@ int ha_innobase::index_next_same( /*=========================*/ uchar* buf, /*!< in/out: buffer for the row */ - const uchar* key, /*!< in: key value */ - uint keylen) /*!< in: key value length */ + const uchar*, uint) { return(general_fetch(buf, ROW_SEL_NEXT, m_last_match_mode)); } @@ -9957,7 +9866,7 @@ ha_innobase::rnd_pos( /* Note that we assume the length of the row reference is fixed for the table, and it is == ref_length */ - int error = index_read(buf, pos, ref_length, HA_READ_KEY_EXACT); + int error = index_read(buf, pos, (uint)ref_length, HA_READ_KEY_EXACT); if (error != 0) { DBUG_PRINT("error", ("Got error: %d", error)); @@ -10062,7 +9971,7 @@ ha_innobase::ft_init_ext( } /* If tablespace is discarded, we should return here */ - if (dict_table_is_discarded(ft_table)) { + if (!ft_table->space) { my_error(ER_TABLESPACE_MISSING, MYF(0), table->s->db.str, table->s->table_name.str); return(NULL); @@ -10419,7 +10328,7 @@ wsrep_append_foreign_key( } ut_a(idx); - key[0] = (char)i; + key[0] = byte(i); rcode = wsrep_rec_get_foreign_key( &key[1], &len, rec, index, idx, @@ -10507,7 +10416,6 @@ wsrep_append_key( THD *thd, trx_t *trx, TABLE_SHARE *table_share, - TABLE *table, const uchar* key, uint16_t key_len, wsrep_key_type key_type /*!< in: access type of this key @@ -10622,12 +10530,11 @@ ha_innobase::wsrep_append_keys( if (!is_null) { rcode = wsrep_append_key( - thd, trx, table_share, table, keyval, + thd, trx, table_share, keyval, len, key_type); + if (rcode) DBUG_RETURN(rcode); - } - else - { + } else { WSREP_DEBUG("NULL key skipped (proto 0): %s", wsrep_thd_query(thd)); } @@ -10661,7 +10568,7 @@ ha_innobase::wsrep_append_keys( if (!tab) { WSREP_WARN("MariaDB-InnoDB key mismatch %s %s", table->s->table_name.str, - key_info->name); + key_info->name.str); } /* !hasPK == table with no PK, must append all non-unique keys */ if (!hasPK || key_info->flags & HA_NOSAME || @@ -10675,7 +10582,7 @@ ha_innobase::wsrep_append_keys( record0, &is_null); if (!is_null) { rcode = wsrep_append_key( - thd, trx, table_share, table, + thd, trx, table_share, keyval0, len+1, key_type); if (rcode) DBUG_RETURN(rcode); @@ -10697,9 +10604,8 @@ ha_innobase::wsrep_append_keys( && memcmp(key0, key1, len)) { rcode = wsrep_append_key( thd, trx, table_share, - table, keyval1, len+1, - key_type); + key_type); if (rcode) DBUG_RETURN(rcode); } } @@ -10710,22 +10616,20 @@ ha_innobase::wsrep_append_keys( /* if no PK, calculate hash of full row, to be the key value */ if (!key_appended && wsrep_certify_nonPK) { uchar digest[16]; - int rcode; - wsrep_calc_row_hash(digest, record0, table, m_prebuilt, thd); - if ((rcode = wsrep_append_key(thd, trx, table_share, table, - (const uchar*) digest, 16, - key_type))) { + wsrep_calc_row_hash(digest, record0, table, m_prebuilt); + + if (int rcode = wsrep_append_key(thd, trx, table_share, + digest, 16, key_type)) { DBUG_RETURN(rcode); } if (record1) { wsrep_calc_row_hash( - digest, record1, table, m_prebuilt, thd); - if ((rcode = wsrep_append_key(thd, trx, table_share, - table, - (const uchar*) digest, - 16, key_type))) { + digest, record1, table, m_prebuilt); + if (int rcode = wsrep_append_key(thd, trx, table_share, + digest, 16, + key_type)) { DBUG_RETURN(rcode); } } @@ -10806,7 +10710,7 @@ create_table_check_doc_id_col( col_len = field->pack_length(); - if (innobase_strcasecmp(field->field_name, + if (innobase_strcasecmp(field->field_name.str, FTS_DOC_ID_COL_NAME) == 0) { /* Note the name is case sensitive due to @@ -10814,7 +10718,7 @@ create_table_check_doc_id_col( if (col_type == DATA_INT && !field->real_maybe_null() && col_len == sizeof(doc_id_t) - && (strcmp(field->field_name, + && (strcmp(field->field_name.str, FTS_DOC_ID_COL_NAME) == 0)) { *doc_id_col = i; } else { @@ -10826,7 +10730,7 @@ create_table_check_doc_id_col( " of BIGINT NOT NULL type, and named" " in all capitalized characters"); my_error(ER_WRONG_COLUMN_NAME, MYF(0), - field->field_name); + field->field_name.str); *doc_id_col = ULINT_UNDEFINED; } @@ -10884,7 +10788,7 @@ innodb_base_col_setup( const Field* field, dict_v_col_t* v_col) { - int n = 0; + ulint n = 0; prepare_vcol_for_base_setup(table, field, v_col); @@ -10897,7 +10801,7 @@ innodb_base_col_setup( for (z = 0; z < table->n_cols; z++) { const char* name = dict_table_get_col_name(table, z); if (!innobase_strcasecmp(name, - base_field->field_name)) { + base_field->field_name.str)) { break; } } @@ -10936,7 +10840,7 @@ innodb_base_col_setup_for_stored( const char* name = dict_table_get_col_name( table, z); if (!innobase_strcasecmp( - name, base_field->field_name)) { + name, base_field->field_name.str)) { break; } } @@ -10971,7 +10875,6 @@ create_table_info_t::create_table_def() ulint doc_id_col = 0; ibool has_doc_id_col = FALSE; mem_heap_t* heap; - ulint space_id = 0; ha_table_option_struct *options= m_form->s->option_struct; dberr_t err = DB_SUCCESS; @@ -11019,24 +10922,17 @@ create_table_info_t::create_table_def() /* Raise error if the Doc ID column is of wrong type or name */ if (doc_id_col == ULINT_UNDEFINED) { - - err = DB_ERROR; -error_ret: - DBUG_RETURN(convert_error_code_to_mysql(err, m_flags, - m_thd)); + DBUG_RETURN(HA_ERR_GENERIC); } else { has_doc_id_col = TRUE; } } - /* For single-table tablespaces, we pass 0 as the space id, and then - determine the actual space id when the tablespace is created. */ - /* Adjust the number of columns for the FTS hidden field */ const ulint actual_n_cols = n_cols + (m_flags2 & DICT_TF2_FTS && !has_doc_id_col); - table = dict_mem_table_create(m_table_name, space_id, + table = dict_mem_table_create(m_table_name, NULL, actual_n_cols, num_v, m_flags, m_flags2); /* Set the hidden doc_id column. */ @@ -11057,8 +10953,25 @@ error_ret: heap = mem_heap_create(1000); + ut_d(bool have_vers_start = false); + ut_d(bool have_vers_end = false); + for (ulint i = 0, j = 0; j < n_cols; i++) { Field* field = m_form->field[i]; + ulint vers_row = 0; + + if (m_form->versioned()) { + if (i == m_form->s->row_start_field) { + vers_row = DATA_VERS_START; + ut_d(have_vers_start = true); + } else if (i == m_form->s->row_end_field) { + vers_row = DATA_VERS_END; + ut_d(have_vers_end = true); + } else if (!(field->flags + & VERS_UPDATE_UNVERSIONED_FLAG)) { + vers_row = DATA_VERSIONED; + } + } col_type = get_innobase_type_from_mysql_type( &unsigned_type, field); @@ -11072,7 +10985,7 @@ error_ret: " column type and try to re-create" " the table with an appropriate" " column type.", - table->name.m_name, field->field_name); + table->name.m_name, field->field_name.str); goto err_col; } @@ -11125,35 +11038,36 @@ error_ret: /* First check whether the column to be added has a system reserved name. */ - if (dict_col_name_is_reserved(field->field_name)){ + if (dict_col_name_is_reserved(field->field_name.str)){ my_error(ER_WRONG_COLUMN_NAME, MYF(0), - field->field_name); + field->field_name.str); err_col: dict_mem_table_free(table); mem_heap_free(heap); - - err = DB_ERROR; - goto error_ret; + ut_ad(trx_state_eq(m_trx, TRX_STATE_NOT_STARTED)); + DBUG_RETURN(HA_ERR_GENERIC); } ulint is_virtual = !field->stored_in_db() ? DATA_VIRTUAL : 0; if (!is_virtual) { dict_mem_table_add_col(table, heap, - field->field_name, col_type, + field->field_name.str, col_type, dtype_form_prtype( (ulint) field->type() | nulls_allowed | unsigned_type - | binary_type | long_true_varchar, + | binary_type | long_true_varchar + | vers_row, charset_no), col_len); } else if (!omit_virtual) { dict_mem_table_add_v_col(table, heap, - field->field_name, col_type, + field->field_name.str, col_type, dtype_form_prtype( (ulint) field->type() | nulls_allowed | unsigned_type | binary_type | long_true_varchar + | vers_row | is_virtual, charset_no), col_len, i, 0); @@ -11173,6 +11087,10 @@ err_col: j++; } + ut_ad(have_vers_start == have_vers_end); + ut_ad(table->versioned() == have_vers_start); + ut_ad(!table->versioned() || table->vers_start != table->vers_end); + if (num_v) { for (ulint i = 0, j = 0; i < n_cols; i++) { dict_v_col_t* v_col; @@ -11219,36 +11137,28 @@ err_col: fts_add_doc_id_column(table, heap); } - /* If temp table, then we avoid creation of entries in SYSTEM TABLES. - Given that temp table lifetime is limited to connection/server lifetime - on re-start we don't need to restore temp-table and so no entry is - needed in SYSTEM tables. */ - if (table->is_temporary()) { + dict_table_add_system_columns(table, heap); + if (table->is_temporary()) { if ((options->encryption == 1 && !innodb_encrypt_temporary_tables) || (options->encryption == 2 && innodb_encrypt_temporary_tables)) { - push_warning_printf(m_thd, Sql_condition::WARN_LEVEL_WARN, + push_warning_printf(m_thd, + Sql_condition::WARN_LEVEL_WARN, ER_ILLEGAL_HA_CREATE_OPTION, "Ignoring encryption parameter during " "temporary table creation."); } - /* Get a new table ID */ + /* Get a new table ID. FIXME: Make this a private + sequence, not shared with persistent tables! */ dict_table_assign_new_id(table, m_trx); - table->space = SRV_TMP_SPACE_ID; - - /* Temp-table are maintained in memory and so - can_be_evicted is FALSE. */ - mem_heap_t* temp_table_heap = mem_heap_create(256); - - dict_table_add_to_cache(table, FALSE, temp_table_heap); - - DBUG_EXECUTE_IF("ib_ddl_crash_during_create2", - DBUG_SUICIDE();); - - mem_heap_free(temp_table_heap); + ut_ad(dict_tf_get_rec_format(table->flags) + != REC_FORMAT_COMPRESSED); + table->space_id = SRV_TMP_SPACE_ID; + table->space = fil_system.temp_space; + table->add_to_cache(); } else { if (err == DB_SUCCESS) { err = row_create_table_for_mysql( @@ -11267,7 +11177,15 @@ err_col: DBUG_EXECUTE_IF("ib_create_err_tablespace_exist", err = DB_TABLESPACE_EXISTS;); - if (err == DB_DUPLICATE_KEY || err == DB_TABLESPACE_EXISTS) { + switch (err) { + case DB_SUCCESS: + ut_ad(table); + m_table = table; + DBUG_RETURN(0); + default: + break; + case DB_DUPLICATE_KEY: + case DB_TABLESPACE_EXISTS: char display_name[FN_REFLEN]; char* buf_end = innobase_convert_identifier( display_name, sizeof(display_name) - 1, @@ -11281,7 +11199,7 @@ err_col: : ER_TABLESPACE_EXISTS, MYF(0), display_name); } - goto error_ret; + DBUG_RETURN(convert_error_code_to_mysql(err, m_flags, m_thd)); } /*****************************************************************//** @@ -11293,14 +11211,12 @@ create_index( trx_t* trx, /*!< in: InnoDB transaction handle */ const TABLE* form, /*!< in: information on table columns and indexes */ - ulint flags, /*!< in: InnoDB table flags */ - const char* table_name, /*!< in: table name */ + dict_table_t* table, /*!< in,out: table */ uint key_num) /*!< in: index number */ { dict_index_t* index; int error; const KEY* key; - ulint ind_type; ulint* field_lengths; DBUG_ENTER("create_index"); @@ -11308,19 +11224,15 @@ create_index( key = form->key_info + key_num; /* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */ - ut_a(innobase_strcasecmp(key->name, innobase_index_reserve_name) != 0); - - ind_type = 0; - if (key->flags & HA_SPATIAL) { - ind_type = DICT_SPATIAL; - } else if (key->flags & HA_FULLTEXT) { - ind_type = DICT_FTS; - } - - if (ind_type != 0) - { - index = dict_mem_index_create(table_name, key->name, 0, - ind_type, + ut_a(innobase_strcasecmp(key->name.str, innobase_index_reserve_name) != 0); + + if (key->flags & (HA_SPATIAL | HA_FULLTEXT)) { + /* Only one of these can be specified at a time. */ + ut_ad(~key->flags & (HA_SPATIAL | HA_FULLTEXT)); + ut_ad(!(key->flags & HA_NOSAME)); + index = dict_mem_index_create(table, key->name.str, + (key->flags & HA_SPATIAL) + ? DICT_SPATIAL : DICT_FTS, key->user_defined_key_parts); for (ulint i = 0; i < key->user_defined_key_parts; i++) { @@ -11333,17 +11245,17 @@ create_index( DBUG_RETURN(HA_ERR_UNSUPPORTED); } - dict_mem_index_add_field(index, field->field_name, 0); + dict_mem_index_add_field(index, field->field_name.str, + 0); } DBUG_RETURN(convert_error_code_to_mysql( row_create_index_for_mysql( index, trx, NULL), - flags, NULL)); - + table->flags, NULL)); } - ind_type = 0; + ulint ind_type = 0; if (key_num == form->s->primary_key) { ind_type |= DICT_CLUSTERED; @@ -11360,7 +11272,7 @@ create_index( /* We pass 0 as the space id, and determine at a lower level the space id where to store the table */ - index = dict_mem_index_create(table_name, key->name, 0, + index = dict_mem_index_create(table, key->name.str, ind_type, key->user_defined_key_parts); for (ulint i = 0; i < key->user_defined_key_parts; i++) { @@ -11385,7 +11297,7 @@ create_index( if (field == NULL) ut_error; - const char* field_name = key_part->field->field_name; + const char* field_name = key_part->field->field_name.str; col_type = get_innobase_type_from_mysql_type( &is_unsigned, key_part->field); @@ -11410,8 +11322,8 @@ create_index( " prefix index field, on an" " inappropriate data type. Table" " name %s, column name %s.", - table_name, - key_part->field->field_name); + form->s->table_name.str, + key_part->field->field_name.str); prefix_len = 0; } @@ -11433,6 +11345,7 @@ create_index( /* Even though we've defined max_supported_key_part_length, we still do our own checking using field_lengths to be absolutely sure we don't create too long indexes. */ + ulint flags = table->flags; error = convert_error_code_to_mysql( row_create_index_for_mysql(index, trx, field_lengths), @@ -11443,31 +11356,6 @@ create_index( DBUG_RETURN(error); } -/*****************************************************************//** -Creates an index to an InnoDB table when the user has defined no -primary index. */ -inline -int -create_clustered_index_when_no_primary( -/*===================================*/ - trx_t* trx, /*!< in: InnoDB transaction handle */ - ulint flags, /*!< in: InnoDB table flags */ - const char* table_name) /*!< in: table name */ -{ - dict_index_t* index; - dberr_t error; - - /* We pass 0 as the space id, and determine at a lower level the space - id where to store the table */ - index = dict_mem_index_create(table_name, - innobase_index_reserve_name, - 0, DICT_CLUSTERED, 0); - - error = row_create_index_for_mysql(index, trx, NULL); - - return(convert_error_code_to_mysql(error, flags, NULL)); -} - /** Return a display name for the row format @param[in] row_format Row Format @return row format name */ @@ -11572,12 +11460,12 @@ create_table_info_t::create_options_are_invalid() case 8: case 16: /* The maximum KEY_BLOCK_SIZE (KBS) is - UNIV_PAGE_SIZE_MAX. But if UNIV_PAGE_SIZE is + UNIV_PAGE_SIZE_MAX. But if srv_page_size is smaller than UNIV_PAGE_SIZE_MAX, the maximum KBS is also smaller. */ kbs_max = ut_min( - 1 << (UNIV_PAGE_SSIZE_MAX - 1), - 1 << (PAGE_ZIP_SSIZE_MAX - 1)); + 1U << (UNIV_PAGE_SSIZE_MAX - 1), + 1U << (PAGE_ZIP_SSIZE_MAX - 1)); if (m_create_info->key_block_size > kbs_max) { push_warning_printf( m_thd, Sql_condition::WARN_LEVEL_WARN, @@ -11598,14 +11486,6 @@ create_table_info_t::create_options_are_invalid() " innodb_file_per_table."); ret = "KEY_BLOCK_SIZE"; } - if (srv_file_format < UNIV_FORMAT_B) { - push_warning( - m_thd, Sql_condition::WARN_LEVEL_WARN, - ER_ILLEGAL_HA_CREATE_OPTION, - "InnoDB: KEY_BLOCK_SIZE requires" - " innodb_file_format > Antelope."); - ret = "KEY_BLOCK_SIZE"; - } break; default: push_warning_printf( @@ -11637,28 +11517,8 @@ create_table_info_t::create_options_are_invalid() get_row_format_name(row_format)); ret = "ROW_FORMAT"; } - if (srv_file_format < UNIV_FORMAT_B) { - push_warning_printf( - m_thd, Sql_condition::WARN_LEVEL_WARN, - ER_ILLEGAL_HA_CREATE_OPTION, - "InnoDB: ROW_FORMAT=%s requires" - " innodb_file_format > Antelope.", - get_row_format_name(row_format)); - ret = "ROW_FORMAT"; - } break; case ROW_TYPE_DYNAMIC: - if (!is_temp && srv_file_format < UNIV_FORMAT_B) { - push_warning_printf( - m_thd, Sql_condition::WARN_LEVEL_WARN, - ER_ILLEGAL_HA_CREATE_OPTION, - "InnoDB: ROW_FORMAT=%s requires" - " innodb_file_format > Antelope.", - get_row_format_name(row_format)); - ret = "ROW_FORMAT"; - } - /* ROW_FORMAT=DYNAMIC also shuns KEY_BLOCK_SIZE */ - /* fall through */ case ROW_TYPE_COMPACT: case ROW_TYPE_REDUNDANT: if (has_key_block_size) { @@ -11701,7 +11561,7 @@ create_table_info_t::create_options_are_invalid() /* Don't support compressed table when page size > 16k. */ if ((has_key_block_size || row_format == ROW_TYPE_COMPRESSED) - && UNIV_PAGE_SIZE > UNIV_PAGE_SIZE_DEF) { + && srv_page_size > UNIV_PAGE_SIZE_DEF) { push_warning(m_thd, Sql_condition::WARN_LEVEL_WARN, ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: Cannot create a COMPRESSED table" @@ -11826,15 +11686,6 @@ create_table_info_t::check_table_options() return "PAGE_COMPRESSED"; } - if (srv_file_format < UNIV_FORMAT_B) { - push_warning( - m_thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, - "InnoDB: PAGE_COMPRESSED requires" - " innodb_file_format > Antelope."); - return "PAGE_COMPRESSED"; - } - if (m_create_info->key_block_size) { push_warning( m_thd, Sql_condition::WARN_LEVEL_WARN, @@ -11884,7 +11735,7 @@ ha_innobase::update_create_info( create_info->auto_increment_value = stats.auto_increment_value; } - if (dict_table_is_temporary(m_prebuilt->table)) { + if (m_prebuilt->table->is_temporary()) { return; } @@ -11924,7 +11775,11 @@ innobase_fts_load_stopword( @return 0 if successful, otherwise, error number */ int create_table_info_t::parse_table_name( - const char* name) + const char* +#ifdef _WIN32 + name +#endif + ) { DBUG_ENTER("parse_table_name"); @@ -12006,10 +11861,6 @@ bool create_table_info_t::innobase_table_flags() ut_min(static_cast<ulint>(UNIV_PAGE_SSIZE_MAX), static_cast<ulint>(PAGE_ZIP_SSIZE_MAX)); - /* Cache the value of innodb_file_format, in case it is - modified by another thread while the table is being created. */ - const ulint file_format_allowed = srv_file_format; - /* Cache the value of innobase_compression_level, in case it is modified by another thread while the table is being created. */ const ulint default_compression_level = page_zip_level; @@ -12038,16 +11889,16 @@ bool create_table_info_t::innobase_table_flags() } } - if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) { + if (innobase_strcasecmp(key->name.str, FTS_DOC_ID_INDEX_NAME)) { continue; } /* Do a pre-check on FTS DOC ID index */ if (!(key->flags & HA_NOSAME) - || strcmp(key->name, FTS_DOC_ID_INDEX_NAME) - || strcmp(key->key_part[0].field->field_name, + || strcmp(key->name.str, FTS_DOC_ID_INDEX_NAME) + || strcmp(key->key_part[0].field->field_name.str, FTS_DOC_ID_COL_NAME)) { - fts_doc_id_index_bad = key->name; + fts_doc_id_index_bad = key->name.str; } if (fts_doc_id_index_bad && (m_flags2 & DICT_TF2_FTS)) { @@ -12091,15 +11942,6 @@ index_bad: zip_allowed = false; } - if (file_format_allowed < UNIV_FORMAT_B) { - push_warning( - m_thd, Sql_condition::WARN_LEVEL_WARN, - ER_ILLEGAL_HA_CREATE_OPTION, - "InnoDB: KEY_BLOCK_SIZE requires" - " innodb_file_format > Antelope."); - zip_allowed = false; - } - if (!zip_allowed || zssize > zip_ssize_max) { push_warning_printf( @@ -12137,7 +11979,7 @@ index_bad: if (row_type == ROW_TYPE_COMPRESSED && zip_allowed) { /* ROW_FORMAT=COMPRESSED without KEY_BLOCK_SIZE implies half the maximum KEY_BLOCK_SIZE(*1k) or - UNIV_PAGE_SIZE, whichever is less. */ + srv_page_size, whichever is less. */ zip_ssize = zip_ssize_max - 1; } } @@ -12165,13 +12007,6 @@ index_bad: ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: ROW_FORMAT=COMPRESSED requires" " innodb_file_per_table."); - - } else if (file_format_allowed == UNIV_FORMAT_A) { - push_warning_printf( - m_thd, Sql_condition::WARN_LEVEL_WARN, - ER_ILLEGAL_HA_CREATE_OPTION, - "InnoDB: ROW_FORMAT=COMPRESSED requires" - " innodb_file_format > Antelope."); } else { innodb_row_format = REC_FORMAT_COMPRESSED; break; @@ -12195,7 +12030,7 @@ index_bad: } /* Don't support compressed table when page size > 16k. */ - if (zip_allowed && zip_ssize && UNIV_PAGE_SIZE > UNIV_PAGE_SIZE_DEF) { + if (zip_allowed && zip_ssize && srv_page_size > UNIV_PAGE_SIZE_DEF) { push_warning(m_thd, Sql_condition::WARN_LEVEL_WARN, ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: Cannot create a COMPRESSED table" @@ -12223,8 +12058,11 @@ index_bad: m_use_data_dir, options->page_compressed, options->page_compression_level == 0 ? - default_compression_level : static_cast<ulint>(options->page_compression_level), - 0); + default_compression_level : ulint(options->page_compression_level)); + + if (m_form->s->table_type == TABLE_TYPE_SEQUENCE) { + m_flags |= DICT_TF_MASK_NO_ROLLBACK; + } /* Set the flags2 when create table or alter tables */ m_flags2 |= DICT_TF2_FTS_AUX_HEX_NAME; @@ -12319,7 +12157,7 @@ innobase_parse_hint_from_comment( } /* update SYS_INDEX table */ - if (!dict_table_is_temporary(table)) { + if (!table->is_temporary()) { for (uint i = 0; i < table_share->keys; i++) { is_found[i] = false; } @@ -12345,7 +12183,7 @@ innobase_parse_hint_from_comment( KEY* key_info = &table_share->key_info[i]; if (innobase_strcasecmp( - index->name, key_info->name) == 0) { + index->name, key_info->name.str) == 0) { dict_index_set_merge_threshold( index, @@ -12387,7 +12225,7 @@ innobase_parse_hint_from_comment( KEY* key_info = &table_share->key_info[i]; if (innobase_strcasecmp( - index->name, key_info->name) == 0) { + index->name, key_info->name.str) == 0) { /* x-lock index is needed to exclude concurrent pessimistic tree operations */ @@ -12547,7 +12385,6 @@ int create_table_info_t::create_table(bool create_fk) int error; int primary_key_no; uint i; - dict_table_t* innobase_table = NULL; DBUG_ENTER("create_table"); @@ -12574,9 +12411,13 @@ int create_table_info_t::create_table(bool create_fk) /* Create an index which is used as the clustered index; order the rows by their row id which is internally generated by InnoDB */ - - error = create_clustered_index_when_no_primary( - m_trx, m_flags, m_table_name); + ulint flags = m_table->flags; + dict_index_t* index = dict_mem_index_create( + m_table, innobase_index_reserve_name, + DICT_CLUSTERED, 0); + error = convert_error_code_to_mysql( + row_create_index_for_mysql(index, m_trx, NULL), + flags, m_thd); if (error) { DBUG_RETURN(error); } @@ -12585,7 +12426,7 @@ int create_table_info_t::create_table(bool create_fk) if (primary_key_no != -1) { /* In InnoDB the clustered index must always be created first */ - if ((error = create_index(m_trx, m_form, m_flags, m_table_name, + if ((error = create_index(m_trx, m_form, m_table, (uint) primary_key_no))) { DBUG_RETURN(error); } @@ -12596,11 +12437,6 @@ int create_table_info_t::create_table(bool create_fk) if (m_flags2 & DICT_TF2_FTS) { fts_doc_id_index_enum ret; - innobase_table = dict_table_open_on_name( - m_table_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); - - ut_a(innobase_table); - /* Check whether there already exists FTS_DOC_ID_INDEX */ ret = innobase_fts_check_doc_id_index_in_def( m_form->s->keys, m_form->key_info); @@ -12619,13 +12455,12 @@ int create_table_info_t::create_table(bool create_fk) " make sure it is of correct" " type\n", FTS_DOC_ID_INDEX_NAME, - innobase_table->name.m_name); + m_table->name.m_name); - if (innobase_table->fts) { - fts_free(innobase_table); + if (m_table->fts) { + fts_free(m_table); } - dict_table_close(innobase_table, TRUE, FALSE); my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), FTS_DOC_ID_INDEX_NAME); DBUG_RETURN(-1); @@ -12635,37 +12470,31 @@ int create_table_info_t::create_table(bool create_fk) } dberr_t err = fts_create_common_tables( - m_trx, innobase_table, m_table_name, + m_trx, m_table, (ret == FTS_EXIST_DOC_ID_INDEX)); error = convert_error_code_to_mysql(err, 0, NULL); - dict_table_close(innobase_table, TRUE, FALSE); - if (error) { DBUG_RETURN(error); } } for (i = 0; i < m_form->s->keys; i++) { - - if (i != static_cast<uint>(primary_key_no)) { - - if ((error = create_index(m_trx, m_form, m_flags, - m_table_name, i))) { - DBUG_RETURN(error); - } + if (i != uint(primary_key_no) + && (error = create_index(m_trx, m_form, m_table, i))) { + DBUG_RETURN(error); } } /* Cache all the FTS indexes on this table in the FTS specific structure. They are used for FTS indexed column update handling. */ if (m_flags2 & DICT_TF2_FTS) { - fts_t* fts = innobase_table->fts; + fts_t* fts = m_table->fts; ut_a(fts != NULL); - dict_table_get_all_fts_indexes(innobase_table, fts->indexes); + dict_table_get_all_fts_indexes(m_table, fts->indexes); } size_t stmt_len; @@ -12728,18 +12557,9 @@ int create_table_info_t::create_table(bool create_fk) } } - innobase_table = dict_table_open_on_name(m_table_name, true, false, - DICT_ERR_IGNORE_NONE); - ut_ad(innobase_table); - /* In TRUNCATE TABLE, we will merely warn about the maximum row size being too large. */ - const bool is_acceptable = row_size_is_acceptable(*innobase_table, - create_fk); - - dict_table_close(innobase_table, true, false); - - if (!is_acceptable) { + if (!row_size_is_acceptable(*m_table, create_fk)) { DBUG_RETURN(convert_error_code_to_mysql( DB_TOO_BIG_RECORD, m_flags, NULL)); } @@ -12765,7 +12585,8 @@ dict_index_t::record_size_info_t dict_index_t::record_size_info() const /* maximum allowed size of a node pointer record */ ulint page_ptr_max; const bool comp= dict_table_is_comp(table); - const page_size_t page_size(dict_table_page_size(table)); + /* table->space == NULL after DISCARD TABLESPACE */ + const page_size_t page_size(dict_tf_get_page_size(table->flags)); record_size_info_t result; if (page_size.is_compressed() && @@ -12915,7 +12736,7 @@ static void ib_warn_row_too_big(THD *thd, const dict_table_t *table) /* FIXME: this row size check should be improved */ /* If prefix is true then a 768-byte prefix is stored locally for BLOB fields. Refer to dict_table_get_format() */ - const bool prefix= (dict_tf_get_format(table->flags) == UNIV_FORMAT_A); + const bool prefix= !dict_table_has_atomic_blobs(table); const ulint free_space= page_get_free_space_of_empty(table->flags & DICT_TF_COMPACT) / 2; @@ -13004,21 +12825,12 @@ create_table_info_t::create_table_update_dict() dict_stats_update(innobase_table, DICT_STATS_EMPTY_TABLE); - if (innobase_table) { - /* We update the highest file format in the system table - space, if this table has higher file format setting. */ - - trx_sys_file_format_max_upgrade( - (const char**) &innobase_file_format_max, - dict_table_get_format(innobase_table)); - } - /* Load server stopword into FTS cache */ if (m_flags2 & DICT_TF2_FTS) { if (!innobase_fts_load_stopword(innobase_table, NULL, m_thd)) { dict_table_close(innobase_table, FALSE, FALSE); srv_active_wake_master_thread(); - trx_free_for_mysql(m_trx); + m_trx->free(); DBUG_RETURN(-1); } @@ -13039,7 +12851,7 @@ create_table_info_t::create_table_update_dict() dict_table_autoinc_lock(innobase_table); dict_table_autoinc_initialize(innobase_table, autoinc); - if (dict_table_is_temporary(innobase_table)) { + if (innobase_table->is_temporary()) { /* AUTO_INCREMENT is not persistent for TEMPORARY TABLE. Temporary tables are never evicted. Keep the counter in memory only. */ @@ -13101,6 +12913,10 @@ ha_innobase::create( DBUG_ENTER("ha_innobase::create"); + DBUG_ASSERT(form->s == table_share); + DBUG_ASSERT(table_share->table_type == TABLE_TYPE_SEQUENCE + || table_share->table_type == TABLE_TYPE_NORMAL); + create_table_info_t info(ha_thd(), form, create_info, @@ -13142,7 +12958,7 @@ ha_innobase::create( trx_rollback_for_mysql(trx); row_mysql_unlock_data_dictionary(trx); if (own_trx) { - trx_free_for_mysql(trx); + trx->free(); } DBUG_RETURN(error); } @@ -13151,7 +12967,7 @@ ha_innobase::create( row_mysql_unlock_data_dictionary(trx); if (own_trx) { - trx_free_for_mysql(trx); + trx->free(); } /* Flush the log to reduce probability that the .frm files and @@ -13205,10 +13021,7 @@ ha_innobase::discard_or_import_tablespace( DBUG_RETURN(HA_ERR_TABLE_READONLY); } - dict_table_t* dict_table = m_prebuilt->table; - - if (dict_table_is_temporary(dict_table)) { - + if (m_prebuilt->table->is_temporary()) { ib_senderrf( m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, ER_CANNOT_DISCARD_TEMPORARY_TABLE); @@ -13216,11 +13029,11 @@ ha_innobase::discard_or_import_tablespace( DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE); } - if (dict_table->space == srv_sys_space.space_id()) { + if (m_prebuilt->table->space == fil_system.sys_space) { ib_senderrf( m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, ER_TABLE_IN_SYSTEM_TABLESPACE, - dict_table->name.m_name); + m_prebuilt->table->name.m_name); DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE); } @@ -13229,7 +13042,7 @@ ha_innobase::discard_or_import_tablespace( /* Obtain an exclusive lock on the table. */ dberr_t err = row_mysql_lock_table( - m_prebuilt->trx, dict_table, LOCK_X, + m_prebuilt->trx, m_prebuilt->table, LOCK_X, discard ? "setting table lock for DISCARD TABLESPACE" : "setting table lock for IMPORT TABLESPACE"); @@ -13242,32 +13055,32 @@ ha_innobase::discard_or_import_tablespace( user may want to set the DISCARD flag in order to IMPORT a new tablespace. */ - if (!dict_table->is_readable()) { + if (!m_prebuilt->table->is_readable()) { ib_senderrf( m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_WARN, ER_TABLESPACE_MISSING, - dict_table->name.m_name); + m_prebuilt->table->name.m_name); } err = row_discard_tablespace_for_mysql( - dict_table->name.m_name, m_prebuilt->trx); + m_prebuilt->table->name.m_name, m_prebuilt->trx); - } else if (dict_table->is_readable()) { + } else if (m_prebuilt->table->is_readable()) { /* Commit the transaction in order to release the table lock. */ trx_commit_for_mysql(m_prebuilt->trx); ib::error() << "Unable to import tablespace " - << dict_table->name << " because it already" + << m_prebuilt->table->name << " because it already" " exists. Please DISCARD the tablespace" " before IMPORT."; ib_senderrf( m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, - ER_TABLESPACE_EXISTS, dict_table->name.m_name); + ER_TABLESPACE_EXISTS, m_prebuilt->table->name.m_name); DBUG_RETURN(HA_ERR_TABLE_EXIST); } else { - err = row_import_for_mysql(dict_table, m_prebuilt); + err = row_import_for_mysql(m_prebuilt->table, m_prebuilt); if (err == DB_SUCCESS) { @@ -13283,12 +13096,35 @@ ha_innobase::discard_or_import_tablespace( /* Commit the transaction in order to release the table lock. */ trx_commit_for_mysql(m_prebuilt->trx); - if (err == DB_SUCCESS && !discard - && dict_stats_is_persistent_enabled(dict_table)) { + if (discard || err != DB_SUCCESS) { + DBUG_RETURN(convert_error_code_to_mysql( + err, m_prebuilt->table->flags, NULL)); + } + + /* Evict and reload the table definition in order to invoke + btr_cur_instant_init(). */ + table_id_t id = m_prebuilt->table->id; + ut_ad(id); + mutex_enter(&dict_sys->mutex); + dict_table_close(m_prebuilt->table, TRUE, FALSE); + dict_table_remove_from_cache(m_prebuilt->table); + m_prebuilt->table = dict_table_open_on_id(id, TRUE, + DICT_TABLE_OP_NORMAL); + mutex_exit(&dict_sys->mutex); + if (!m_prebuilt->table) { + err = DB_TABLE_NOT_FOUND; + } else { + if (const Field* ai = table->found_next_number_field) { + initialize_auto_increment(m_prebuilt->table, ai); + } + dict_stats_init(m_prebuilt->table); + } + + if (dict_stats_is_persistent_enabled(m_prebuilt->table)) { dberr_t ret; /* Adjust the persistent statistics. */ - ret = dict_stats_update(dict_table, + ret = dict_stats_update(m_prebuilt->table, DICT_STATS_RECALC_PERSISTENT); if (ret != DB_SUCCESS) { @@ -13298,11 +13134,12 @@ ha_innobase::discard_or_import_tablespace( ER_ALTER_INFO, "Error updating stats for table '%s'" " after table rebuild: %s", - dict_table->name.m_name, ut_strerr(ret)); + m_prebuilt->table->name.m_name, + ut_strerr(ret)); } } - DBUG_RETURN(convert_error_code_to_mysql(err, dict_table->flags, NULL)); + DBUG_RETURN(0); } /** @@ -13353,7 +13190,7 @@ inline int ha_innobase::delete_table(const char* name, enum_sql_command sqlcom) iter != parent_trx->mod_tables.end(); ++iter) { - dict_table_t* table_to_drop = *iter; + dict_table_t* table_to_drop = iter->first; if (strcmp(norm_name, table_to_drop->name.m_name) == 0) { parent_trx->mod_tables.erase(table_to_drop); @@ -13473,7 +13310,7 @@ inline int ha_innobase::delete_table(const char* name, enum_sql_command sqlcom) innobase_commit_low(trx); - trx_free_for_mysql(trx); + trx->free(); DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL)); } @@ -13573,7 +13410,7 @@ innobase_drop_database( innobase_commit_low(trx); - trx_free_for_mysql(trx); + trx->free(); } /** Rename an InnoDB table. @@ -13643,11 +13480,6 @@ inline dberr_t innobase_rename_table(trx_t *trx, const char *from, goto func_exit; } - /* Transaction must be flagged as a locking transaction or it hasn't - been started yet. */ - - ut_a(trx->will_lock > 0); - error = row_rename_table_for_mysql(norm_from, norm_to, trx, commit, commit); @@ -13730,37 +13562,6 @@ int ha_innobase::truncate() update_thd(); - if (!srv_safe_truncate) { - if (!trx_is_started(m_prebuilt->trx)) { - ++m_prebuilt->trx->will_lock; - } - - dberr_t err = row_truncate_table_for_mysql( - m_prebuilt->table, m_prebuilt->trx); - - int error; - - switch (err) { - case DB_TABLESPACE_DELETED: - case DB_TABLESPACE_NOT_FOUND: - ib_senderrf( - m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, - err == DB_TABLESPACE_DELETED - ? ER_TABLESPACE_DISCARDED - : ER_TABLESPACE_MISSING, - table->s->table_name.str); - error = HA_ERR_TABLESPACE_MISSING; - break; - default: - error = convert_error_code_to_mysql( - err, m_prebuilt->table->flags, - m_prebuilt->trx->mysql_thd); - break; - } - table->status = STATUS_NOT_FOUND; - DBUG_RETURN(error); - } - HA_CREATE_INFO info; mem_heap_t* heap = mem_heap_create(1000); dict_table_t* ib_table = m_prebuilt->table; @@ -13769,7 +13570,7 @@ int ha_innobase::truncate() info.init(); update_create_info_from_table(&info, table); - if (dict_table_is_temporary(ib_table)) { + if (ib_table->is_temporary()) { info.options|= HA_LEX_CREATE_TMP_TABLE; } else { dict_get_and_save_data_dir_path(ib_table, false); @@ -13820,7 +13621,7 @@ int ha_innobase::truncate() || dict_table_is_file_per_table(ib_table), trx); } - trx_free_for_mysql(trx); + trx->free(); if (!err) { /* Reopen the newly created table, and drop the @@ -13883,7 +13684,7 @@ ha_innobase::rename_table( innobase_commit_low(trx); - trx_free_for_mysql(trx); + trx->free(); if (error == DB_SUCCESS) { char norm_from[MAX_FULL_NAME_LEN]; @@ -13947,7 +13748,7 @@ ha_innobase::records_in_range( dict_index_t* index; dtuple_t* range_start; dtuple_t* range_end; - int64_t n_rows; + ha_rows n_rows; page_cur_mode_t mode1; page_cur_mode_t mode2; mem_heap_t* heap; @@ -13967,7 +13768,7 @@ ha_innobase::records_in_range( /* There exists possibility of not being able to find requested index due to inconsistency between MySQL and InoDB dictionary info. Necessary message should have been printed in innobase_get_index() */ - if (dict_table_is_discarded(m_prebuilt->table)) { + if (!m_prebuilt->table->space) { n_rows = HA_POS_ERROR; goto func_exit; } @@ -13999,8 +13800,7 @@ ha_innobase::records_in_range( m_prebuilt->srch_key_val_len, index, (byte*) (min_key ? min_key->key : (const uchar*) 0), - (ulint) (min_key ? min_key->length : 0), - m_prebuilt->trx); + (ulint) (min_key ? min_key->length : 0)); DBUG_ASSERT(min_key ? range_start->n_fields > 0 @@ -14012,8 +13812,7 @@ ha_innobase::records_in_range( m_prebuilt->srch_key_val_len, index, (byte*) (max_key ? max_key->key : (const uchar*) 0), - (ulint) (max_key ? max_key->length : 0), - m_prebuilt->trx); + (ulint) (max_key ? max_key->length : 0)); DBUG_ASSERT(max_key ? range_end->n_fields > 0 @@ -14097,8 +13896,8 @@ ha_innobase::estimate_rows_upper_bound() ut_a(stat_n_leaf_pages > 0); - local_data_file_length = - ((ulonglong) stat_n_leaf_pages) * UNIV_PAGE_SIZE; + local_data_file_length = ulonglong(stat_n_leaf_pages) + << srv_page_size_shift; /* Calculate a minimum length for a clustered index record and from that an upper bound for the number of rows. Since we only calculate @@ -14191,16 +13990,6 @@ ha_innobase::read_time( return(ranges + (double) rows / (double) total_rows * time_for_scan); } -/******************************************************************//** -Return the size of the InnoDB memory buffer. */ - -longlong -ha_innobase::get_memory_buffer_size() const -/*=======================================*/ -{ - return(innobase_buffer_pool_size); -} - /** Update the system variable with the given value of the InnoDB buffer pool size. @param[in] buf_pool_size given value of buffer pool size.*/ @@ -14214,7 +14003,7 @@ innodb_set_buf_pool_size(ulonglong buf_pool_size) Calculates the key number used inside MySQL for an Innobase index. @return the key number used inside MySQL */ static -int +unsigned innobase_get_mysql_key_number_for_index( /*====================================*/ const TABLE* table, /*!< in: table in MySQL data @@ -14238,8 +14027,7 @@ innobase_get_mysql_key_number_for_index( i++; } - if (dict_index_is_clust(index) - && dict_index_is_auto_gen_clust(index)) { + if (dict_index_is_auto_gen_clust(index)) { ut_a(i > 0); i--; } @@ -14251,7 +14039,7 @@ innobase_get_mysql_key_number_for_index( structure and InnoDB dict_index_t list */ for (i = 0; i < table->s->keys; i++) { ind = dict_table_get_index_on_name( - ib_table, table->key_info[i].name); + ib_table, table->key_info[i].name.str); if (index == ind) { return(i); @@ -14274,13 +14062,13 @@ innobase_get_mysql_key_number_for_index( " index.", index->name()); } - return(-1); + return(~0U); } } ut_error; - return(-1); + return(~0U); } /*********************************************************************//** @@ -14358,6 +14146,48 @@ innodb_rec_per_key( return(rec_per_key); } +/** Calculate how many KiB of new data we will be able to insert to the +tablespace without running out of space. Start with a space object that has +been acquired by the caller who holds it for the calculation, +@param[in] space tablespace object from fil_space_acquire() +@return available space in KiB */ +static uintmax_t +fsp_get_available_space_in_free_extents(const fil_space_t& space) +{ + ulint size_in_header = space.size_in_header; + if (size_in_header < FSP_EXTENT_SIZE) { + return 0; /* TODO: count free frag pages and + return a value based on that */ + } + + /* Below we play safe when counting free extents above the free limit: + some of them will contain extent descriptor pages, and therefore + will not be free extents */ + ut_ad(size_in_header >= space.free_limit); + ulint n_free_up = + (size_in_header - space.free_limit) / FSP_EXTENT_SIZE; + + const ulint size = page_size_t(space.flags).physical(); + if (n_free_up > 0) { + n_free_up--; + n_free_up -= n_free_up / (size / FSP_EXTENT_SIZE); + } + + /* We reserve 1 extent + 0.5 % of the space size to undo logs + and 1 extent + 0.5 % to cleaning operations; NOTE: this source + code is duplicated in the function above! */ + + ulint reserve = 2 + ((size_in_header / FSP_EXTENT_SIZE) * 2) / 200; + ulint n_free = space.free_len + n_free_up; + + if (reserve > n_free) { + return(0); + } + + return(static_cast<uintmax_t>(n_free - reserve) + * FSP_EXTENT_SIZE * (size / 1024)); +} + /*********************************************************************//** Returns statistics information of the table to the MySQL interpreter, in various fields of the handle object. @@ -14445,6 +14275,8 @@ ha_innobase::info_low( stats.update_time = (ulong) ib_table->update_time; } + DBUG_EXECUTE_IF("dict_sys_mutex_avoid", goto func_exit;); + if (flag & HA_STATUS_VARIABLE) { ulint stat_clustered_index_size; @@ -14474,7 +14306,7 @@ ha_innobase::info_low( set. That way SHOW TABLE STATUS will show the best estimate, while the optimizer never sees the table empty. */ - if (n_rows == 0 && !(flag & HA_STATUS_TIME)) { + if (n_rows == 0 && !(flag & (HA_STATUS_TIME | HA_STATUS_OPEN))) { n_rows++; } @@ -14495,69 +14327,23 @@ ha_innobase::info_low( m_prebuilt->autoinc_last_value = 0; } - const page_size_t& page_size - = dict_table_page_size(ib_table); - stats.records = (ha_rows) n_rows; stats.deleted = 0; - stats.data_file_length - = ((ulonglong) stat_clustered_index_size) - * page_size.physical(); - stats.index_file_length - = ((ulonglong) stat_sum_of_other_index_sizes) - * page_size.physical(); - - /* Since fsp_get_available_space_in_free_extents() is - acquiring latches inside InnoDB, we do not call it if we - are asked by MySQL to avoid locking. Another reason to - avoid the call is that it uses quite a lot of CPU. - See Bug#38185. */ - if (flag & HA_STATUS_NO_LOCK - || !(flag & HA_STATUS_VARIABLE_EXTRA)) { - /* We do not update delete_length if no - locking is requested so the "old" value can - remain. delete_length is initialized to 0 in - the ha_statistics' constructor. Also we only - need delete_length to be set when - HA_STATUS_VARIABLE_EXTRA is set */ - } else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { - /* Avoid accessing the tablespace if - innodb_crash_recovery is set to a high value. */ - stats.delete_length = 0; - } else { - uintmax_t avail_space; - - avail_space = fsp_get_available_space_in_free_extents( - ib_table->space); - - if (avail_space == UINTMAX_MAX) { - THD* thd; - char errbuf[MYSYS_STRERROR_SIZE]; - - thd = ha_thd(); - - push_warning_printf( - thd, - Sql_condition::WARN_LEVEL_WARN, - ER_CANT_GET_STAT, - "InnoDB: Trying to get the free" - " space for table %s but its" - " tablespace has been discarded or" - " the .ibd file is missing. Setting" - " the free space to zero." - " (errno: %d - %s)", - ib_table->name.m_name, errno, - my_strerror(errbuf, sizeof(errbuf), - errno)); - - stats.delete_length = 0; - } else { - stats.delete_length = avail_space * 1024; - } + if (fil_space_t* space = ib_table->space) { + const ulint size = page_size_t(space->flags) + .physical(); + stats.data_file_length + = ulonglong(stat_clustered_index_size) + * size; + stats.index_file_length + = ulonglong(stat_sum_of_other_index_sizes) + * size; + stats.delete_length = 1024 + * fsp_get_available_space_in_free_extents( + *space); } - stats.check_time = 0; - stats.mrr_length_per_rec= ref_length + 8; // 8 = max(sizeof(void *)); + stats.mrr_length_per_rec= (uint)ref_length + 8; // 8 = max(sizeof(void *)); if (stats.records == 0) { stats.mean_rec_length = 0; @@ -14730,7 +14516,7 @@ ha_innobase::info_low( errkey = (unsigned int) ( (m_prebuilt->trx->error_key_num == ULINT_UNDEFINED) - ? ~0 + ? ~0U : m_prebuilt->trx->error_key_num); } } @@ -14764,10 +14550,7 @@ each index tree. This does NOT calculate exact statistics on the table. @return HA_ADMIN_* error code or HA_ADMIN_OK */ int -ha_innobase::analyze( -/*=================*/ - THD* thd, /*!< in: connection thread handle */ - HA_CHECK_OPT* check_opt) /*!< in: currently ignored */ +ha_innobase::analyze(THD*, HA_CHECK_OPT*) { /* Simply call info_low() with all the flags and request recalculation of the statistics */ @@ -14879,7 +14662,7 @@ int ha_innobase::optimize( /*==================*/ THD* thd, /*!< in: connection thread handle */ - HA_CHECK_OPT* check_opt) /*!< in: currently ignored */ + HA_CHECK_OPT*) { /* FTS-FIXME: Since MySQL doesn't support engine-specific commands, @@ -14899,7 +14682,7 @@ ha_innobase::optimize( try_alter = false; } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - err, + uint(err), "InnoDB: Cannot defragment table %s: returned error code %d\n", m_prebuilt->table->name.m_name, err); @@ -14911,7 +14694,7 @@ ha_innobase::optimize( if (innodb_optimize_fulltext_only) { if (m_prebuilt->table->fts && m_prebuilt->table->fts->cache - && !dict_table_is_discarded(m_prebuilt->table)) { + && m_prebuilt->table->space) { fts_sync_table(m_prebuilt->table); fts_optimize_table(m_prebuilt->table); } @@ -14952,7 +14735,7 @@ ha_innobase::check( build_template(true); } - if (dict_table_is_discarded(m_prebuilt->table)) { + if (!m_prebuilt->table->space) { ib_senderrf( thd, @@ -14963,7 +14746,7 @@ ha_innobase::check( DBUG_RETURN(HA_ADMIN_CORRUPT); } else if (!m_prebuilt->table->is_readable() && - !fil_space_get(m_prebuilt->table->space)) { + !m_prebuilt->table->space) { ib_senderrf( thd, IB_LOG_LEVEL_ERROR, @@ -15005,8 +14788,11 @@ ha_innobase::check( /* We must run the index record counts at an isolation level >= READ COMMITTED, because a dirty read can see a wrong number of records in some index; to play safe, we use always - REPEATABLE READ here */ - m_prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ; + REPEATABLE READ here (except when undo logs are unavailable) */ + m_prebuilt->trx->isolation_level = srv_force_recovery + >= SRV_FORCE_NO_UNDO_LOG_SCAN + ? TRX_ISO_READ_UNCOMMITTED + : TRX_ISO_REPEATABLE_READ; ut_ad(!m_prebuilt->table->corrupted); @@ -15210,9 +14996,11 @@ ha_innobase::update_table_comment( #define SSTR( x ) reinterpret_cast< std::ostringstream & >( \ ( std::ostringstream() << std::dec << x ) ).str() - fk_str.append("InnoDB free: "); - fk_str.append(SSTR(fsp_get_available_space_in_free_extents( - m_prebuilt->table->space))); + if (m_prebuilt->table->space) { + fk_str.append("InnoDB free: "); + fk_str.append(SSTR(fsp_get_available_space_in_free_extents( + *m_prebuilt->table->space))); + } fk_str.append(dict_print_info_on_foreign_keys( FALSE, m_prebuilt->trx, @@ -15304,8 +15092,8 @@ get_foreign_key_info( char tmp_buff[NAME_LEN+1]; char name_buff[NAME_LEN+1]; const char* ptr; - LEX_STRING* referenced_key_name; - LEX_STRING* name = NULL; + LEX_CSTRING* referenced_key_name; + LEX_CSTRING* name = NULL; if (dict_table_t::is_temporary_name(foreign->foreign_table_name)) { return NULL; @@ -15785,6 +15573,16 @@ ha_innobase::extra( break; case HA_EXTRA_BEGIN_ALTER_COPY: m_prebuilt->table->skip_alter_undo = 1; + if (m_prebuilt->table->is_temporary() + || !m_prebuilt->table->versioned_by_id()) { + break; + } + trx_start_if_not_started(m_prebuilt->trx, true); + m_prebuilt->trx->mod_tables.insert( + trx_mod_tables_t::value_type( + const_cast<dict_table_t*>(m_prebuilt->table), + 0)) + .first->second.set_versioned(0); break; case HA_EXTRA_END_ALTER_COPY: m_prebuilt->table->skip_alter_undo = 0; @@ -15872,7 +15670,7 @@ ha_innobase::start_stmt( m_prebuilt->hint_need_to_fetch_extra_cols = 0; reset_template(); - if (dict_table_is_temporary(m_prebuilt->table) + if (m_prebuilt->table->is_temporary() && m_mysql_has_locked && m_prebuilt->select_lock_type == LOCK_NONE) { dberr_t error; @@ -15946,6 +15744,10 @@ innobase_map_isolation_level( /*=========================*/ enum_tx_isolation iso) /*!< in: MySQL isolation level code */ { + if (UNIV_UNLIKELY(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) + || UNIV_UNLIKELY(srv_read_only_mode)) { + return TRX_ISO_READ_UNCOMMITTED; + } switch (iso) { case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ); case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED); @@ -16012,24 +15814,24 @@ ha_innobase::external_lock( } /* Check for UPDATEs in read-only mode. */ - if (srv_read_only_mode - && (thd_sql_command(thd) == SQLCOM_UPDATE - || thd_sql_command(thd) == SQLCOM_INSERT - || thd_sql_command(thd) == SQLCOM_REPLACE - || thd_sql_command(thd) == SQLCOM_DROP_TABLE - || thd_sql_command(thd) == SQLCOM_ALTER_TABLE - || thd_sql_command(thd) == SQLCOM_OPTIMIZE - || (thd_sql_command(thd) == SQLCOM_CREATE_TABLE - && lock_type == F_WRLCK) - || thd_sql_command(thd) == SQLCOM_CREATE_INDEX - || thd_sql_command(thd) == SQLCOM_DROP_INDEX - || thd_sql_command(thd) == SQLCOM_DELETE)) { - - if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE) { - ib_senderrf(thd, IB_LOG_LEVEL_WARN, - ER_READ_ONLY_MODE); - DBUG_RETURN(HA_ERR_TABLE_READONLY); - } else { + if (srv_read_only_mode) { + switch (thd_sql_command(thd)) { + case SQLCOM_CREATE_TABLE: + if (lock_type != F_WRLCK) { + break; + } + /* fall through */ + case SQLCOM_UPDATE: + case SQLCOM_INSERT: + case SQLCOM_REPLACE: + case SQLCOM_DROP_TABLE: + case SQLCOM_ALTER_TABLE: + case SQLCOM_OPTIMIZE: + case SQLCOM_CREATE_INDEX: + case SQLCOM_DROP_INDEX: + case SQLCOM_CREATE_SEQUENCE: + case SQLCOM_DROP_SEQUENCE: + case SQLCOM_DELETE: ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); DBUG_RETURN(HA_ERR_TABLE_READONLY); @@ -16048,7 +15850,7 @@ ha_innobase::external_lock( && thd_sql_command(thd) == SQLCOM_FLUSH && lock_type == F_RDLCK) { - if (dict_table_is_discarded(m_prebuilt->table)) { + if (!m_prebuilt->table->space) { ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, ER_TABLESPACE_DISCARDED, table->s->table_name.str); @@ -16185,14 +15987,8 @@ ha_innobase::external_lock( innobase_commit(ht, thd, TRUE); } - } else if (trx->isolation_level <= TRX_ISO_READ_COMMITTED - && MVCC::is_view_active(trx->read_view)) { - - mutex_enter(&trx_sys->mutex); - - trx_sys->mvcc->view_close(trx->read_view, true); - - mutex_exit(&trx_sys->mutex); + } else if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + trx->read_view.close(); } } @@ -16257,7 +16053,7 @@ innodb_show_status( bytes of text. */ char* str; - ssize_t flen; + size_t flen; mutex_enter(&srv_monitor_file_mutex); rewind(srv_monitor_file); @@ -16267,11 +16063,12 @@ innodb_show_status( os_file_set_eof(srv_monitor_file); - if ((flen = ftell(srv_monitor_file)) < 0) { + flen = size_t(ftell(srv_monitor_file)); + if (ssize_t(flen) < 0) { flen = 0; } - ssize_t usable_len; + size_t usable_len; if (flen > MAX_STATUS_SIZE) { usable_len = MAX_STATUS_SIZE; @@ -16284,7 +16081,7 @@ innodb_show_status( read the contents of the temporary file */ if (!(str = (char*) my_malloc(//PSI_INSTRUMENT_ME, - usable_len + 1, MYF(0)))) { + usable_len + 1, MYF(0)))) { mutex_exit(&srv_monitor_file_mutex); DBUG_RETURN(1); } @@ -16294,19 +16091,18 @@ innodb_show_status( if (flen < MAX_STATUS_SIZE) { /* Display the entire output. */ flen = fread(str, 1, flen, srv_monitor_file); - } else if (trx_list_end < (ulint) flen + } else if (trx_list_end < flen && trx_list_start < trx_list_end - && trx_list_start + (flen - trx_list_end) + && trx_list_start + flen - trx_list_end < MAX_STATUS_SIZE - sizeof truncated_msg - 1) { /* Omit the beginning of the list of active transactions. */ - ssize_t len = fread(str, 1, trx_list_start, srv_monitor_file); + size_t len = fread(str, 1, trx_list_start, srv_monitor_file); memcpy(str + len, truncated_msg, sizeof truncated_msg - 1); len += sizeof truncated_msg - 1; usable_len = (MAX_STATUS_SIZE - 1) - len; - fseek(srv_monitor_file, - static_cast<long>(flen - usable_len), SEEK_SET); + fseek(srv_monitor_file, long(flen - usable_len), SEEK_SET); len += fread(str + len, 1, usable_len, srv_monitor_file); flen = len; } else { @@ -16440,12 +16236,10 @@ struct ShowStatus { spins=N,waits=N,calls=N" The user has to parse the dataunfortunately - @param[in,out] hton the innodb handlerton @param[in,out] thd the MySQL query thread of the caller @param[in,out] stat_print function for printing statistics @return true on success. */ bool to_string( - handlerton* hton, THD* thd, stat_print_fn* stat_print) UNIV_NOTHROW; @@ -16461,13 +16255,11 @@ We store the metrics in the "Status" column as: spins=N,waits=N,calls=N" The user has to parse the dataunfortunately -@param[in,out] hton the innodb handlerton @param[in,out] thd the MySQL query thread of the caller @param[in,out] stat_print function for printing statistics @return true on success. */ bool ShowStatus::to_string( - handlerton* hton, THD* thd, stat_print_fn* stat_print) UNIV_NOTHROW @@ -16516,7 +16308,11 @@ ShowStatus::to_string( static int innodb_show_mutex_status( - handlerton* hton, + handlerton* +#ifdef DBUG_ASSERT_EXISTS + hton +#endif + , THD* thd, stat_print_fn* stat_print) { @@ -16528,7 +16324,7 @@ innodb_show_mutex_status( mutex_monitor.iterate(collector); - if (!collector.to_string(hton, thd, stat_print)) { + if (!collector.to_string(thd, stat_print)) { DBUG_RETURN(1); } @@ -16543,7 +16339,11 @@ innodb_show_mutex_status( static int innodb_show_rwlock_status( - handlerton* hton, + handlerton* +#ifdef DBUG_ASSERT_EXISTS + hton +#endif + , THD* thd, stat_print_fn* stat_print) { @@ -16745,23 +16545,17 @@ ha_innobase::store_lock( trx->isolation_level = innobase_map_isolation_level( (enum_tx_isolation) thd_tx_isolation(thd)); - if (trx->isolation_level <= TRX_ISO_READ_COMMITTED - && MVCC::is_view_active(trx->read_view)) { + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) { /* At low transaction isolation levels we let each consistent read set its own snapshot */ - - mutex_enter(&trx_sys->mutex); - - trx_sys->mvcc->view_close(trx->read_view, true); - - mutex_exit(&trx_sys->mutex); + trx->read_view.close(); } } DBUG_ASSERT(EQ_CURRENT_THD(thd)); const bool in_lock_tables = thd_in_lock_tables(thd); - const uint sql_command = thd_sql_command(thd); + const int sql_command = thd_sql_command(thd); if (srv_read_only_mode && (sql_command == SQLCOM_UPDATE @@ -16775,6 +16569,8 @@ ha_innobase::store_lock( && lock_type <= TL_WRITE)) || sql_command == SQLCOM_CREATE_INDEX || sql_command == SQLCOM_DROP_INDEX + || sql_command == SQLCOM_CREATE_SEQUENCE + || sql_command == SQLCOM_DROP_SEQUENCE || sql_command == SQLCOM_DELETE)) { ib_senderrf(trx->mysql_thd, @@ -16804,7 +16600,8 @@ ha_innobase::store_lock( } /* Check for DROP TABLE */ - } else if (sql_command == SQLCOM_DROP_TABLE) { + } else if (sql_command == SQLCOM_DROP_TABLE || + sql_command == SQLCOM_DROP_SEQUENCE) { /* MySQL calls this function in DROP TABLE though this table handle may belong to another thd that is running a query. Let @@ -16839,7 +16636,8 @@ ha_innobase::store_lock( /* Use consistent read for checksum table */ if (sql_command == SQLCOM_CHECKSUM - || (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ) + || sql_command == SQLCOM_CREATE_SEQUENCE + || (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ) || ((srv_locks_unsafe_for_binlog || trx->isolation_level <= TRX_ISO_READ_COMMITTED) && trx->isolation_level != TRX_ISO_SERIALIZABLE @@ -16848,6 +16646,7 @@ ha_innobase::store_lock( && (sql_command == SQLCOM_INSERT_SELECT || sql_command == SQLCOM_REPLACE_SELECT || sql_command == SQLCOM_UPDATE + || sql_command == SQLCOM_CREATE_SEQUENCE || sql_command == SQLCOM_CREATE_TABLE))) { /* If we either have innobase_locks_unsafe_for_binlog @@ -17253,7 +17052,7 @@ my_bool ha_innobase::register_query_cache_table( /*====================================*/ THD* thd, /*!< in: user thread handle */ - char* table_key, /*!< in: normalized path to the + const char* table_key, /*!< in: normalized path to the table */ uint key_length, /*!< in: length of the normalized path to the table */ @@ -17325,8 +17124,7 @@ innobase_get_at_most_n_mbchars( characters, and we can store in the column prefix index the whole string. */ - char_length = my_charpos(charset, str, - str + data_len, (int) n_chars); + char_length= my_charpos(charset, str, str + data_len, n_chars); if (char_length > data_len) { char_length = data_len; } @@ -17461,7 +17259,7 @@ innobase_commit_by_xid( ut_ad(trx->mysql_thd == NULL); trx_deregister_from_2pc(trx); ut_ad(!trx->will_lock); /* trx cache requirement */ - trx_free_for_background(trx); + trx->free(); return(XA_OK); } else { @@ -17496,7 +17294,7 @@ int innobase_rollback_by_xid(handlerton* hton, XID* xid) int ret = innobase_rollback_trx(trx); trx_deregister_from_2pc(trx); ut_ad(!trx->will_lock); - trx_free_for_background(trx); + trx->free(); return(ret); } else { @@ -17560,10 +17358,7 @@ void innodb_io_capacity_max_update( /*===========================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ + st_mysql_sys_var*, void*, const void* save) /*!< in: immediate result from check function */ { @@ -17595,10 +17390,7 @@ void innodb_io_capacity_update( /*======================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ + st_mysql_sys_var*, void*, const void* save) /*!< in: immediate result from check function */ { @@ -17631,10 +17423,7 @@ void innodb_max_dirty_pages_pct_update( /*==============================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ + st_mysql_sys_var*, void*, const void* save) /*!< in: immediate result from check function */ { @@ -17665,10 +17454,7 @@ void innodb_max_dirty_pages_pct_lwm_update( /*==================================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ + st_mysql_sys_var*, void*, const void* save) /*!< in: immediate result from check function */ { @@ -17690,117 +17476,6 @@ innodb_max_dirty_pages_pct_lwm_update( srv_max_dirty_pages_pct_lwm = in_val; } -/************************************************************//** -Validate the file format name and return its corresponding id. -@return valid file format id */ -static -uint -innobase_file_format_name_lookup( -/*=============================*/ - const char* format_name) /*!< in: pointer to file format name */ -{ - char* endp; - uint format_id; - - ut_a(format_name != NULL); - - /* The format name can contain the format id itself instead of - the name and we check for that. */ - format_id = (uint) strtoul(format_name, &endp, 10); - - /* Check for valid parse. */ - if (*endp == '\0' && *format_name != '\0') { - - if (format_id <= UNIV_FORMAT_MAX) { - - return(format_id); - } - } else { - - for (format_id = 0; format_id <= UNIV_FORMAT_MAX; - format_id++) { - const char* name; - - name = trx_sys_file_format_id_to_name(format_id); - - if (!innobase_strcasecmp(format_name, name)) { - - return(format_id); - } - } - } - - return(UNIV_FORMAT_MAX + 1); -} - -/************************************************************//** -Validate the file format check config parameters, as a side effect it -sets the srv_max_file_format_at_startup variable. -@return the format_id if valid config value, otherwise, return -1 */ -static -int -innobase_file_format_validate_and_set( -/*==================================*/ - const char* format_max) /*!< in: parameter value */ -{ - uint format_id; - - format_id = innobase_file_format_name_lookup(format_max); - - if (format_id < UNIV_FORMAT_MAX + 1) { - srv_max_file_format_at_startup = format_id; - - return((int) format_id); - } else { - return(-1); - } -} - -/*************************************************************//** -Check if it is a valid file format. This function is registered as -a callback with MySQL. -@return 0 for valid file format */ -static -int -innodb_file_format_name_validate( -/*=============================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to system - variable */ - void* save, /*!< out: immediate result - for update function */ - struct st_mysql_value* value) /*!< in: incoming string */ -{ - const char* file_format_input; - char buff[STRING_BUFFER_USUAL_SIZE]; - int len = sizeof(buff); - - ut_a(save != NULL); - ut_a(value != NULL); - - file_format_input = value->val_str(value, buff, &len); - - if (file_format_input != NULL) { - uint format_id; - - format_id = innobase_file_format_name_lookup( - file_format_input); - - if (format_id <= UNIV_FORMAT_MAX) { - - /* Save a pointer to the name in the - 'file_format_name_map' constant array. */ - *static_cast<const char**>(save) = - trx_sys_file_format_id_to_name(format_id); - - return(0); - } - } - - *static_cast<const char**>(save) = NULL; - return(1); -} - /*************************************************************//** Don't allow to set innodb_fast_shutdown=0 if purge threads are already down. @@ -17832,172 +17507,6 @@ fast_shutdown_validate( return(0); } -/****************************************************************//** -Update the system variable innodb_file_format using the "saved" -value. This function is registered as a callback with MySQL. */ -static -void -innodb_file_format_name_update( -/*===========================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr, /*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ -{ - const char* format_name; - - ut_a(var_ptr != NULL); - ut_a(save != NULL); - - - push_warning(thd, Sql_condition::WARN_LEVEL_WARN, - HA_ERR_WRONG_COMMAND, deprecated_file_format); - - format_name = *static_cast<const char*const*>(save); - - if (format_name) { - uint format_id; - - format_id = innobase_file_format_name_lookup(format_name); - - if (format_id <= UNIV_FORMAT_MAX) { - srv_file_format = format_id; - } - } - - *static_cast<const char**>(var_ptr) - = trx_sys_file_format_id_to_name(srv_file_format); -} - -/*************************************************************//** -Check if valid argument to innodb_file_format_max. This function -is registered as a callback with MySQL. -@return 0 for valid file format */ -static -int -innodb_file_format_max_validate( -/*============================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to system - variable */ - void* save, /*!< out: immediate result - for update function */ - struct st_mysql_value* value) /*!< in: incoming string */ -{ - const char* file_format_input; - char buff[STRING_BUFFER_USUAL_SIZE]; - int len = sizeof(buff); - int format_id; - - ut_a(save != NULL); - ut_a(value != NULL); - - file_format_input = value->val_str(value, buff, &len); - - if (file_format_input != NULL) { - - format_id = innobase_file_format_validate_and_set( - file_format_input); - - if (format_id >= 0) { - /* Save a pointer to the name in the - 'file_format_name_map' constant array. */ - *static_cast<const char**>(save) = - trx_sys_file_format_id_to_name( - (uint) format_id); - - return(0); - - } else { - push_warning_printf(thd, - Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_ARGUMENTS, - "InnoDB: invalid innodb_file_format_max" - " value; can be any format up to %s" - " or equivalent id of %d", - trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX), - UNIV_FORMAT_MAX); - } - } - - *static_cast<const char**>(save) = NULL; - return(1); -} - -/****************************************************************//** -Update the system variable innodb_file_format_max using the "saved" -value. This function is registered as a callback with MySQL. */ -static -void -innodb_file_format_max_update( -/*==========================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ -{ - const char* format_name_in; - const char** format_name_out; - uint format_id; - - ut_a(save != NULL); - ut_a(var_ptr != NULL); - - - push_warning(thd, Sql_condition::WARN_LEVEL_WARN, - HA_ERR_WRONG_COMMAND, deprecated_file_format_max); - - format_name_in = *static_cast<const char*const*>(save); - - if (!format_name_in) { - - return; - } - - format_id = innobase_file_format_name_lookup(format_name_in); - - if (format_id > UNIV_FORMAT_MAX) { - /* DEFAULT is "on", which is invalid at runtime. */ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_ARGUMENTS, - "Ignoring SET innodb_file_format=%s", - format_name_in); - return; - } - - format_name_out = static_cast<const char**>(var_ptr); - - /* Update the max format id in the system tablespace. */ - if (trx_sys_file_format_max_set(format_id, format_name_out)) { - ib::info() << "The file format in the system tablespace is now" - " set to " << *format_name_out << "."; - } -} - -/** Update innodb_large_prefix. -@param[in,out] thd MySQL client connection -@param[out] var_ptr current value -@param[in] save to-be-assigned value */ -static -void -innodb_large_prefix_update( - THD* thd, - st_mysql_sys_var*, - void* var_ptr, - const void* save) -{ - push_warning(thd, Sql_condition::WARN_LEVEL_WARN, - HA_ERR_WRONG_COMMAND, deprecated_large_prefix); - - *static_cast<my_bool*>(var_ptr) = *static_cast<const my_bool*>(save); -} - /*************************************************************//** Check whether valid argument given to innobase_*_stopword_table. This function is registered as a callback with MySQL. @@ -18007,8 +17516,7 @@ int innodb_stopword_table_validate( /*===========================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to system - variable */ + st_mysql_sys_var*, void* save, /*!< out: immediate result for update function */ struct st_mysql_value* value) /*!< in: incoming string */ @@ -18050,17 +17558,10 @@ innodb_stopword_table_validate( /** Update the system variable innodb_buffer_pool_size using the "saved" value. This function is registered as a callback with MySQL. -@param[in] thd thread handle -@param[in] var pointer to system variable -@param[out] var_ptr where the formal string goes @param[in] save immediate result from check function */ static void -innodb_buffer_pool_size_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save) +innodb_buffer_pool_size_update(THD*,st_mysql_sys_var*,void*, const void* save) { longlong in_val = *static_cast<const longlong*>(save); @@ -18123,15 +17624,8 @@ Update the system variable innodb_adaptive_hash_index using the "saved" value. This function is registered as a callback with MySQL. */ static void -innodb_adaptive_hash_index_update( -/*==============================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ +innodb_adaptive_hash_index_update(THD*, st_mysql_sys_var*, void*, + const void* save) { mysql_mutex_unlock(&LOCK_global_system_variables); if (*(my_bool*) save) { @@ -18148,15 +17642,7 @@ Update the system variable innodb_cmp_per_index using the "saved" value. This function is registered as a callback with MySQL. */ static void -innodb_cmp_per_index_update( -/*========================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ +innodb_cmp_per_index_update(THD*, st_mysql_sys_var*, void*, const void* save) { /* Reset the stats whenever we enable the table INFORMATION_SCHEMA.innodb_cmp_per_index. */ @@ -18174,15 +17660,7 @@ Update the system variable innodb_old_blocks_pct using the "saved" value. This function is registered as a callback with MySQL. */ static void -innodb_old_blocks_pct_update( -/*=========================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ +innodb_old_blocks_pct_update(THD*, st_mysql_sys_var*, void*, const void* save) { mysql_mutex_unlock(&LOCK_global_system_variables); uint ratio = buf_LRU_old_ratio_update(*static_cast<const uint*>(save), @@ -18196,15 +17674,8 @@ Update the system variable innodb_old_blocks_pct using the "saved" value. This function is registered as a callback with MySQL. */ static void -innodb_change_buffer_max_size_update( -/*=================================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ +innodb_change_buffer_max_size_update(THD*, st_mysql_sys_var*, void*, + const void* save) { srv_change_buffer_max_size = *static_cast<const uint*>(save); mysql_mutex_unlock(&LOCK_global_system_variables); @@ -18220,15 +17691,7 @@ static ulong srv_saved_page_number_debug = 0; Save an InnoDB page number. */ static void -innodb_save_page_no( -/*================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ +innodb_save_page_no(THD*, st_mysql_sys_var*, void*, const void* save) { srv_saved_page_number_debug = *static_cast<const ulong*>(save); @@ -18240,15 +17703,7 @@ innodb_save_page_no( Make the first page of given user tablespace dirty. */ static void -innodb_make_page_dirty( -/*===================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ +innodb_make_page_dirty(THD*, st_mysql_sys_var*, void*, const void* save) { mtr_t mtr; ulong space_id = *static_cast<const ulong*>(save); @@ -18263,7 +17718,7 @@ func_exit_no_space: if (srv_saved_page_number_debug >= space->size) { func_exit: - fil_space_release(space); + space->release(); goto func_exit_no_space; } @@ -18289,108 +17744,6 @@ func_exit: } #endif // UNIV_DEBUG /*************************************************************//** -Find the corresponding ibuf_use_t value that indexes into -innobase_change_buffering_values[] array for the input -change buffering option name. -@return corresponding IBUF_USE_* value for the input variable -name, or IBUF_USE_COUNT if not able to find a match */ -static -ibuf_use_t -innodb_find_change_buffering_value( -/*===============================*/ - const char* input_name) /*!< in: input change buffering - option name */ -{ - for (ulint i = 0; - i < UT_ARR_SIZE(innobase_change_buffering_values); - ++i) { - - /* found a match */ - if (!innobase_strcasecmp( - input_name, innobase_change_buffering_values[i])) { - return(static_cast<ibuf_use_t>(i)); - } - } - - /* Did not find any match */ - return(IBUF_USE_COUNT); -} - -/*************************************************************//** -Check if it is a valid value of innodb_change_buffering. This function is -registered as a callback with MySQL. -@return 0 for valid innodb_change_buffering */ -static -int -innodb_change_buffering_validate( -/*=============================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to system - variable */ - void* save, /*!< out: immediate result - for update function */ - struct st_mysql_value* value) /*!< in: incoming string */ -{ - const char* change_buffering_input; - char buff[STRING_BUFFER_USUAL_SIZE]; - int len = sizeof(buff); - - ut_a(save != NULL); - ut_a(value != NULL); - - change_buffering_input = value->val_str(value, buff, &len); - - if (change_buffering_input != NULL) { - ibuf_use_t use; - - use = innodb_find_change_buffering_value( - change_buffering_input); - - if (use != IBUF_USE_COUNT) { - /* Find a matching change_buffering option value. */ - *static_cast<const char**>(save) = - innobase_change_buffering_values[use]; - - return(0); - } - } - - /* No corresponding change buffering option for user supplied - "change_buffering_input" */ - return(1); -} - -/****************************************************************//** -Update the system variable innodb_change_buffering using the "saved" -value. This function is registered as a callback with MySQL. */ -static -void -innodb_change_buffering_update( -/*===========================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ -{ - ibuf_use_t use; - - ut_a(var_ptr != NULL); - ut_a(save != NULL); - - use = innodb_find_change_buffering_value( - *static_cast<const char*const*>(save)); - - ut_a(use < IBUF_USE_COUNT); - - ibuf_use = use; - *static_cast<const char**>(var_ptr) = - *static_cast<const char*const*>(save); -} - -/*************************************************************//** Just emit a warning that the usage of the variable is deprecated. @return 0 */ static @@ -18398,10 +17751,7 @@ void innodb_stats_sample_pages_update( /*=============================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ + st_mysql_sys_var*, void*, const void* save) /*!< in: immediate result from check function */ { @@ -18677,9 +18027,7 @@ static int innodb_monitor_validate( /*====================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to system - variable */ + THD*, st_mysql_sys_var*, void* save, /*!< out: immediate result for update function */ struct st_mysql_value* value) /*!< in: incoming string */ @@ -18918,13 +18266,8 @@ SET GLOBAL innodb_buffer_pool_evict='uncompressed' evicts all uncompressed page frames of compressed tablespaces. */ static void -innodb_buffer_pool_evict_update( -/*============================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var*var, /*!< in: pointer to system variable */ - void* var_ptr,/*!< out: ignored */ - const void* save) /*!< in: immediate result - from check function */ +innodb_buffer_pool_evict_update(THD*, st_mysql_sys_var*, void*, + const void* save) { if (const char* op = *static_cast<const char*const*>(save)) { if (!strcmp(op, "uncompressed")) { @@ -18955,8 +18298,7 @@ void innodb_enable_monitor_update( /*=========================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ + st_mysql_sys_var*, void* var_ptr,/*!< out: where the formal string goes */ const void* save) /*!< in: immediate result @@ -18973,8 +18315,7 @@ void innodb_disable_monitor_update( /*==========================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ + st_mysql_sys_var*, void* var_ptr,/*!< out: where the formal string goes */ const void* save) /*!< in: immediate result @@ -18992,8 +18333,7 @@ void innodb_reset_monitor_update( /*========================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ + st_mysql_sys_var*, void* var_ptr,/*!< out: where the formal string goes */ const void* save) /*!< in: immediate result @@ -19011,8 +18351,7 @@ void innodb_reset_all_monitor_update( /*============================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ + st_mysql_sys_var*, void* var_ptr,/*!< out: where the formal string goes */ const void* save) /*!< in: immediate result @@ -19024,15 +18363,8 @@ innodb_reset_all_monitor_update( static void -innodb_defragment_frequency_update( -/*===============================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ +innodb_defragment_frequency_update(THD*, st_mysql_sys_var*, void*, + const void* save) { srv_defragment_frequency = (*static_cast<const uint*>(save)); srv_defragment_interval = 1000000000ULL / srv_defragment_frequency; @@ -19071,13 +18403,8 @@ innodb_enable_monitor_at_startup( for (char* option = my_strtok_r(str, sep, &last); option; option = my_strtok_r(NULL, sep, &last)) { - ulint ret; char* option_name; - - ret = innodb_monitor_valid_byname(&option_name, option); - - /* The name is validated if ret == 0 */ - if (!ret) { + if (!innodb_monitor_valid_byname(&option_name, option)) { innodb_monitor_update(NULL, NULL, &option, MONITOR_TURN_ON, FALSE); } else { @@ -19090,13 +18417,7 @@ innodb_enable_monitor_at_startup( /****************************************************************//** Callback function for accessing the InnoDB variables from MySQL: SHOW VARIABLES. */ -static -int -show_innodb_vars( -/*=============*/ - THD* thd, - SHOW_VAR* var, - char* buff) +static int show_innodb_vars(THD*, SHOW_VAR* var, char*) { innodb_export_status(); var->type = SHOW_ARRAY; @@ -19126,7 +18447,7 @@ innobase_index_name_is_reserved( for (key_num = 0; key_num < num_of_keys; key_num++) { key = &key_info[key_num]; - if (innobase_strcasecmp(key->name, + if (innobase_strcasecmp(key->name.str, innobase_index_reserve_name) == 0) { /* Push warning to mysql */ push_warning_printf(thd, @@ -19215,17 +18536,7 @@ static uint innodb_merge_threshold_set_all_debug /** Wait for the background drop list to become empty. */ static void -wait_background_drop_list_empty( - THD* thd /*!< in: thread handle */ - MY_ATTRIBUTE((unused)), - struct st_mysql_sys_var* var /*!< in: pointer to system - variable */ - MY_ATTRIBUTE((unused)), - void* var_ptr /*!< out: where the formal - string goes */ - MY_ATTRIBUTE((unused)), - const void* save) /*!< in: immediate result from - check function */ +wait_background_drop_list_empty(THD*, st_mysql_sys_var*, void*, const void*) { row_wait_for_background_drop_list_empty(); } @@ -19234,32 +18545,21 @@ wait_background_drop_list_empty( Force innodb to checkpoint. */ static void -checkpoint_now_set( -/*===============*/ - THD* thd /*!< in: thread handle */ - MY_ATTRIBUTE((unused)), - struct st_mysql_sys_var* var /*!< in: pointer to system - variable */ - MY_ATTRIBUTE((unused)), - void* var_ptr /*!< out: where the formal - string goes */ - MY_ATTRIBUTE((unused)), - const void* save) /*!< in: immediate result from - check function */ +checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void* save) { if (*(my_bool*) save) { mysql_mutex_unlock(&LOCK_global_system_variables); - while (log_sys->last_checkpoint_lsn + while (log_sys.last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT - + (log_sys->append_on_checkpoint != NULL - ? log_sys->append_on_checkpoint->size() : 0) - < log_sys->lsn) { + + (log_sys.append_on_checkpoint != NULL + ? log_sys.append_on_checkpoint->size() : 0) + < log_sys.lsn) { log_make_checkpoint(); fil_flush_file_spaces(FIL_TYPE_LOG); } - dberr_t err = fil_write_flushed_lsn(log_sys->lsn); + dberr_t err = fil_write_flushed_lsn(log_sys.lsn); if (err != DB_SUCCESS) { ib::warn() << "Checkpoint set failed " << err; @@ -19273,18 +18573,7 @@ checkpoint_now_set( Force a dirty pages flush now. */ static void -buf_flush_list_now_set( -/*===================*/ - THD* thd /*!< in: thread handle */ - MY_ATTRIBUTE((unused)), - struct st_mysql_sys_var* var /*!< in: pointer to system - variable */ - MY_ATTRIBUTE((unused)), - void* var_ptr /*!< out: where the formal - string goes */ - MY_ATTRIBUTE((unused)), - const void* save) /*!< in: immediate result from - check function */ +buf_flush_list_now_set(THD*, st_mysql_sys_var*, void*, const void* save) { if (*(my_bool*) save) { mysql_mutex_unlock(&LOCK_global_system_variables); @@ -19295,17 +18584,11 @@ buf_flush_list_now_set( /** Override current MERGE_THRESHOLD setting for all indexes at dictionary now. -@param[in] thd thread handle -@param[in] var pointer to system variable -@param[out] var_ptr where the formal string goes @param[in] save immediate result from check function */ static void -innodb_merge_threshold_set_all_debug_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save) +innodb_merge_threshold_set_all_debug_update(THD*, st_mysql_sys_var*, void*, + const void* save) { innodb_merge_threshold_set_all_debug = (*static_cast<const uint*>(save)); @@ -19438,10 +18721,7 @@ void innodb_log_write_ahead_size_update( /*===============================*/ THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ + st_mysql_sys_var*, void*, const void* save) /*!< in: immediate result from check function */ { @@ -19452,8 +18732,8 @@ innodb_log_write_ahead_size_update( val = val * 2; } - if (val > UNIV_PAGE_SIZE) { - val = UNIV_PAGE_SIZE; + if (val > srv_page_size) { + val = srv_page_size; push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS, "innodb_log_write_ahead_size cannot" @@ -19462,7 +18742,7 @@ innodb_log_write_ahead_size_update( ER_WRONG_ARGUMENTS, "Setting innodb_log_write_ahead_size" " to %lu", - UNIV_PAGE_SIZE); + srv_page_size); } else if (val != in_val) { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS, @@ -19511,8 +18791,8 @@ innodb_encryption_threads_update(THD*,st_mysql_sys_var*,void*,const void*save) @param[in] save to-be-assigned value */ static void -innodb_encryption_rotate_key_age_update(THD*,st_mysql_sys_var*,void*, - const void*save) +innodb_encryption_rotate_key_age_update(THD*, st_mysql_sys_var*, void*, + const void* save) { mysql_mutex_unlock(&LOCK_global_system_variables); fil_crypt_set_rotate_key_age(*static_cast<const uint*>(save)); @@ -19523,8 +18803,8 @@ innodb_encryption_rotate_key_age_update(THD*,st_mysql_sys_var*,void*, @param[in] save to-be-assigned value */ static void -innodb_encryption_rotation_iops_update(THD*,st_mysql_sys_var*,void*, - const void*save) +innodb_encryption_rotation_iops_update(THD*, st_mysql_sys_var*, void*, + const void* save) { mysql_mutex_unlock(&LOCK_global_system_variables); fil_crypt_set_rotation_iops(*static_cast<const uint*>(save)); @@ -19535,7 +18815,7 @@ innodb_encryption_rotation_iops_update(THD*,st_mysql_sys_var*,void*, @param[in] save to-be-assigned value */ static void -innodb_encrypt_tables_update(THD*,st_mysql_sys_var*,void*,const void*save) +innodb_encrypt_tables_update(THD*, st_mysql_sys_var*, void*, const void* save) { mysql_mutex_unlock(&LOCK_global_system_variables); fil_crypt_set_encrypt_tables(*static_cast<const ulong*>(save)); @@ -19543,17 +18823,13 @@ innodb_encrypt_tables_update(THD*,st_mysql_sys_var*,void*,const void*save) } /** Update the innodb_log_checksums parameter. -@param[in] thd thread handle -@param[in] var system variable +@param[in,out] thd client connection @param[out] var_ptr current value @param[in] save immediate result from check function */ static void -innodb_log_checksums_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save) +innodb_log_checksums_update(THD* thd, st_mysql_sys_var*, void* var_ptr, + const void* save) { *static_cast<my_bool*>(var_ptr) = innodb_log_checksums_func_update( thd, *static_cast<const my_bool*>(save)); @@ -19601,6 +18877,7 @@ static void bg_wsrep_kill_trx( DBUG_ENTER("bg_wsrep_kill_trx"); if (thd) { + wsrep_thd_LOCK(thd); victim_trx= thd_to_trx(thd); /* Victim trx might not exist e.g. on MDL-conflict. */ if (victim_trx) { @@ -19611,23 +18888,18 @@ static void bg_wsrep_kill_trx( { /* Victim was meanwhile rolled back or committed */ - trx_mutex_exit(victim_trx); lock_mutex_exit(); - wsrep_thd_UNLOCK(thd); - victim_trx= NULL; + trx_mutex_exit(victim_trx); + goto no_victim; } } else { - /* find_thread_by_id locked - THD::LOCK_thd_data */ +no_victim: wsrep_thd_UNLOCK(thd); + /* find_thread_by_id() acquired THD::LOCK_kill_data */ + wsrep_thd_kill_UNLOCK(thd); + goto ret; } - } - - if (!victim_trx) { - /* Victim trx might not exist (MDL-conflict) or victim - was meanwhile rolled back or committed because of - a KILL statement or a disconnect. */ - goto ret; + wsrep_thd_UNLOCK(thd); } WSREP_DEBUG("BF kill (" ULINTPF ", seqno: " INT64PF @@ -19789,7 +19061,7 @@ ret_unlock: lock_mutex_exit(); if (awake) wsrep_thd_awake(thd, arg->signal); - wsrep_thd_UNLOCK(thd); + wsrep_thd_kill_UNLOCK(thd); ret: free(arg); @@ -19841,7 +19113,7 @@ static void wsrep_abort_transaction( /*====================*/ - handlerton* hton, + handlerton*, THD *bf_thd, THD *victim_thd, my_bool signal) @@ -19866,10 +19138,12 @@ wsrep_abort_transaction( DBUG_VOID_RETURN; } else { WSREP_DEBUG("victim does not have transaction"); + wsrep_thd_kill_LOCK(victim_thd); wsrep_thd_LOCK(victim_thd); wsrep_thd_set_conflict_state(victim_thd, MUST_ABORT); - wsrep_thd_awake(victim_thd, signal); wsrep_thd_UNLOCK(victim_thd); + wsrep_thd_awake(victim_thd, signal); + wsrep_thd_kill_UNLOCK(victim_thd); } DBUG_VOID_RETURN; @@ -19884,17 +19158,14 @@ innobase_wsrep_set_checkpoint( { DBUG_ASSERT(hton == innodb_hton_ptr); - if (wsrep_is_wsrep_xid(xid)) { - mtr_t mtr; - mtr_start(&mtr); - trx_sysf_t* sys_header = trx_sysf_get(&mtr); - trx_sys_update_wsrep_checkpoint(xid, sys_header, &mtr); - mtr_commit(&mtr); - innobase_flush_logs(hton, false); - return 0; - } else { - return 1; - } + if (wsrep_is_wsrep_xid(xid)) { + + trx_rseg_update_wsrep_checkpoint(xid); + innobase_flush_logs(hton, false); + return 0; + } else { + return 1; + } } static @@ -19905,20 +19176,13 @@ innobase_wsrep_get_checkpoint( XID* xid) { DBUG_ASSERT(hton == innodb_hton_ptr); - trx_sys_read_wsrep_checkpoint(xid); - return 0; + trx_rseg_read_wsrep_checkpoint(*xid); + return 0; } -static -void -wsrep_fake_trx_id( -/*==============*/ - handlerton *hton, - THD *thd) /*!< in: user thread handle */ +static void wsrep_fake_trx_id(handlerton *, THD *thd) { - mutex_enter(&trx_sys->mutex); - trx_id_t trx_id = trx_sys_get_new_trx_id(); - mutex_exit(&trx_sys->mutex); + trx_id_t trx_id = trx_sys.get_new_trx_id(); WSREP_DEBUG("innodb fake trx id: " TRX_ID_FMT " thd: %s", trx_id, wsrep_thd_query(thd)); wsrep_ws_handle_for_trx(wsrep_thd_ws_handle(thd), trx_id); @@ -19926,44 +19190,6 @@ wsrep_fake_trx_id( #endif /* WITH_WSREP */ -/** Update the innodb_use_trim parameter. -@param[in] thd thread handle -@param[in] var system variable -@param[out] var_ptr current value -@param[in] save immediate result from check function */ -static -void -innodb_use_trim_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save) -{ - srv_use_trim = *static_cast<const my_bool*>(save); - - push_warning(thd, Sql_condition::WARN_LEVEL_WARN, - HA_ERR_WRONG_COMMAND, deprecated_use_trim); -} - -/** Update the innodb_instrument_sempahores parameter. -@param[in] thd thread handle -@param[in] var system variable -@param[out] var_ptr current value -@param[in] save immediate result from check function */ -static -void -innodb_instrument_semaphores_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save) -{ - innodb_instrument_semaphores = *static_cast<const my_bool*>(save); - - push_warning(thd, Sql_condition::WARN_LEVEL_WARN, - HA_ERR_WRONG_COMMAND, deprecated_instrument_semaphores); -} - static void innodb_idle_flush_pct_update(THD *thd, st_mysql_sys_var *var, void*, const void *save) { @@ -19995,7 +19221,7 @@ static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm, " magic number when reading;" " Files updated when this option is set to crc32 or strict_crc32 will" " not be readable by MariaDB versions older than 10.0.4", - NULL, NULL, SRV_CHECKSUM_ALGORITHM_CRC32, + NULL, innodb_checksum_algorithm_update, SRV_CHECKSUM_ALGORITHM_CRC32, &innodb_checksum_algorithm_typelib); static MYSQL_SYSVAR_BOOL(log_checksums, innodb_log_checksums, @@ -20016,7 +19242,7 @@ static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir, "The common part for InnoDB table spaces.", NULL, NULL, NULL); -static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite, +static MYSQL_SYSVAR_BOOL(doublewrite, srv_use_doublewrite_buf, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, "Enable InnoDB doublewrite buffer (enabled by default)." " Disable with --skip-innodb-doublewrite.", @@ -20030,17 +19256,18 @@ static MYSQL_SYSVAR_BOOL(use_atomic_writes, srv_use_atomic_writes, "the directFS filesystem or with Shannon cards using any file system.", NULL, NULL, TRUE); -static MYSQL_SYSVAR_BOOL(use_fallocate, innobase_use_fallocate, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "Use posix_fallocate() to allocate files. DEPRECATED, has no effect.", - NULL, NULL, FALSE); - static MYSQL_SYSVAR_BOOL(stats_include_delete_marked, srv_stats_include_delete_marked, PLUGIN_VAR_OPCMDARG, "Include delete marked records when calculating persistent statistics", NULL, NULL, FALSE); +static MYSQL_SYSVAR_ENUM(instant_alter_column_allowed, + innodb_instant_alter_column_allowed, + PLUGIN_VAR_RQCMDARG, + "File format constraint for ALTER TABLE", NULL, NULL, 1/*add_last*/, + &innodb_instant_alter_column_allowed_typelib); + static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity, PLUGIN_VAR_RQCMDARG, "Number of IOPs the server can do. Tunes the background IO rate", @@ -20112,41 +19339,14 @@ static MYSQL_SYSVAR_ULONG(sync_array_size, srv_sync_array_size, static MYSQL_SYSVAR_UINT(fast_shutdown, srv_fast_shutdown, PLUGIN_VAR_OPCMDARG, "Speeds up the shutdown process of the InnoDB storage engine. Possible" - " values are 0, 1 (faster) or 2 (fastest - crash-like).", - fast_shutdown_validate, NULL, 1, 0, 2, 0); + " values are 0, 1 (faster), 2 (crash-like), 3 (fastest clean).", + fast_shutdown_validate, NULL, 1, 0, 3, 0); static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table, PLUGIN_VAR_NOCMDARG, "Stores each InnoDB table to an .ibd file in the database dir.", NULL, NULL, TRUE); -static MYSQL_SYSVAR_STR(file_format, innobase_file_format_name, - PLUGIN_VAR_RQCMDARG, - "File format to use for new tables in .ibd files.", - innodb_file_format_name_validate, - innodb_file_format_name_update, innodb_file_format_default); - -/* "innobase_file_format_check" decides whether we would continue -booting the server if the file format stamped on the system -table space exceeds the maximum file format supported -by the server. Can be set during server startup at command -line or configure file, and a read only variable after -server startup */ -static MYSQL_SYSVAR_BOOL(file_format_check, innobase_file_format_check, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "Whether to perform system file format check.", - NULL, NULL, TRUE); - -/* If a new file format is introduced, the file format -name needs to be updated accordingly. Please refer to -file_format_name_map[] defined in trx0sys.cc for the next -file format name. */ -static MYSQL_SYSVAR_STR(file_format_max, innobase_file_format_max, - PLUGIN_VAR_OPCMDARG, - "The highest file format in the tablespace.", - innodb_file_format_max_validate, - innodb_file_format_max_update, innodb_file_format_max_default); - static MYSQL_SYSVAR_STR(ft_server_stopword_table, innobase_server_stopword_table, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC, "The user supplied stopword table name.", @@ -20173,14 +19373,19 @@ static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, " guarantees in case of crash. 0 and 2 can be faster than 1 or 3.", NULL, NULL, 1, 0, 3, 0); -static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method, +static MYSQL_SYSVAR_ENUM(flush_method, innodb_flush_method, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "With which method to flush data.", NULL, NULL, NULL); + "With which method to flush data.", + NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_FSYNC), + &innodb_flush_method_typelib); -static MYSQL_SYSVAR_BOOL(large_prefix, innobase_large_prefix, - PLUGIN_VAR_NOCMDARG, - "Support large index prefix length of REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes.", - NULL, innodb_large_prefix_update, TRUE); +static MYSQL_SYSVAR_STR(file_format, innodb_file_format, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Deprecated parameter with no effect.", NULL, NULL, NULL); + +static MYSQL_SYSVAR_STR(large_prefix, innodb_large_prefix, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Deprecated parameter with no effect.", NULL, NULL, NULL); static MYSQL_SYSVAR_BOOL(force_load_corrupted, srv_load_corrupted, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, @@ -20198,10 +19403,20 @@ static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Path to InnoDB log files.", NULL, NULL, NULL); +/** Update innodb_page_cleaners. +@param[in] save the new value of innodb_page_cleaners */ +static +void +innodb_page_cleaners_threads_update(THD*, struct st_mysql_sys_var*, void*, const void *save) +{ + buf_flush_set_page_cleaner_thread_cnt(*static_cast<const ulong*>(save)); +} + static MYSQL_SYSVAR_ULONG(page_cleaners, srv_n_page_cleaners, - PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + PLUGIN_VAR_RQCMDARG, "Page cleaner threads can be from 1 to 64. Default is 4.", - NULL, NULL, 4, 1, 64, 0); + NULL, + innodb_page_cleaners_threads_update, 4, 1, 64, 0); static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct, srv_max_buf_pool_modified_pct, PLUGIN_VAR_RQCMDARG, @@ -20384,13 +19599,13 @@ BUF_POOL_SIZE_THRESHOLD (srv/srv0start.cc), then srv_buf_pool_instances_default can be removed and 8 used instead. The problem with the current setup is that with 128MiB default buffer pool size and 8 instances by default we would emit a warning when no options are specified. */ -static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size, +static MYSQL_SYSVAR_ULONGLONG(buffer_pool_size, innobase_buffer_pool_size, PLUGIN_VAR_RQCMDARG, "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.", innodb_buffer_pool_size_validate, innodb_buffer_pool_size_update, - static_cast<longlong>(srv_buf_pool_def_size), - static_cast<longlong>(srv_buf_pool_min_size), + srv_buf_pool_def_size, + srv_buf_pool_min_size, LLONG_MAX, 1024*1024L); static MYSQL_SYSVAR_ULONG(buffer_pool_chunk_size, srv_buf_pool_chunk_unit, @@ -20451,6 +19666,12 @@ static MYSQL_SYSVAR_ULONG(buffer_pool_dump_pct, srv_buf_pool_dump_pct, NULL, NULL, 25, 1, 100, 0); #ifdef UNIV_DEBUG +/* Added to test the innodb_buffer_pool_load_incomplete status variable. */ +static MYSQL_SYSVAR_ULONG(buffer_pool_load_pages_abort, srv_buf_pool_load_pages_abort, + PLUGIN_VAR_RQCMDARG, + "Number of pages during a buffer pool load to process before signaling innodb_buffer_pool_load_abort=1", + NULL, NULL, LONG_MAX, 1, LONG_MAX, 0); + static MYSQL_SYSVAR_STR(buffer_pool_evict, srv_buffer_pool_evict, PLUGIN_VAR_RQCMDARG, "Evict pages from the buffer pool", @@ -20557,7 +19778,7 @@ static MYSQL_SYSVAR_BOOL(deadlock_detect, innobase_deadlock_detect, " and we rely on innodb_lock_wait_timeout in case of deadlock.", NULL, NULL, TRUE); -static MYSQL_SYSVAR_LONG(fill_factor, innobase_fill_factor, +static MYSQL_SYSVAR_UINT(fill_factor, innobase_fill_factor, PLUGIN_VAR_RQCMDARG, "Percentage of B-tree page filled during bulk insert", NULL, NULL, 100, 10, 100, 0); @@ -20627,12 +19848,12 @@ static MYSQL_SYSVAR_BOOL(optimize_fulltext_only, innodb_optimize_fulltext_only, "Only optimize the Fulltext index of the table", NULL, NULL, FALSE); -static MYSQL_SYSVAR_ULONG(read_io_threads, innobase_read_io_threads, +static MYSQL_SYSVAR_ULONG(read_io_threads, srv_n_read_io_threads, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of background read I/O threads in InnoDB.", NULL, NULL, 4, 1, 64, 0); -static MYSQL_SYSVAR_ULONG(write_io_threads, innobase_write_io_threads, +static MYSQL_SYSVAR_ULONG(write_io_threads, srv_n_write_io_threads, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of background write I/O threads in InnoDB.", NULL, NULL, 4, 1, 64, 0); @@ -20648,10 +19869,10 @@ static MYSQL_SYSVAR_ULONG(page_size, srv_page_size, NULL, NULL, UNIV_PAGE_SIZE_DEF, UNIV_PAGE_SIZE_MIN, UNIV_PAGE_SIZE_MAX, 0); -static MYSQL_SYSVAR_LONG(log_buffer_size, innobase_log_buffer_size, +static MYSQL_SYSVAR_ULONG(log_buffer_size, srv_log_buffer_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "The size of the buffer which InnoDB uses to write log to the log files on disk.", - NULL, NULL, 16*1024*1024L, 256*1024L, LONG_MAX, 1024); + NULL, NULL, 16L << 20, 256L << 10, LONG_MAX, 1024); static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -20684,10 +19905,10 @@ static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms, " The timeout is disabled if 0.", NULL, NULL, 1000, 0, UINT_MAX32, 0); -static MYSQL_SYSVAR_LONG(open_files, innobase_open_files, +static MYSQL_SYSVAR_ULONG(open_files, innobase_open_files, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "How many files at the maximum InnoDB keeps open at the same time.", - NULL, NULL, 0L, 0L, LONG_MAX, 0); + NULL, NULL, 0, 0, LONG_MAX, 0); static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds, PLUGIN_VAR_RQCMDARG, @@ -20731,12 +19952,12 @@ static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay, static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Path to individual files and their sizes.", - NULL, NULL, NULL); + NULL, NULL, "ibdata1:12M:autoextend"); static MYSQL_SYSVAR_STR(temp_data_file_path, innobase_temp_data_file_path, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Path to files and their sizes making temp-tablespace.", - NULL, NULL, NULL); + NULL, NULL, "ibtmp1:12M:autoextend"); static MYSQL_SYSVAR_STR(undo_directory, srv_undo_dir, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -20814,12 +20035,10 @@ static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave, NULL, NULL, FALSE); #endif /* HAVE_LIBNUMA */ -static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering, +static MYSQL_SYSVAR_ENUM(change_buffering, innodb_change_buffering, PLUGIN_VAR_RQCMDARG, - "Buffer changes to reduce random access:" - " OFF, ON, inserting, deleting, changing, or purging.", - innodb_change_buffering_validate, - innodb_change_buffering_update, "all"); + "Buffer changes to secondary indexes.", + NULL, NULL, IBUF_USE_ALL, &innodb_change_buffering_typelib); static MYSQL_SYSVAR_UINT(change_buffer_max_size, srv_change_buffer_max_size, @@ -20874,14 +20093,8 @@ static my_bool innobase_disallow_writes = FALSE; An "update" method for innobase_disallow_writes variable. */ static void -innobase_disallow_writes_update( -/*============================*/ - THD* thd, /* in: thread handle */ - st_mysql_sys_var* var, /* in: pointer to system - variable */ - void* var_ptr, /* out: pointer to dynamic - variable */ - const void* save) /* in: temporary storage */ +innobase_disallow_writes_update(THD*, st_mysql_sys_var*, + void* var_ptr, const void* save) { const my_bool val = *static_cast<const my_bool*>(save); *static_cast<my_bool*>(var_ptr) = val; @@ -20968,11 +20181,6 @@ static MYSQL_SYSVAR_BOOL(read_only, srv_read_only_mode, "Start InnoDB in read only mode (off by default)", NULL, NULL, FALSE); -static MYSQL_SYSVAR_BOOL(safe_truncate, srv_safe_truncate, - PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, - "Use backup-safe TRUNCATE TABLE and crash-safe RENAME (incompatible with older MariaDB 10.2; ON by default)", - NULL, NULL, TRUE); - static MYSQL_SYSVAR_BOOL(cmp_per_index_enabled, srv_cmp_per_index_enabled, PLUGIN_VAR_OPCMDARG, "Enable INFORMATION_SCHEMA.innodb_cmp_per_index," @@ -21005,6 +20213,11 @@ static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug, " but the each purges were not done yet.", NULL, NULL, FALSE); +static MYSQL_SYSVAR_BOOL(evict_tables_on_commit_debug, + innodb_evict_tables_on_commit_debug, PLUGIN_VAR_OPCMDARG, + "On transaction commit, try to evict tables from the data dictionary cache.", + NULL, NULL, FALSE); + static MYSQL_SYSVAR_UINT(data_file_size_debug, srv_sys_space_size_debug, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -21056,11 +20269,6 @@ static MYSQL_SYSVAR_BOOL(force_primary_key, "Do not allow to create table without primary key (off by default)", NULL, NULL, FALSE); -static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, - PLUGIN_VAR_OPCMDARG, - "Deallocate (punch_hole|trim) unused portions of the page compressed page (on by default)", - NULL, innodb_use_trim_update, TRUE); - static const char *page_compression_algorithms[]= { "none", "zlib", "lz4", "lzo", "lzma", "bzip2", "snappy", 0 }; static TYPELIB page_compression_algorithms_typelib= { @@ -21077,20 +20285,6 @@ static MYSQL_SYSVAR_ENUM(compression_algorithm, innodb_compression_algorithm, PAGE_ZLIB_ALGORITHM, &page_compression_algorithms_typelib); -static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "DEPRECATED. Number of multi-threaded flush threads", - NULL, NULL, - MTFLUSH_DEFAULT_WORKER, /* Default setting */ - 1, /* Minimum setting */ - MTFLUSH_MAX_WORKER, /* Max setting */ - 0); - -static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "DEPRECATED. Use multi-threaded flush. Default FALSE.", - NULL, NULL, FALSE); - static MYSQL_SYSVAR_ULONG(fatal_semaphore_wait_threshold, srv_fatal_semaphore_wait_threshold, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Maximum number of seconds that semaphore times out in InnoDB.", @@ -21207,18 +20401,11 @@ static MYSQL_SYSVAR_BOOL(debug_force_scrubbing, NULL, NULL, FALSE); #endif /* UNIV_DEBUG */ -static MYSQL_SYSVAR_BOOL(instrument_semaphores, innodb_instrument_semaphores, - PLUGIN_VAR_OPCMDARG, - "DEPRECATED. This setting has no effect.", - NULL, innodb_instrument_semaphores_update, FALSE); - static MYSQL_SYSVAR_BOOL(encrypt_temporary_tables, innodb_encrypt_temporary_tables, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, "Enrypt the temporary table data.", NULL, NULL, false); -#include "ha_xtradb.h" - static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(autoextend_increment), MYSQL_SYSVAR(buffer_pool_size), @@ -21233,6 +20420,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { #endif /* UNIV_DEBUG */ MYSQL_SYSVAR(buffer_pool_load_now), MYSQL_SYSVAR(buffer_pool_load_abort), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(buffer_pool_load_pages_abort), +#endif /* UNIV_DEBUG */ MYSQL_SYSVAR(buffer_pool_load_at_startup), MYSQL_SYSVAR(defragment), MYSQL_SYSVAR(defragment_n_pages), @@ -21254,14 +20444,11 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(doublewrite), MYSQL_SYSVAR(stats_include_delete_marked), MYSQL_SYSVAR(use_atomic_writes), - MYSQL_SYSVAR(use_fallocate), MYSQL_SYSVAR(fast_shutdown), MYSQL_SYSVAR(read_io_threads), MYSQL_SYSVAR(write_io_threads), MYSQL_SYSVAR(file_per_table), - MYSQL_SYSVAR(file_format), - MYSQL_SYSVAR(file_format_check), - MYSQL_SYSVAR(file_format_max), + MYSQL_SYSVAR(file_format), /* deprecated in MariaDB 10.2; no effect */ MYSQL_SYSVAR(flush_log_at_timeout), MYSQL_SYSVAR(flush_log_at_trx_commit), MYSQL_SYSVAR(flush_method), @@ -21275,7 +20462,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(ft_min_token_size), MYSQL_SYSVAR(ft_num_word_optimize), MYSQL_SYSVAR(ft_sort_pll_degree), - MYSQL_SYSVAR(large_prefix), + MYSQL_SYSVAR(large_prefix), /* deprecated in MariaDB 10.2; no effect */ MYSQL_SYSVAR(force_load_corrupted), MYSQL_SYSVAR(lock_schedule_algorithm), MYSQL_SYSVAR(locks_unsafe_for_binlog), @@ -21324,7 +20511,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(replication_delay), MYSQL_SYSVAR(status_file), MYSQL_SYSVAR(strict_mode), - MYSQL_SYSVAR(support_xa), MYSQL_SYSVAR(sort_buffer_size), MYSQL_SYSVAR(online_alter_log_max_size), MYSQL_SYSVAR(sync_spin_loops), @@ -21354,7 +20540,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(random_read_ahead), MYSQL_SYSVAR(read_ahead_threshold), MYSQL_SYSVAR(read_only), - MYSQL_SYSVAR(safe_truncate), + MYSQL_SYSVAR(instant_alter_column_allowed), MYSQL_SYSVAR(io_capacity), MYSQL_SYSVAR(io_capacity_max), MYSQL_SYSVAR(page_cleaners), @@ -21394,6 +20580,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(trx_rseg_n_slots_debug), MYSQL_SYSVAR(limit_optimistic_insert_debug), MYSQL_SYSVAR(trx_purge_view_update_only_debug), + MYSQL_SYSVAR(evict_tables_on_commit_debug), MYSQL_SYSVAR(data_file_size_debug), MYSQL_SYSVAR(fil_make_page_dirty_debug), MYSQL_SYSVAR(saved_page_number_debug), @@ -21406,11 +20593,8 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(force_primary_key), MYSQL_SYSVAR(fatal_semaphore_wait_threshold), /* Table page compression feature */ - MYSQL_SYSVAR(use_trim), MYSQL_SYSVAR(compression_default), MYSQL_SYSVAR(compression_algorithm), - MYSQL_SYSVAR(mtflush_threads), - MYSQL_SYSVAR(use_mtflush), /* Encryption feature */ MYSQL_SYSVAR(encrypt_tables), MYSQL_SYSVAR(encryption_threads), @@ -21429,15 +20613,10 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { #ifdef UNIV_DEBUG MYSQL_SYSVAR(debug_force_scrubbing), #endif - MYSQL_SYSVAR(instrument_semaphores), MYSQL_SYSVAR(buf_dump_status_frequency), MYSQL_SYSVAR(background_thread), MYSQL_SYSVAR(encrypt_temporary_tables), - /* XtraDB compatibility system variables */ -#define HA_XTRADB_SYSVARS -#include "ha_xtradb.h" - NULL }; @@ -21449,7 +20628,7 @@ maria_declare_plugin(innobase) plugin_author, "Supports transactions, row-level locking, foreign keys and encryption for tables", PLUGIN_LICENSE_GPL, - innobase_init, /* Plugin Init */ + innodb_init, /* Plugin Init */ NULL, /* Plugin Deinit */ INNODB_VERSION_SHORT, innodb_status_variables_export,/* status variables */ @@ -21522,13 +20701,13 @@ innodb_params_adjust() = MYSQL_SYSVAR_NAME(undo_logs).def_val = srv_available_undo_logs; MYSQL_SYSVAR_NAME(max_undo_log_size).max_val - = 1ULL << (32 + UNIV_PAGE_SIZE_SHIFT); + = 1ULL << (32U + srv_page_size_shift); MYSQL_SYSVAR_NAME(max_undo_log_size).min_val = MYSQL_SYSVAR_NAME(max_undo_log_size).def_val = ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) - * srv_page_size; + << srv_page_size_shift; MYSQL_SYSVAR_NAME(max_undo_log_size).max_val - = 1ULL << (32 + UNIV_PAGE_SIZE_SHIFT); + = 1ULL << (32U + srv_page_size_shift); } /**************************************************************************** @@ -21842,7 +21021,7 @@ innobase_rename_vc_templ( if (is_part != NULL) { *is_part = '\0'; - tbnamelen = is_part - tbname; + tbnamelen = ulint(is_part - tbname); } dbnamelen = filename_to_tablename(dbname, t_dbname, @@ -22022,7 +21201,7 @@ innobase_get_computed_value( if (!heap || index->table->vc_templ->rec_len >= REC_VERSION_56_MAX_INDEX_COL_LEN) { if (*local_heap == NULL) { - *local_heap = mem_heap_create(UNIV_PAGE_SIZE); + *local_heap = mem_heap_create(srv_page_size); } buf = static_cast<byte*>(mem_heap_alloc( @@ -22055,7 +21234,7 @@ innobase_get_computed_value( if (row_field->ext) { if (*local_heap == NULL) { - *local_heap = mem_heap_create(UNIV_PAGE_SIZE); + *local_heap = mem_heap_create(srv_page_size); } data = btr_copy_externally_stored_field( @@ -22186,7 +21365,7 @@ ib_senderrf( ...) /*!< Args */ { va_list args; - const char* format = innobase_get_err_msg(code); + const char* format = my_get_err_msg(code); /* If the caller wants to push a message to the client then the caller must pass a valid session handle. */ @@ -22332,7 +21511,7 @@ innobase_convert_to_filename_charset( CHARSET_INFO* cs_from = system_charset_info; return(static_cast<uint>(strconvert( - cs_from, from, strlen(from), + cs_from, from, uint(strlen(from)), cs_to, to, static_cast<uint>(len), &errors))); } @@ -22351,14 +21530,13 @@ innobase_convert_to_system_charset( CHARSET_INFO* cs2 = system_charset_info; return(static_cast<uint>(strconvert( - cs1, from, strlen(from), + cs1, from, static_cast<uint>(strlen(from)), cs2, to, static_cast<uint>(len), errors))); } /** Validate the requested buffer pool size. Also, reserve the necessary memory needed for buffer pool resize. @param[in] thd thread handle -@param[in] var pointer to system variable @param[out] save immediate result for update function @param[in] value incoming string @return 0 on success, 1 on failure. @@ -22367,13 +21545,11 @@ static int innodb_buffer_pool_size_validate( THD* thd, - struct st_mysql_sys_var* var, + st_mysql_sys_var*, void* save, struct st_mysql_value* value) { longlong intbuf; - - value->val_int(value, &intbuf); if (!srv_was_started) { @@ -22419,12 +21595,11 @@ innodb_buffer_pool_size_validate( return(1); } - ulint requested_buf_pool_size - = buf_pool_size_align(static_cast<ulint>(intbuf)); + ulint requested_buf_pool_size = buf_pool_size_align(ulint(intbuf)); - *static_cast<longlong*>(save) = requested_buf_pool_size; + *static_cast<ulonglong*>(save) = requested_buf_pool_size; - if (srv_buf_pool_size == static_cast<ulint>(intbuf)) { + if (srv_buf_pool_size == ulint(intbuf)) { buf_pool_mutex_exit_all(); /* nothing to do */ return(0); @@ -22472,7 +21647,7 @@ innodb_compression_algorithm_validate( for update function */ struct st_mysql_value* value) /*!< in: incoming string */ { - long compression_algorithm; + ulong compression_algorithm; DBUG_ENTER("innobase_compression_algorithm_validate"); if (check_sysvar_enum(thd, var, save, value)) { @@ -22584,7 +21759,7 @@ UNIV_INTERN void ib_push_warning( trx_t* trx, /*!< in: trx */ - ulint error, /*!< in: error code to push as warning */ + dberr_t error, /*!< in: error code to push as warning */ const char *format,/*!< in: warning message */ ...) { @@ -22598,9 +21773,9 @@ ib_push_warning( buf = (char *)my_malloc(MAX_BUF_SIZE, MYF(MY_WME)); vsprintf(buf,format, args); - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - convert_error_code_to_mysql((dberr_t)error, 0, thd), - buf); + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + uint(convert_error_code_to_mysql(error, 0, thd)), buf); my_free(buf); va_end(args); } @@ -22612,7 +21787,7 @@ UNIV_INTERN void ib_push_warning( void* ithd, /*!< in: thd */ - ulint error, /*!< in: error code to push as warning */ + dberr_t error, /*!< in: error code to push as warning */ const char *format,/*!< in: warning message */ ...) { @@ -22630,9 +21805,9 @@ ib_push_warning( buf = (char *)my_malloc(MAX_BUF_SIZE, MYF(MY_WME)); vsprintf(buf,format, args); - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - convert_error_code_to_mysql((dberr_t)error, 0, thd), - buf); + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + uint(convert_error_code_to_mysql(error, 0, thd)), buf); my_free(buf); va_end(args); } @@ -22728,3 +21903,70 @@ ib_push_frm_error( break; } } + +/** Writes 8 bytes to nth tuple field +@param[in] tuple where to write +@param[in] nth index in tuple +@param[in] data what to write +@param[in] buf field data buffer */ +static void set_tuple_col_8(dtuple_t *tuple, int col, uint64_t data, byte *buf) +{ + dfield_t *dfield= dtuple_get_nth_field(tuple, col); + ut_ad(dfield->type.len == 8); + if (dfield->len == UNIV_SQL_NULL) + { + dfield_set_data(dfield, buf, 8); + } + ut_ad(dfield->len == dfield->type.len && dfield->data); + mach_write_to_8(dfield->data, data); +} + +void ins_node_t::vers_update_end(row_prebuilt_t *prebuilt, bool history_row) +{ + ut_ad(prebuilt->ins_node == this); + trx_t *trx= prebuilt->trx; +#ifndef DBUG_OFF + ut_ad(table->vers_start != table->vers_end); + const mysql_row_templ_t *t= prebuilt->get_template_by_col(table->vers_end); + ut_ad(t); + ut_ad(t->mysql_col_len == 8); +#endif + + if (history_row) + { + set_tuple_col_8(row, table->vers_end, trx->id, vers_end_buf); + } + else /* ROW_INS_VERSIONED */ + { + set_tuple_col_8(row, table->vers_end, TRX_ID_MAX, vers_end_buf); +#ifndef DBUG_OFF + t= prebuilt->get_template_by_col(table->vers_start); + ut_ad(t); + ut_ad(t->mysql_col_len == 8); +#endif + set_tuple_col_8(row, table->vers_start, trx->id, vers_start_buf); + } + dict_index_t *clust_index= dict_table_get_first_index(table); + THD *thd= trx->mysql_thd; + TABLE *mysql_table= prebuilt->m_mysql_table; + mem_heap_t *local_heap= NULL; + for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++) + { + + const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no); + for (ulint i= 0; i < unsigned(v_col->num_base); i++) + { + dict_col_t *base_col= v_col->base_col[i]; + if (base_col->ind == table->vers_end) + { + innobase_get_computed_value(row, v_col, clust_index, &local_heap, + table->heap, NULL, thd, mysql_table, + mysql_table->record[0], NULL, NULL, NULL); + } + } + } + if (local_heap) + { + mem_heap_free(local_heap); + } +} diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index 274c6761d4f..038528cc2e4 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -113,13 +113,11 @@ public: double read_time(uint index, uint ranges, ha_rows rows); - longlong get_memory_buffer_size() const; - int delete_all_rows(); int write_row(uchar * buf); - int update_row(const uchar * old_data, uchar * new_data); + int update_row(const uchar * old_data, const uchar * new_data); int delete_row(const uchar * buf); @@ -269,7 +267,7 @@ public: */ my_bool register_query_cache_table( THD* thd, - char* table_key, + const char* table_key, uint key_length, qc_engine_callback* call_back, ulonglong* engine_data); @@ -286,12 +284,24 @@ public: by ALTER TABLE and holding data used during in-place alter. @retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported - @retval HA_ALTER_INPLACE_NO_LOCK Supported - @retval HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE - Supported, but requires lock during main phase and - exclusive lock during prepare phase. - @retval HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE - Supported, prepare phase requires exclusive lock. */ + @retval HA_ALTER_INPLACE_INSTANT + MDL_EXCLUSIVE is needed for executing prepare_inplace_alter_table() + and commit_inplace_alter_table(). inplace_alter_table() + will not be called. + @retval HA_ALTER_INPLACE_COPY_NO_LOCK + MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded + to LOCK=NONE for rebuilding the table in inplace_alter_table() + @retval HA_ALTER_INPLACE_COPY_LOCK + MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded + to LOCK=SHARED for rebuilding the table in inplace_alter_table() + @retval HA_ALTER_INPLACE_NOCOPY_NO_LOCK + MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded + to LOCK=NONE for inplace_alter_table() which will not rebuild the table + @retval HA_ALTER_INPLACE_NOCOPY_LOCK + MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded + to LOCK=SHARED for inplace_alter_table() which will not rebuild + the table. */ + enum_alter_inplace_result check_if_supported_inplace_alter( TABLE* altered_table, Alter_inplace_info* ha_alter_info); @@ -502,9 +512,6 @@ the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */ #error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS #endif -LEX_STRING* thd_query_string(MYSQL_THD thd); -size_t thd_query_safe(MYSQL_THD thd, char *buf, size_t buflen); - extern "C" { /** Check if a user thread is a replication slave thread @@ -757,6 +764,8 @@ private: /** Table name */ char* m_table_name; + /** Table */ + dict_table_t* m_table; /** Whether the table needs to be dropped before rollback */ bool m_drop_before_rollback; @@ -870,19 +879,6 @@ innodb_base_col_setup_for_stored( create_table_info_t::normalize_table_name_low(norm_name, name, FALSE) #endif /* _WIN32 */ -/** Converts an InnoDB error code to a MySQL error code. -Also tells to MySQL about a possible transaction rollback inside InnoDB caused -by a lock wait timeout or a deadlock. -@param[in] error InnoDB error code. -@param[in] flags InnoDB table flags or 0. -@param[in] thd MySQL thread or NULL. -@return MySQL error code */ -int -convert_error_code_to_mysql( - dberr_t error, - ulint flags, - THD* thd); - /** Converts a search mode flag understood by MySQL to a flag understood by InnoDB. @param[in] find_flag MySQL search mode flag. diff --git a/storage/innobase/handler/ha_xtradb.h b/storage/innobase/handler/ha_xtradb.h deleted file mode 100644 index b049905613c..00000000000 --- a/storage/innobase/handler/ha_xtradb.h +++ /dev/null @@ -1,988 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2000, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. -Copyright (c) 2009, Percona Inc. - -Portions of this file contain modifications contributed and copyrighted -by Percona Inc.. Those modifications are -gratefully acknowledged and are described briefly in the InnoDB -documentation. The contributions by Percona Inc. are incorporated with -their permission, and subject to the conditions contained in the file -COPYING.Percona. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/** @file ha_xtradb.h */ - -#ifndef HA_XTRADB_H -#define HA_XTRADB_H - -static -void -innodb_print_deprecation(const char* param); - -/* XtraDB compatibility system variables. Note that default value and -minimum value can be different compared to original to detect has user -really used the parameter or not. */ - -static my_bool innodb_buffer_pool_populate; -#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG -static ulong srv_cleaner_max_lru_time; -static ulong srv_cleaner_max_flush_time; -static ulong srv_cleaner_flush_chunk_size; -static ulong srv_cleaner_lru_chunk_size; -static ulong srv_cleaner_free_list_lwm; -static my_bool srv_cleaner_eviction_factor; -#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ -static ulong srv_pass_corrupt_table; -static ulong srv_empty_free_list_algorithm; -static ulong innobase_file_io_threads; -static ulong srv_foreground_preflush; -static longlong srv_kill_idle_transaction; -static my_bool srv_fake_changes_locks; -static my_bool innobase_log_archive; -static char* innobase_log_arch_dir = NULL; -static ulong srv_log_arch_expire_sec; -static ulong innobase_log_block_size; -static ulong srv_log_checksum_algorithm; -static ulonglong srv_max_bitmap_file_size; -static ulonglong srv_max_changed_pages; -static ulong innobase_mirrored_log_groups; -#ifdef UNIV_LINUX -static ulong srv_sched_priority_cleaner; -#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG -static my_bool srv_cleaner_thread_priority; -static my_bool srv_io_thread_priority; -static my_bool srv_master_thread_priority; -static my_bool srv_purge_thread_priority; -static ulong srv_sched_priority_io; -static ulong srv_sched_priority_master; -static ulong srv_sched_priority_purge; -#endif /* UNIV_DEBUG || UNIV_PERF_DEBUG */ -#endif /* UNIV_LINUX */ -static ulong srv_cleaner_lsn_age_factor; -static ulong srv_show_locks_held; -static ulong srv_show_verbose_locks; -static my_bool srv_track_changed_pages; -static my_bool innodb_track_redo_log_now; -static my_bool srv_use_global_flush_log_at_trx_commit; -static my_bool srv_use_stacktrace; - - -static const char innodb_deprecated_msg[]= "Using %s is deprecated and the" - " parameter may be removed in future releases." - " Ignoning the parameter."; - - -#ifdef BTR_CUR_HASH_ADAPT -/* it is just alias for innodb_adaptive_hash_index_parts */ -/** Number of distinct partitions of AHI. -Each partition is protected by its own latch and so we have parts number -of latches protecting complete search system. */ -static MYSQL_SYSVAR_ULONG(adaptive_hash_index_partitions, btr_ahi_parts, - PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, - "It is an alias for innodb_adaptive_hash_index_parts; " - "only exists to allow easier upgrade from earlier XtraDB versions.", - NULL, NULL, 8, 1, 512, 0); -#endif /* BTR_CUR_HASH_ADAPT */ - -static MYSQL_SYSVAR_BOOL(buffer_pool_populate, innodb_buffer_pool_populate, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, NULL, FALSE); - -#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG -static -void -set_cleaner_max_lru_time(THD*thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_cleaner_max_lru_time"); -} -/* Original default 1000 */ -static MYSQL_SYSVAR_ULONG(cleaner_max_lru_time, srv_cleaner_max_lru_time, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, set_cleaner_max_lru_time, 0, 0, ~0UL, 0); - -static -void -set_cleaner_max_flush_time(THD*thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_cleaner_max_flush_time"); -} -/* Original default 1000 */ -static MYSQL_SYSVAR_ULONG(cleaner_max_flush_time, srv_cleaner_max_flush_time, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_cleaner_max_flush_time, 0, 0, ~0UL, 0); - -static -void -set_cleaner_flush_chunk_size(THD*thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_cleaner_flush_chunk_size"); -} -/* Original default 100 */ -static MYSQL_SYSVAR_ULONG(cleaner_flush_chunk_size, - srv_cleaner_flush_chunk_size, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_cleaner_flush_chunk_size, 0, 0, ~0UL, 0); - -static -void -set_cleaner_lru_chunk_size(THD*thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_cleaner_lru_chunk_size"); -} -/* Original default 100 */ -static MYSQL_SYSVAR_ULONG(cleaner_lru_chunk_size, - srv_cleaner_lru_chunk_size, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_cleaner_lru_chunk_size, 0, 0, ~0UL, 0); - -static -void -set_cleaner_free_list_lwm(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_cleaner_free_list_lwm"); -} -/* Original default 10 */ -static MYSQL_SYSVAR_ULONG(cleaner_free_list_lwm, srv_cleaner_free_list_lwm, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_cleaner_free_list_lwm, 0, 0, 100, 0); - -static -void -set_cleaner_eviction_factor(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_cleaner_eviction_factor"); -} -static MYSQL_SYSVAR_BOOL(cleaner_eviction_factor, srv_cleaner_eviction_factor, - PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_cleaner_eviction_factor, FALSE); - -#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ - -/* Added new default DEPRECATED */ -/** Possible values for system variable "innodb_cleaner_lsn_age_factor". */ -static const char* innodb_cleaner_lsn_age_factor_names[] = { - "LEGACY", - "HIGH_CHECKPOINT", - "DEPRECATED", - NullS -}; - -/** Enumeration for innodb_cleaner_lsn_age_factor. */ -static TYPELIB innodb_cleaner_lsn_age_factor_typelib = { - array_elements(innodb_cleaner_lsn_age_factor_names) - 1, - "innodb_cleaner_lsn_age_factor_typelib", - innodb_cleaner_lsn_age_factor_names, - NULL -}; - -/** Alternatives for srv_cleaner_lsn_age_factor, set through -innodb_cleaner_lsn_age_factor variable */ -enum srv_cleaner_lsn_age_factor_t { - SRV_CLEANER_LSN_AGE_FACTOR_LEGACY, /*!< Original Oracle MySQL 5.6 - formula */ - SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT, - /*!< Percona Server 5.6 formula - that returns lower values than - legacy option for low - checkpoint ages, and higher - values for high ages. This has - the effect of stabilizing the - checkpoint age higher. */ - SRV_CLEANER_LSN_AGE_FACTOR_DEPRECATED /*!< Deprecated, do not use */ -}; - -/** Alternatives for srv_foreground_preflush, set through -innodb_foreground_preflush variable */ -enum srv_foreground_preflush_t { - SRV_FOREGROUND_PREFLUSH_SYNC_PREFLUSH, /*!< Original Oracle MySQL 5.6 - behavior of performing a sync - flush list flush */ - SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF, /*!< Exponential backoff wait - for the page cleaner to flush - for us */ - SRV_FOREGROUND_PREFLUSH_DEPRECATED /*!< Deprecated, do not use */ -}; - -/** Alternatives for srv_empty_free_list_algorithm, set through -innodb_empty_free_list_algorithm variable */ -enum srv_empty_free_list_t { - SRV_EMPTY_FREE_LIST_LEGACY, /*!< Original Oracle MySQL 5.6 - algorithm */ - SRV_EMPTY_FREE_LIST_BACKOFF, /*!< Percona Server 5.6 algorithm that - loops in a progressive backoff until a - free page is produced by the cleaner - thread */ - SRV_EMPTY_FREE_LIST_DEPRECATED /*!< Deprecated, do not use */ -}; - -#define SRV_CHECKSUM_ALGORITHM_DEPRECATED 6 - -static -void -set_cleaner_lsn_age_factor(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_cleaner_lsn_age_factor"); -} -static MYSQL_SYSVAR_ENUM(cleaner_lsn_age_factor, - srv_cleaner_lsn_age_factor, - PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_cleaner_lsn_age_factor, SRV_CLEANER_LSN_AGE_FACTOR_DEPRECATED, - &innodb_cleaner_lsn_age_factor_typelib); - -/* Added new default drepcated, 3 */ -const char *corrupt_table_action_names[]= -{ - "assert", /* 0 */ - "warn", /* 1 */ - "salvage", /* 2 */ - "deprecated", /* 3 */ - NullS -}; - -TYPELIB corrupt_table_action_typelib= -{ - array_elements(corrupt_table_action_names) - 1, "corrupt_table_action_typelib", - corrupt_table_action_names, NULL -}; - -static -void -set_corrupt_table_action(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_corrupt_table_action"); -} -static MYSQL_SYSVAR_ENUM(corrupt_table_action, srv_pass_corrupt_table, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_corrupt_table_action, 3, &corrupt_table_action_typelib); - -/* Added new default DEPRECATED */ -/** Possible values for system variable "innodb_empty_free_list_algorithm". */ -static const char* innodb_empty_free_list_algorithm_names[] = { - "LEGACY", - "BACKOFF", - "DEPRECATED", - NullS -}; - -/** Enumeration for innodb_empty_free_list_algorithm. */ -static TYPELIB innodb_empty_free_list_algorithm_typelib = { - array_elements(innodb_empty_free_list_algorithm_names) - 1, - "innodb_empty_free_list_algorithm_typelib", - innodb_empty_free_list_algorithm_names, - NULL -}; - -static -void -set_empty_free_list_algorithm(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_empty_free_list_algorithm"); -} -static MYSQL_SYSVAR_ENUM(empty_free_list_algorithm, - srv_empty_free_list_algorithm, - PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_empty_free_list_algorithm, SRV_EMPTY_FREE_LIST_DEPRECATED, - &innodb_empty_free_list_algorithm_typelib); - -static -void -set_fake_changes(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_fake_changes"); -} -static MYSQL_THDVAR_BOOL(fake_changes, PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_fake_changes, FALSE); - -/* Original default, min 4. */ -static MYSQL_SYSVAR_ULONG(file_io_threads, innobase_file_io_threads, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, NULL, 0, 0, 64, 0); - -/** Possible values for system variable "innodb_foreground_preflush". */ -static const char* innodb_foreground_preflush_names[] = { - "SYNC_PREFLUSH", - "EXPONENTIAL_BACKOFF", - "DEPRECATED", - NullS -}; - -/* Enumeration for innodb_foreground_preflush. */ -static TYPELIB innodb_foreground_preflush_typelib = { - array_elements(innodb_foreground_preflush_names) - 1, - "innodb_foreground_preflush_typelib", - innodb_foreground_preflush_names, - NULL -}; - -static -void -set_foreground_preflush(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_foreground_preflush"); -} -static MYSQL_SYSVAR_ENUM(foreground_preflush, srv_foreground_preflush, - PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_foreground_preflush, SRV_FOREGROUND_PREFLUSH_DEPRECATED, - &innodb_foreground_preflush_typelib); - -#ifdef EXTENDED_FOR_KILLIDLE -#define kill_idle_help_text "If non-zero value, the idle session with transaction which is idle over the value in seconds is killed by InnoDB." -#else -#define kill_idle_help_text "No effect for this build." -#endif -static -void -set_kill_idle_transaction(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_kill_idle_transaction"); -} -static MYSQL_SYSVAR_LONGLONG(kill_idle_transaction, srv_kill_idle_transaction, - PLUGIN_VAR_RQCMDARG, kill_idle_help_text, - NULL, &set_kill_idle_transaction, 0, 0, LONG_MAX, 0); - -static -void -set_locking_fake_changes(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_locking_fake_changes"); -} -/* Original default: TRUE */ -static MYSQL_SYSVAR_BOOL(locking_fake_changes, srv_fake_changes_locks, - PLUGIN_VAR_NOCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_locking_fake_changes, FALSE); - -static MYSQL_SYSVAR_STR(log_arch_dir, innobase_log_arch_dir, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, NULL, NULL); - -static -void -set_log_archive(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_log_archive"); -} -static MYSQL_SYSVAR_BOOL(log_archive, innobase_log_archive, - PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_log_archive, FALSE); - -static -void -set_log_arch_expire_sec(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_log_arch_expire_sec"); -} -static MYSQL_SYSVAR_ULONG(log_arch_expire_sec, - srv_log_arch_expire_sec, PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_log_arch_expire_sec, 0, 0, ~0UL, 0); - -/* Original default, min 512 */ -static MYSQL_SYSVAR_ULONG(log_block_size, innobase_log_block_size, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, NULL, 0, 0, - (1 << UNIV_PAGE_SIZE_SHIFT_MAX), 0); - -/* Added new default deprecated */ -/** Possible values for system variables "innodb_checksum_algorithm" and -"innodb_log_checksum_algorithm". */ -static const char* innodb_checksum_algorithm_names2[] = { - "CRC32", - "STRICT_CRC32", - "INNODB", - "STRICT_INNODB", - "NONE", - "STRICT_NONE", - "DEPRECATED", - NullS -}; - -/** Used to define an enumerate type of the system variables -innodb_checksum_algorithm and innodb_log_checksum_algorithm. */ -static TYPELIB innodb_checksum_algorithm_typelib2 = { - array_elements(innodb_checksum_algorithm_names2) - 1, - "innodb_checksum_algorithm_typelib2", - innodb_checksum_algorithm_names2, - NULL -}; -static -void -set_log_checksum_algorithm(THD* thd, st_mysql_sys_var*, void*, const void* save) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_log_checksum_algorithm"); -} -static MYSQL_SYSVAR_ENUM(log_checksum_algorithm, srv_log_checksum_algorithm, - PLUGIN_VAR_RQCMDARG, - "Deprecated and translated to innodb_log_checksums (NONE to OFF, " - "everything else to ON); only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_log_checksum_algorithm, SRV_CHECKSUM_ALGORITHM_DEPRECATED, - &innodb_checksum_algorithm_typelib2); - -static -void -set_max_bitmap_file_size(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_max_bitmap_file_size"); -} -/* Original default 100M, min 4K */ -static MYSQL_SYSVAR_ULONGLONG(max_bitmap_file_size, srv_max_bitmap_file_size, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_max_bitmap_file_size, 0, 0, ULONGLONG_MAX, 0); - -static -void -set_max_changed_pages(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_max_changed_pages"); -} -/* Original default 1000000 */ -static MYSQL_SYSVAR_ULONGLONG(max_changed_pages, srv_max_changed_pages, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_max_changed_pages, 0, 0, ~0ULL, 0); - -/* Note that the default and minimum values are set to 0 to -detect if the option is passed and print deprecation message */ -static MYSQL_SYSVAR_ULONG(mirrored_log_groups, innobase_mirrored_log_groups, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, NULL, 0, 0, 10, 0); - -#ifdef UNIV_LINUX - -static -void -set_sched_priority_cleaner(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_sched_priority_cleaner"); -} -/* Original default 19 */ -static MYSQL_SYSVAR_ULONG(sched_priority_cleaner, srv_sched_priority_cleaner, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_sched_priority_cleaner, 0, 0, 39, 0); - -#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG -static -void -set_priority_cleaner(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_priority_cleaner"); -} -static MYSQL_SYSVAR_BOOL(priority_cleaner, srv_cleaner_thread_priority, - PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_priority_cleaner, FALSE); - -static -void -set_priority_io(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_priority_io"); -} -static MYSQL_SYSVAR_BOOL(priority_io, srv_io_thread_priority, - PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_priority_io, FALSE); - -static -void -set_priority_master(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_priority_master"); -} -static MYSQL_SYSVAR_BOOL(priority_master, srv_master_thread_priority, - PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_priority_master, FALSE); - -static -void -set_priority_purge(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_priority_purge"); -} -static MYSQL_SYSVAR_BOOL(priority_purge, srv_purge_thread_priority, - PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_priority_purge, FALSE); - -static -void -set_sched_priority_io(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_sched_priority_io"); -} -/* Original default 19 */ -static MYSQL_SYSVAR_ULONG(sched_priority_io, srv_sched_priority_io, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_sched_priority_io, 0, 0, 39, 0); - -static -void -set_sched_priority_master(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_sched_priority_master"); -} -/* Original default 19 */ -static MYSQL_SYSVAR_ULONG(sched_priority_master, srv_sched_priority_master, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_sched_priority_master, 0, 0, 39, 0); - -static -void -set_sched_priority_purge(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_sched_priority_purge"); -} -/* Original default 19 */ -static MYSQL_SYSVAR_ULONG(sched_priority_purge, srv_sched_priority_purge, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_sched_priority_purge, 0, 0, 39, 0); -#endif /* UNIV_DEBUG || UNIV_PERF_DEBUG */ -#endif /* UNIV_LINUX */ - -static -void -set_show_locks_held(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_show_locks_held"); -} -/* TODO: Implement */ -static MYSQL_SYSVAR_ULONG(show_locks_held, srv_show_locks_held, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_show_locks_held, 0, 0, 1000, 0); - -static -void -set_show_verbose_locks(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_show_verbose_locks"); -} -/* TODO: Implement */ -static MYSQL_SYSVAR_ULONG(show_verbose_locks, srv_show_verbose_locks, - PLUGIN_VAR_RQCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_show_verbose_locks, 0, 0, 1, 0); - -static MYSQL_SYSVAR_BOOL(track_changed_pages, srv_track_changed_pages, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, NULL, FALSE); - -static -void -set_track_redo_log_now(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_track_redo_log_now"); -} -static MYSQL_SYSVAR_BOOL(track_redo_log_now, - innodb_track_redo_log_now, - PLUGIN_VAR_OPCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_track_redo_log_now, FALSE); - -static -void -set_use_global_flush_log_at_trx_commit(THD* thd, st_mysql_sys_var*, void*, const void*) -{ - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_DEPRECATED_SYNTAX, - innodb_deprecated_msg, - "innodb_use_global_flush_log_at_trx_commit"); -} -static MYSQL_SYSVAR_BOOL(use_global_flush_log_at_trx_commit, srv_use_global_flush_log_at_trx_commit, - PLUGIN_VAR_NOCMDARG, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, &set_use_global_flush_log_at_trx_commit, FALSE); - -static MYSQL_SYSVAR_BOOL(use_stacktrace, srv_use_stacktrace, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "Deprecated and ignored; only exists to allow easier upgrade from " - "earlier XtraDB versions.", - NULL, NULL, FALSE); - -/** Print deprecation message for a given system variable. -@param[in] param System parameter name */ -static -void -innodb_print_deprecation(const char* param) -{ - ib::warn() << "Using " << param << " is deprecated and the" - " parameter may be removed in future releases." - " Ignoning the parameter."; -} - -/** Check if user has used xtradb extended system variable that -is not currently supported by innodb or marked as deprecated. */ -static -void -innodb_check_deprecated(void) -{ - if (innodb_buffer_pool_populate) { - innodb_print_deprecation("innodb-buffer-pool-populate"); - } - -#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG - if (srv_cleaner_max_lru_time) { - innodb_print_deprecation("innodb-cleaner-max-lru-time"); - } - - if (srv_cleaner_max_flush_time) { - innodb_print_deprecation("innodb-cleaner-max-flush-time"); - } - - if (srv_cleaner_flush_chunk_size) { - innodb_print_deprecation("innodb-cleaner-flush-chunk-size"); - } - - if (srv_cleaner_lru_chunk_size) { - innodb_print_deprecation("innodb-cleaner-lru_chunk_size"); - } - if (srv_cleaner_free_list_lwm) { - innodb_print_deprecation("innodb-cleaner-free-list-lwm"); - } - - if (srv_cleaner_eviction_factor) { - innodb_print_deprecation("innodb-cleaner-eviction-factor"); - } - -#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ - - if (srv_cleaner_lsn_age_factor != SRV_CLEANER_LSN_AGE_FACTOR_DEPRECATED) { - innodb_print_deprecation("innodb-cleaner-lsn-age-factor"); - } - - if (srv_pass_corrupt_table != 3) { - innodb_print_deprecation("innodb-pass-corrupt-table"); - } - - if (srv_empty_free_list_algorithm != SRV_EMPTY_FREE_LIST_DEPRECATED) { - innodb_print_deprecation("innodb-empty-free-list-algorithm"); - } - - if (THDVAR((THD*) NULL, fake_changes)) { - innodb_print_deprecation("innodb-fake-changes"); - } - - if (innobase_file_io_threads) { - innodb_print_deprecation("innodb-file-io-threads"); - } - - if (srv_foreground_preflush != SRV_FOREGROUND_PREFLUSH_DEPRECATED) { - innodb_print_deprecation("innodb-foreground-preflush"); - } - - if (srv_kill_idle_transaction != 0) { - innodb_print_deprecation("innodb-kill-idle-transaction"); - } - - if (srv_fake_changes_locks) { - innodb_print_deprecation("innodb-fake-changes-locks"); - } - - if (innobase_log_arch_dir) { - innodb_print_deprecation("innodb-log-arch-dir"); - } - - if (innobase_log_archive) { - innodb_print_deprecation("innodb-log-archive"); - } - - if (srv_log_arch_expire_sec) { - innodb_print_deprecation("innodb-log-arch-expire-sec"); - } - - if (innobase_log_block_size) { - innodb_print_deprecation("innodb-log-block-size"); - } - - if (srv_log_checksum_algorithm != SRV_CHECKSUM_ALGORITHM_DEPRECATED) { - innodb_print_deprecation("innodb-log-checksum-algorithm"); - } - - if (srv_max_changed_pages) { - innodb_print_deprecation("innodb-max-changed-pages"); - } - - if (innobase_mirrored_log_groups) { - innodb_print_deprecation("innodb-mirrored-log-groups"); - } - -#ifdef UNIV_LINUX - if (srv_sched_priority_cleaner) { - innodb_print_deprecation("innodb-sched-priority-cleaner"); - } - -#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG - if (srv_cleaner_thread_priority) { - innodb_print_deprecation("innodb-cleaner-thread-priority"); - } - - if (srv_io_thread_priority) { - innodb_print_deprecation("innodb-io-thread-priority"); - } - - if (srv_master_thread_priority) { - innodb_print_deprecation("inodb-master-thread-priority"); - } - - if (srv_purge_thread_priority) { - innodb_print_deprecation("inodb-purge-thread-priority"); - } - - if (srv_sched_priority_io) { - innodb_print_deprecation("innodb-sched-priority-io"); - } - - if (srv_sched_priority_master) { - innodb_print_deprecation("innodb-sched-priority-master"); - } - - if (srv_sched_priority_purge) { - innodb_print_deprecation("innodb-sched-priority-purge"); - } -#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ -#endif /* UNIV_LINUX */ - - if (srv_track_changed_pages) { - innodb_print_deprecation("innodb-track-changed-pages"); - } - - if (innodb_track_redo_log_now) { - innodb_print_deprecation("innodb-track-redo-log-now"); - } - - if (srv_use_global_flush_log_at_trx_commit) { - innodb_print_deprecation("innodb-use-global-flush-log-at-trx-commit"); - } - - if (srv_use_stacktrace) { - innodb_print_deprecation("innodb-use-stacktrace"); - } - - if (srv_max_bitmap_file_size) { - innodb_print_deprecation("innodb-max-bitmap-file-size"); - } - - if (srv_show_locks_held) { - innodb_print_deprecation("innodb-show-locks-held"); - } - - if (srv_show_verbose_locks) { - innodb_print_deprecation("innodb-show-verbose-locks"); - } -} - -#endif /* HA_XTRADB_H */ - -#ifdef HA_XTRADB_SYSVARS - /* XtraDB compatibility system variables */ -#ifdef BTR_CUR_HASH_ADAPT - MYSQL_SYSVAR(adaptive_hash_index_partitions), -#endif /* BTR_CUR_HASH_ADAPT */ - MYSQL_SYSVAR(buffer_pool_populate), -#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG - MYSQL_SYSVAR(cleaner_eviction_factor), - MYSQL_SYSVAR(cleaner_flush_chunk_size), - MYSQL_SYSVAR(cleaner_free_list_lwm), - MYSQL_SYSVAR(cleaner_lru_chunk_size), - MYSQL_SYSVAR(cleaner_max_lru_time), - MYSQL_SYSVAR(cleaner_max_flush_time), -#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ - MYSQL_SYSVAR(cleaner_lsn_age_factor), - MYSQL_SYSVAR(corrupt_table_action), - MYSQL_SYSVAR(empty_free_list_algorithm), - MYSQL_SYSVAR(fake_changes), - MYSQL_SYSVAR(file_io_threads), - MYSQL_SYSVAR(foreground_preflush), - MYSQL_SYSVAR(kill_idle_transaction), - MYSQL_SYSVAR(locking_fake_changes), - MYSQL_SYSVAR(log_arch_dir), - MYSQL_SYSVAR(log_archive), - MYSQL_SYSVAR(log_arch_expire_sec), - MYSQL_SYSVAR(log_block_size), - MYSQL_SYSVAR(log_checksum_algorithm), - MYSQL_SYSVAR(max_bitmap_file_size), - MYSQL_SYSVAR(max_changed_pages), - MYSQL_SYSVAR(mirrored_log_groups), -#ifdef UNIV_LINUX - MYSQL_SYSVAR(sched_priority_cleaner), -#endif -#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG -#ifdef UNIV_LINUX - MYSQL_SYSVAR(priority_cleaner), - MYSQL_SYSVAR(priority_io), - MYSQL_SYSVAR(priority_master), - MYSQL_SYSVAR(priority_purge), - MYSQL_SYSVAR(sched_priority_io), - MYSQL_SYSVAR(sched_priority_master), - MYSQL_SYSVAR(sched_priority_purge), -#endif /* UNIV_LINUX */ -#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ - MYSQL_SYSVAR(show_locks_held), - MYSQL_SYSVAR(show_verbose_locks), - MYSQL_SYSVAR(track_changed_pages), - MYSQL_SYSVAR(track_redo_log_now), - MYSQL_SYSVAR(use_global_flush_log_at_trx_commit), - MYSQL_SYSVAR(use_stacktrace), - -#endif /* HA_XTRADB_SYSVARS */ diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index 3ac4f4fe4c1..0e17488e8aa 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -42,10 +42,14 @@ Smart ALTER TABLE #include "rem0types.h" #include "row0log.h" #include "row0merge.h" +#include "row0ins.h" +#include "row0row.h" +#include "row0upd.h" #include "trx0trx.h" #include "trx0roll.h" #include "handler0alter.h" #include "srv0mon.h" +#include "srv0srv.h" #include "fts0priv.h" #include "fts0plugin.h" #include "pars0pars.h" @@ -55,68 +59,94 @@ Smart ALTER TABLE #include "span.h" using st_::span; +/** File format constraint for ALTER TABLE */ +extern ulong innodb_instant_alter_column_allowed; static const char *MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN= "INPLACE ADD or DROP of virtual columns cannot be " "combined with other ALTER TABLE actions"; /** Operations for creating secondary indexes (no rebuild needed) */ -static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ONLINE_CREATE - = Alter_inplace_info::ADD_INDEX - | Alter_inplace_info::ADD_UNIQUE_INDEX; +static const alter_table_operations INNOBASE_ONLINE_CREATE + = ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX + | ALTER_ADD_UNIQUE_INDEX; + +/** Operations that require filling in default values for columns */ +static const alter_table_operations INNOBASE_DEFAULTS + = ALTER_COLUMN_NOT_NULLABLE + | ALTER_ADD_STORED_BASE_COLUMN; + + +/** Operations that require knowledge about row_start, row_end values */ +static const alter_table_operations INNOBASE_ALTER_VERSIONED_REBUILD + = ALTER_ADD_SYSTEM_VERSIONING + | ALTER_DROP_SYSTEM_VERSIONING; /** Operations for rebuilding a table in place */ -static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_REBUILD - = Alter_inplace_info::ADD_PK_INDEX - | Alter_inplace_info::DROP_PK_INDEX - | Alter_inplace_info::CHANGE_CREATE_OPTION - /* CHANGE_CREATE_OPTION needs to check innobase_need_rebuild() */ - | Alter_inplace_info::ALTER_COLUMN_NULLABLE - | Alter_inplace_info::ALTER_COLUMN_NOT_NULLABLE - | Alter_inplace_info::ALTER_STORED_COLUMN_ORDER - | Alter_inplace_info::DROP_STORED_COLUMN - | Alter_inplace_info::ADD_STORED_BASE_COLUMN - | Alter_inplace_info::RECREATE_TABLE +static const alter_table_operations INNOBASE_ALTER_REBUILD + = ALTER_ADD_PK_INDEX + | ALTER_DROP_PK_INDEX + | ALTER_OPTIONS + /* ALTER_OPTIONS needs to check alter_options_need_rebuild() */ + | ALTER_COLUMN_NULLABLE + | INNOBASE_DEFAULTS + | ALTER_STORED_COLUMN_ORDER + | ALTER_DROP_STORED_COLUMN + | ALTER_RECREATE_TABLE /* - | Alter_inplace_info::ALTER_STORED_COLUMN_TYPE + | ALTER_STORED_COLUMN_TYPE */ + | INNOBASE_ALTER_VERSIONED_REBUILD ; /** Operations that require changes to data */ -static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_DATA +static const alter_table_operations INNOBASE_ALTER_DATA = INNOBASE_ONLINE_CREATE | INNOBASE_ALTER_REBUILD; /** Operations for altering a table that InnoDB does not care about */ -static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_INPLACE_IGNORE - = Alter_inplace_info::ALTER_COLUMN_DEFAULT - | Alter_inplace_info::ALTER_PARTITIONED - | Alter_inplace_info::ALTER_COLUMN_COLUMN_FORMAT - | Alter_inplace_info::ALTER_COLUMN_STORAGE_TYPE - | Alter_inplace_info::ALTER_VIRTUAL_GCOL_EXPR - | Alter_inplace_info::ALTER_RENAME - | Alter_inplace_info::ALTER_COLUMN_INDEX_LENGTH; +static const alter_table_operations INNOBASE_INPLACE_IGNORE + = ALTER_COLUMN_DEFAULT + | ALTER_PARTITIONED + | ALTER_COLUMN_COLUMN_FORMAT + | ALTER_COLUMN_STORAGE_TYPE + | ALTER_VIRTUAL_GCOL_EXPR + | ALTER_DROP_CHECK_CONSTRAINT + | ALTER_RENAME + | ALTER_COLUMN_INDEX_LENGTH; /** Operations on foreign key definitions (changing the schema only) */ -static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_FOREIGN_OPERATIONS - = Alter_inplace_info::DROP_FOREIGN_KEY - | Alter_inplace_info::ADD_FOREIGN_KEY; +static const alter_table_operations INNOBASE_FOREIGN_OPERATIONS + = ALTER_DROP_FOREIGN_KEY + | ALTER_ADD_FOREIGN_KEY; + +/** Operations that InnoDB cares about and can perform without creating data */ +static const alter_table_operations INNOBASE_ALTER_NOCREATE + = ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX + | ALTER_DROP_UNIQUE_INDEX; /** Operations that InnoDB cares about and can perform without validation */ -static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_NOVALIDATE - = INNOBASE_FOREIGN_OPERATIONS - | Alter_inplace_info::DROP_INDEX - | Alter_inplace_info::DROP_UNIQUE_INDEX - | Alter_inplace_info::ALTER_COLUMN_NAME - //| Alter_inplace_info::ALTER_INDEX_COMMENT - | Alter_inplace_info::DROP_VIRTUAL_COLUMN - | Alter_inplace_info::ALTER_VIRTUAL_COLUMN_ORDER; +static const alter_table_operations INNOBASE_ALTER_NOVALIDATE + = INNOBASE_ALTER_NOCREATE + | ALTER_VIRTUAL_COLUMN_ORDER + | ALTER_COLUMN_NAME + | INNOBASE_FOREIGN_OPERATIONS + | ALTER_COLUMN_UNVERSIONED + | ALTER_DROP_VIRTUAL_COLUMN; /** Operations that InnoDB cares about and can perform without rebuild */ -static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_NOREBUILD - = INNOBASE_ALTER_NOVALIDATE - | INNOBASE_ONLINE_CREATE - | Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH - | Alter_inplace_info::ADD_VIRTUAL_COLUMN; +static const alter_table_operations INNOBASE_ALTER_NOREBUILD + = INNOBASE_ONLINE_CREATE + | INNOBASE_ALTER_NOCREATE; + +/** Operations that can be performed instantly, without inplace_alter_table() */ +static const alter_table_operations INNOBASE_ALTER_INSTANT + = ALTER_VIRTUAL_COLUMN_ORDER + | ALTER_COLUMN_NAME + | ALTER_ADD_VIRTUAL_COLUMN + | INNOBASE_FOREIGN_OPERATIONS + | ALTER_COLUMN_EQUAL_PACK_LENGTH + | ALTER_COLUMN_UNVERSIONED + | ALTER_DROP_VIRTUAL_COLUMN; struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx { @@ -156,14 +186,16 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx dict_table_t* old_table; /** table where the indexes are being created or dropped */ dict_table_t* new_table; + /** table definition for instant ADD COLUMN */ + dict_table_t* instant_table; /** mapping of old column numbers to new ones, or NULL */ const ulint* col_map; /** new column names, or NULL if nothing was renamed */ const char** col_names; /** added AUTO_INCREMENT column position, or ULINT_UNDEFINED */ const ulint add_autoinc; - /** default values of ADD COLUMN, or NULL */ - const dtuple_t* add_cols; + /** default values of ADD and CHANGE COLUMN, or NULL */ + const dtuple_t* defaults; /** autoinc sequence to use */ ib_sequence_t sequence; /** temporary table name to use for old table when renaming tables */ @@ -182,6 +214,22 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx const char** drop_vcol_name; /** ALTER TABLE stage progress recorder */ ut_stage_alter_t* m_stage; + /** original number of user columns in the table */ + const unsigned old_n_cols; + /** original columns of the table */ + dict_col_t* const old_cols; + /** original column names of the table */ + const char* const old_col_names; + + /** Allow non-null conversion. + (1) Alter ignore should allow the conversion + irrespective of sql mode. + (2) Don't allow the conversion in strict mode + (3) Allow the conversion only in non-strict mode. */ + const bool allow_not_null; + + /** The page_compression_level attribute, or 0 */ + const uint page_compression_level; ha_innobase_inplace_ctx(row_prebuilt_t*& prebuilt_arg, dict_index_t** drop_arg, @@ -199,7 +247,9 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx ulint add_autoinc_arg, ulonglong autoinc_col_min_value_arg, ulonglong autoinc_col_max_value_arg, - ulint num_to_drop_vcol_arg) : + bool allow_not_null_flag, + bool page_compressed, + ulonglong page_compression_level_arg) : inplace_alter_handler_ctx(), prebuilt (prebuilt_arg), add_index (0), add_key_numbers (0), num_to_add_index (0), @@ -209,10 +259,10 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx add_fk (add_fk_arg), num_to_add_fk (num_to_add_fk_arg), online (online_arg), heap (heap_arg), trx (0), old_table (prebuilt_arg->table), - new_table (new_table_arg), + new_table (new_table_arg), instant_table (0), col_map (0), col_names (col_names_arg), add_autoinc (add_autoinc_arg), - add_cols (0), + defaults (0), sequence(prebuilt->trx->mysql_thd, autoinc_col_min_value_arg, autoinc_col_max_value_arg), tmp_name (0), @@ -223,8 +273,19 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx num_to_drop_vcol(0), drop_vcol(0), drop_vcol_name(0), - m_stage(NULL) + m_stage(NULL), + old_n_cols(prebuilt_arg->table->n_cols), + old_cols(prebuilt_arg->table->cols), + old_col_names(prebuilt_arg->table->col_names), + allow_not_null(allow_not_null_flag), + page_compression_level(page_compressed + ? (page_compression_level_arg + ? uint(page_compression_level_arg) + : page_zip_level) + : 0) { + ut_ad(old_n_cols >= DATA_N_SYS_COLS); + ut_ad(page_compression_level <= 9); #ifdef UNIV_DEBUG for (ulint i = 0; i < num_to_add_index; i++) { ut_ad(!add_index[i]->to_be_dropped); @@ -241,6 +302,19 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx ~ha_innobase_inplace_ctx() { UT_DELETE(m_stage); + if (instant_table) { + ut_ad(!instant_table->id); + while (dict_index_t* index + = UT_LIST_GET_LAST(instant_table->indexes)) { + UT_LIST_REMOVE(instant_table->indexes, index); + rw_lock_free(&index->lock); + dict_mem_index_free(index); + } + if (instant_table->fts) { + fts_free(instant_table); + } + dict_mem_table_free(instant_table); + } mem_heap_free(heap); } @@ -248,6 +322,36 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx @return whether the table will be rebuilt */ bool need_rebuild () const { return(old_table != new_table); } + /** Convert table-rebuilding ALTER to instant ALTER. */ + void prepare_instant() + { + DBUG_ASSERT(need_rebuild()); + DBUG_ASSERT(!is_instant()); + DBUG_ASSERT(old_table->n_cols == old_table->n_def); + DBUG_ASSERT(new_table->n_cols == new_table->n_def); + DBUG_ASSERT(old_table->n_cols == old_n_cols); + DBUG_ASSERT(new_table->n_cols > old_table->n_cols); + instant_table = new_table; + + new_table = old_table; + export_vars.innodb_instant_alter_column++; + } + + /** Revert prepare_instant() if the transaction is rolled back. */ + void rollback_instant() + { + if (!is_instant()) return; + old_table->rollback_instant(old_n_cols, + old_cols, old_col_names); + } + + /** @return whether this is instant ALTER TABLE */ + bool is_instant() const + { + DBUG_ASSERT(!instant_table || !instant_table->can_be_evicted); + return instant_table; + } + /** Share context between partitions. @param[in] ctx context from another partition of the table */ void set_shared_data(const inplace_alter_handler_ctx& ctx) @@ -295,10 +399,8 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx if (!is_new_vcol(*vcol)) continue; - dict_v_col_t *drop_vcol= index->new_vcol_info-> - add_drop_v_col(index->heap, vcol, --n_drop_new_vcol); - /* Re-assign the index field with newly stored virtual column */ - index->fields[i].col= reinterpret_cast<dict_col_t*>(drop_vcol); + index->fields[i].col= &index->new_vcol_info-> + add_drop_v_col(index->heap, vcol, --n_drop_new_vcol)->m_col; } } } @@ -410,20 +512,18 @@ my_error_innodb( /** Determine if fulltext indexes exist in a given table. @param table MySQL table -@return whether fulltext indexes exist on the table */ -static -bool -innobase_fulltext_exist( -/*====================*/ - const TABLE* table) +@return number of fulltext indexes */ +static uint innobase_fulltext_exist(const TABLE* table) { + uint count = 0; + for (uint i = 0; i < table->s->keys; i++) { if (table->key_info[i].flags & HA_FULLTEXT) { - return(true); + count++; } } - return(false); + return count; } /** Determine whether indexed virtual columns exist in a table. @@ -464,44 +564,62 @@ innobase_spatial_exist( return(false); } -/** Determine if ALTER TABLE needs to rebuild the table. -@param ha_alter_info the DDL operation -@param table metadata before ALTER TABLE -@return whether it is necessary to rebuild the table */ -static MY_ATTRIBUTE((nonnull, warn_unused_result)) -bool -innobase_need_rebuild( +/** Determine if ALTER_OPTIONS requires rebuilding the table. +@param[in] ha_alter_info the ALTER TABLE operation +@param[in] table metadata before ALTER TABLE +@return whether it is mandatory to rebuild the table */ +static bool alter_options_need_rebuild( const Alter_inplace_info* ha_alter_info, const TABLE* table) { - Alter_inplace_info::HA_ALTER_FLAGS alter_inplace_flags = - ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE; + DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_OPTIONS); + + if (ha_alter_info->create_info->used_fields + & (HA_CREATE_USED_ROW_FORMAT + | HA_CREATE_USED_KEY_BLOCK_SIZE)) { + /* Specifying ROW_FORMAT or KEY_BLOCK_SIZE requires + rebuilding the table. (These attributes in the .frm + file may disagree with the InnoDB data dictionary, and + the interpretation of thse attributes depends on + InnoDB parameters. That is why we for now always + require a rebuild when these attributes are specified.) */ + return true; + } - if (alter_inplace_flags & Alter_inplace_info::CHANGE_CREATE_OPTION) { - const ha_table_option_struct& alt_opt= + const ha_table_option_struct& alt_opt= *ha_alter_info->create_info->option_struct; - const ha_table_option_struct& opt= *table->s->option_struct; + const ha_table_option_struct& opt= *table->s->option_struct; - if (alt_opt.page_compressed != opt.page_compressed - || alt_opt.page_compression_level - != opt.page_compression_level - || alt_opt.encryption != opt.encryption - || alt_opt.encryption_key_id != opt.encryption_key_id) { - return(true); - } + /* Allow an instant change to enable page_compressed, + and any change of page_compression_level. */ + if ((!alt_opt.page_compressed && opt.page_compressed) + || alt_opt.encryption != opt.encryption + || alt_opt.encryption_key_id != opt.encryption_key_id) { + return(true); } - if (alter_inplace_flags == Alter_inplace_info::CHANGE_CREATE_OPTION - && !(ha_alter_info->create_info->used_fields - & (HA_CREATE_USED_ROW_FORMAT - | HA_CREATE_USED_KEY_BLOCK_SIZE))) { - /* Any other CHANGE_CREATE_OPTION than changing - ROW_FORMAT or KEY_BLOCK_SIZE can be done without - rebuilding the table. */ - return(false); + return false; +} + +/** Determine if ALTER TABLE needs to rebuild the table +(or perform instant operation). +@param[in] ha_alter_info the ALTER TABLE operation +@param[in] table metadata before ALTER TABLE +@return whether it is necessary to rebuild the table or to alter columns */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +innobase_need_rebuild( + const Alter_inplace_info* ha_alter_info, + const TABLE* table) +{ + if ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_NOREBUILD + | INNOBASE_ALTER_INSTANT)) + == ALTER_OPTIONS) { + return alter_options_need_rebuild(ha_alter_info, table); } - return(!!(alter_inplace_flags & INNOBASE_ALTER_REBUILD)); + return !!(ha_alter_info->handler_flags & INNOBASE_ALTER_REBUILD); } /** Check if virtual column in old and new table are in order, excluding @@ -525,7 +643,7 @@ check_v_col_in_order( /* We don't support any adding new virtual column before existed virtual column. */ if (ha_alter_info->handler_flags - & Alter_inplace_info::ADD_VIRTUAL_COLUMN) { + & ALTER_ADD_VIRTUAL_COLUMN) { bool has_new = false; List_iterator_fast<Create_field> cf_it( @@ -555,7 +673,7 @@ check_v_col_in_order( /* directly return true if ALTER_VIRTUAL_COLUMN_ORDER is not on */ if (!(ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_VIRTUAL_COLUMN_ORDER)) { + & ALTER_VIRTUAL_COLUMN_ORDER)) { return(true); } @@ -581,8 +699,8 @@ check_v_col_in_order( } if (my_strcasecmp(system_charset_info, - field->field_name, - new_field->field_name) != 0) { + field->field_name.str, + new_field->field_name.str) != 0) { /* different column */ return(false); } else { @@ -602,29 +720,273 @@ check_v_col_in_order( return(true); } +/** Determine if an instant operation is possible for altering columns. +@param[in] ha_alter_info the ALTER TABLE operation +@param[in] table table definition before ALTER TABLE */ +static +bool +instant_alter_column_possible( + const Alter_inplace_info* ha_alter_info, + const TABLE* table) +{ + // Making table system-versioned instantly is not implemented yet. + if (ha_alter_info->handler_flags & ALTER_ADD_SYSTEM_VERSIONING) { + return false; + } + + if (~ha_alter_info->handler_flags & ALTER_ADD_STORED_BASE_COLUMN) { + return false; + } + + /* At the moment, we disallow ADD [UNIQUE] INDEX together with + instant ADD COLUMN. + + The main reason is that the work of instant ADD must be done + in commit_inplace_alter_table(). For the rollback_instant() + to work, we must add the columns to dict_table_t beforehand, + and roll back those changes in case the transaction is rolled + back. + + If we added the columns to the dictionary cache already in the + prepare_inplace_alter_table(), we would have to deal with + column number mismatch in ha_innobase::open(), write_row() and + other functions. */ + + /* FIXME: allow instant ADD COLUMN together with + INNOBASE_ONLINE_CREATE (ADD [UNIQUE] INDEX) on pre-existing + columns. */ + if (ha_alter_info->handler_flags + & ((INNOBASE_ALTER_REBUILD | INNOBASE_ONLINE_CREATE) + & ~ALTER_ADD_STORED_BASE_COLUMN & ~ALTER_OPTIONS)) { + return false; + } + + return !(ha_alter_info->handler_flags & ALTER_OPTIONS) + || !alter_options_need_rebuild(ha_alter_info, table); +} + +/** Check whether the non-const default value for the field +@param[in] field field which could be added or changed +@return true if the non-const default is present. */ +static bool is_non_const_value(Field* field) +{ + return field->default_value + && field->default_value->flags + & uint(~(VCOL_SESSION_FUNC | VCOL_TIME_FUNC)); +} + +/** Set default value for the field. +@param[in] field field which could be added or changed +@return true if the default value is set. */ +static bool set_default_value(Field* field) +{ + /* The added/changed NOT NULL column lacks a DEFAULT value, + or the DEFAULT is the same for all rows. + (Time functions, such as CURRENT_TIMESTAMP(), + are evaluated from a timestamp that is assigned + at the start of the statement. Session + functions, such as USER(), always evaluate the + same within a statement.) */ + + ut_ad(!is_non_const_value(field)); + + /* Compute the DEFAULT values of non-constant columns + (VCOL_SESSION_FUNC | VCOL_TIME_FUNC). */ + switch (field->set_default()) { + case 0: /* OK */ + case 3: /* DATETIME to TIME or DATE conversion */ + return true; + case -1: /* OOM, or GEOMETRY type mismatch */ + case 1: /* A number adjusted to the min/max value */ + case 2: /* String truncation, or conversion problem */ + break; + } + + return false; +} + +/** Check whether the table has the FTS_DOC_ID column +@param[in] table InnoDB table with fulltext index +@param[in] altered_table MySQL table with fulltext index +@param[out] fts_doc_col_no The column number for Doc ID, + or ULINT_UNDEFINED if it is of wrong type +@param[out] num_v Number of virtual column +@param[in] check_only check only whether fts doc id exist. +@return whether there exists an FTS_DOC_ID column */ +static +bool +innobase_fts_check_doc_id_col( + const dict_table_t* table, + const TABLE* altered_table, + ulint* fts_doc_col_no, + ulint* num_v, + bool check_only=false) +{ + *fts_doc_col_no = ULINT_UNDEFINED; + + const uint n_cols = altered_table->s->fields; + ulint i; + int err = 0; + *num_v = 0; + + for (i = 0; i < n_cols; i++) { + const Field* field = altered_table->field[i]; + + if (!field->stored_in_db()) { + (*num_v)++; + } + + if (my_strcasecmp(system_charset_info, + field->field_name.str, FTS_DOC_ID_COL_NAME)) { + continue; + } + + if (strcmp(field->field_name.str, FTS_DOC_ID_COL_NAME)) { + err = ER_WRONG_COLUMN_NAME; + } else if (field->type() != MYSQL_TYPE_LONGLONG + || field->pack_length() != 8 + || field->real_maybe_null() + || !(field->flags & UNSIGNED_FLAG) + || !field->stored_in_db()) { + err = ER_INNODB_FT_WRONG_DOCID_COLUMN; + } else { + *fts_doc_col_no = i - *num_v; + } + + if (err && !check_only) { + my_error(err, MYF(0), field->field_name.str); + } + + return(true); + } + + if (!table) { + return(false); + } + + /* Not to count the virtual columns */ + i -= *num_v; + + for (; i + DATA_N_SYS_COLS < (uint) table->n_cols; i++) { + const char* name = dict_table_get_col_name(table, i); + + if (strcmp(name, FTS_DOC_ID_COL_NAME) == 0) { +#ifdef UNIV_DEBUG + const dict_col_t* col; + + col = dict_table_get_nth_col(table, i); + + /* Because the FTS_DOC_ID does not exist in + the MySQL data dictionary, this must be the + internally created FTS_DOC_ID column. */ + ut_ad(col->mtype == DATA_INT); + ut_ad(col->len == 8); + ut_ad(col->prtype & DATA_NOT_NULL); + ut_ad(col->prtype & DATA_UNSIGNED); +#endif /* UNIV_DEBUG */ + *fts_doc_col_no = i; + return(true); + } + } + + return(false); +} + +/** Check whether the table is empty. +@param[in] table table to be checked +@return true if table is empty */ +static bool innobase_table_is_empty(const dict_table_t *table) +{ + dict_index_t *clust_index= dict_table_get_first_index(table); + mtr_t mtr; + btr_pcur_t pcur; + buf_block_t *block; + page_cur_t *cur; + const rec_t *rec; + bool next_page= false; + + mtr.start(); + btr_pcur_open_at_index_side(true, clust_index, BTR_SEARCH_LEAF, + &pcur, true, 0, &mtr); + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + if (!rec_is_metadata(btr_pcur_get_rec(&pcur), clust_index)) + btr_pcur_move_to_prev_on_page(&pcur); +scan_leaf: + cur= btr_pcur_get_page_cur(&pcur); + page_cur_move_to_next(cur); +next_page: + if (next_page) + { + uint32_t next_page_no= btr_page_get_next(page_cur_get_page(cur)); + if (next_page_no == FIL_NULL) + { + mtr.commit(); + return true; + } + + next_page= false; + block= page_cur_get_block(cur); + block= btr_block_get(page_id_t(block->page.id.space(), next_page_no), + block->page.size, BTR_SEARCH_LEAF, clust_index, + &mtr); + btr_leaf_page_release(page_cur_get_block(cur), BTR_SEARCH_LEAF, &mtr); + page_cur_set_before_first(block, cur); + page_cur_move_to_next(cur); + } + + rec= page_cur_get_rec(cur); + if (rec_get_deleted_flag(rec, dict_table_is_comp(table))); + else if (!page_rec_is_supremum(rec)) + { + mtr.commit(); + return false; + } + else + { + next_page= true; + goto next_page; + } + goto scan_leaf; +} + /** Check if InnoDB supports a particular alter table in-place @param altered_table TABLE object for new version of table. @param ha_alter_info Structure describing changes to be done by ALTER TABLE and holding data used during in-place alter. @retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported -@retval HA_ALTER_INPLACE_NO_LOCK Supported -@retval HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE Supported, but requires -lock during main phase and exclusive lock during prepare phase. -@retval HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE Supported, prepare phase -requires exclusive lock (any transactions that have accessed the table -must commit or roll back first, and no transactions can access the table -while prepare_inplace_alter_table() is executing) +@retval HA_ALTER_INPLACE_INSTANT +MDL_EXCLUSIVE is needed for executing prepare_inplace_alter_table() +and commit_inplace_alter_table(). inplace_alter_table() will not be called. +@retval HA_ALTER_INPLACE_COPY_NO_LOCK +MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to +LOCK=NONE for rebuilding the table in inplace_alter_table() +@retval HA_ALTER_INPLACE_COPY_LOCK +MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to +LOCK=SHARED for rebuilding the table in inplace_alter_table() +@retval HA_ALTER_INPLACE_NOCOPY_NO_LOCK +MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to +LOCK=NONE for inplace_alter_table() which will not rebuild the table +@retval HA_ALTER_INPLACE_NOCOPY_LOCK +MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to +LOCK=SHARED for inplace_alter_table() which will not rebuild the table */ enum_alter_inplace_result ha_innobase::check_if_supported_inplace_alter( -/*==========================================*/ TABLE* altered_table, Alter_inplace_info* ha_alter_info) { DBUG_ENTER("check_if_supported_inplace_alter"); + if ((ha_alter_info->handler_flags + & INNOBASE_ALTER_VERSIONED_REBUILD) + && altered_table->versioned(VERS_TIMESTAMP)) { + ha_alter_info->unsupported_reason = + "Not implemented for system-versioned timestamp tables"; + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + /* Before 10.2.2 information about virtual columns was not stored in system tables. We need to do a full alter to rebuild proper 10.2.2+ metadata with the information about virtual columns */ @@ -634,7 +996,7 @@ ha_innobase::check_if_supported_inplace_alter( if (high_level_read_only) { ha_alter_info->unsupported_reason = - innobase_get_err_msg(ER_READ_ONLY_MODE); + my_get_err_msg(ER_READ_ONLY_MODE); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } @@ -645,7 +1007,7 @@ ha_innobase::check_if_supported_inplace_alter( return an error too. This is how we effectively deny adding too many columns to a table. */ ha_alter_info->unsupported_reason = - innobase_get_err_msg(ER_TOO_MANY_FIELDS); + my_get_err_msg(ER_TOO_MANY_FIELDS); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } @@ -653,40 +1015,61 @@ ha_innobase::check_if_supported_inplace_alter( if (ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_INSTANT | INNOBASE_ALTER_NOREBUILD | INNOBASE_ALTER_REBUILD)) { if (ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_STORED_COLUMN_TYPE) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( + & ALTER_STORED_COLUMN_TYPE) { + ha_alter_info->unsupported_reason = my_get_err_msg( ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_COLUMN_TYPE); } + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } /* Only support online add foreign key constraint when check_foreigns is turned off */ - if ((ha_alter_info->handler_flags & Alter_inplace_info::ADD_FOREIGN_KEY) + if ((ha_alter_info->handler_flags & ALTER_ADD_FOREIGN_KEY) && m_prebuilt->trx->check_foreigns) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( + ha_alter_info->unsupported_reason = my_get_err_msg( ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FK_CHECK); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } - if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) { - DBUG_RETURN(HA_ALTER_INPLACE_NO_LOCK); + const char* reason_rebuild = NULL; + + switch (innodb_instant_alter_column_allowed) { + case 0: /* never */ + if ((ha_alter_info->handler_flags + & ALTER_ADD_STORED_BASE_COLUMN) + || m_prebuilt->table->is_instant()) { + reason_rebuild = + "innodb_instant_alter_column_allowed=never"; + if (ha_alter_info->handler_flags + & ALTER_RECREATE_TABLE) { + reason_rebuild = NULL; + } else { + ha_alter_info->handler_flags + |= ALTER_RECREATE_TABLE; + ha_alter_info->unsupported_reason + = reason_rebuild; + } + } + break; } - /* Only support NULL -> NOT NULL change if strict table sql_mode - is set. Fall back to COPY for conversion if not strict tables. - In-Place will fail with an error when trying to convert - NULL to a NOT NULL value. */ - if ((ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_COLUMN_NOT_NULLABLE) - && (ha_alter_info->ignore || !thd_is_strict_mode(m_user_thd))) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( - ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL); - DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + switch (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) { + case ALTER_OPTIONS: + if (alter_options_need_rebuild(ha_alter_info, table)) { + reason_rebuild = my_get_err_msg( + ER_ALTER_OPERATION_TABLE_OPTIONS_NEED_REBUILD); + ha_alter_info->unsupported_reason = reason_rebuild; + break; + } + /* fall through */ + case 0: + DBUG_RETURN(HA_ALTER_INPLACE_INSTANT); } /* InnoDB cannot IGNORE when creating unique indexes. IGNORE @@ -694,9 +1077,8 @@ ha_innobase::check_if_supported_inplace_alter( code will not delete anything from existing indexes. */ if (ha_alter_info->ignore && (ha_alter_info->handler_flags - & (Alter_inplace_info::ADD_PK_INDEX - | Alter_inplace_info::ADD_UNIQUE_INDEX))) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( + & (ALTER_ADD_PK_INDEX | ALTER_ADD_UNIQUE_INDEX))) { + ha_alter_info->unsupported_reason = my_get_err_msg( ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_IGNORE); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } @@ -704,27 +1086,27 @@ ha_innobase::check_if_supported_inplace_alter( /* DROP PRIMARY KEY is only allowed in combination with ADD PRIMARY KEY. */ if ((ha_alter_info->handler_flags - & (Alter_inplace_info::ADD_PK_INDEX - | Alter_inplace_info::DROP_PK_INDEX)) - == Alter_inplace_info::DROP_PK_INDEX) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( + & (ALTER_ADD_PK_INDEX | ALTER_DROP_PK_INDEX)) + == ALTER_DROP_PK_INDEX) { + ha_alter_info->unsupported_reason = my_get_err_msg( ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOPK); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } - /* If a column change from NOT NULL to NULL, - and there's a implict pk on this column. the - table should be rebuild. The change should - only go through the "Copy" method. */ - if ((ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_COLUMN_NULLABLE)) { + if (ha_alter_info->handler_flags & ALTER_COLUMN_NULLABLE) { + /* If a NOT NULL attribute is going to be removed and + a UNIQUE INDEX on the column had been promoted to an + implicit PRIMARY KEY, the table should be rebuilt by + ALGORITHM=COPY. (Theoretically, we could support + rebuilding by ALGORITHM=INPLACE if a PRIMARY KEY is + going to be added, either explicitly or by promoting + another UNIQUE KEY.) */ const uint my_primary_key = altered_table->s->primary_key; - /* See if MYSQL table has no pk but we do. */ if (UNIV_UNLIKELY(my_primary_key >= MAX_KEY) && !dict_index_is_auto_gen_clust( dict_table_get_first_index(m_prebuilt->table))) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( + ha_alter_info->unsupported_reason = my_get_err_msg( ER_PRIMARY_CANT_HAVE_NULL); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } @@ -743,8 +1125,9 @@ ha_innobase::check_if_supported_inplace_alter( */ for (ulint i = 0, icol= 0; i < table->s->fields; i++) { const Field* field = table->field[i]; - const dict_col_t* col = dict_table_get_nth_col(m_prebuilt->table, icol); - ulint unsigned_flag; + const dict_col_t* col = dict_table_get_nth_col( + m_prebuilt->table, icol); + ulint unsigned_flag; if (!field->stored_in_db()) { continue; @@ -752,7 +1135,8 @@ ha_innobase::check_if_supported_inplace_alter( icol++; - if (col->mtype != get_innobase_type_from_mysql_type(&unsigned_flag, field)) { + if (col->mtype != get_innobase_type_from_mysql_type( + &unsigned_flag, field)) { DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } @@ -769,7 +1153,7 @@ ha_innobase::check_if_supported_inplace_alter( use "Copy" method. */ if (m_prebuilt->table->dict_frm_mismatch) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( + ha_alter_info->unsupported_reason = my_get_err_msg( ER_NO_SUCH_INDEX); ib_push_frm_error(m_user_thd, m_prebuilt->table, altered_table, n_indexes, true); @@ -777,33 +1161,40 @@ ha_innobase::check_if_supported_inplace_alter( DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } + /* '0000-00-00' value isn't allowed for datetime datatype + for newly added column when table is not empty */ + if (ha_alter_info->error_if_not_empty + && !innobase_table_is_empty(m_prebuilt->table)) { + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + bool add_drop_v_cols = false; /* If there is add or drop virtual columns, we will support operations with these 2 options alone with inplace interface for now */ if (ha_alter_info->handler_flags - & (Alter_inplace_info::ADD_VIRTUAL_COLUMN - | Alter_inplace_info::DROP_VIRTUAL_COLUMN - | Alter_inplace_info::ALTER_VIRTUAL_COLUMN_ORDER)) { + & (ALTER_ADD_VIRTUAL_COLUMN + | ALTER_DROP_VIRTUAL_COLUMN + | ALTER_VIRTUAL_COLUMN_ORDER)) { ulonglong flags = ha_alter_info->handler_flags; /* TODO: uncomment the flags below, once we start to support them */ - flags &= ~(Alter_inplace_info::ADD_VIRTUAL_COLUMN - | Alter_inplace_info::DROP_VIRTUAL_COLUMN - | Alter_inplace_info::ALTER_VIRTUAL_COLUMN_ORDER - | Alter_inplace_info::ALTER_VIRTUAL_GCOL_EXPR - | Alter_inplace_info::ALTER_COLUMN_VCOL + flags &= ~(ALTER_ADD_VIRTUAL_COLUMN + | ALTER_DROP_VIRTUAL_COLUMN + | ALTER_VIRTUAL_COLUMN_ORDER + | ALTER_VIRTUAL_GCOL_EXPR + | ALTER_COLUMN_VCOL /* - | Alter_inplace_info::ADD_STORED_BASE_COLUMN - | Alter_inplace_info::DROP_STORED_COLUMN - | Alter_inplace_info::ALTER_STORED_COLUMN_ORDER - | Alter_inplace_info::ADD_UNIQUE_INDEX + | ALTER_ADD_STORED_BASE_COLUMN + | ALTER_DROP_STORED_COLUMN + | ALTER_STORED_COLUMN_ORDER + | ALTER_ADD_UNIQUE_INDEX */ - | Alter_inplace_info::ADD_INDEX - | Alter_inplace_info::DROP_INDEX); + | ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX + | ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX); if (flags != 0 || IF_PARTITIONING((altered_table->s->partition_info_str @@ -819,8 +1210,9 @@ ha_innobase::check_if_supported_inplace_alter( } /* We should be able to do the operation in-place. - See if we can do it online (LOCK=NONE). */ - bool online = true; + See if we can do it online (LOCK=NONE) or without rebuild. */ + bool online = true, need_rebuild = false; + const uint fulltext_indexes = innobase_fulltext_exist(altered_table); List_iterator_fast<Create_field> cf_it( ha_alter_info->alter_info->create_list); @@ -842,7 +1234,8 @@ ha_innobase::check_if_supported_inplace_alter( } for (KEY_PART_INFO* key_part = new_key->key_part; - key_part < new_key->key_part + new_key->user_defined_key_parts; + key_part < (new_key->key_part + + new_key->user_defined_key_parts); key_part++) { const Create_field* new_field; @@ -876,17 +1269,16 @@ ha_innobase::check_if_supported_inplace_alter( /* This is an added column. */ DBUG_ASSERT(ha_alter_info->handler_flags - & Alter_inplace_info::ADD_COLUMN); + & ALTER_ADD_COLUMN); /* We cannot replace a hidden FTS_DOC_ID with a user-visible FTS_DOC_ID. */ - if (m_prebuilt->table->fts - && innobase_fulltext_exist(altered_table) + if (fulltext_indexes && m_prebuilt->table->fts && !my_strcasecmp( system_charset_info, - key_part->field->field_name, + key_part->field->field_name.str, FTS_DOC_ID_COL_NAME)) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( + ha_alter_info->unsupported_reason = my_get_err_msg( ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_HIDDEN_FTS); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } @@ -897,13 +1289,18 @@ ha_innobase::check_if_supported_inplace_alter( & AUTO_INCREMENT_FLAG)); if (key_part->field->flags & AUTO_INCREMENT_FLAG) { - /* We cannot assign an AUTO_INCREMENT - column values during online ALTER. */ + /* We cannot assign AUTO_INCREMENT values + during online or instant ALTER. */ DBUG_ASSERT(key_part->field == altered_table -> found_next_number_field); - ha_alter_info->unsupported_reason = innobase_get_err_msg( - ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_AUTOINC); + + if (ha_alter_info->online) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_AUTOINC); + } + online = false; + need_rebuild = true; } if (!key_part->field->stored_in_db()) { @@ -911,36 +1308,41 @@ ha_innobase::check_if_supported_inplace_alter( virtual column, while there is also a drop virtual column in the same clause */ if (ha_alter_info->handler_flags - & Alter_inplace_info::DROP_VIRTUAL_COLUMN) { + & ALTER_DROP_VIRTUAL_COLUMN) { ha_alter_info->unsupported_reason = MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN; DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } - ha_alter_info->unsupported_reason = - MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN; + if (ha_alter_info->online + && !ha_alter_info->unsupported_reason) { + ha_alter_info->unsupported_reason = + MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN; + } + online = false; } } } - DBUG_ASSERT(!m_prebuilt->table->fts || m_prebuilt->table->fts->doc_col - <= table->s->fields); - DBUG_ASSERT(!m_prebuilt->table->fts || m_prebuilt->table->fts->doc_col - < dict_table_get_n_user_cols(m_prebuilt->table)); + DBUG_ASSERT(!m_prebuilt->table->fts + || (m_prebuilt->table->fts->doc_col <= table->s->fields)); - if (m_prebuilt->table->fts - && innobase_fulltext_exist(altered_table)) { + DBUG_ASSERT(!m_prebuilt->table->fts + || (m_prebuilt->table->fts->doc_col + < dict_table_get_n_user_cols(m_prebuilt->table))); + + if (fulltext_indexes && m_prebuilt->table->fts) { /* FULLTEXT indexes are supposed to remain. */ /* Disallow DROP INDEX FTS_DOC_ID_INDEX */ for (uint i = 0; i < ha_alter_info->index_drop_count; i++) { if (!my_strcasecmp( system_charset_info, - ha_alter_info->index_drop_buffer[i]->name, + ha_alter_info->index_drop_buffer[i]->name.str, FTS_DOC_ID_INDEX_NAME)) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( + ha_alter_info->unsupported_reason = my_get_err_msg( ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } @@ -958,9 +1360,9 @@ ha_innobase::check_if_supported_inplace_alter( if (!my_strcasecmp( system_charset_info, - (*fp)->field_name, + (*fp)->field_name.str, FTS_DOC_ID_COL_NAME)) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( + ha_alter_info->unsupported_reason = my_get_err_msg( ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } @@ -969,91 +1371,30 @@ ha_innobase::check_if_supported_inplace_alter( m_prebuilt->trx->will_lock++; - if (!online) { - /* We already determined that only a non-locking - operation is possible. */ - } else if (((ha_alter_info->handler_flags - & Alter_inplace_info::ADD_PK_INDEX) - || innobase_need_rebuild(ha_alter_info, table)) - && (innobase_fulltext_exist(altered_table) - || innobase_spatial_exist(altered_table) - || innobase_indexed_virtual_exist(altered_table))) { - /* Refuse to rebuild the table online, if - FULLTEXT OR SPATIAL indexes or indexed virtual columns - are to survive the rebuild. */ - online = false; - /* If the table already contains fulltext indexes, - refuse to rebuild the table natively altogether. */ - if (m_prebuilt->table->fts) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( - ER_INNODB_FT_LIMIT); - DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); - } - - if (innobase_spatial_exist(altered_table)) { - ha_alter_info->unsupported_reason = - innobase_get_err_msg( - ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_GIS); - } else if (!innobase_fulltext_exist(altered_table)) { - /* MDEV-14341 FIXME: Remove this limitation. */ - ha_alter_info->unsupported_reason = - "online rebuild with indexed virtual columns"; - } else { - ha_alter_info->unsupported_reason = - innobase_get_err_msg( - ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS); - } - } else if ((ha_alter_info->handler_flags - & Alter_inplace_info::ADD_INDEX)) { - /* ADD FULLTEXT|SPATIAL INDEX requires a lock. - - We could do ADD FULLTEXT INDEX without a lock if the - table already contains an FTS_DOC_ID column, but in - that case we would have to apply the modification log - to the full-text indexes. - - We could also do ADD SPATIAL INDEX by implementing - row_log_apply() for it. */ - - for (uint i = 0; i < ha_alter_info->index_add_count; i++) { - const KEY* key = - &ha_alter_info->key_info_buffer[ - ha_alter_info->index_add_buffer[i]]; - if (key->flags & HA_FULLTEXT) { - DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK - & ~(HA_FULLTEXT - | HA_PACK_KEY - | HA_GENERATED_KEY - | HA_BINARY_PACK_KEY))); - ha_alter_info->unsupported_reason = innobase_get_err_msg( - ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS); - online = false; - break; - } - if (key->flags & HA_SPATIAL) { - ha_alter_info->unsupported_reason = innobase_get_err_msg( - ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_GIS); - online = false; - break; - } - } - } - /* When changing a NULL column to NOT NULL and specifying a DEFAULT value, ensure that the DEFAULT expression is a constant. Also, in ADD COLUMN, for now we only support a constant DEFAULT expression. */ cf_it.rewind(); Field **af = altered_table->field; + bool add_column_not_last = false; + uint n_stored_cols = 0, n_add_cols = 0; while (Create_field* cf = cf_it++) { DBUG_ASSERT(cf->field || (ha_alter_info->handler_flags - & Alter_inplace_info::ADD_COLUMN)); + & ALTER_ADD_COLUMN)); if (const Field* f = cf->field) { - /* This could be changing an existing column + if (!f->real_maybe_null() || (*af)->real_maybe_null()) + goto next_column; + /* We are changing an existing column from NULL to NOT NULL. */ + DBUG_ASSERT(ha_alter_info->handler_flags + & ALTER_COLUMN_NOT_NULLABLE); + /* Virtual columns are never NOT NULL. */ + DBUG_ASSERT(f->stored_in_db()); + switch ((*af)->type()) { case MYSQL_TYPE_TIMESTAMP: case MYSQL_TYPE_TIMESTAMP2: @@ -1062,28 +1403,12 @@ ha_innobase::check_if_supported_inplace_alter( replaced. Ensure that the DEFAULT expression is not changing during ALTER TABLE. */ - if (!f->real_maybe_null() - || (*af)->real_maybe_null()) { - /* The column was NOT NULL, or it - will allow NULL after ALTER TABLE. */ - goto next_column; - } - if (!(*af)->default_value && (*af)->is_real_null()) { /* No DEFAULT value is specified. We can report errors for any NULL values for - the TIMESTAMP. - - FIXME: Allow any DEFAULT - expression whose value does - not change during ALTER TABLE. - This would require a fix in - row_merge_read_clustered_index() - to try to replace the DEFAULT - value before reporting - DB_INVALID_NULL. */ + the TIMESTAMP. */ goto next_column; } break; @@ -1100,42 +1425,168 @@ ha_innobase::check_if_supported_inplace_alter( goto next_column; } - ha_alter_info->unsupported_reason - = innobase_get_err_msg( - ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL); - } else if (!(*af)->default_value - || !((*af)->default_value->flags - & ~(VCOL_SESSION_FUNC | VCOL_TIME_FUNC))) { - /* The added NOT NULL column lacks a DEFAULT value, - or the DEFAULT is the same for all rows. - (Time functions, such as CURRENT_TIMESTAMP(), - are evaluated from a timestamp that is assigned - at the start of the statement. Session - functions, such as USER(), always evaluate the - same within a statement.) */ - - /* Compute the DEFAULT values of non-constant columns - (VCOL_SESSION_FUNC | VCOL_TIME_FUNC). */ - switch ((*af)->set_default()) { - case 0: /* OK */ - case 3: /* DATETIME to TIME or DATE conversion */ + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL); + } else if (!is_non_const_value(*af)) { + + n_add_cols++; + + if (af < &altered_table->field[table_share->fields]) { + add_column_not_last = true; + } + + if (set_default_value(*af)) { goto next_column; - case -1: /* OOM, or GEOMETRY type mismatch */ - case 1: /* A number adjusted to the min/max value */ - case 2: /* String truncation, or conversion problem */ - break; } } DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); next_column: - af++; + n_stored_cols += (*af++)->stored_in_db(); + } + + if (!add_column_not_last + && uint(m_prebuilt->table->n_cols) - DATA_N_SYS_COLS + n_add_cols + == n_stored_cols + && m_prebuilt->table->supports_instant() + && instant_alter_column_possible(ha_alter_info, table)) { + + DBUG_RETURN(HA_ALTER_INPLACE_INSTANT); + } + + if (!(ha_alter_info->handler_flags & ~(INNOBASE_ALTER_INSTANT + | INNOBASE_INPLACE_IGNORE))) { + DBUG_RETURN(HA_ALTER_INPLACE_INSTANT); + } + + bool fts_need_rebuild = false; + need_rebuild = need_rebuild + || innobase_need_rebuild(ha_alter_info, table); + + if (need_rebuild + && (fulltext_indexes + || innobase_spatial_exist(altered_table) + || innobase_indexed_virtual_exist(altered_table))) { + /* If the table already contains fulltext indexes, + refuse to rebuild the table natively altogether. */ + if (fulltext_indexes > 1) { +cannot_create_many_fulltext_index: + ha_alter_info->unsupported_reason = + my_get_err_msg(ER_INNODB_FT_LIMIT); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if (!online || !ha_alter_info->online + || ha_alter_info->unsupported_reason != reason_rebuild) { + /* Either LOCK=NONE was not requested, or we already + gave specific reason to refuse it. */ + } else if (fulltext_indexes) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS); + } else if (innobase_spatial_exist(altered_table)) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_GIS); + } else { + /* MDEV-14341 FIXME: Remove this limitation. */ + ha_alter_info->unsupported_reason = + "online rebuild with indexed virtual columns"; + } + + online = false; + } + + if (ha_alter_info->handler_flags + & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) { + /* ADD FULLTEXT|SPATIAL INDEX requires a lock. + + We could do ADD FULLTEXT INDEX without a lock if the + table already contains an FTS_DOC_ID column, but in + that case we would have to apply the modification log + to the full-text indexes. + + We could also do ADD SPATIAL INDEX by implementing + row_log_apply() for it. */ + bool add_fulltext = false; + + for (uint i = 0; i < ha_alter_info->index_add_count; i++) { + const KEY* key = + &ha_alter_info->key_info_buffer[ + ha_alter_info->index_add_buffer[i]]; + if (key->flags & HA_FULLTEXT) { + DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK + & ~(HA_FULLTEXT + | HA_PACK_KEY + | HA_GENERATED_KEY + | HA_BINARY_PACK_KEY))); + if (add_fulltext) { + goto cannot_create_many_fulltext_index; + } + + add_fulltext = true; + if (ha_alter_info->online + && !ha_alter_info->unsupported_reason) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS); + } + + online = false; + + /* Full text search index exists, check + whether the table already has DOC ID column. + If not, InnoDB have to rebuild the table to + add a Doc ID hidden column and change + primary index. */ + ulint fts_doc_col_no; + ulint num_v = 0; + + fts_need_rebuild = + !innobase_fts_check_doc_id_col( + m_prebuilt->table, + altered_table, + &fts_doc_col_no, &num_v, true); + } + + if (online && (key->flags & HA_SPATIAL)) { + + if (ha_alter_info->online) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_GIS); + } + + online = false; + } + } + } + + // FIXME: implement Online DDL for system-versioned operations + if (ha_alter_info->handler_flags & INNOBASE_ALTER_VERSIONED_REBUILD) { + + if (ha_alter_info->online) { + ha_alter_info->unsupported_reason = + "Not implemented for system-versioned operations"; + } + + online = false; + } + + if (need_rebuild || fts_need_rebuild) { + ha_alter_info->handler_flags |= ALTER_RECREATE_TABLE; + DBUG_RETURN(online + ? HA_ALTER_INPLACE_COPY_NO_LOCK + : HA_ALTER_INPLACE_COPY_LOCK); + } + + if (ha_alter_info->unsupported_reason) { + } else if (ha_alter_info->handler_flags & INNOBASE_ONLINE_CREATE) { + ha_alter_info->unsupported_reason = "ADD INDEX"; + } else { + ha_alter_info->unsupported_reason = "DROP INDEX"; } DBUG_RETURN(online - ? HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE - : HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE); + ? HA_ALTER_INPLACE_NOCOPY_NO_LOCK + : HA_ALTER_INPLACE_NOCOPY_LOCK); } /*************************************************************//** @@ -1147,7 +1598,7 @@ innobase_init_foreign( /*==================*/ dict_foreign_t* foreign, /*!< in/out: structure to initialize */ - char* constraint_name, /*!< in/out: constraint name if + const char* constraint_name, /*!< in/out: constraint name if exists */ dict_table_t* table, /*!< in: foreign table */ dict_index_t* index, /*!< in: foreign key index */ @@ -1273,7 +1724,6 @@ innobase_set_foreign_key_option( ut_ad(!foreign->type); switch (fk_key->delete_opt) { - // JAN: TODO: ? MySQL 5.7 used enum fk_option directly from sql_lex.h case FK_OPTION_NO_ACTION: case FK_OPTION_RESTRICT: case FK_OPTION_SET_DEFAULT: @@ -1285,6 +1735,8 @@ innobase_set_foreign_key_option( case FK_OPTION_SET_NULL: foreign->type = DICT_FOREIGN_ON_DELETE_SET_NULL; break; + case FK_OPTION_UNDEF: + break; } switch (fk_key->update_opt) { @@ -1299,6 +1751,8 @@ innobase_set_foreign_key_option( case FK_OPTION_SET_NULL: foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL; break; + case FK_OPTION_UNDEF: + break; } return(innobase_check_fk_option(foreign)); @@ -1357,7 +1811,7 @@ no_match: } if (innobase_strcasecmp(col_names[j], - key_part.field->field_name)) { + key_part.field->field_name.str)) { /* Name mismatch */ goto no_match; } @@ -1373,12 +1827,10 @@ no_match: Find an index whose first fields are the columns in the array in the same order and is not marked for deletion @return matching index, NULL if not found */ -static MY_ATTRIBUTE((nonnull(1,2,5), warn_unused_result)) +static MY_ATTRIBUTE((nonnull(1,4), warn_unused_result)) dict_index_t* innobase_find_fk_index( /*===================*/ - Alter_inplace_info* ha_alter_info, - /*!< in: alter table info */ dict_table_t* table, /*!< in: table */ const char** col_names, /*!< in: column names, or NULL @@ -1545,7 +1997,6 @@ innobase_get_foreign_key_info( } index = innobase_find_fk_index( - ha_alter_info, table, col_names, span<dict_index_t*>(drop_index, n_drop_index), column_names, i); @@ -1677,7 +2128,7 @@ innobase_get_foreign_key_info( /* Not possible to add a foreign key without a referenced column */ mutex_exit(&dict_sys->mutex); - my_error(ER_CANNOT_ADD_FOREIGN, MYF(0)); + my_error(ER_CANNOT_ADD_FOREIGN, MYF(0), tbl_namep); goto err_exit; } @@ -1806,7 +2257,7 @@ innobase_col_to_mysql( case DATA_SYS: /* These column types should never be shipped to MySQL. */ ut_ad(0); - + /* fall through */ case DATA_FLOAT: case DATA_DOUBLE: case DATA_DECIMAL: @@ -1861,7 +2312,7 @@ null_field: continue; } - ifield = rec_get_nth_field(rec, offsets, ipos, &ilen); + ifield = rec_get_nth_cfield(rec, index, offsets, ipos, &ilen); /* Assign the NULL flag */ if (ilen == UNIV_SQL_NULL) { @@ -1981,21 +2432,6 @@ innobase_row_to_mysql( } } -/*************************************************************//** -Resets table->record[0]. */ -void -innobase_rec_reset( -/*===============*/ - TABLE* table) /*!< in/out: MySQL table */ -{ - uint n_fields = table->s->fields; - uint i; - - for (i = 0; i < n_fields; i++) { - table->field[i]->set_default(); - } -} - /*******************************************************************//** This function checks that index keys are sensible. @return 0 or error number */ @@ -2020,9 +2456,9 @@ innobase_check_index_keys( const KEY& key2 = info->key_info_buffer[ info->index_add_buffer[i]]; - if (0 == strcmp(key.name, key2.name)) { + if (0 == strcmp(key.name.str, key2.name.str)) { my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), - key.name); + key.name.str); return(ER_WRONG_NAME_FOR_INDEX); } @@ -2036,7 +2472,7 @@ innobase_check_index_keys( index; index = dict_table_get_next_index(index)) { if (index->is_committed() - && !strcmp(key.name, index->name)) { + && !strcmp(key.name.str, index->name)) { break; } } @@ -2061,15 +2497,15 @@ innobase_check_index_keys( const KEY* drop_key = info->index_drop_buffer[i]; - if (0 == strcmp(key.name, drop_key->name)) { + if (0 == strcmp(key.name.str, + drop_key->name.str)) { goto name_ok; } } my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), - key.name); - + key.name.str); return(ER_WRONG_NAME_FOR_INDEX); } @@ -2108,7 +2544,7 @@ name_ok: } my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB", - field->field_name); + field->field_name.str); return(ER_WRONG_KEY_COLUMN); } @@ -2124,7 +2560,7 @@ name_ok: } my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB", - field->field_name); + field->field_name.str); return(ER_WRONG_KEY_COLUMN); } } @@ -2221,7 +2657,7 @@ innobase_create_index_def( index->parser = NULL; index->key_number = key_number; index->n_fields = n_fields; - index->name = mem_heap_strdup(heap, key->name); + index->name = mem_heap_strdup(heap, key->name.str); index->rebuild = new_clustered; if (key_clustered) { @@ -2241,8 +2677,8 @@ innobase_create_index_def( if (key->flags & HA_USES_PARSER) { for (ulint j = 0; j < altered_table->s->keys; j++) { - if (ut_strcmp(altered_table->key_info[j].name, - key->name) == 0) { + if (ut_strcmp(altered_table->key_info[j].name.str, + key->name.str) == 0) { ut_ad(altered_table->key_info[j].flags & HA_USES_PARSER); @@ -2298,92 +2734,6 @@ innobase_create_index_def( } /*******************************************************************//** -Check whether the table has the FTS_DOC_ID column -@return whether there exists an FTS_DOC_ID column */ -static -bool -innobase_fts_check_doc_id_col( -/*==========================*/ - const dict_table_t* table, /*!< in: InnoDB table with - fulltext index */ - const TABLE* altered_table, - /*!< in: MySQL table with - fulltext index */ - ulint* fts_doc_col_no, - /*!< out: The column number for - Doc ID, or ULINT_UNDEFINED - if it is of wrong type */ - ulint* num_v) /*!< out: number of virtual column */ -{ - *fts_doc_col_no = ULINT_UNDEFINED; - - const uint n_cols = altered_table->s->fields; - ulint i; - - *num_v = 0; - - for (i = 0; i < n_cols; i++) { - const Field* field = altered_table->field[i]; - - if (!field->stored_in_db()) { - (*num_v)++; - } - - if (my_strcasecmp(system_charset_info, - field->field_name, FTS_DOC_ID_COL_NAME)) { - continue; - } - - if (strcmp(field->field_name, FTS_DOC_ID_COL_NAME)) { - my_error(ER_WRONG_COLUMN_NAME, MYF(0), - field->field_name); - } else if (field->type() != MYSQL_TYPE_LONGLONG - || field->pack_length() != 8 - || field->real_maybe_null() - || !(field->flags & UNSIGNED_FLAG) - || !field->stored_in_db()) { - my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN, MYF(0), - field->field_name); - } else { - *fts_doc_col_no = i - *num_v; - } - - return(true); - } - - if (!table) { - return(false); - } - - /* Not to count the virtual columns */ - i -= *num_v; - - for (; i + DATA_N_SYS_COLS < (uint) table->n_cols; i++) { - const char* name = dict_table_get_col_name(table, i); - - if (strcmp(name, FTS_DOC_ID_COL_NAME) == 0) { -#ifdef UNIV_DEBUG - const dict_col_t* col; - - col = dict_table_get_nth_col(table, i); - - /* Because the FTS_DOC_ID does not exist in - the MySQL data dictionary, this must be the - internally created FTS_DOC_ID column. */ - ut_ad(col->mtype == DATA_INT); - ut_ad(col->len == 8); - ut_ad(col->prtype & DATA_NOT_NULL); - ut_ad(col->prtype & DATA_UNSIGNED); -#endif /* UNIV_DEBUG */ - *fts_doc_col_no = i; - return(true); - } - } - - return(false); -} - -/*******************************************************************//** Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME on the Doc ID column. @return the status of the FTS_DOC_ID index */ @@ -2409,14 +2759,14 @@ innobase_fts_check_doc_id_index( const KEY& key = altered_table->key_info[i]; if (innobase_strcasecmp( - key.name, FTS_DOC_ID_INDEX_NAME)) { + key.name.str, FTS_DOC_ID_INDEX_NAME)) { continue; } if ((key.flags & HA_NOSAME) && key.user_defined_key_parts == 1 - && !strcmp(key.name, FTS_DOC_ID_INDEX_NAME) - && !strcmp(key.key_part[0].field->field_name, + && !strcmp(key.name.str, FTS_DOC_ID_INDEX_NAME) + && !strcmp(key.key_part[0].field->field_name.str, FTS_DOC_ID_COL_NAME)) { if (fts_doc_col_no) { *fts_doc_col_no = ULINT_UNDEFINED; @@ -2458,7 +2808,7 @@ innobase_fts_check_doc_id_index( && field->col->mtype == DATA_INT && field->col->len == 8 && field->col->prtype & DATA_NOT_NULL - && !dict_col_is_virtual(field->col)) { + && !field->col->is_virtual()) { if (fts_doc_col_no) { *fts_doc_col_no = dict_col_get_no(field->col); } @@ -2488,7 +2838,7 @@ innobase_fts_check_doc_id_index_in_def( for (ulint j = 0; j < n_key; j++) { const KEY* key = &key_info[j]; - if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) { + if (innobase_strcasecmp(key->name.str, FTS_DOC_ID_INDEX_NAME)) { continue; } @@ -2496,8 +2846,8 @@ innobase_fts_check_doc_id_index_in_def( named as "FTS_DOC_ID_INDEX" and on column "FTS_DOC_ID" */ if (!(key->flags & HA_NOSAME) || key->user_defined_key_parts != 1 - || strcmp(key->name, FTS_DOC_ID_INDEX_NAME) - || strcmp(key->key_part[0].field->field_name, + || strcmp(key->name.str, FTS_DOC_ID_INDEX_NAME) + || strcmp(key->key_part[0].field->field_name.str, FTS_DOC_ID_COL_NAME)) { return(FTS_INCORRECT_DOC_ID_INDEX); } @@ -2568,7 +2918,7 @@ innobase_create_key_defs( new_primary = n_add > 0 && !my_strcasecmp(system_charset_info, - key_info[*add].name, "PRIMARY"); + key_info[*add].name.str, "PRIMARY"); n_fts_add = 0; /* If there is a UNIQUE INDEX consisting entirely of NOT NULL @@ -2611,7 +2961,7 @@ innobase_create_key_defs( index->ind_type = DICT_CLUSTERED; index->name = innobase_index_reserve_name; index->rebuild = true; - index->key_number = ~0; + index->key_number = ~0U; primary_key_number = ULINT_UNDEFINED; goto created_clustered; } else { @@ -2776,7 +3126,7 @@ online_retry_drop_indexes( online_retry_drop_indexes_low(table, trx); trx_commit_for_mysql(trx); row_mysql_unlock_data_dictionary(trx); - trx_free_for_mysql(trx); + trx->free(); } ut_d(mutex_enter(&dict_sys->mutex)); @@ -2945,12 +3295,11 @@ column that is being dropped or modified to NOT NULL. @retval true Not allowed (will call my_error()) @retval false Allowed */ -MY_ATTRIBUTE((pure, nonnull(1,2,3,4), warn_unused_result)) +MY_ATTRIBUTE((pure, nonnull(1,2,3), warn_unused_result)) static bool innobase_check_foreigns( Alter_inplace_info* ha_alter_info, - const TABLE* altered_table, const TABLE* old_table, const dict_table_t* user_table, dict_foreign_t** drop_fk, @@ -2975,7 +3324,7 @@ innobase_check_foreigns( if (!new_field || (new_field->flags & NOT_NULL_FLAG)) { if (innobase_check_foreigns_low( user_table, drop_fk, n_drop_fk, - (*fp)->field_name, !new_field)) { + (*fp)->field_name.str, !new_field)) { return(true); } } @@ -2985,20 +3334,23 @@ innobase_check_foreigns( } /** Convert a default value for ADD COLUMN. - -@param heap Memory heap where allocated -@param dfield InnoDB data field to copy to -@param field MySQL value for the column -@param comp nonzero if in compact format */ -static MY_ATTRIBUTE((nonnull)) -void -innobase_build_col_map_add( -/*=======================*/ +@param[in,out] heap Memory heap where allocated +@param[out] dfield InnoDB data field to copy to +@param[in] field MySQL value for the column +@param[in] old_field Old field or NULL if new col is added +@param[in] comp nonzero if in compact format. */ +static void innobase_build_col_map_add( mem_heap_t* heap, dfield_t* dfield, const Field* field, + const Field* old_field, ulint comp) { + if (old_field && old_field->real_maybe_null() + && field->real_maybe_null()) { + return; + } + if (field->is_real_null()) { dfield_set_null(dfield); return; @@ -3008,7 +3360,7 @@ innobase_build_col_map_add( byte* buf = static_cast<byte*>(mem_heap_alloc(heap, size)); - const byte* mysql_data = field->ptr; + const byte* mysql_data = old_field ? old_field->ptr : field->ptr; row_mysql_store_col_in_innobase_format( dfield, buf, true, mysql_data, size, comp); @@ -3022,7 +3374,7 @@ adding columns. @param table MySQL table as it is before the ALTER operation @param new_table InnoDB table corresponding to MySQL altered_table @param old_table InnoDB table corresponding to MYSQL table -@param add_cols Default values for ADD COLUMN, or NULL if no ADD COLUMN +@param defaults Default values for ADD COLUMN, or NULL if no ADD COLUMN @param heap Memory heap where allocated @return array of integers, mapping column numbers in the table to column numbers in altered_table */ @@ -3035,7 +3387,7 @@ innobase_build_col_map( const TABLE* table, const dict_table_t* new_table, const dict_table_t* old_table, - dtuple_t* add_cols, + dtuple_t* defaults, mem_heap_t* heap) { DBUG_ENTER("innobase_build_col_map"); @@ -3048,9 +3400,9 @@ innobase_build_col_map( + dict_table_get_n_v_cols(old_table) >= table->s->fields + DATA_N_SYS_COLS || ha_innobase::omits_virtual_cols(*table->s)); - DBUG_ASSERT(!!add_cols == !!(ha_alter_info->handler_flags - & Alter_inplace_info::ADD_COLUMN)); - DBUG_ASSERT(!add_cols || dtuple_get_n_fields(add_cols) + DBUG_ASSERT(!!defaults == !!(ha_alter_info->handler_flags + & INNOBASE_DEFAULTS)); + DBUG_ASSERT(!defaults || dtuple_get_n_fields(defaults) == dict_table_get_n_cols(new_table)); const uint old_n_v_cols = uint(table->s->fields @@ -3102,6 +3454,21 @@ innobase_build_col_map( } if (new_field->field == field) { + + const Field* altered_field = + altered_table->field[i + num_v]; + + if (defaults) { + innobase_build_col_map_add( + heap, + dtuple_get_nth_field( + defaults, i), + altered_field, + field, + dict_table_is_comp( + new_table)); + } + col_map[old_i - num_old_v] = i; goto found_col; } @@ -3109,8 +3476,9 @@ innobase_build_col_map( ut_ad(!is_v); innobase_build_col_map_add( - heap, dtuple_get_nth_field(add_cols, i), + heap, dtuple_get_nth_field(defaults, i), altered_table->field[i + num_v], + NULL, dict_table_is_comp(new_table)); found_col: if (is_v) { @@ -3222,7 +3590,7 @@ innobase_get_col_names( DBUG_ENTER("innobase_get_col_names"); DBUG_ASSERT(user_table->n_t_def > table->s->fields); DBUG_ASSERT(ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_COLUMN_NAME); + & ALTER_COLUMN_NAME); cols = static_cast<const char**>( mem_heap_zalloc(heap, user_table->n_def * sizeof *cols)); @@ -3242,7 +3610,7 @@ innobase_get_col_names( num_v += !table->field[old_i]->stored_in_db(); if (new_field->field == table->field[old_i]) { - cols[old_i - num_v] = new_field->field_name; + cols[old_i - num_v] = new_field->field_name.str; break; } } @@ -3393,14 +3761,15 @@ innobase_pk_order_preserved( const bool old_pk_column = old_field < old_n_uniq; if (old_pk_column) { - new_field_order = old_field; + new_field_order = lint(old_field); } else if (innobase_pk_col_is_existing(new_col_no, col_map, old_n_cols) || new_clust_index->table->persistent_autoinc == new_field + 1) { /* Adding an existing column or an AUTO_INCREMENT column may change the existing ordering. */ - new_field_order = old_n_uniq + existing_field_count++; + new_field_order = lint(old_n_uniq + + existing_field_count++); } else { /* Skip newly added column. */ continue; @@ -3534,7 +3903,7 @@ innobase_check_gis_columns( ulint col_nr = dict_table_has_column( table, - key_part.field->field_name, + key_part.field->field_name.str, key_part.fieldnr); ut_ad(col_nr != table->n_def); dict_col_t* col = &table->cols[col_nr]; @@ -3657,7 +4026,7 @@ prepare_inplace_add_virtual( if (charset_no > MAX_CHAR_COLL_NUM) { my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB", - field->field_name); + field->field_name.str); return(true); } } else { @@ -3688,7 +4057,7 @@ prepare_inplace_add_virtual( ctx->add_vcol[j].m_col.ind = i - 1; ctx->add_vcol[j].num_base = 0; - ctx->add_vcol_name[j] = field->field_name; + ctx->add_vcol_name[j] = field->field_name.str; ctx->add_vcol[j].base_col = NULL; ctx->add_vcol[j].v_pos = ctx->old_table->n_v_cols - ctx->num_to_drop_vcol + j; @@ -3704,7 +4073,6 @@ prepare_inplace_add_virtual( /** Collect virtual column info for its addition @param[in] ha_alter_info Data used during in-place alter -@param[in] altered_table MySQL table that is being altered to @param[in] table MySQL table as it is before the ALTER operation @retval true Failure @retval false Success */ @@ -3712,7 +4080,6 @@ static bool prepare_inplace_drop_virtual( Alter_inplace_info* ha_alter_info, - const TABLE* altered_table, const TABLE* table) { ha_innobase_inplace_ctx* ctx; @@ -3776,7 +4143,7 @@ prepare_inplace_drop_virtual( if (charset_no > MAX_CHAR_COLL_NUM) { my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB", - field->field_name); + field->field_name.str); return(true); } } else { @@ -3807,7 +4174,7 @@ prepare_inplace_drop_virtual( ctx->drop_vcol[j].m_col.ind = i; - ctx->drop_vcol_name[j] = field->field_name; + ctx->drop_vcol_name[j] = field->field_name.str; dict_v_col_t* v_col = dict_table_get_nth_v_col_mysql( ctx->old_table, i); @@ -3907,40 +4274,38 @@ innobase_add_one_virtual( return(error); } -/** Update INNODB SYS_TABLES on number of virtual columns +/** Update SYS_TABLES.N_COLS in the data dictionary. @param[in] user_table InnoDB table -@param[in] n_col number of columns +@param[in] n_cols the new value of SYS_TABLES.N_COLS @param[in] trx transaction -@return DB_SUCCESS if successful, otherwise error code */ +@return whether the operation failed */ static -dberr_t -innobase_update_n_virtual( - const dict_table_t* table, - ulint n_col, - trx_t* trx) +bool +innodb_update_n_cols(const dict_table_t* table, ulint n_cols, trx_t* trx) { - dberr_t err = DB_SUCCESS; pars_info_t* info = pars_info_create(); - pars_info_add_int4_literal(info, "num_col", n_col); + pars_info_add_int4_literal(info, "n", n_cols); pars_info_add_ull_literal(info, "id", table->id); - err = que_eval_sql( - info, - "PROCEDURE RENUMBER_TABLE_ID_PROC () IS\n" - "BEGIN\n" - "UPDATE SYS_TABLES" - " SET N_COLS = :num_col\n" - " WHERE ID = :id;\n" - "END;\n", FALSE, trx); + dberr_t err = que_eval_sql(info, + "PROCEDURE UPDATE_N_COLS () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET N_COLS = :n" + " WHERE ID = :id;\n" + "END;\n", FALSE, trx); + + if (err != DB_SUCCESS) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: Updating SYS_TABLES.N_COLS failed"); + return true; + } - return(err); + return false; } /** Update system table for adding virtual column(s) @param[in] ha_alter_info Data used during in-place alter -@param[in] altered_table MySQL table that is being altered -@param[in] table MySQL table as it is before the ALTER operation @param[in] user_table InnoDB table @param[in] trx transaction @retval true Failure @@ -3949,8 +4314,6 @@ static bool innobase_add_virtual_try( Alter_inplace_info* ha_alter_info, - const TABLE* altered_table, - const TABLE* table, const dict_table_t* user_table, trx_t* trx) { @@ -3974,27 +4337,279 @@ innobase_add_virtual_try( } - ulint n_col = user_table->n_cols; - ulint n_v_col = user_table->n_v_cols; + ulint n_col = unsigned(user_table->n_cols) - DATA_N_SYS_COLS; + ulint n_v_col = unsigned(user_table->n_v_cols) + + ctx->num_to_add_vcol - ctx->num_to_drop_vcol; + ulint new_n = dict_table_encode_n_col(n_col, n_v_col) + + (unsigned(user_table->flags & DICT_TF_COMPACT) << 31); + + return innodb_update_n_cols(user_table, new_n, trx); +} + +/** Insert into SYS_COLUMNS and insert/update the hidden metadata record +for instant ADD COLUMN. +@param[in,out] ctx ALTER TABLE context for the current partition +@param[in] altered_table MySQL table that is being altered +@param[in] table MySQL table as it is before the ALTER operation +@param[in,out] trx dictionary transaction +@retval true failure +@retval false success */ +static +bool +innobase_add_instant_try( + ha_innobase_inplace_ctx*ctx, + const TABLE* altered_table, + const TABLE* table, + trx_t* trx) +{ + DBUG_ASSERT(!ctx->need_rebuild()); + + if (!ctx->is_instant()) return false; + + DBUG_ASSERT(altered_table->s->fields > table->s->fields); + DBUG_ASSERT(ctx->old_table->n_cols == ctx->old_n_cols); - n_v_col += ctx->num_to_add_vcol; + dict_table_t* user_table = ctx->old_table; + user_table->instant_add_column(*ctx->instant_table); + dict_index_t* index = dict_table_get_first_index(user_table); + /* The table may have been emptied and may have lost its + 'instant-add-ness' during this instant ADD COLUMN. */ - n_col -= DATA_N_SYS_COLS; + /* Construct a table row of default values for the stored columns. */ + dtuple_t* row = dtuple_create(ctx->heap, user_table->n_cols); + dict_table_copy_types(row, user_table); + Field** af = altered_table->field; + Field** const end = altered_table->field + altered_table->s->fields; - n_v_col -= ctx->num_to_drop_vcol; + for (uint i = 0; af < end; af++) { + if (!(*af)->stored_in_db()) { + continue; + } - ulint new_n = dict_table_encode_n_col(n_col, n_v_col) - + ((user_table->flags & DICT_TF_COMPACT) << 31); + dict_col_t* col = dict_table_get_nth_col(user_table, i); + DBUG_ASSERT(!strcmp((*af)->field_name.str, + dict_table_get_col_name(user_table, i))); - err = innobase_update_n_virtual(user_table, new_n, trx); + dfield_t* d = dtuple_get_nth_field(row, i); + + if (col->is_instant()) { + dfield_set_data(d, col->def_val.data, + col->def_val.len); + } else if ((*af)->real_maybe_null()) { + /* Store NULL for nullable 'core' columns. */ + dfield_set_null(d); + } else { + switch ((*af)->type()) { + case MYSQL_TYPE_VARCHAR: + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + variable_length: + /* Store the empty string for 'core' + variable-length NOT NULL columns. */ + dfield_set_data(d, field_ref_zero, 0); + break; + case MYSQL_TYPE_STRING: + if (col->mbminlen != col->mbmaxlen + && dict_table_is_comp(user_table)) { + goto variable_length; + } + /* fall through */ + default: + /* For fixed-length NOT NULL 'core' columns, + get a dummy default value from SQL. Note that + we will preserve the old values of these + columns when updating the metadata + record, to avoid unnecessary updates. */ + ulint len = (*af)->pack_length(); + DBUG_ASSERT(d->type.mtype != DATA_INT + || len <= 8); + row_mysql_store_col_in_innobase_format( + d, d->type.mtype == DATA_INT + ? static_cast<byte*>( + mem_heap_alloc(ctx->heap, len)) + : NULL, true, (*af)->ptr, len, + dict_table_is_comp(user_table)); + } + } + + if (i + DATA_N_SYS_COLS < ctx->old_n_cols) { + i++; + continue; + } + + pars_info_t* info = pars_info_create(); + pars_info_add_ull_literal(info, "id", user_table->id); + pars_info_add_int4_literal(info, "pos", i); + pars_info_add_str_literal(info, "name", (*af)->field_name.str); + pars_info_add_int4_literal(info, "mtype", d->type.mtype); + pars_info_add_int4_literal(info, "prtype", d->type.prtype); + pars_info_add_int4_literal(info, "len", d->type.len); + + dberr_t err = que_eval_sql( + info, + "PROCEDURE ADD_COL () IS\n" + "BEGIN\n" + "INSERT INTO SYS_COLUMNS VALUES" + "(:id,:pos,:name,:mtype,:prtype,:len,0);\n" + "END;\n", FALSE, trx); + if (err != DB_SUCCESS) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: Insert into SYS_COLUMNS failed"); + return(true); + } + + i++; + } + + if (innodb_update_n_cols(user_table, dict_table_encode_n_col( + unsigned(user_table->n_cols) + - DATA_N_SYS_COLS, + user_table->n_v_cols) + | (user_table->flags & DICT_TF_COMPACT) << 31, + trx)) { + return true; + } + + /* If the table has been discarded then change the metadata alone + and make the index to non-instant format */ + if (!user_table->space) { + index->remove_instant(); + return false; + } + + unsigned i = unsigned(user_table->n_cols) - DATA_N_SYS_COLS; + byte trx_id[DATA_TRX_ID_LEN], roll_ptr[DATA_ROLL_PTR_LEN]; + dfield_set_data(dtuple_get_nth_field(row, i++), field_ref_zero, + DATA_ROW_ID_LEN); + dfield_set_data(dtuple_get_nth_field(row, i++), trx_id, sizeof trx_id); + dfield_set_data(dtuple_get_nth_field(row, i),roll_ptr,sizeof roll_ptr); + DBUG_ASSERT(i + 1 == user_table->n_cols); + + trx_write_trx_id(trx_id, trx->id); + /* The DB_ROLL_PTR will be assigned later, when allocating undo log. + Silence a Valgrind warning in dtuple_validate() when + row_ins_clust_index_entry_low() searches for the insert position. */ + memset(roll_ptr, 0, sizeof roll_ptr); + + dtuple_t* entry = row_build_index_entry(row, NULL, index, ctx->heap); + entry->info_bits = REC_INFO_METADATA; + + mtr_t mtr; + mtr.start(); + index->set_modified(mtr); + btr_pcur_t pcur; + btr_pcur_open_at_index_side(true, index, BTR_MODIFY_TREE, &pcur, true, + 0, &mtr); + ut_ad(btr_pcur_is_before_first_on_page(&pcur)); + btr_pcur_move_to_next_on_page(&pcur); + + buf_block_t* block = btr_pcur_get_block(&pcur); + ut_ad(page_is_leaf(block->frame)); + ut_ad(!page_has_prev(block->frame)); + ut_ad(!buf_block_get_page_zip(block)); + const rec_t* rec = btr_pcur_get_rec(&pcur); + que_thr_t* thr = pars_complete_graph_for_exec( + NULL, trx, ctx->heap, NULL); + const bool is_root = block->page.id.page_no() == index->page; + + dberr_t err; + if (rec_is_metadata(rec, index)) { + ut_ad(page_rec_is_user_rec(rec)); + if (is_root + && !page_has_next(block->frame) + && page_rec_is_last(rec, block->frame)) { + goto empty_table; + } + /* Extend the record with the instantly added columns. */ + const unsigned n = user_table->n_cols - ctx->old_n_cols; + /* Reserve room for DB_TRX_ID,DB_ROLL_PTR and any + non-updated off-page columns in case they are moved off + page as a result of the update. */ + upd_t* update = upd_create(index->n_fields, ctx->heap); + update->n_fields = n; + update->info_bits = REC_INFO_METADATA; + /* Add the default values for instantly added columns */ + for (unsigned i = 0; i < n; i++) { + upd_field_t* uf = upd_get_nth_field(update, i); + unsigned f = index->n_fields - n + i; + uf->field_no = f; + uf->new_val = entry->fields[f]; + } + rec_offs* offsets = NULL; + mem_heap_t* offsets_heap = NULL; + big_rec_t* big_rec; + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, + btr_pcur_get_btr_cur(&pcur), + &offsets, &offsets_heap, ctx->heap, + &big_rec, update, UPD_NODE_NO_ORD_CHANGE, + thr, trx->id, &mtr); + if (big_rec) { + if (err == DB_SUCCESS) { + err = btr_store_big_rec_extern_fields( + &pcur, offsets, big_rec, &mtr, + BTR_STORE_UPDATE); + } + + dtuple_big_rec_free(big_rec); + } + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + btr_pcur_close(&pcur); + goto func_exit; + } else if (is_root && page_rec_is_supremum(rec)) { +empty_table: + /* The table is empty. */ + ut_ad(fil_page_index_page_check(block->frame)); + ut_ad(!page_has_siblings(block->frame)); + ut_ad(block->page.id.page_no() == index->page); + btr_page_empty(block, NULL, index, 0, &mtr); + index->remove_instant(); + err = DB_SUCCESS; + goto func_exit; + } + + /* Convert the table to the instant ADD COLUMN format. */ + ut_ad(user_table->is_instant()); + mtr.commit(); + mtr.start(); + index->set_modified(mtr); + if (page_t* root = btr_root_get(index, &mtr)) { + if (fil_page_get_type(root) != FIL_PAGE_INDEX) { + DBUG_ASSERT(!"wrong page type"); + goto err_exit; + } + + DBUG_ASSERT(!page_is_comp(root) || !page_get_instant(root)); + mlog_write_ulint(root + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_INSTANT, MLOG_2BYTES, + &mtr); + page_set_instant(root, index->n_core_fields, &mtr); + mtr.commit(); + mtr.start(); + index->set_modified(mtr); + err = row_ins_clust_index_entry_low( + BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index, + index->n_uniq, entry, 0, thr); + } else { +err_exit: + err = DB_CORRUPTION; + } + +func_exit: + mtr.commit(); if (err != DB_SUCCESS) { - my_error(ER_INTERNAL_ERROR, MYF(0), - "InnoDB: ADD COLUMN...VIRTUAL"); - return(true); + my_error_innodb(err, table->s->table_name.str, + user_table->flags); + return true; } - return(false); + return false; } /** Update INNODB SYS_COLUMNS on new virtual column's position @@ -4108,11 +4723,11 @@ innobase_drop_one_virtual_sys_columns( for (ulint i = v_col->v_pos + 1; i < table->n_v_cols; i++) { dict_v_col_t* t_col = dict_table_get_nth_v_col(table, i); ulint old_p = dict_create_v_col_pos( - t_col->v_pos - n_prev_dropped, - t_col->m_col.ind - n_prev_dropped); + t_col->v_pos - n_prev_dropped, + t_col->m_col.ind - n_prev_dropped); ulint new_p = dict_create_v_col_pos( - t_col->v_pos - 1 - n_prev_dropped, - t_col->m_col.ind - 1 - n_prev_dropped); + t_col->v_pos - 1 - n_prev_dropped, + ulint(t_col->m_col.ind) - 1 - n_prev_dropped); error = innobase_update_v_pos_sys_columns( table, old_p, new_p, trx); @@ -4161,8 +4776,6 @@ innobase_drop_one_virtual_sys_virtual( /** Update system table for dropping virtual column(s) @param[in] ha_alter_info Data used during in-place alter -@param[in] altered_table MySQL table that is being altered -@param[in] table MySQL table as it is before the ALTER operation @param[in] user_table InnoDB table @param[in] trx transaction @retval true Failure @@ -4171,8 +4784,6 @@ static bool innobase_drop_virtual_try( Alter_inplace_info* ha_alter_info, - const TABLE* altered_table, - const TABLE* table, const dict_table_t* user_table, trx_t* trx) { @@ -4208,24 +4819,13 @@ innobase_drop_virtual_try( } - ulint n_col = user_table->n_cols; - ulint n_v_col = user_table->n_v_cols; - - n_v_col -= ctx->num_to_drop_vcol; - - n_col -= DATA_N_SYS_COLS; - + ulint n_col = unsigned(user_table->n_cols) - DATA_N_SYS_COLS; + ulint n_v_col = unsigned(user_table->n_v_cols) + - ctx->num_to_drop_vcol; ulint new_n = dict_table_encode_n_col(n_col, n_v_col) - + ((user_table->flags & DICT_TF_COMPACT) << 31); - - err = innobase_update_n_virtual(user_table, new_n, trx); - - if (err != DB_SUCCESS) { - my_error(ER_INTERNAL_ERROR, MYF(0), - "InnoDB: DROP COLUMN...VIRTUAL"); - } + | ((user_table->flags & DICT_TF_COMPACT) << 31); - return(false); + return innodb_update_n_cols(user_table, new_n, trx); } /** Adjust the create index column number from "New table" to @@ -4309,6 +4909,40 @@ innodb_v_adjust_idx_col( } } +/** Create index metadata in the data dictionary. +@param[in,out] trx dictionary transaction +@param[in,out] index index being created +@param[in] add_v virtual columns that are being added, or NULL +@return the created index */ +MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)) +static +dict_index_t* +create_index_dict( + trx_t* trx, + dict_index_t* index, + const dict_add_v_col_t* add_v) +{ + DBUG_ENTER("create_index_dict"); + + mem_heap_t* heap = mem_heap_create(512); + ind_node_t* node = ind_create_graph_create( + index, index->table->name.m_name, heap, add_v); + que_thr_t* thr = pars_complete_graph_for_exec(node, trx, heap, NULL); + + que_fork_start_command( + static_cast<que_fork_t*>(que_node_get_parent(thr))); + + que_run_threads(thr); + + DBUG_ASSERT(trx->error_state != DB_SUCCESS || index != node->index); + DBUG_ASSERT(trx->error_state != DB_SUCCESS || node->index); + index = node->index; + + que_graph_free((que_t*) que_node_get_parent(thr)); + + DBUG_RETURN(index); +} + /** Update internal structures with concurrent writes blocked, while preparing ALTER TABLE. @@ -4344,7 +4978,7 @@ prepare_inplace_alter_table_dict( index_def_t* index_defs; /* index definitions */ dict_table_t* user_table; dict_index_t* fts_index = NULL; - ulint new_clustered = 0; + bool new_clustered = false; dberr_t error; ulint num_fts_index; dict_add_v_col_t* add_v = NULL; @@ -4362,7 +4996,7 @@ prepare_inplace_alter_table_dict( DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_id_idx); DBUG_ASSERT(!add_fts_doc_id_idx || innobase_fulltext_exist(altered_table)); - DBUG_ASSERT(!ctx->add_cols); + DBUG_ASSERT(!ctx->defaults); DBUG_ASSERT(!ctx->add_index); DBUG_ASSERT(!ctx->add_key_numbers); DBUG_ASSERT(!ctx->num_to_add_index); @@ -4372,15 +5006,14 @@ prepare_inplace_alter_table_dict( trx_start_if_not_started_xa(ctx->prebuilt->trx, true); if (ha_alter_info->handler_flags - & Alter_inplace_info::DROP_VIRTUAL_COLUMN) { - if (prepare_inplace_drop_virtual( - ha_alter_info, altered_table, old_table)) { + & ALTER_DROP_VIRTUAL_COLUMN) { + if (prepare_inplace_drop_virtual(ha_alter_info, old_table)) { DBUG_RETURN(true); } } if (ha_alter_info->handler_flags - & Alter_inplace_info::ADD_VIRTUAL_COLUMN) { + & ALTER_ADD_VIRTUAL_COLUMN) { if (prepare_inplace_add_virtual( ha_alter_info, altered_table, old_table)) { DBUG_RETURN(true); @@ -4390,7 +5023,7 @@ prepare_inplace_alter_table_dict( for create index */ if (ha_alter_info->handler_flags - & Alter_inplace_info::ADD_INDEX) { + & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) { for (ulint i = 0; i < ctx->num_to_add_vcol; i++) { /* Set mbminmax for newly added column */ dict_col_t& col = ctx->add_vcol[i].m_col; @@ -4412,12 +5045,6 @@ prepare_inplace_alter_table_dict( here */ ut_ad(check_v_col_in_order(old_table, altered_table, ha_alter_info)); - /* Create a background transaction for the operations on - the data dictionary tables. */ - ctx->trx = innobase_trx_allocate(ctx->prebuilt->trx->mysql_thd); - - trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX); - /* Create table containing all indexes to be built in this ALTER TABLE ADD INDEX so that they are in the correct order in the table. */ @@ -4436,36 +5063,13 @@ prepare_inplace_alter_table_dict( fts_doc_id_col, add_fts_doc_id, add_fts_doc_id_idx, old_table); - new_clustered = DICT_CLUSTERED & index_defs[0].ind_type; + new_clustered = (DICT_CLUSTERED & index_defs[0].ind_type) != 0; create_table_info_t info(ctx->prebuilt->trx->mysql_thd, altered_table, ha_alter_info->create_info, NULL, NULL, srv_file_per_table); ut_d(bool stats_wait = false); - if (num_fts_index > 1) { - my_error(ER_INNODB_FT_LIMIT, MYF(0)); - goto error_handled; - } - - if (!ctx->online) { - /* This is not an online operation (LOCK=NONE). */ - } else if (ctx->add_autoinc == ULINT_UNDEFINED - && num_fts_index == 0 - && (!innobase_need_rebuild(ha_alter_info, old_table) - || !innobase_fulltext_exist(altered_table))) { - /* InnoDB can perform an online operation (LOCK=NONE). */ - } else { - size_t query_length; - /* This should have been blocked in - check_if_supported_inplace_alter(). */ - ut_ad(0); - my_error(ER_NOT_SUPPORTED_YET, MYF(0), - innobase_get_stmt_unsafe(ctx->prebuilt->trx->mysql_thd, - &query_length)); - goto error_handled; - } - /* The primary index would be rebuilt if a FTS Doc ID column is to be added, and the primary index definition is just copied from old table and stored in indexdefs[0] */ @@ -4477,17 +5081,12 @@ prepare_inplace_alter_table_dict( /* Allocate memory for dictionary index definitions */ ctx->add_index = static_cast<dict_index_t**>( - mem_heap_alloc(ctx->heap, ctx->num_to_add_index + mem_heap_zalloc(ctx->heap, ctx->num_to_add_index * sizeof *ctx->add_index)); ctx->add_key_numbers = add_key_nums = static_cast<ulint*>( mem_heap_alloc(ctx->heap, ctx->num_to_add_index * sizeof *ctx->add_key_numbers)); - /* This transaction should be dictionary operation, so that - the data dictionary will be locked during crash recovery. */ - - ut_ad(ctx->trx->dict_operation == TRX_DICT_OP_INDEX); - /* Acquire a lock on the table before creating any indexes. */ if (ctx->online) { @@ -4502,6 +5101,12 @@ prepare_inplace_alter_table_dict( } } + /* Create a background transaction for the operations on + the data dictionary tables. */ + ctx->trx = innobase_trx_allocate(ctx->prebuilt->trx->mysql_thd); + + trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX); + /* Latch the InnoDB data dictionary exclusively so that no deadlocks or lock waits can happen in it during an index create operation. */ @@ -4522,62 +5127,67 @@ prepare_inplace_alter_table_dict( ut_d(dict_table_check_for_dup_indexes( ctx->new_table, CHECK_ABORTED_OK)); + DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter", + error = DB_OUT_OF_MEMORY; + goto error_handling;); + /* If a new clustered index is defined for the table we need to rebuild the table with a temporary name. */ if (new_clustered) { - size_t dblen = ctx->old_table->name.dblen() + 1; - size_t tablen = altered_table->s->table_name.length; - const char* part = ctx->old_table->name.part(); - size_t partlen = part ? strlen(part) : 0; - char* new_table_name = static_cast<char*>( - mem_heap_alloc(ctx->heap, - dblen + tablen + partlen + 1)); - memcpy(new_table_name, ctx->old_table->name.m_name, dblen); - memcpy(new_table_name + dblen, - altered_table->s->table_name.str, tablen); - memcpy(new_table_name + dblen + tablen, - part ? part : "", partlen + 1); - ulint n_cols = 0; - ulint n_v_cols = 0; - dtuple_t* add_cols; - ulint space_id = 0; - ulint z = 0; - uint32_t key_id = FIL_DEFAULT_ENCRYPTION_KEY; - fil_encryption_t mode = FIL_ENCRYPTION_DEFAULT; + if (innobase_check_foreigns( + ha_alter_info, old_table, + user_table, ctx->drop_fk, ctx->num_to_drop_fk)) { +new_clustered_failed: + DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx); + trx_rollback_to_savepoint(ctx->trx, NULL); - if (dict_table_is_discarded(ctx->prebuilt->table)) { - } else if (fil_space_t* space - = fil_space_acquire(ctx->prebuilt->table->space)) { - if (const fil_space_crypt_t* crypt_data - = space->crypt_data) { - key_id = crypt_data->key_id; - mode = crypt_data->encryption; - } + ut_ad(user_table->get_ref_count() == 1); - fil_space_release(space); - } + online_retry_drop_indexes_with_trx( + user_table, ctx->trx); - if (ha_alter_info->handler_flags - & Alter_inplace_info::CHANGE_CREATE_OPTION) { - const ha_table_option_struct& alt_opt= - *ha_alter_info->create_info->option_struct; - const ha_table_option_struct& opt= - *old_table->s->option_struct; - if (alt_opt.encryption != opt.encryption - || alt_opt.encryption_key_id - != opt.encryption_key_id) { - key_id = uint32_t(alt_opt.encryption_key_id); - mode = fil_encryption_t(alt_opt.encryption); + if (ctx->need_rebuild()) { + if (ctx->new_table) { + ut_ad(!ctx->new_table->cached); + dict_mem_table_free(ctx->new_table); + } + ctx->new_table = ctx->old_table; } - } - if (innobase_check_foreigns( - ha_alter_info, altered_table, old_table, - user_table, ctx->drop_fk, ctx->num_to_drop_fk)) { - goto new_clustered_failed; + while (ctx->num_to_add_index--) { + if (dict_index_t*& i = ctx->add_index[ + ctx->num_to_add_index]) { + dict_mem_index_free(i); + i = NULL; + } + } + + goto err_exit; } + size_t prefixlen= strlen(mysql_data_home); + if (mysql_data_home[prefixlen-1] != FN_LIBCHAR) + prefixlen++; + size_t tablen = altered_table->s->path.length - prefixlen; + const char* part = ctx->old_table->name.part(); + size_t partlen = part ? strlen(part) : 0; + char* new_table_name = static_cast<char*>( + mem_heap_alloc(ctx->heap, tablen + partlen + 1)); + memcpy(new_table_name, + altered_table->s->path.str + prefixlen, tablen); +#ifdef _WIN32 + { + char *sep= strchr(new_table_name, FN_LIBCHAR); + sep[0]= '/'; + } +#endif + memcpy(new_table_name + tablen, part ? part : "", partlen + 1); + ulint n_cols = 0; + ulint n_v_cols = 0; + dtuple_t* defaults; + ulint z = 0; + for (uint i = 0; i < altered_table->s->fields; i++) { const Field* field = altered_table->field[i]; @@ -4601,19 +5211,8 @@ prepare_inplace_alter_table_dict( DBUG_ASSERT(!add_fts_doc_id_idx || (flags2 & DICT_TF2_FTS)); - /* Create the table. */ - trx_set_dict_operation(ctx->trx, TRX_DICT_OP_TABLE); - - if (dict_table_get_low(new_table_name)) { - my_error(ER_TABLE_EXISTS_ERROR, MYF(0), - new_table_name); - goto new_clustered_failed; - } - - /* The initial space id 0 may be overridden later if this - table is going to be a file_per_table tablespace. */ ctx->new_table = dict_mem_table_create( - new_table_name, space_id, n_cols + n_v_cols, n_v_cols, + new_table_name, NULL, n_cols + n_v_cols, n_v_cols, flags, flags2); /* The rebuilt indexed_table will use the renamed @@ -4654,14 +5253,24 @@ prepare_inplace_alter_table_dict( field_type |= DATA_UNSIGNED; } + if (altered_table->versioned()) { + if (i == altered_table->s->row_start_field) { + field_type |= DATA_VERS_START; + } else if (i == + altered_table->s->row_end_field) { + field_type |= DATA_VERS_END; + } else if (!(field->flags + & VERS_UPDATE_UNVERSIONED_FLAG)) { + field_type |= DATA_VERSIONED; + } + } + if (dtype_is_string_type(col_type)) { charset_no = (ulint) field->charset()->number; if (charset_no > MAX_CHAR_COLL_NUM) { - dict_mem_table_free( - ctx->new_table); my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB", - field->field_name); + field->field_name.str); goto new_clustered_failed; } } else { @@ -4689,11 +5298,12 @@ prepare_inplace_alter_table_dict( } - if (dict_col_name_is_reserved(field->field_name)) { + if (dict_col_name_is_reserved(field->field_name.str)) { wrong_column_name: dict_mem_table_free(ctx->new_table); + ctx->new_table = ctx->old_table; my_error(ER_WRONG_COLUMN_NAME, MYF(0), - field->field_name); + field->field_name.str); goto new_clustered_failed; } @@ -4701,12 +5311,12 @@ wrong_column_name: to internal query parser. FTS_DOC_ID column must be of BIGINT NOT NULL type and it should be in all capitalized characters */ - if (!innobase_strcasecmp(field->field_name, + if (!innobase_strcasecmp(field->field_name.str, FTS_DOC_ID_COL_NAME)) { if (col_type != DATA_INT || field->real_maybe_null() || col_len != sizeof(doc_id_t) - || strcmp(field->field_name, + || strcmp(field->field_name.str, FTS_DOC_ID_COL_NAME)) { goto wrong_column_name; } @@ -4715,7 +5325,7 @@ wrong_column_name: if (is_virtual) { dict_mem_table_add_v_col( ctx->new_table, ctx->heap, - field->field_name, + field->field_name.str, col_type, dtype_form_prtype( field_type, charset_no) @@ -4724,7 +5334,7 @@ wrong_column_name: } else { dict_mem_table_add_col( ctx->new_table, ctx->heap, - field->field_name, + field->field_name.str, col_type, dtype_form_prtype( field_type, charset_no), @@ -4757,69 +5367,23 @@ wrong_column_name: ctx->new_table->fts->doc_col = fts_doc_id_col; } - error = row_create_table_for_mysql( - ctx->new_table, ctx->trx, mode, key_id); - - switch (error) { - dict_table_t* temp_table; - case DB_SUCCESS: - /* We need to bump up the table ref count and - before we can use it we need to open the - table. The new_table must be in the data - dictionary cache, because we are still holding - the dict_sys->mutex. */ - ut_ad(mutex_own(&dict_sys->mutex)); - temp_table = dict_table_open_on_name( - ctx->new_table->name.m_name, TRUE, FALSE, - DICT_ERR_IGNORE_NONE); - ut_a(ctx->new_table == temp_table); - /* n_ref_count must be 1, because purge cannot - be executing on this very table as we are - holding dict_operation_lock X-latch. */ - DBUG_ASSERT(ctx->new_table->get_ref_count() == 1); - break; - case DB_TABLESPACE_EXISTS: - my_error(ER_TABLESPACE_EXISTS, MYF(0), - new_table_name); - goto new_clustered_failed; - case DB_DUPLICATE_KEY: - my_error(HA_ERR_TABLE_EXIST, MYF(0), - altered_table->s->table_name.str); - goto new_clustered_failed; - case DB_UNSUPPORTED: - my_error(ER_UNSUPPORTED_EXTENSION, MYF(0), - ctx->new_table->name.m_name); - goto new_clustered_failed; - default: - my_error_innodb(error, table_name, flags); -new_clustered_failed: - DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx); - trx_rollback_to_savepoint(ctx->trx, NULL); - - ut_ad(user_table->get_ref_count() == 1); - - online_retry_drop_indexes_with_trx( - user_table, ctx->trx); - goto err_exit; - } + dict_table_add_system_columns(ctx->new_table, ctx->heap); - if (ha_alter_info->handler_flags - & Alter_inplace_info::ADD_COLUMN) { - add_cols = dtuple_create_with_vcol( + if (ha_alter_info->handler_flags & INNOBASE_DEFAULTS) { + defaults = dtuple_create_with_vcol( ctx->heap, dict_table_get_n_cols(ctx->new_table), dict_table_get_n_v_cols(ctx->new_table)); - dict_table_copy_types(add_cols, ctx->new_table); + dict_table_copy_types(defaults, ctx->new_table); } else { - add_cols = NULL; + defaults = NULL; } ctx->col_map = innobase_build_col_map( ha_alter_info, altered_table, old_table, - ctx->new_table, user_table, - add_cols, ctx->heap); - ctx->add_cols = add_cols; + ctx->new_table, user_table, defaults, ctx->heap); + ctx->defaults = defaults; } else { DBUG_ASSERT(!innobase_need_rebuild(ha_alter_info, old_table)); DBUG_ASSERT(old_table->s->primary_key @@ -4865,13 +5429,9 @@ new_clustered_failed: } } - /* Assign table_id, so that no table id of - fts_create_index_tables() will be written to the undo logs. */ - DBUG_ASSERT(ctx->new_table->id != 0); - ctx->trx->table_id = ctx->new_table->id; - - /* Create the indexes in SYS_INDEXES and load into dictionary. */ + ut_ad(new_clustered == ctx->need_rebuild()); + /* Create the index metadata. */ for (ulint a = 0; a < ctx->num_to_add_index; a++) { if (index_defs[a].ind_type & DICT_VIRTUAL && ctx->num_to_drop_vcol > 0 && !new_clustered) { @@ -4880,92 +5440,310 @@ new_clustered_failed: &index_defs[a]); } - DBUG_EXECUTE_IF( - "create_index_metadata_fail", - if (a + 1 == ctx->num_to_add_index) { - ctx->trx->error_state = DB_OUT_OF_FILE_SPACE; - ctx->add_index[a] = NULL; - goto index_created; - }); ctx->add_index[a] = row_merge_create_index( - ctx->trx, ctx->new_table, &index_defs[a], add_v); -#ifndef DBUG_OFF -index_created: -#endif + ctx->new_table, &index_defs[a], add_v); + add_key_nums[a] = index_defs[a].key_number; - if (!ctx->add_index[a]) { - error = ctx->trx->error_state; - DBUG_ASSERT(error != DB_SUCCESS); - goto error_handling; + DBUG_ASSERT(ctx->add_index[a]->is_committed() + == !!new_clustered); + } + + if (ctx->need_rebuild() && user_table->supports_instant()) { + if (!instant_alter_column_possible(ha_alter_info, old_table)) { + goto not_instant_add_column; } - /* For ALTER TABLE...FORCE or OPTIMIZE TABLE, we may - only issue warnings, because there will be no schema change. */ - if (!info.row_size_is_acceptable( - *ctx->add_index[a], - !!(ha_alter_info->handler_flags - & ~(INNOBASE_INPLACE_IGNORE - | INNOBASE_ALTER_NOVALIDATE - | Alter_inplace_info::RECREATE_TABLE)))) { - error = DB_TOO_BIG_RECORD; - goto error_handling; + for (uint i = uint(ctx->old_table->n_cols) - DATA_N_SYS_COLS; + i--; ) { + if (ctx->col_map[i] != i) { + goto not_instant_add_column; + } } - DBUG_ASSERT(ctx->add_index[a]->is_committed() - == !!new_clustered); + DBUG_ASSERT(ctx->new_table->n_cols > ctx->old_table->n_cols); - if (ctx->add_index[a]->type & DICT_FTS) { - DBUG_ASSERT(num_fts_index); - DBUG_ASSERT(!fts_index); - DBUG_ASSERT(ctx->add_index[a]->type == DICT_FTS); - fts_index = ctx->add_index[a]; - } - - /* If only online ALTER TABLE operations have been - requested, allocate a modification log. If the table - will be locked anyway, the modification - log is unnecessary. When rebuilding the table - (new_clustered), we will allocate the log for the - clustered index of the old table, later. */ - if (new_clustered - || !ctx->online - || !user_table->is_readable() - || dict_table_is_discarded(user_table)) { - /* No need to allocate a modification log. */ - ut_ad(!ctx->add_index[a]->online_log); - } else if (ctx->add_index[a]->type & DICT_FTS) { - /* Fulltext indexes are not covered - by a modification log. */ - } else { - DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter", - error = DB_OUT_OF_MEMORY; - goto error_handling;); - rw_lock_x_lock(&ctx->add_index[a]->lock); + for (uint a = 0; a < ctx->num_to_add_index; a++) { + ctx->add_index[a]->table = ctx->new_table; + error = dict_index_add_to_cache( + ctx->add_index[a], FIL_NULL, add_v); + ut_a(error == DB_SUCCESS); + } + DBUG_ASSERT(ha_alter_info->key_count + /* hidden GEN_CLUST_INDEX in InnoDB */ + + dict_index_is_auto_gen_clust( + dict_table_get_first_index(ctx->new_table)) + /* hidden FTS_DOC_ID_INDEX in InnoDB */ + + (ctx->old_table->fts_doc_id_index + && innobase_fts_check_doc_id_index_in_def( + altered_table->s->keys, + altered_table->key_info) + != FTS_EXIST_DOC_ID_INDEX) + == ctx->num_to_add_index); + ctx->num_to_add_index = 0; + ctx->add_index = NULL; - bool ok = row_log_allocate(ctx->add_index[a], - NULL, true, NULL, NULL, - path); - rw_lock_x_unlock(&ctx->add_index[a]->lock); + uint i = 0; // index of stored columns ctx->new_table->cols[] + Field **af = altered_table->field; - if (!ok) { - error = DB_OUT_OF_MEMORY; - goto error_handling; + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + + while (const Create_field* new_field = cf_it++) { + DBUG_ASSERT(!new_field->field + || std::find(old_table->field, + old_table->field + + old_table->s->fields, + new_field->field) != + old_table->field + old_table->s->fields); + DBUG_ASSERT(new_field->field + || !strcmp(new_field->field_name.str, + (*af)->field_name.str)); + + if (!(*af)->stored_in_db()) { + af++; + continue; } + + dict_col_t* col = dict_table_get_nth_col( + ctx->new_table, i); + DBUG_ASSERT(!strcmp((*af)->field_name.str, + dict_table_get_col_name(ctx->new_table, + i))); + DBUG_ASSERT(!col->is_instant()); + + if (new_field->field) { + ut_d(const dict_col_t* old_col + = dict_table_get_nth_col(user_table, i)); + ut_d(const dict_index_t* index + = user_table->indexes.start); + DBUG_SLOW_ASSERT(col->mtype == old_col->mtype); + ut_ad(col->prtype == old_col->prtype + || col->prtype + == (old_col->prtype & ~DATA_VERSIONED)); + DBUG_SLOW_ASSERT(col->mbminlen + == old_col->mbminlen); + DBUG_SLOW_ASSERT(col->mbmaxlen + == old_col->mbmaxlen); + DBUG_SLOW_ASSERT(col->len >= old_col->len); + DBUG_SLOW_ASSERT(old_col->is_instant() + == (dict_col_get_clust_pos( + old_col, index) + >= index->n_core_fields)); + } else if ((*af)->is_real_null()) { + /* DEFAULT NULL */ + col->def_val.len = UNIV_SQL_NULL; + } else { + switch ((*af)->type()) { + case MYSQL_TYPE_VARCHAR: + col->def_val.len = reinterpret_cast + <const Field_varstring*> + ((*af))->get_length(); + col->def_val.data = reinterpret_cast + <const Field_varstring*> + ((*af))->get_data(); + break; + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + col->def_val.len = reinterpret_cast + <const Field_blob*> + ((*af))->get_length(); + col->def_val.data = reinterpret_cast + <const Field_blob*> + ((*af))->get_ptr(); + break; + default: + dfield_t d; + dict_col_copy_type(col, &d.type); + ulint len = (*af)->pack_length(); + DBUG_ASSERT(len <= 8 + || d.type.mtype + != DATA_INT); + row_mysql_store_col_in_innobase_format( + &d, + d.type.mtype == DATA_INT + ? static_cast<byte*>( + mem_heap_alloc( + ctx->heap, + len)) + : NULL, + true, (*af)->ptr, len, + dict_table_is_comp( + user_table)); + col->def_val.len = d.len; + col->def_val.data = d.data; + } + } + + i++; + af++; } + + DBUG_ASSERT(af == altered_table->field + + altered_table->s->fields); + /* There might exist a hidden FTS_DOC_ID column for + FULLTEXT INDEX. If it exists, the columns should have + been implicitly added by ADD FULLTEXT INDEX together + with instant ADD COLUMN. (If a hidden FTS_DOC_ID pre-existed, + then the ctx->col_map[] check should have prevented + adding visible user columns after that.) */ + DBUG_ASSERT(DATA_N_SYS_COLS + i == ctx->new_table->n_cols + || (1 + DATA_N_SYS_COLS + i + == ctx->new_table->n_cols + && !strcmp(dict_table_get_col_name( + ctx->new_table, i), + FTS_DOC_ID_COL_NAME))); + + ctx->prepare_instant(); } - ut_ad(new_clustered == ctx->need_rebuild()); + if (ctx->need_rebuild()) { +not_instant_add_column: + DBUG_ASSERT(ctx->need_rebuild()); + DBUG_ASSERT(!ctx->is_instant()); + DBUG_ASSERT(num_fts_index <= 1); + DBUG_ASSERT(!ctx->online || num_fts_index == 0); + DBUG_ASSERT(!ctx->online + || ctx->add_autoinc == ULINT_UNDEFINED); + DBUG_ASSERT(!ctx->online + || !innobase_need_rebuild(ha_alter_info, old_table) + || !innobase_fulltext_exist(altered_table)); - DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter", - error = DB_OUT_OF_MEMORY; - goto error_handling;); + uint32_t key_id = FIL_DEFAULT_ENCRYPTION_KEY; + fil_encryption_t mode = FIL_ENCRYPTION_DEFAULT; + + if (fil_space_t* s = user_table->space) { + if (const fil_space_crypt_t* c = s->crypt_data) { + key_id = c->key_id; + mode = c->encryption; + } + } + + if (ha_alter_info->handler_flags & ALTER_OPTIONS) { + const ha_table_option_struct& alt_opt= + *ha_alter_info->create_info->option_struct; + const ha_table_option_struct& opt= + *old_table->s->option_struct; + if (alt_opt.encryption != opt.encryption + || alt_opt.encryption_key_id + != opt.encryption_key_id) { + key_id = uint32_t(alt_opt.encryption_key_id); + mode = fil_encryption_t(alt_opt.encryption); + } + } + + if (dict_table_get_low(ctx->new_table->name.m_name)) { + my_error(ER_TABLE_EXISTS_ERROR, MYF(0), + ctx->new_table->name.m_name); + goto new_clustered_failed; + } + + /* Create the table. */ + trx_set_dict_operation(ctx->trx, TRX_DICT_OP_TABLE); + + error = row_create_table_for_mysql( + ctx->new_table, ctx->trx, mode, key_id); + + switch (error) { + dict_table_t* temp_table; + case DB_SUCCESS: + /* We need to bump up the table ref count and + before we can use it we need to open the + table. The new_table must be in the data + dictionary cache, because we are still holding + the dict_sys->mutex. */ + ut_ad(mutex_own(&dict_sys->mutex)); + temp_table = dict_table_open_on_name( + ctx->new_table->name.m_name, TRUE, FALSE, + DICT_ERR_IGNORE_NONE); + ut_a(ctx->new_table == temp_table); + /* n_ref_count must be 1, because purge cannot + be executing on this very table as we are + holding dict_operation_lock X-latch. */ + DBUG_ASSERT(ctx->new_table->get_ref_count() == 1); + DBUG_ASSERT(ctx->new_table->id != 0); + DBUG_ASSERT(ctx->new_table->id == ctx->trx->table_id); + break; + case DB_TABLESPACE_EXISTS: + my_error(ER_TABLESPACE_EXISTS, MYF(0), + altered_table->s->table_name.str); + goto new_table_failed; + case DB_DUPLICATE_KEY: + my_error(HA_ERR_TABLE_EXIST, MYF(0), + altered_table->s->table_name.str); + goto new_table_failed; + case DB_UNSUPPORTED: + my_error(ER_UNSUPPORTED_EXTENSION, MYF(0), + altered_table->s->table_name.str); + goto new_table_failed; + default: + my_error_innodb(error, table_name, flags); +new_table_failed: + DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx); + ctx->new_table = NULL; + goto new_clustered_failed; + } + + for (ulint a = 0; a < ctx->num_to_add_index; a++) { + dict_index_t* index = ctx->add_index[a]; + const ulint n_v_col = index->get_new_n_vcol(); + index = create_index_dict(ctx->trx, index, add_v); + error = ctx->trx->error_state; + if (error != DB_SUCCESS) { + if (index) { + dict_mem_index_free(index); + } +error_handling_drop_uncached_1: + while (++a < ctx->num_to_add_index) { + dict_mem_index_free(ctx->add_index[a]); + } + goto error_handling; + } else { + DBUG_ASSERT(index != ctx->add_index[a]); + } + + ctx->add_index[a] = index; + /* For ALTER TABLE...FORCE or OPTIMIZE TABLE, + we may only issue warnings, because there will + be no schema change from the user perspective. */ + if (!info.row_size_is_acceptable( + *index, + !!(ha_alter_info->handler_flags + & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_NOVALIDATE + | ALTER_RECREATE_TABLE)))) { + error = DB_TOO_BIG_RECORD; + goto error_handling_drop_uncached_1; + } + index->parser = index_defs[a].parser; + if (n_v_col) { + index->assign_new_v_col(n_v_col); + } + /* Note the id of the transaction that created this + index, we use it to restrict readers from accessing + this index, to ensure read consistency. */ + ut_ad(index->trx_id == ctx->trx->id); + + if (index->type & DICT_FTS) { + DBUG_ASSERT(num_fts_index == 1); + DBUG_ASSERT(!fts_index); + DBUG_ASSERT(index->type == DICT_FTS); + fts_index = ctx->add_index[a]; + } + } - if (new_clustered) { dict_index_t* clust_index = dict_table_get_first_index( user_table); dict_index_t* new_clust_index = dict_table_get_first_index( ctx->new_table); + ut_ad(!new_clust_index->is_instant()); + /* row_merge_build_index() depends on the correct value */ + ut_ad(new_clust_index->n_core_null_bytes + == UT_BITS_IN_BYTES(new_clust_index->n_nullable)); + DBUG_ASSERT(!ctx->new_table->persistent_autoinc); if (const Field* ai = altered_table->found_next_number_field) { const unsigned col_no = innodb_col_no(ai); @@ -4977,7 +5755,7 @@ index_created: /* Initialize the AUTO_INCREMENT sequence to the rebuilt table from the old one. */ if (!old_table->found_next_number_field - || dict_table_is_discarded(user_table)) { + || !user_table->space) { } else if (ib_uint64_t autoinc = btr_read_autoinc(clust_index)) { btr_write_autoinc(new_clust_index, autoinc); @@ -4994,10 +5772,13 @@ index_created: /* Allocate a log for online table rebuild. */ rw_lock_x_lock(&clust_index->lock); bool ok = row_log_allocate( + ctx->prebuilt->trx, clust_index, ctx->new_table, !(ha_alter_info->handler_flags - & Alter_inplace_info::ADD_PK_INDEX), - ctx->add_cols, ctx->col_map, path); + & ALTER_ADD_PK_INDEX), + ctx->defaults, ctx->col_map, path, + old_table, + ctx->allow_not_null); rw_lock_x_unlock(&clust_index->lock); if (!ok) { @@ -5005,12 +5786,103 @@ index_created: goto error_handling; } } + } else if (ctx->num_to_add_index) { + ut_ad(!ctx->is_instant()); + ctx->trx->table_id = user_table->id; + + for (ulint a = 0; a < ctx->num_to_add_index; a++) { + dict_index_t* index = ctx->add_index[a]; + const ulint n_v_col = index->get_new_n_vcol(); + DBUG_EXECUTE_IF( + "create_index_metadata_fail", + if (a + 1 == ctx->num_to_add_index) { + ctx->trx->error_state = + DB_OUT_OF_FILE_SPACE; + goto index_created; + }); + index = create_index_dict(ctx->trx, index, add_v); +#ifndef DBUG_OFF +index_created: +#endif + error = ctx->trx->error_state; + if (error != DB_SUCCESS) { + if (index) { + dict_mem_index_free(index); + } +error_handling_drop_uncached: + while (++a < ctx->num_to_add_index) { + dict_mem_index_free(ctx->add_index[a]); + } + goto error_handling; + } else { + DBUG_ASSERT(index != ctx->add_index[a]); + } + ctx->add_index[a]= index; + if (!info.row_size_is_acceptable(*index, true)) { + error = DB_TOO_BIG_RECORD; + goto error_handling_drop_uncached; + } + + index->parser = index_defs[a].parser; + if (n_v_col) { + index->assign_new_v_col(n_v_col); + } + /* Note the id of the transaction that created this + index, we use it to restrict readers from accessing + this index, to ensure read consistency. */ + ut_ad(index->trx_id == ctx->trx->id); + + /* If ADD INDEX with LOCK=NONE has been + requested, allocate a modification log. */ + if (index->type & DICT_FTS) { + DBUG_ASSERT(num_fts_index == 1); + DBUG_ASSERT(!fts_index); + DBUG_ASSERT(index->type == DICT_FTS); + fts_index = ctx->add_index[a]; + /* Fulltext indexes are not covered + by a modification log. */ + } else if (!ctx->online + || !user_table->is_readable() + || !user_table->space) { + /* No need to allocate a modification log. */ + DBUG_ASSERT(!index->online_log); + } else { + rw_lock_x_lock(&ctx->add_index[a]->lock); + + bool ok = row_log_allocate( + ctx->prebuilt->trx, + index, + NULL, true, NULL, NULL, + path, old_table, + ctx->allow_not_null); + + rw_lock_x_unlock(&index->lock); + + DBUG_EXECUTE_IF( + "innodb_OOM_prepare_add_index", + if (ok && a == 1) { + row_log_free( + index->online_log); + index->online_log = NULL; + ok = false; + }); + + if (!ok) { + error = DB_OUT_OF_MEMORY; + goto error_handling_drop_uncached; + } + } + } + } else if (ctx->is_instant() + && !info.row_size_is_acceptable(*user_table, true)) { + error = DB_TOO_BIG_RECORD; + goto error_handling; } - if (ctx->online) { + if (ctx->online && ctx->num_to_add_index) { /* Assign a consistent read view for row_merge_read_clustered_index(). */ - trx_assign_read_view(ctx->prebuilt->trx); + ctx->prebuilt->trx->read_view.open(ctx->prebuilt->trx); } if (fts_index) { @@ -5034,8 +5906,8 @@ op_ok: ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS); - if (new_clustered) { - /* For !new_clustered, this will be set at + if (ctx->need_rebuild()) { + /* For !ctx->need_rebuild(), this will be set at commit_cache_norebuild(). */ ctx->new_table->fts_doc_id_index = dict_table_get_index_on_name( @@ -5043,10 +5915,8 @@ op_ok: DBUG_ASSERT(ctx->new_table->fts_doc_id_index != NULL); } - /* This function will commit the transaction and reset - the trx_t::dict_operation flag on success. */ - - error = fts_create_index_tables(ctx->trx, fts_index); + error = fts_create_index_tables(ctx->trx, fts_index, + ctx->new_table->id); DBUG_EXECUTE_IF("innodb_test_fail_after_fts_index_table", error = DB_LOCK_WAIT_TIMEOUT; @@ -5056,13 +5926,13 @@ op_ok: goto error_handling; } + trx_commit(ctx->trx); trx_start_for_ddl(ctx->trx, op); if (!ctx->new_table->fts || ib_vector_size(ctx->new_table->fts->indexes) == 0) { error = fts_create_common_tables( - ctx->trx, ctx->new_table, - user_table->name.m_name, TRUE); + ctx->trx, ctx->new_table, true); DBUG_EXECUTE_IF( "innodb_test_fail_after_fts_common_table", @@ -5132,6 +6002,11 @@ error_handling: error_handled: ctx->prebuilt->trx->error_info = NULL; + + if (!ctx->trx) { + goto err_exit; + } + ctx->trx->error_state = DB_SUCCESS; if (!dict_locked) { @@ -5192,9 +6067,11 @@ err_exit: ctx->drop_index[i]->to_be_dropped = 0; } - row_mysql_unlock_data_dictionary(ctx->trx); + if (ctx->trx) { + row_mysql_unlock_data_dictionary(ctx->trx); - trx_free_for_mysql(ctx->trx); + ctx->trx->free(); + } trx_commit_for_mysql(ctx->prebuilt->trx); for (uint i = 0; i < ctx->num_to_add_fk; i++) { @@ -5448,9 +6325,9 @@ ha_innobase::prepare_inplace_alter_table( NULL, srv_file_per_table); - info.set_tablespace_type(indexed_table->space != TRX_SYS_SPACE); + info.set_tablespace_type(indexed_table->space != fil_system.sys_space); - if (ha_alter_info->handler_flags & Alter_inplace_info::ADD_INDEX) { + if (ha_alter_info->handler_flags & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) { if (info.gcols_in_fulltext_or_spatial()) { goto err_exit_no_heap; } @@ -5461,9 +6338,7 @@ ha_innobase::prepare_inplace_alter_table( if (indexed_table->corrupted) { /* Handled below */ } else { - FilSpace space(indexed_table->space, true); - - if (space()) { + if (const fil_space_t* space = indexed_table->space) { String str; const char* engine= table_type(); @@ -5475,7 +6350,7 @@ ha_innobase::prepare_inplace_alter_table( " used key_id is not available. " " Can't continue reading table.", table_share->table_name.str, - space()->chain.start->name); + space->chain.start->name); my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, str.c_ptr(), engine); DBUG_RETURN(true); @@ -5533,7 +6408,7 @@ err_exit_no_heap: /* Prohibit renaming a column to something that the table already contains. */ if (ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_COLUMN_NAME) { + & ALTER_COLUMN_NAME) { List_iterator_fast<Create_field> cf_it( ha_alter_info->alter_info->create_list); @@ -5547,7 +6422,7 @@ err_exit_no_heap: cf_it.rewind(); while (Create_field* cf = cf_it++) { if (cf->field == *fp) { - name = cf->field_name; + name = cf->field_name.str; goto check_if_ok_to_rename; } } @@ -5557,7 +6432,7 @@ check_if_ok_to_rename: /* Prohibit renaming a column from FTS_DOC_ID if full-text indexes exist. */ if (!my_strcasecmp(system_charset_info, - (*fp)->field_name, + (*fp)->field_name.str, FTS_DOC_ID_COL_NAME) && innobase_fulltext_exist(altered_table)) { my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN, @@ -5679,11 +6554,12 @@ check_if_ok_to_rename: n_drop_fk = 0; if (ha_alter_info->handler_flags - & (INNOBASE_ALTER_NOREBUILD | INNOBASE_ALTER_REBUILD)) { + & (INNOBASE_ALTER_NOREBUILD | INNOBASE_ALTER_REBUILD + | INNOBASE_ALTER_INSTANT)) { heap = mem_heap_create(1024); if (ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_COLUMN_NAME) { + & ALTER_COLUMN_NAME) { col_names = innobase_get_col_names( ha_alter_info, altered_table, table, indexed_table, heap); @@ -5696,7 +6572,7 @@ check_if_ok_to_rename: } if (ha_alter_info->handler_flags - & Alter_inplace_info::DROP_FOREIGN_KEY) { + & ALTER_DROP_FOREIGN_KEY) { DBUG_ASSERT(ha_alter_info->alter_info->drop_list.elements > 0); drop_fk = static_cast<dict_foreign_t**>( @@ -5761,9 +6637,9 @@ dup_fk: dict_index_t* drop_primary = NULL; DBUG_ASSERT(ha_alter_info->handler_flags - & (Alter_inplace_info::DROP_INDEX - | Alter_inplace_info::DROP_UNIQUE_INDEX - | Alter_inplace_info::DROP_PK_INDEX)); + & (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX + | ALTER_DROP_UNIQUE_INDEX + | ALTER_DROP_PK_INDEX)); /* Check which indexes to drop. */ drop_index = static_cast<dict_index_t**>( mem_heap_alloc( @@ -5775,7 +6651,7 @@ dup_fk: = ha_alter_info->index_drop_buffer[i]; dict_index_t* index = dict_table_get_index_on_name( - indexed_table, key->name); + indexed_table, key->name.str); if (!index) { push_warning_printf( @@ -5783,7 +6659,7 @@ dup_fk: Sql_condition::WARN_LEVEL_WARN, HA_ERR_WRONG_INDEX, "InnoDB could not find key" - " with name %s", key->name); + " with name %s", key->name.str); } else { ut_ad(!index->to_be_dropped); if (!index->is_primary()) { @@ -5817,7 +6693,7 @@ dup_fk: if (!my_strcasecmp( system_charset_info, FTS_DOC_ID_INDEX_NAME, - table->key_info[i].name)) { + table->key_info[i].name.str)) { /* The index exists in the MySQL data dictionary. Do not drop it, even though it is no longer needed @@ -5883,7 +6759,7 @@ check_if_can_drop_indexes: /* Check if any of the existing indexes are marked as corruption and if they are, refuse adding more indexes. */ - if (ha_alter_info->handler_flags & Alter_inplace_info::ADD_INDEX) { + if (ha_alter_info->handler_flags & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) { for (dict_index_t* index = dict_table_get_first_index(indexed_table); index != NULL; index = dict_table_get_next_index(index)) { @@ -5901,7 +6777,7 @@ check_if_can_drop_indexes: n_add_fk = 0; if (ha_alter_info->handler_flags - & Alter_inplace_info::ADD_FOREIGN_KEY) { + & ALTER_ADD_FOREIGN_KEY) { ut_ad(!m_prebuilt->trx->check_foreigns); alter_fill_stored_column(altered_table, m_prebuilt->table, @@ -5951,10 +6827,15 @@ err_exit: } } + const ha_table_option_struct& alt_opt= + *ha_alter_info->create_info->option_struct; + if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA) - || ((ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) - == Alter_inplace_info::CHANGE_CREATE_OPTION - && !innobase_need_rebuild(ha_alter_info, table))) { + || ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_NOCREATE + | INNOBASE_ALTER_INSTANT)) + == ALTER_OPTIONS + && !alter_options_need_rebuild(ha_alter_info, table))) { if (heap) { ha_alter_info->handler_ctx @@ -5966,11 +6847,15 @@ err_exit: add_fk, n_add_fk, ha_alter_info->online, heap, indexed_table, - col_names, ULINT_UNDEFINED, 0, 0, 0); + col_names, ULINT_UNDEFINED, 0, 0, + (ha_alter_info->ignore + || !thd_is_strict_mode(m_user_thd)), + alt_opt.page_compressed, + alt_opt.page_compression_level); } DBUG_ASSERT(m_prebuilt->trx->dict_operation_lock_mode == 0); - if (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) { + if (ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE)) { online_retry_drop_indexes( m_prebuilt->table, m_user_thd); @@ -5978,14 +6863,13 @@ err_exit: } if ((ha_alter_info->handler_flags - & Alter_inplace_info::DROP_VIRTUAL_COLUMN) - && prepare_inplace_drop_virtual( - ha_alter_info, altered_table, table)) { + & ALTER_DROP_VIRTUAL_COLUMN) + && prepare_inplace_drop_virtual(ha_alter_info, table)) { DBUG_RETURN(true); } if ((ha_alter_info->handler_flags - & Alter_inplace_info::ADD_VIRTUAL_COLUMN) + & ALTER_ADD_VIRTUAL_COLUMN) && prepare_inplace_add_virtual( ha_alter_info, altered_table, table)) { DBUG_RETURN(true); @@ -6009,12 +6893,6 @@ err_exit: add_fts_doc_id = true; add_fts_doc_id_idx = true; - push_warning_printf( - m_user_thd, - Sql_condition::WARN_LEVEL_WARN, - HA_ERR_WRONG_INDEX, - "InnoDB rebuilding table to add" - " column " FTS_DOC_ID_COL_NAME); } else if (fts_doc_col_no == ULINT_UNDEFINED) { goto err_exit; } @@ -6034,9 +6912,9 @@ err_exit: doc_col_no == fts_doc_col_no || doc_col_no == ULINT_UNDEFINED || (ha_alter_info->handler_flags - & (Alter_inplace_info::ALTER_STORED_COLUMN_ORDER - | Alter_inplace_info::DROP_STORED_COLUMN - | Alter_inplace_info::ADD_STORED_BASE_COLUMN))); + & (ALTER_STORED_COLUMN_ORDER + | ALTER_DROP_STORED_COLUMN + | ALTER_ADD_STORED_BASE_COLUMN))); } } @@ -6059,7 +6937,7 @@ err_exit: /* This is an added column. */ DBUG_ASSERT(!new_field->field); DBUG_ASSERT(ha_alter_info->handler_flags - & Alter_inplace_info::ADD_COLUMN); + & ALTER_ADD_COLUMN); field = altered_table->field[i]; @@ -6098,7 +6976,9 @@ found_col: heap, m_prebuilt->table, col_names, add_autoinc_col_no, ha_alter_info->create_info->auto_increment_value, - autoinc_col_max_value, 0); + autoinc_col_max_value, + ha_alter_info->ignore || !thd_is_strict_mode(m_user_thd), + alt_opt.page_compressed, alt_opt.page_compression_level); DBUG_RETURN(prepare_inplace_alter_table_dict( ha_alter_info, altered_table, table, @@ -6190,7 +7070,7 @@ get_error_key_name( } else if (ha_alter_info->key_count == 0) { return(dict_table_get_first_index(table)->name); } else { - return(ha_alter_info->key_info_buffer[error_key_num].name); + return(ha_alter_info->key_info_buffer[error_key_num].name.str); } } @@ -6234,9 +7114,11 @@ ok_exit: DBUG_RETURN(false); } - if ((ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) - == Alter_inplace_info::CHANGE_CREATE_OPTION - && !innobase_need_rebuild(ha_alter_info, table)) { + if ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_NOCREATE + | INNOBASE_ALTER_INSTANT)) + == ALTER_OPTIONS + && !alter_options_need_rebuild(ha_alter_info, table)) { goto ok_exit; } @@ -6248,6 +7130,8 @@ ok_exit: DBUG_ASSERT(ctx->trx); DBUG_ASSERT(ctx->prebuilt == m_prebuilt); + if (ctx->is_instant()) goto ok_exit; + dict_index_t* pk = dict_table_get_first_index(m_prebuilt->table); ut_ad(pk != NULL); @@ -6257,8 +7141,7 @@ ok_exit: ctx->m_stage = UT_NEW_NOKEY(ut_stage_alter_t(pk)); - if (!m_prebuilt->table->is_readable() - || dict_table_is_discarded(m_prebuilt->table)) { + if (!m_prebuilt->table->is_readable()) { goto all_done; } @@ -6273,7 +7156,7 @@ ok_exit: rebuild_templ = ctx->need_rebuild() || ((ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH) + & ALTER_COLUMN_EQUAL_PACK_LENGTH) && alter_templ_needs_rebuild( altered_table, ha_alter_info, ctx->new_table)); @@ -6327,9 +7210,9 @@ ok_exit: m_prebuilt->table, ctx->new_table, ctx->online, ctx->add_index, ctx->add_key_numbers, ctx->num_to_add_index, - altered_table, ctx->add_cols, ctx->col_map, + altered_table, ctx->defaults, ctx->col_map, ctx->add_autoinc, ctx->sequence, ctx->skip_pk_sort, - ctx->m_stage, add_v, eval_table); + ctx->m_stage, add_v, eval_table, ctx->allow_not_null); #ifndef DBUG_OFF oom: @@ -6338,7 +7221,7 @@ oom: DEBUG_SYNC_C("row_log_table_apply1_before"); error = row_log_table_apply( ctx->thr, m_prebuilt->table, altered_table, - ctx->m_stage); + ctx->m_stage, ctx->new_table); } /* Init online ddl status variables */ @@ -6489,7 +7372,7 @@ check_col_exists_in_indexes( const dict_col_t* idx_col = dict_index_get_nth_col(index, i); - if (is_v && dict_col_is_virtual(idx_col)) { + if (is_v && idx_col->is_virtual()) { const dict_v_col_t* v_col = reinterpret_cast< const dict_v_col_t*>(idx_col); if (v_col->v_pos == col_no) { @@ -6497,7 +7380,7 @@ check_col_exists_in_indexes( } } - if (!is_v && !dict_col_is_virtual(idx_col) + if (!is_v && !idx_col->is_virtual() && dict_col_get_no(idx_col) == col_no) { return(true); } @@ -6571,7 +7454,8 @@ rollback_inplace_alter_table( goto func_exit; } - trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX); + trx_start_for_ddl(ctx->trx, ctx->need_rebuild() + ? TRX_DICT_OP_TABLE : TRX_DICT_OP_INDEX); row_mysql_lock_data_dictionary(ctx->trx); if (ctx->need_rebuild()) { @@ -6614,7 +7498,7 @@ rollback_inplace_alter_table( } } else { DBUG_ASSERT(!(ha_alter_info->handler_flags - & Alter_inplace_info::ADD_PK_INDEX)); + & ALTER_ADD_PK_INDEX)); DBUG_ASSERT(ctx->new_table == prebuilt->table); innobase_rollback_sec_index( @@ -6628,7 +7512,8 @@ rollback_inplace_alter_table( trx_commit_for_mysql(ctx->trx); row_mysql_unlock_data_dictionary(ctx->trx); - trx_free_for_mysql(ctx->trx); + ctx->trx->free(); + ctx->trx = NULL; func_exit: #ifndef DBUG_OFF @@ -6748,27 +7633,23 @@ innobase_drop_foreign_try( } /** Rename a column in the data dictionary tables. -@param[in] user_table InnoDB table that was being altered -@param[in] trx data dictionary transaction -@param[in] table_name Table name in MySQL -@param[in] nth_col 0-based index of the column -@param[in] from old column name -@param[in] to new column name -@param[in] new_clustered whether the table has been rebuilt -@param[in] is_virtual whether it is a virtual column +@param[in] ctx ALTER TABLE context +@param[in,out] trx Data dictionary transaction +@param[in] table_name Table name in MySQL +@param[in] nth_col 0-based index of the column +@param[in] from old column name +@param[in] to new column name @retval true Failure @retval false Success */ static MY_ATTRIBUTE((nonnull, warn_unused_result)) bool innobase_rename_column_try( - const dict_table_t* user_table, - trx_t* trx, - const char* table_name, - ulint nth_col, - const char* from, - const char* to, - bool new_clustered, - bool is_virtual) + const ha_innobase_inplace_ctx& ctx, + trx_t* trx, + const char* table_name, + ulint nth_col, + const char* from, + const char* to) { pars_info_t* info; dberr_t error; @@ -6781,13 +7662,13 @@ innobase_rename_column_try( ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - if (new_clustered) { + if (ctx.need_rebuild()) { goto rename_foreign; } info = pars_info_create(); - pars_info_add_ull_literal(info, "tableid", user_table->id); + pars_info_add_ull_literal(info, "tableid", ctx.old_table->id); pars_info_add_int4_literal(info, "nth", nth_col); pars_info_add_str_literal(info, "new", to); @@ -6817,7 +7698,7 @@ err_exit: trx->op_info = "renaming column in SYS_FIELDS"; for (const dict_index_t* index = dict_table_get_first_index( - user_table); + ctx.old_table); index != NULL; index = dict_table_get_next_index(index)) { @@ -6892,7 +7773,7 @@ err_exit: } } - if (index == dict_table_get_first_index(user_table)) { + if (index == dict_table_get_first_index(ctx.old_table)) { clust_has_prefixes = has_prefixes; } } @@ -6903,8 +7784,8 @@ rename_foreign: std::set<dict_foreign_t*> fk_evict; bool foreign_modified; - for (dict_foreign_set::const_iterator it = user_table->foreign_set.begin(); - it != user_table->foreign_set.end(); + for (dict_foreign_set::const_iterator it = ctx.old_table->foreign_set.begin(); + it != ctx.old_table->foreign_set.end(); ++it) { dict_foreign_t* foreign = *it; @@ -6917,6 +7798,14 @@ rename_foreign: continue; } + /* Ignore the foreign key rename if fk info + is being dropped. */ + if (innobase_dropping_foreign( + foreign, ctx.drop_fk, + ctx.num_to_drop_fk)) { + continue; + } + info = pars_info_create(); pars_info_add_str_literal(info, "id", foreign->id); @@ -6945,8 +7834,8 @@ rename_foreign: } for (dict_foreign_set::const_iterator it - = user_table->referenced_set.begin(); - it != user_table->referenced_set.end(); + = ctx.old_table->referenced_set.begin(); + it != ctx.old_table->referenced_set.end(); ++it) { foreign_modified = false; @@ -6986,7 +7875,8 @@ rename_foreign: } } - if (new_clustered) { + /* Reload the foreign key info for instant table too. */ + if (ctx.need_rebuild() || ctx.is_instant()) { std::for_each(fk_evict.begin(), fk_evict.end(), dict_foreign_remove_from_cache); } @@ -7019,7 +7909,7 @@ innobase_rename_columns_try( ulint num_v = 0; DBUG_ASSERT(ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_COLUMN_NAME); + & ALTER_COLUMN_NAME); for (Field** fp = table->field; *fp; fp++, i++) { const bool is_virtual = !(*fp)->stored_in_db(); @@ -7037,12 +7927,10 @@ innobase_rename_columns_try( : i - num_v; if (innobase_rename_column_try( - ctx->old_table, trx, table_name, + *ctx, trx, table_name, col_n, - cf->field->field_name, - cf->field_name, - ctx->need_rebuild(), - is_virtual)) { + cf->field->field_name.str, + cf->field_name.str)) { return(true); } goto processed_field; @@ -7219,8 +8107,8 @@ innobase_rename_or_enlarge_columns_cache( dict_table_t* user_table) { if (!(ha_alter_info->handler_flags - & (Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH - | Alter_inplace_info::ALTER_COLUMN_NAME))) { + & (ALTER_COLUMN_EQUAL_PACK_LENGTH + | ALTER_COLUMN_NAME))) { return; } @@ -7255,8 +8143,8 @@ innobase_rename_or_enlarge_columns_cache( if ((*fp)->flags & FIELD_IS_RENAMED) { dict_mem_table_col_rename( user_table, col_n, - cf->field->field_name, - cf->field_name, is_virtual); + cf->field->field_name.str, + cf->field_name.str, is_virtual); } break; @@ -7300,11 +8188,11 @@ commit_set_autoinc( btr_write_autoinc(dict_table_get_first_index(ctx->new_table), autoinc - 1, true); } else if ((ha_alter_info->handler_flags - & Alter_inplace_info::CHANGE_CREATE_OPTION) + & ALTER_CHANGE_CREATE_OPTION) && (ha_alter_info->create_info->used_fields & HA_CREATE_USED_AUTO)) { - if (dict_table_is_discarded(ctx->old_table)) { + if (!ctx->old_table->space) { my_error(ER_TABLESPACE_DISCARDED, MYF(0), old_table->s->table_name.str); DBUG_RETURN(true); @@ -7315,7 +8203,7 @@ commit_set_autoinc( const Field* ai = old_table->found_next_number_field; ut_ad(!strcmp(dict_table_get_col_name(ctx->old_table, innodb_col_no(ai)), - ai->field_name)); + ai->field_name.str)); ib_uint64_t autoinc = ha_alter_info->create_info->auto_increment_value; @@ -7584,6 +8472,152 @@ innobase_update_foreign_cache( DBUG_RETURN(err); } +/** Changes SYS_COLUMNS.PRTYPE for one column. +@param[in,out] trx transaction +@param[in] table_name table name +@param[in] tableid table ID as in SYS_TABLES +@param[in] pos column position +@param[in] prtype new precise type +@return boolean flag +@retval true on failure +@retval false on success */ +static +bool +vers_change_field_try( + trx_t* trx, + const char* table_name, + const table_id_t tableid, + const ulint pos, + const ulint prtype) +{ + DBUG_ENTER("vers_change_field_try"); + + pars_info_t* info = pars_info_create(); + + pars_info_add_int4_literal(info, "prtype", prtype); + pars_info_add_ull_literal(info,"tableid", tableid); + pars_info_add_int4_literal(info, "pos", pos); + + dberr_t error = que_eval_sql(info, + "PROCEDURE CHANGE_COLUMN_MTYPE () IS\n" + "BEGIN\n" + "UPDATE SYS_COLUMNS SET PRTYPE=:prtype\n" + "WHERE TABLE_ID=:tableid AND POS=:pos;\n" + "END;\n", + false, trx); + + if (error != DB_SUCCESS) { + my_error_innodb(error, table_name, 0); + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + +/** Changes fields WITH/WITHOUT SYSTEM VERSIONING property in SYS_COLUMNS. +@param[in] ha_alter_info alter info +@param[in] ctx alter inplace context +@param[in] trx transaction +@param[in] table old table +@return boolean flag +@retval true on failure +@retval false on success */ +static +bool +vers_change_fields_try( + const Alter_inplace_info* ha_alter_info, + const ha_innobase_inplace_ctx* ctx, + trx_t* trx, + const TABLE* table) +{ + DBUG_ENTER("vers_change_fields_try"); + + DBUG_ASSERT(ha_alter_info); + DBUG_ASSERT(ctx); + + List_iterator_fast<Create_field> it( + ha_alter_info->alter_info->create_list); + + while (const Create_field* create_field = it++) { + if (!create_field->field) { + continue; + } + if (create_field->versioning + == Column_definition::VERSIONING_NOT_SET) { + continue; + } + + const dict_table_t* new_table = ctx->new_table; + const uint pos = innodb_col_no(create_field->field); + const dict_col_t* col = dict_table_get_nth_col(new_table, pos); + + DBUG_ASSERT(!col->vers_sys_start()); + DBUG_ASSERT(!col->vers_sys_end()); + + ulint new_prtype + = create_field->versioning + == Column_definition::WITHOUT_VERSIONING + ? col->prtype & ~DATA_VERSIONED + : col->prtype | DATA_VERSIONED; + + if (vers_change_field_try(trx, table->s->table_name.str, + new_table->id, pos, + new_prtype)) { + DBUG_RETURN(true); + } + } + + DBUG_RETURN(false); +} + +/** Changes WITH/WITHOUT SYSTEM VERSIONING for fields +in the data dictionary cache. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param table MySQL table as it is before the ALTER operation */ +static +void +vers_change_fields_cache( + Alter_inplace_info* ha_alter_info, + const ha_innobase_inplace_ctx* ctx, + const TABLE* table) +{ + DBUG_ENTER("vers_change_fields_cache"); + + DBUG_ASSERT(ha_alter_info); + DBUG_ASSERT(ctx); + DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED); + + List_iterator_fast<Create_field> it( + ha_alter_info->alter_info->create_list); + + while (const Create_field* create_field = it++) { + if (!create_field->field || create_field->field->vcol_info) { + continue; + } + dict_col_t* col = dict_table_get_nth_col( + ctx->new_table, innodb_col_no(create_field->field)); + + if (create_field->versioning + == Column_definition::WITHOUT_VERSIONING) { + + DBUG_ASSERT(!col->vers_sys_start()); + DBUG_ASSERT(!col->vers_sys_end()); + col->prtype &= ~DATA_VERSIONED; + } else if (create_field->versioning + == Column_definition::WITH_VERSIONING) { + + DBUG_ASSERT(!col->vers_sys_start()); + DBUG_ASSERT(!col->vers_sys_end()); + col->prtype |= DATA_VERSIONED; + } + } + + DBUG_VOID_RETURN; +} + /** Commit the changes made during prepare_inplace_alter_table() and inplace_alter_table() inside the data dictionary tables, when rebuilding the table. @@ -7614,7 +8648,7 @@ commit_try_rebuild( DBUG_ASSERT(ctx->need_rebuild()); DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH); DBUG_ASSERT(!(ha_alter_info->handler_flags - & Alter_inplace_info::DROP_FOREIGN_KEY) + & ALTER_DROP_FOREIGN_KEY) || ctx->num_to_drop_fk > 0); DBUG_ASSERT(ctx->num_to_drop_fk <= ha_alter_info->alter_info->drop_list.elements); @@ -7648,7 +8682,7 @@ commit_try_rebuild( } if ((ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_COLUMN_NAME) + & ALTER_COLUMN_NAME) && innobase_rename_columns_try(ha_alter_info, ctx, old_table, trx, table_name)) { DBUG_RETURN(true); @@ -7658,7 +8692,7 @@ commit_try_rebuild( /* The new table must inherit the flag from the "parent" table. */ - if (dict_table_is_discarded(user_table)) { + if (!user_table->space) { rebuilt_table->file_unreadable = true; rebuilt_table->flags2 |= DICT_TF2_DISCARDED; } @@ -7710,8 +8744,7 @@ commit_cache_rebuild( DBUG_ENTER("commit_cache_rebuild"); DEBUG_SYNC_C("commit_cache_rebuild"); DBUG_ASSERT(ctx->need_rebuild()); - DBUG_ASSERT(dict_table_is_discarded(ctx->old_table) - == dict_table_is_discarded(ctx->new_table)); + DBUG_ASSERT(!ctx->old_table->space == !ctx->new_table->space); const char* old_name = mem_heap_strdup( ctx->heap, ctx->old_table->name.m_name); @@ -7755,7 +8788,7 @@ get_col_list_to_be_dropped( const dict_col_t* idx_col = dict_index_get_nth_col(index, col); - if (dict_col_is_virtual(idx_col)) { + if (idx_col->is_virtual()) { const dict_v_col_t* v_col = reinterpret_cast< const dict_v_col_t*>(idx_col); @@ -7769,6 +8802,96 @@ get_col_list_to_be_dropped( } } +/** Change PAGE_COMPRESSED to ON or change the PAGE_COMPRESSION_LEVEL. +@param[in] level PAGE_COMPRESSION_LEVEL +@param[in] table table before the change +@param[in,out] trx data dictionary transaction +@param[in] table_name table name in MariaDB +@return whether the operation succeeded */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +bool +innobase_page_compression_try( + uint level, + const dict_table_t* table, + trx_t* trx, + const char* table_name) +{ + DBUG_ENTER("innobase_page_compression_try"); + DBUG_ASSERT(level >= 1); + DBUG_ASSERT(level <= 9); + + unsigned flags = table->flags + & ~(0xFU << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + flags |= 1U << DICT_TF_POS_PAGE_COMPRESSION + | level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL; + + if (table->flags == flags) { + DBUG_RETURN(false); + } + + pars_info_t* info = pars_info_create(); + + pars_info_add_ull_literal(info, "id", table->id); + pars_info_add_int4_literal(info, "type", + dict_tf_to_sys_tables_type(flags)); + + dberr_t error = que_eval_sql(info, + "PROCEDURE CHANGE_COMPRESSION () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET TYPE=:type\n" + "WHERE ID=:id;\n" + "END;\n", + false, trx); + + if (error != DB_SUCCESS) { + my_error_innodb(error, table_name, 0); + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + +static +void +dict_stats_try_drop_table(THD *thd, const table_name_t &name, + const LEX_CSTRING &table_name) +{ + char errstr[1024]; + if (dict_stats_drop_table(name.m_name, errstr, sizeof(errstr)) != DB_SUCCESS) + { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_ALTER_INFO, + "Deleting persistent statistics" + " for table '%s' in InnoDB failed: %s", + table_name.str, + errstr); + } +} + +/** Evict the table from cache and reopen it. Drop outdated statistics. + @param thd mariadb THD entity + @param table innodb table + @param maria_table_name user-friendly table name for errors + @return newly opened table */ +static +dict_table_t* +innobase_reload_table(THD *thd, dict_table_t *table, + const LEX_CSTRING &table_name) +{ + char *tb_name= strdup(table->name.m_name); + dict_table_close(table, true, false); + dict_table_remove_from_cache(table); + table= dict_table_open_on_name(tb_name, TRUE, TRUE, + DICT_ERR_IGNORE_FK_NOKEY); + + /* Drop outdated table stats. */ + dict_stats_try_drop_table(thd, table->name, table_name); + free(tb_name); + return table; +} + /** Commit the changes made during prepare_inplace_alter_table() and inplace_alter_table() inside the data dictionary tables, when not rebuilding the table. @@ -7795,13 +8918,20 @@ commit_try_norebuild( DBUG_ASSERT(!ctx->need_rebuild()); DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH); DBUG_ASSERT(!(ha_alter_info->handler_flags - & Alter_inplace_info::DROP_FOREIGN_KEY) + & ALTER_DROP_FOREIGN_KEY) || ctx->num_to_drop_fk > 0); DBUG_ASSERT(ctx->num_to_drop_fk <= ha_alter_info->alter_info->drop_list.elements || ctx->num_to_drop_vcol == ha_alter_info->alter_info->drop_list.elements); + if (ctx->page_compression_level + && innobase_page_compression_try(ctx->page_compression_level, + ctx->new_table, trx, + table_name)) { + DBUG_RETURN(true); + } + for (ulint i = 0; i < ctx->num_to_add_index; i++) { dict_index_t* index = ctx->add_index[i]; DBUG_ASSERT(dict_index_get_online_status(index) @@ -7829,6 +8959,11 @@ commit_try_norebuild( DBUG_RETURN(true); } + if ((ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED) + && vers_change_fields_try(ha_alter_info, ctx, trx, old_table)) { + DBUG_RETURN(true); + } + dberr_t error; /* We altered the table in place. Mark the indexes as committed. */ @@ -7885,32 +9020,32 @@ commit_try_norebuild( } if ((ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_COLUMN_NAME) + & ALTER_COLUMN_NAME) && innobase_rename_columns_try(ha_alter_info, ctx, old_table, trx, table_name)) { DBUG_RETURN(true); } if ((ha_alter_info->handler_flags - & Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH) + & ALTER_COLUMN_EQUAL_PACK_LENGTH) && innobase_enlarge_columns_try(ha_alter_info, old_table, ctx->old_table, trx, table_name)) { DBUG_RETURN(true); } if ((ha_alter_info->handler_flags - & Alter_inplace_info::DROP_VIRTUAL_COLUMN) - && innobase_drop_virtual_try( - ha_alter_info, altered_table, old_table, - ctx->old_table, trx)) { + & ALTER_DROP_VIRTUAL_COLUMN) + && innobase_drop_virtual_try(ha_alter_info, ctx->old_table, trx)) { DBUG_RETURN(true); } if ((ha_alter_info->handler_flags - & Alter_inplace_info::ADD_VIRTUAL_COLUMN) - && innobase_add_virtual_try( - ha_alter_info, altered_table, old_table, - ctx->old_table, trx)) { + & ALTER_ADD_VIRTUAL_COLUMN) + && innobase_add_virtual_try(ha_alter_info, ctx->old_table, trx)) { + DBUG_RETURN(true); + } + + if (innobase_add_instant_try(ctx, altered_table, old_table, trx)) { DBUG_RETURN(true); } @@ -7919,24 +9054,76 @@ commit_try_norebuild( /** Commit the changes to the data dictionary cache after a successful commit_try_norebuild() call. -@param ctx In-place ALTER TABLE context +@param ha_alter_info algorithm=inplace context +@param ctx In-place ALTER TABLE context for the current partition @param table the TABLE before the ALTER -@param trx Data dictionary transaction object -(will be started and committed) +@param trx Data dictionary transaction +(will be started and committed, for DROP INDEX) @return whether all replacements were found for dropped indexes */ -inline MY_ATTRIBUTE((nonnull, warn_unused_result)) +inline MY_ATTRIBUTE((nonnull)) bool commit_cache_norebuild( /*===================*/ + Alter_inplace_info* ha_alter_info, ha_innobase_inplace_ctx*ctx, const TABLE* table, trx_t* trx) { DBUG_ENTER("commit_cache_norebuild"); - - bool found = true; - DBUG_ASSERT(!ctx->need_rebuild()); + DBUG_ASSERT(ctx->new_table->space != fil_system.temp_space); + DBUG_ASSERT(!ctx->new_table->is_temporary()); + + bool found = true; + + if (ctx->page_compression_level) { + DBUG_ASSERT(ctx->new_table->space != fil_system.sys_space); + ctx->new_table->flags &= + ~(0xFU << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + ctx->new_table->flags |= 1 << DICT_TF_POS_PAGE_COMPRESSION + | (ctx->page_compression_level + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + + if (fil_space_t* space = ctx->new_table->space) { + bool update = !(space->flags + & FSP_FLAGS_MASK_PAGE_COMPRESSION); + mutex_enter(&fil_system.mutex); + space->flags = (~FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL + & (space->flags + | FSP_FLAGS_MASK_PAGE_COMPRESSION)) + | ctx->page_compression_level + << FSP_FLAGS_MEM_COMPRESSION_LEVEL; + mutex_exit(&fil_system.mutex); + + if (update) { + /* Maybe we should introduce an undo + log record for updating tablespace + flags, and perform the update already + in innobase_page_compression_try(). + + If the server is killed before the + following mini-transaction commit + becomes durable, fsp_flags_try_adjust() + will perform the equivalent adjustment + and warn "adjusting FSP_SPACE_FLAGS". */ + mtr_t mtr; + mtr.start(); + if (buf_block_t* b = buf_page_get( + page_id_t(space->id, 0), + page_size_t(space->flags), + RW_X_LATCH, &mtr)) { + mtr.set_named_space(space); + mlog_write_ulint( + FSP_HEADER_OFFSET + + FSP_SPACE_FLAGS + b->frame, + space->flags + & ~FSP_FLAGS_MEM_MASK, + MLOG_4BYTES, &mtr); + } + mtr.commit(); + } + } + } col_set drop_list; col_set v_drop_list; @@ -8025,6 +9212,15 @@ commit_cache_norebuild( trx_commit_for_mysql(trx); } + if (!ctx->is_instant()) { + innobase_rename_or_enlarge_columns_cache( + ha_alter_info, table, ctx->new_table); + } + + if (ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED) { + vers_change_fields_cache(ha_alter_info, ctx, table); + } + ctx->new_table->fts_doc_id_index = ctx->new_table->fts ? dict_table_get_index_on_name( @@ -8032,7 +9228,6 @@ commit_cache_norebuild( : NULL; DBUG_ASSERT((ctx->new_table->fts == NULL) == (ctx->new_table->fts_doc_id_index == NULL)); - DBUG_RETURN(found); } @@ -8041,8 +9236,6 @@ Remove statistics for dropped indexes, add statistics for created indexes and rename statistics for renamed indexes. @param ha_alter_info Data used during in-place alter @param ctx In-place ALTER TABLE context -@param altered_table MySQL table that is being altered -@param table_name Table name in MySQL @param thd MySQL connection */ static @@ -8051,8 +9244,6 @@ alter_stats_norebuild( /*==================*/ Alter_inplace_info* ha_alter_info, ha_innobase_inplace_ctx* ctx, - TABLE* altered_table, - const char* table_name, THD* thd) { ulint i; @@ -8091,7 +9282,7 @@ alter_stats_norebuild( char errstr[1024]; if (dict_stats_drop_index( - ctx->new_table->name.m_name, key->name, + ctx->new_table->name.m_name, key->name.str, errstr, sizeof errstr) != DB_SUCCESS) { push_warning(thd, Sql_condition::WARN_LEVEL_WARN, @@ -8129,7 +9320,7 @@ alter_stats_rebuild( { DBUG_ENTER("alter_stats_rebuild"); - if (dict_table_is_discarded(table) + if (!table->space || !dict_stats_is_persistent_enabled(table)) { DBUG_VOID_RETURN; } @@ -8213,7 +9404,8 @@ static bool alter_rebuild_apply_log( dberr_t error = row_log_table_apply( ctx->thr, user_table, altered_table, static_cast<ha_innobase_inplace_ctx*>( - ha_alter_info->handler_ctx)->m_stage); + ha_alter_info->handler_ctx)->m_stage, + ctx->new_table); if (s_templ) { ut_ad(ctx->need_rebuild()); @@ -8350,9 +9542,9 @@ ha_innobase::commit_inplace_alter_table( /* If decryption failed for old table or new table fail here. */ if ((!ctx->old_table->is_readable() - && fil_space_get(ctx->old_table->space)) + && ctx->old_table->space) || (!ctx->new_table->is_readable() - && fil_space_get(ctx->new_table->space))) { + && ctx->new_table->space)) { String str; const char* engine= table_type(); get_error_message(HA_ERR_DECRYPTION_FAILED, &str); @@ -8506,8 +9698,7 @@ ha_innobase::commit_inplace_alter_table( = static_cast<ha_innobase_inplace_ctx*>(*pctx); DBUG_ASSERT(new_clustered == ctx->need_rebuild()); - if (ctx->need_rebuild() - && dict_table_is_discarded(ctx->old_table)) { + if (ctx->need_rebuild() && !ctx->old_table->space) { my_error(ER_TABLESPACE_DISCARDED, MYF(0), table->s->table_name.str); fail = true; @@ -8551,9 +9742,16 @@ ha_innobase::commit_inplace_alter_table( } /* Commit or roll back the changes to the data dictionary. */ + DEBUG_SYNC(m_user_thd, "innodb_alter_inplace_before_commit"); if (fail) { trx_rollback_for_mysql(trx); + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*>(*pctx); + ctx->rollback_instant(); + } } else if (!new_clustered) { trx_commit_for_mysql(trx); } else { @@ -8722,12 +9920,11 @@ foreign_fail: bool fk_fail = innobase_update_foreign_cache( ctx, m_user_thd) != DB_SUCCESS; - if (!commit_cache_norebuild(ctx, table, trx)) { + if (!commit_cache_norebuild(ha_alter_info, ctx, table, + trx)) { fk_fail = true; } - innobase_rename_or_enlarge_columns_cache( - ha_alter_info, table, ctx->new_table); if (fk_fail && m_prebuilt->trx->check_foreigns) { goto foreign_fail; } @@ -8770,7 +9967,7 @@ foreign_fail: row_mysql_unlock_data_dictionary(trx); if (trx != ctx0->trx) { - trx_free_for_mysql(trx); + trx->free(); } DBUG_RETURN(true); } @@ -8789,44 +9986,24 @@ foreign_fail: = static_cast<ha_innobase_inplace_ctx*>(*pctx); if (ctx->trx) { - trx_free_for_mysql(ctx->trx); + ctx->trx->free(); ctx->trx = NULL; } } - if (ctx0->num_to_drop_vcol || ctx0->num_to_add_vcol) { - /* FIXME: this workaround does not seem to work with - partitioned tables */ + if (ctx0->num_to_drop_vcol || ctx0->num_to_add_vcol + || (ctx0->new_table->n_v_cols && !new_clustered + && (ha_alter_info->alter_info->drop_list.elements + || ha_alter_info->alter_info->create_list.elements))) { DBUG_ASSERT(ctx0->old_table->get_ref_count() == 1); - trx_commit_for_mysql(m_prebuilt->trx); - char tb_name[NAME_LEN * 2 + 1 + 1]; - strcpy(tb_name, m_prebuilt->table->name.m_name); - dict_table_close(m_prebuilt->table, true, false); - dict_table_remove_from_cache(m_prebuilt->table); - m_prebuilt->table = dict_table_open_on_name( - tb_name, TRUE, TRUE, DICT_ERR_IGNORE_FK_NOKEY); - - /* Drop outdated table stats. */ - char errstr[1024]; - if (dict_stats_drop_table( - m_prebuilt->table->name.m_name, - errstr, sizeof(errstr)) - != DB_SUCCESS) { - push_warning_printf( - m_user_thd, - Sql_condition::WARN_LEVEL_WARN, - ER_ALTER_INFO, - "Deleting persistent statistics" - " for table '%s' in" - " InnoDB failed: %s", - table->s->table_name.str, - errstr); - } + m_prebuilt->table = innobase_reload_table(m_user_thd, + m_prebuilt->table, + table->s->table_name); row_mysql_unlock_data_dictionary(trx); - trx_free_for_mysql(trx); + trx->free(); MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE); DBUG_RETURN(false); } @@ -8883,8 +10060,6 @@ foreign_fail: old copy of the table (which was renamed to ctx->tmp_name). */ - char errstr[1024]; - DBUG_ASSERT(0 == strcmp(ctx->old_table->name.m_name, ctx->tmp_name)); @@ -8893,20 +10068,9 @@ foreign_fail: DBUG_SET("+d,innodb_report_deadlock"); ); - if (dict_stats_drop_table( - ctx->new_table->name.m_name, - errstr, sizeof(errstr)) - != DB_SUCCESS) { - push_warning_printf( - m_user_thd, - Sql_condition::WARN_LEVEL_WARN, - ER_ALTER_INFO, - "Deleting persistent statistics" - " for rebuilt table '%s' in" - " InnoDB failed: %s", - table->s->table_name.str, - errstr); - } + dict_stats_try_drop_table(m_user_thd, + ctx->new_table->name, + table->s->table_name); DBUG_EXECUTE_IF( "ib_rename_index_fail3", @@ -8957,7 +10121,7 @@ foreign_fail: } row_mysql_unlock_data_dictionary(trx); - trx_free_for_mysql(trx); + trx->free(); /* TODO: The following code could be executed while allowing concurrent access to the table @@ -8985,9 +10149,7 @@ foreign_fail: (*pctx); DBUG_ASSERT(!ctx->need_rebuild()); - alter_stats_norebuild( - ha_alter_info, ctx, altered_table, - table->s->table_name.str, m_user_thd); + alter_stats_norebuild(ha_alter_info, ctx, m_user_thd); DBUG_INJECT_CRASH("ib_commit_inplace_crash", crash_inject_count++); } diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index 003760f80b1..d8cb0b67ab5 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -87,11 +87,6 @@ in i_s_page_type[] array */ #define I_S_PAGE_TYPE_BITS 4 -/* Check if we can hold all page types */ -#if I_S_PAGE_TYPE_LAST >= 1 << I_S_PAGE_TYPE_BITS -# error i_s_page_type[] is too large -#endif - /** Name string for File Page Types */ static buf_page_desc_t i_s_page_type[] = { {"ALLOCATED", FIL_PAGE_TYPE_ALLOCATED}, @@ -284,7 +279,7 @@ field_store_ulint( if (n != ULINT_UNDEFINED) { - ret = field->store(n, true); + ret = field->store(longlong(n), true); field->set_notnull(); } else { @@ -484,18 +479,7 @@ static ST_FIELD_INFO innodb_trx_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#ifdef BTR_CUR_HASH_ADAPT -#define IDX_TRX_ADAPTIVE_HASH_LATCHED 20 - {STRUCT_FLD(field_name, "trx_adaptive_hash_latched"), - STRUCT_FLD(field_length, 1), - STRUCT_FLD(field_type, MYSQL_TYPE_LONG), - STRUCT_FLD(value, 0), - STRUCT_FLD(field_flags, 0), - STRUCT_FLD(old_name, ""), - STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#endif /* BTR_CUR_HASH_ADAPT */ - -#define IDX_TRX_READ_ONLY 20 + I_S_AHI +#define IDX_TRX_READ_ONLY 20 {STRUCT_FLD(field_name, "trx_is_read_only"), STRUCT_FLD(field_length, 1), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -504,7 +488,7 @@ static ST_FIELD_INFO innodb_trx_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define IDX_TRX_AUTOCOMMIT_NON_LOCKING 21 + I_S_AHI +#define IDX_TRX_AUTOCOMMIT_NON_LOCKING 21 {STRUCT_FLD(field_name, "trx_autocommit_non_locking"), STRUCT_FLD(field_length, 1), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -651,11 +635,6 @@ fill_innodb_trx_from_cache( OK(field_store_string(fields[IDX_TRX_LAST_FOREIGN_KEY_ERROR], row->trx_foreign_key_error)); -#ifdef BTR_CUR_HASH_ADAPT - /* trx_adaptive_hash_latched */ - OK(fields[IDX_TRX_ADAPTIVE_HASH_LATCHED]->store(0, true)); -#endif /* BTR_CUR_HASH_ADAPT */ - /* trx_is_read_only*/ OK(fields[IDX_TRX_READ_ONLY]->store( row->trx_is_read_only, true)); @@ -1198,7 +1177,7 @@ trx_i_s_common_fill_table( TABLE_LIST* tables, /*!< in/out: tables to fill */ Item* ) /*!< in: condition (not used) */ { - const char* table_name; + LEX_CSTRING table_name; int ret; trx_i_s_cache_t* cache; @@ -1218,7 +1197,7 @@ trx_i_s_common_fill_table( table_name = tables->schema_table_name; /* or table_name = tables->schema_table->table_name; */ - RETURN_IF_INNODB_NOT_STARTED(table_name); + RETURN_IF_INNODB_NOT_STARTED(table_name.str); /* update the cache */ trx_i_s_cache_start_write(cache); @@ -1227,7 +1206,7 @@ trx_i_s_common_fill_table( if (trx_i_s_cache_is_truncated(cache)) { - ib::warn() << "Data in " << table_name << " truncated due to" + ib::warn() << "Data in " << table_name.str << " truncated due to" " memory limit of " << TRX_I_S_MEM_LIMIT << " bytes"; } @@ -1235,7 +1214,7 @@ trx_i_s_common_fill_table( trx_i_s_cache_start_read(cache); - if (innobase_strcasecmp(table_name, "innodb_trx") == 0) { + if (innobase_strcasecmp(table_name.str, "innodb_trx") == 0) { if (fill_innodb_trx_from_cache( cache, thd, tables->table) != 0) { @@ -1243,7 +1222,7 @@ trx_i_s_common_fill_table( ret = 1; } - } else if (innobase_strcasecmp(table_name, "innodb_locks") == 0) { + } else if (innobase_strcasecmp(table_name.str, "innodb_locks") == 0) { if (fill_innodb_locks_from_cache( cache, thd, tables->table) != 0) { @@ -1251,7 +1230,7 @@ trx_i_s_common_fill_table( ret = 1; } - } else if (innobase_strcasecmp(table_name, "innodb_lock_waits") == 0) { + } else if (innobase_strcasecmp(table_name.str, "innodb_lock_waits") == 0) { if (fill_innodb_lock_waits_from_cache( cache, thd, tables->table) != 0) { @@ -1261,7 +1240,7 @@ trx_i_s_common_fill_table( } else { ib::error() << "trx_i_s_common_fill_table() was" - " called to fill unknown table: " << table_name << "." + " called to fill unknown table: " << table_name.str << "." " This function only knows how to fill" " innodb_trx, innodb_locks and" " innodb_lock_waits tables."; @@ -1365,7 +1344,7 @@ i_s_cmp_fill_low( DBUG_RETURN(0); } - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); for (uint i = 0; i < PAGE_ZIP_SSIZE_MAX; i++) { page_zip_stat_t* zip_stat = &page_zip_stat[i]; @@ -1668,7 +1647,7 @@ i_s_cmp_per_index_fill_low( DBUG_RETURN(0); } - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* Create a snapshot of the stats so we do not bump into lock order violations with dict_sys->mutex below. */ @@ -1689,7 +1668,7 @@ i_s_cmp_per_index_fill_low( char db_utf8[MAX_DB_UTF8_LEN]; char table_utf8[MAX_TABLE_UTF8_LEN]; - dict_fs2utf8(index->table_name, + dict_fs2utf8(index->table->name.m_name, db_utf8, sizeof(db_utf8), table_utf8, sizeof(table_utf8)); @@ -1991,7 +1970,7 @@ i_s_cmpmem_fill_low( DBUG_RETURN(0); } - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool; @@ -2889,7 +2868,7 @@ i_s_fts_deleted_generic_fill( DBUG_RETURN(0); } - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* Prevent DROP of the internal tables for fulltext indexes. FIXME: acquire DDL-blocking MDL on the user table name! */ @@ -2909,7 +2888,7 @@ i_s_fts_deleted_generic_fill( deleted = fts_doc_ids_create(); - trx = trx_allocate_for_background(); + trx = trx_create(); trx->op_info = "Select for FTS DELETE TABLE"; FTS_INIT_FTS_TABLE(&fts_table, @@ -2922,7 +2901,7 @@ i_s_fts_deleted_generic_fill( rw_lock_s_unlock(&dict_operation_lock); - trx_free_for_background(trx); + trx->free(); fields = table->field; @@ -3299,7 +3278,7 @@ i_s_fts_index_cache_fill( DBUG_RETURN(0); } - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* Prevent DROP of the internal tables for fulltext indexes. FIXME: acquire DDL-blocking MDL on the user table name! */ @@ -3445,7 +3424,7 @@ i_s_fts_index_table_fill_selected( fts_result_cache_limit = 8192; ); - trx = trx_allocate_for_background(); + trx = trx_create(); trx->op_info = "fetching FTS index nodes"; @@ -3502,7 +3481,7 @@ i_s_fts_index_table_fill_selected( que_graph_free(graph); mutex_exit(&dict_sys->mutex); - trx_free_for_background(trx); + trx->free(); if (fetch.total_memory >= fts_result_cache_limit) { error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; @@ -3749,7 +3728,7 @@ i_s_fts_index_table_fill( DBUG_RETURN(0); } - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* Prevent DROP of the internal tables for fulltext indexes. FIXME: acquire DDL-blocking MDL on the user table name! */ @@ -3912,7 +3891,7 @@ i_s_fts_config_fill( DBUG_RETURN(0); } - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* Prevent DROP of the internal tables for fulltext indexes. FIXME: acquire DDL-blocking MDL on the user table name! */ @@ -3934,7 +3913,7 @@ no_fts: fields = table->field; - trx = trx_allocate_for_background(); + trx = trx_create(); trx->op_info = "Select for FTS CONFIG TABLE"; FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, user_table); @@ -3989,7 +3968,7 @@ no_fts: rw_lock_s_unlock(&dict_operation_lock); - trx_free_for_background(trx); + trx->free(); DBUG_RETURN(ret); } @@ -4506,7 +4485,7 @@ i_s_innodb_buffer_stats_fill_table( buf_pool_info_t* pool_info; DBUG_ENTER("i_s_innodb_buffer_fill_general"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* Only allow the PROCESS privilege holder to access the stats */ if (check_global_access(thd, PROCESS_ACL)) { @@ -4814,6 +4793,8 @@ i_s_innodb_buffer_page_fill( TABLE* table; Field** fields; + compile_time_assert(I_S_PAGE_TYPE_LAST < 1 << I_S_PAGE_TYPE_BITS); + DBUG_ENTER("i_s_innodb_buffer_page_fill"); table = tables->table; @@ -4886,8 +4867,8 @@ i_s_innodb_buffer_page_fill( if (index) { table_name_end = innobase_convert_name( table_name, sizeof(table_name), - index->table_name, - strlen(index->table_name), + index->table->name.m_name, + strlen(index->table->name.m_name), thd); ret = fields[IDX_BUFFER_PAGE_TABLE_NAME] @@ -4924,10 +4905,7 @@ i_s_innodb_buffer_page_fill( page_info->zip_ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize : 0, true)); - -#if BUF_PAGE_STATE_BITS > 3 -# error "BUF_PAGE_STATE_BITS > 3, please ensure that all 1<<BUF_PAGE_STATE_BITS values are checked for" -#endif + compile_time_assert(BUF_PAGE_STATE_BITS == 3); state = static_cast<enum buf_page_state>(page_info->page_state); switch (state) { @@ -5011,13 +4989,15 @@ i_s_innodb_set_page_type( in the i_s_page_type[] array is I_S_PAGE_TYPE_INDEX (1) for index pages or I_S_PAGE_TYPE_IBUF for change buffer index pages */ - if (page_info->index_id - == static_cast<index_id_t>(DICT_IBUF_ID_MIN - + IBUF_SPACE_ID)) { - page_info->page_type = I_S_PAGE_TYPE_IBUF; - } else if (page_type == FIL_PAGE_RTREE) { + if (page_type == FIL_PAGE_RTREE) { page_info->page_type = I_S_PAGE_TYPE_RTREE; + } else if (page_info->index_id + == static_cast<index_id_t>(DICT_IBUF_ID_MIN + + IBUF_SPACE_ID)) { + page_info->page_type = I_S_PAGE_TYPE_IBUF; } else { + ut_ad(page_type == FIL_PAGE_INDEX + || page_type == FIL_PAGE_TYPE_INSTANT); page_info->page_type = I_S_PAGE_TYPE_INDEX; } @@ -5241,7 +5221,7 @@ i_s_innodb_buffer_page_fill_table( DBUG_ENTER("i_s_innodb_buffer_page_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -5610,8 +5590,8 @@ i_s_innodb_buf_page_lru_fill( if (index) { table_name_end = innobase_convert_name( table_name, sizeof(table_name), - index->table_name, - strlen(index->table_name), + index->table->name.m_name, + strlen(index->table->name.m_name), thd); ret = fields[IDX_BUF_LRU_PAGE_TABLE_NAME] @@ -5792,7 +5772,7 @@ i_s_innodb_buf_page_lru_fill_table( DBUG_ENTER("i_s_innodb_buf_page_lru_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to any users that do not hold PROCESS_ACL */ if (check_global_access(thd, PROCESS_ACL)) { @@ -5890,12 +5870,8 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_page_lru = /*******************************************************************//** Unbind a dynamic INFORMATION_SCHEMA table. -@return 0 on success */ -static -int -i_s_common_deinit( -/*==============*/ - void* p) /*!< in/out: table schema object */ +@return 0 */ +static int i_s_common_deinit(void*) { DBUG_ENTER("i_s_common_deinit"); @@ -5953,16 +5929,7 @@ static ST_FIELD_INFO innodb_sys_tables_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLES_FILE_FORMAT 5 - {STRUCT_FLD(field_name, "FILE_FORMAT"), - STRUCT_FLD(field_length, 10), - STRUCT_FLD(field_type, MYSQL_TYPE_STRING), - STRUCT_FLD(value, 0), - STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), - STRUCT_FLD(old_name, ""), - STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, - -#define SYS_TABLES_ROW_FORMAT 6 +#define SYS_TABLES_ROW_FORMAT 5 {STRUCT_FLD(field_name, "ROW_FORMAT"), STRUCT_FLD(field_length, 12), STRUCT_FLD(field_type, MYSQL_TYPE_STRING), @@ -5971,7 +5938,7 @@ static ST_FIELD_INFO innodb_sys_tables_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLES_ZIP_PAGE_SIZE 7 +#define SYS_TABLES_ZIP_PAGE_SIZE 6 {STRUCT_FLD(field_name, "ZIP_PAGE_SIZE"), STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -5980,7 +5947,7 @@ static ST_FIELD_INFO innodb_sys_tables_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLES_SPACE_TYPE 8 +#define SYS_TABLES_SPACE_TYPE 7 {STRUCT_FLD(field_name, "SPACE_TYPE"), STRUCT_FLD(field_length, 10), STRUCT_FLD(field_type, MYSQL_TYPE_STRING), @@ -6009,11 +5976,8 @@ i_s_dict_fill_sys_tables( ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS( table->flags); const page_size_t& page_size = dict_tf_get_page_size(table->flags); - const char* file_format; const char* row_format; - const char* space_type; - file_format = trx_sys_file_format_id_to_name(atomic_blobs); if (!compact) { row_format = "Redundant"; } else if (!atomic_blobs) { @@ -6024,12 +5988,6 @@ i_s_dict_fill_sys_tables( row_format = "Dynamic"; } - if (is_system_tablespace(table->space)) { - space_type = "System"; - } else { - space_type = "Single"; - } - DBUG_ENTER("i_s_dict_fill_sys_tables"); fields = table_to_fill->field; @@ -6042,9 +6000,7 @@ i_s_dict_fill_sys_tables( OK(fields[SYS_TABLES_NUM_COLUMN]->store(table->n_cols)); - OK(fields[SYS_TABLES_SPACE]->store(table->space)); - - OK(field_store_string(fields[SYS_TABLES_FILE_FORMAT], file_format)); + OK(fields[SYS_TABLES_SPACE]->store(table->space_id, true)); OK(field_store_string(fields[SYS_TABLES_ROW_FORMAT], row_format)); @@ -6053,7 +6009,8 @@ i_s_dict_fill_sys_tables( ? page_size.physical() : 0, true)); - OK(field_store_string(fields[SYS_TABLES_SPACE_TYPE], space_type)); + OK(field_store_string(fields[SYS_TABLES_SPACE_TYPE], + table->space_id ? "Single" : "System")); OK(schema_table_store_record(thd, table_to_fill)); @@ -6077,7 +6034,7 @@ i_s_sys_tables_fill_table( mtr_t mtr; DBUG_ENTER("i_s_sys_tables_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -6097,23 +6054,19 @@ i_s_sys_tables_fill_table( /* Create and populate a dict_table_t structure with information from SYS_TABLES row */ err_msg = dict_process_sys_tables_rec_and_mtr_commit( - heap, rec, &table_rec, - DICT_TABLE_LOAD_FROM_RECORD, &mtr); + heap, rec, &table_rec, false, &mtr); mutex_exit(&dict_sys->mutex); if (!err_msg) { - i_s_dict_fill_sys_tables(thd, table_rec, tables->table); + i_s_dict_fill_sys_tables(thd, table_rec, + tables->table); } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_FIND_SYSTEM_REC, "%s", err_msg); } - /* Since dict_process_sys_tables_rec_and_mtr_commit() - is called with DICT_TABLE_LOAD_FROM_RECORD, the table_rec - is created in dict_process_sys_tables_rec(), we will - need to free it */ if (table_rec) { dict_mem_table_free(table_rec); } @@ -6382,7 +6335,7 @@ i_s_sys_tables_fill_table_stats( mtr_t mtr; DBUG_ENTER("i_s_sys_tables_fill_table_stats"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -6403,8 +6356,7 @@ i_s_sys_tables_fill_table_stats( /* Fetch the dict_table_t structure corresponding to this SYS_TABLES record */ err_msg = dict_process_sys_tables_rec_and_mtr_commit( - heap, rec, &table_rec, - DICT_TABLE_LOAD_FROM_CACHE, &mtr); + heap, rec, &table_rec, true, &mtr); ulint ref_count = table_rec ? table_rec->get_ref_count() : 0; mutex_exit(&dict_sys->mutex); @@ -6603,6 +6555,7 @@ i_s_dict_fill_sys_indexes( /*======================*/ THD* thd, /*!< in: thread */ table_id_t table_id, /*!< in: table id */ + ulint space_id, /*!< in: tablespace id */ dict_index_t* index, /*!< in: populated dict_index_t struct with index info */ TABLE* table_to_fill) /*!< in/out: fill this table */ @@ -6627,20 +6580,25 @@ i_s_dict_fill_sys_indexes( OK(fields[SYS_INDEX_TABLE_ID]->store(longlong(table_id), true)); - OK(fields[SYS_INDEX_TYPE]->store(index->type)); + OK(fields[SYS_INDEX_TYPE]->store(index->type, true)); OK(fields[SYS_INDEX_NUM_FIELDS]->store(index->n_fields)); /* FIL_NULL is ULINT32_UNDEFINED */ if (index->page == FIL_NULL) { - OK(fields[SYS_INDEX_PAGE_NO]->store(-1)); + fields[SYS_INDEX_PAGE_NO]->set_null(); } else { - OK(fields[SYS_INDEX_PAGE_NO]->store(index->page)); + OK(fields[SYS_INDEX_PAGE_NO]->store(index->page, true)); } - OK(fields[SYS_INDEX_SPACE]->store(index->space)); + if (space_id == ULINT_UNDEFINED) { + fields[SYS_INDEX_SPACE]->set_null(); + } else { + OK(fields[SYS_INDEX_SPACE]->store(space_id, true)); + } - OK(fields[SYS_INDEX_MERGE_THRESHOLD]->store(index->merge_threshold)); + OK(fields[SYS_INDEX_MERGE_THRESHOLD]->store(index->merge_threshold, + true)); OK(schema_table_store_record(thd, table_to_fill)); @@ -6664,7 +6622,7 @@ i_s_sys_indexes_fill_table( mtr_t mtr; DBUG_ENTER("i_s_sys_indexes_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -6682,19 +6640,27 @@ i_s_sys_indexes_fill_table( while (rec) { const char* err_msg; table_id_t table_id; + ulint space_id; dict_index_t index_rec; /* Populate a dict_index_t structure with information from a SYS_INDEXES row */ err_msg = dict_process_sys_indexes_rec(heap, rec, &index_rec, &table_id); - + const byte* field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__SPACE, &space_id); + space_id = space_id == 4 ? mach_read_from_4(field) + : ULINT_UNDEFINED; mtr_commit(&mtr); mutex_exit(&dict_sys->mutex); if (!err_msg) { - i_s_dict_fill_sys_indexes(thd, table_id, &index_rec, - tables->table); + if (int err = i_s_dict_fill_sys_indexes( + thd, table_id, space_id, &index_rec, + tables->table)) { + mem_heap_free(heap); + DBUG_RETURN(err); + } } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_FIND_SYSTEM_REC, "%s", @@ -6873,7 +6839,7 @@ i_s_dict_fill_sys_columns( OK(field_store_string(fields[SYS_COLUMN_NAME], col_name)); - if (dict_col_is_virtual(column)) { + if (column->is_virtual()) { ulint pos = dict_create_v_col_pos(nth_v_col, column->ind); OK(fields[SYS_COLUMN_POSITION]->store(pos, true)); } else { @@ -6909,7 +6875,7 @@ i_s_sys_columns_fill_table( mtr_t mtr; DBUG_ENTER("i_s_sys_columns_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -7116,18 +7082,16 @@ i_s_sys_virtual_fill_table( const rec_t* rec; ulint pos; ulint base_pos; - mem_heap_t* heap; mtr_t mtr; DBUG_ENTER("i_s_sys_virtual_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { DBUG_RETURN(0); } - heap = mem_heap_create(1000); mutex_enter(&dict_sys->mutex); mtr_start(&mtr); @@ -7139,7 +7103,7 @@ i_s_sys_virtual_fill_table( /* populate a dict_col_t structure with information from a SYS_VIRTUAL row */ - err_msg = dict_process_sys_virtual_rec(heap, rec, + err_msg = dict_process_sys_virtual_rec(rec, &table_id, &pos, &base_pos); @@ -7155,8 +7119,6 @@ i_s_sys_virtual_fill_table( err_msg); } - mem_heap_empty(heap); - /* Get the next record */ mutex_enter(&dict_sys->mutex); mtr_start(&mtr); @@ -7165,7 +7127,6 @@ i_s_sys_virtual_fill_table( mtr_commit(&mtr); mutex_exit(&dict_sys->mutex); - mem_heap_free(heap); DBUG_RETURN(0); } @@ -7322,7 +7283,7 @@ i_s_sys_fields_fill_table( mtr_t mtr; DBUG_ENTER("i_s_sys_fields_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -7554,7 +7515,7 @@ i_s_sys_foreign_fill_table( mtr_t mtr; DBUG_ENTER("i_s_sys_foreign_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -7769,7 +7730,7 @@ i_s_sys_foreign_cols_fill_table( mtr_t mtr; DBUG_ENTER("i_s_sys_foreign_cols_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -7921,16 +7882,7 @@ static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLESPACES_FILE_FORMAT 3 - {STRUCT_FLD(field_name, "FILE_FORMAT"), - STRUCT_FLD(field_length, 10), - STRUCT_FLD(field_type, MYSQL_TYPE_STRING), - STRUCT_FLD(value, 0), - STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), - STRUCT_FLD(old_name, ""), - STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, - -#define SYS_TABLESPACES_ROW_FORMAT 4 +#define SYS_TABLESPACES_ROW_FORMAT 3 {STRUCT_FLD(field_name, "ROW_FORMAT"), STRUCT_FLD(field_length, 22), STRUCT_FLD(field_type, MYSQL_TYPE_STRING), @@ -7939,7 +7891,7 @@ static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLESPACES_PAGE_SIZE 5 +#define SYS_TABLESPACES_PAGE_SIZE 4 {STRUCT_FLD(field_name, "PAGE_SIZE"), STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -7948,7 +7900,7 @@ static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLESPACES_ZIP_PAGE_SIZE 6 +#define SYS_TABLESPACES_ZIP_PAGE_SIZE 5 {STRUCT_FLD(field_name, "ZIP_PAGE_SIZE"), STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -7957,7 +7909,7 @@ static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLESPACES_SPACE_TYPE 7 +#define SYS_TABLESPACES_SPACE_TYPE 6 {STRUCT_FLD(field_name, "SPACE_TYPE"), STRUCT_FLD(field_length, 10), STRUCT_FLD(field_type, MYSQL_TYPE_STRING), @@ -7966,7 +7918,7 @@ static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLESPACES_FS_BLOCK_SIZE 8 +#define SYS_TABLESPACES_FS_BLOCK_SIZE 7 {STRUCT_FLD(field_name, "FS_BLOCK_SIZE"), STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -7975,7 +7927,7 @@ static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLESPACES_FILE_SIZE 9 +#define SYS_TABLESPACES_FILE_SIZE 8 {STRUCT_FLD(field_name, "FILE_SIZE"), STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), @@ -7984,7 +7936,7 @@ static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLESPACES_ALLOC_SIZE 10 +#define SYS_TABLESPACES_ALLOC_SIZE 9 {STRUCT_FLD(field_name, "ALLOCATED_SIZE"), STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), @@ -8013,12 +7965,10 @@ i_s_dict_fill_sys_tablespaces( { Field** fields; ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); - const char* file_format; const char* row_format; DBUG_ENTER("i_s_dict_fill_sys_tablespaces"); - file_format = trx_sys_file_format_id_to_name(atomic_blobs); if (is_system_tablespace(space)) { row_format = "Compact, Redundant or Dynamic"; } else if (FSP_FLAGS_GET_ZIP_SSIZE(flags)) { @@ -8037,9 +7987,6 @@ i_s_dict_fill_sys_tablespaces( OK(fields[SYS_TABLESPACES_FLAGS]->store(flags, true)); - OK(field_store_string(fields[SYS_TABLESPACES_FILE_FORMAT], - file_format)); - OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT], row_format)); OK(field_store_string(fields[SYS_TABLESPACES_SPACE_TYPE], @@ -8100,7 +8047,7 @@ i_s_dict_fill_sys_tablespaces( } file_done: - fil_space_release(s); + s->release(); } if (file.m_total_size == static_cast<os_offset_t>(~0)) { @@ -8139,7 +8086,7 @@ i_s_sys_tablespaces_fill_table( mtr_t mtr; DBUG_ENTER("i_s_sys_tablespaces_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -8330,7 +8277,7 @@ i_s_sys_datafiles_fill_table( mtr_t mtr; DBUG_ENTER("i_s_sys_datafiles_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -8625,31 +8572,31 @@ i_s_tablespaces_encryption_fill_table( Item* ) /*!< in: condition (not used) */ { DBUG_ENTER("i_s_tablespaces_encryption_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { DBUG_RETURN(0); } - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); - for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system->space_list); + for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list); space; space = UT_LIST_GET_NEXT(space_list, space)) { - if (space->purpose == FIL_TYPE_TABLESPACE) { - space->n_pending_ops++; - mutex_exit(&fil_system->mutex); + if (space->purpose == FIL_TYPE_TABLESPACE + && space->acquire()) { + mutex_exit(&fil_system.mutex); if (int err = i_s_dict_fill_tablespaces_encryption( thd, space, tables->table)) { - fil_space_release(space); + space->release(); DBUG_RETURN(err); } - mutex_enter(&fil_system->mutex); - space->n_pending_ops--; + mutex_enter(&fil_system.mutex); + space->release(); } } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); DBUG_RETURN(0); } /*******************************************************************//** @@ -8887,31 +8834,31 @@ i_s_tablespaces_scrubbing_fill_table( Item* ) /*!< in: condition (not used) */ { DBUG_ENTER("i_s_tablespaces_scrubbing_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without SUPER_ACL privilege */ if (check_global_access(thd, SUPER_ACL)) { DBUG_RETURN(0); } - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); - for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system->space_list); + for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list); space; space = UT_LIST_GET_NEXT(space_list, space)) { - if (space->purpose == FIL_TYPE_TABLESPACE) { - space->n_pending_ops++; - mutex_exit(&fil_system->mutex); + if (space->purpose == FIL_TYPE_TABLESPACE + && space->acquire()) { + mutex_exit(&fil_system.mutex); if (int err = i_s_dict_fill_tablespaces_scrubbing( thd, space, tables->table)) { - fil_space_release(space); + space->release(); DBUG_RETURN(err); } - mutex_enter(&fil_system->mutex); - space->n_pending_ops--; + mutex_enter(&fil_system.mutex); + space->release(); } } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); DBUG_RETURN(0); } /*******************************************************************//** @@ -9043,7 +8990,7 @@ i_s_innodb_mutexes_fill_table( Field** fields = tables->table->field; DBUG_ENTER("i_s_innodb_mutexes_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -9385,25 +9332,7 @@ static ST_FIELD_INFO innodb_sys_semaphore_waits_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, - // SYS_SEMAPHORE_WAITS_LAST_READER_FILE 17 - {STRUCT_FLD(field_name, "LAST_READER_FILE"), - STRUCT_FLD(field_length, OS_FILE_MAX_PATH), - STRUCT_FLD(field_type, MYSQL_TYPE_STRING), - STRUCT_FLD(value, 0), - STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), - STRUCT_FLD(old_name, ""), - STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, - - // SYS_SEMAPHORE_WAITS_LAST_READER_LINE 18 - {STRUCT_FLD(field_name, "LAST_READER_LINE"), - STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), - STRUCT_FLD(field_type, MYSQL_TYPE_LONG), - STRUCT_FLD(value, 0), - STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), - STRUCT_FLD(old_name, ""), - STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, - - // SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 19 + // SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 17 {STRUCT_FLD(field_name, "LAST_WRITER_FILE"), STRUCT_FLD(field_length, OS_FILE_MAX_PATH), STRUCT_FLD(field_type, MYSQL_TYPE_STRING), @@ -9412,7 +9341,7 @@ static ST_FIELD_INFO innodb_sys_semaphore_waits_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, - // SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 20 + // SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 18 {STRUCT_FLD(field_name, "LAST_WRITER_LINE"), STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -9421,7 +9350,7 @@ static ST_FIELD_INFO innodb_sys_semaphore_waits_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, - // SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 21 + // SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 19 {STRUCT_FLD(field_name, "OS_WAIT_COUNT"), STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), diff --git a/storage/innobase/handler/i_s.h b/storage/innobase/handler/i_s.h index ed4165bdeeb..66404dc32b4 100644 --- a/storage/innobase/handler/i_s.h +++ b/storage/innobase/handler/i_s.h @@ -131,11 +131,9 @@ HPUX aCC: HP ANSI C++ B3910B A.03.65) can't handle it. */ #define SYS_SEMAPHORE_WAITS_READERS 14 #define SYS_SEMAPHORE_WAITS_WAITERS_FLAG 15 #define SYS_SEMAPHORE_WAITS_LOCK_WORD 16 -#define SYS_SEMAPHORE_WAITS_LAST_READER_FILE 17 -#define SYS_SEMAPHORE_WAITS_LAST_READER_LINE 18 -#define SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 19 -#define SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 20 -#define SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 21 +#define SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 17 +#define SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 18 +#define SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 19 /*******************************************************************//** Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field. diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index 46649187ca0..4bebb7d1257 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -36,9 +36,6 @@ my_bool srv_ibuf_disable_background_merge; /** Number of bits describing a single page */ #define IBUF_BITS_PER_PAGE 4 -#if IBUF_BITS_PER_PAGE % 2 -# error "IBUF_BITS_PER_PAGE must be an even number!" -#endif /** The start address for an insert buffer bitmap page bitmap */ #define IBUF_BITMAP PAGE_DATA @@ -187,7 +184,7 @@ it uses synchronous aio, it can access any pages, as long as it obeys the access order rules. */ /** Operations that can currently be buffered. */ -ibuf_use_t ibuf_use = IBUF_USE_ALL; +ulong innodb_change_buffering; #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG /** Dump the change buffer at startup */ @@ -230,9 +227,6 @@ type, counter, and some flags. */ /* @{ */ #define IBUF_REC_INFO_SIZE 4 /*!< Combined size of info fields at the beginning of the fourth field */ -#if IBUF_REC_INFO_SIZE >= DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE -# error "IBUF_REC_INFO_SIZE >= DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" -#endif /* Offsets for the fields at the beginning of the fourth field */ #define IBUF_REC_OFFSET_COUNTER 0 /*!< Operation counter */ @@ -369,7 +363,7 @@ ibuf_tree_root_get( ut_ad(ibuf_inside(mtr)); ut_ad(mutex_own(&ibuf_mutex)); - mtr_sx_lock(dict_index_get_lock(ibuf->index), mtr); + mtr_sx_lock_index(ibuf->index, mtr); /* only segment list access is exclusive each other */ block = buf_page_get( @@ -393,6 +387,10 @@ void ibuf_close(void) /*============*/ { + if (ibuf == NULL) { + return; + } + mutex_free(&ibuf_pessimistic_insert_mutex); mutex_free(&ibuf_mutex); @@ -422,7 +420,7 @@ ibuf_size_update( ibuf->free_list_len = flst_get_len(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST); - ibuf->height = 1 + btr_page_get_level_low(root); + ibuf->height = 1 + btr_page_get_level(root); /* the '1 +' is the ibuf header page */ ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len); @@ -449,7 +447,7 @@ ibuf_init_at_db_start(void) buffer pool size. Once ibuf struct is initialized this value is updated with the user supplied size by calling ibuf_max_size_update(). */ - ibuf->max_size = ((buf_pool_get_curr_size() / UNIV_PAGE_SIZE) + ibuf->max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift) * CHANGE_BUFFER_DEFAULT_SIZE) / 100; mutex_create(LATCH_ID_IBUF, &ibuf_mutex); @@ -461,7 +459,9 @@ ibuf_init_at_db_start(void) mtr_start(&mtr); - mtr_x_lock_space(IBUF_SPACE_ID, &mtr); + compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE); + compile_time_assert(IBUF_SPACE_ID == 0); + mtr_x_lock_space(fil_system.sys_space, &mtr); mutex_enter(&ibuf_mutex); @@ -497,11 +497,11 @@ ibuf_init_at_db_start(void) mtr.commit(); ibuf->index = dict_mem_index_create( - "innodb_change_buffer", "CLUST_IND", - IBUF_SPACE_ID, DICT_CLUSTERED | DICT_IBUF, 1); + dict_mem_table_create("innodb_change_buffer", + fil_system.sys_space, 1, 0, 0, 0), + "CLUST_IND", + DICT_CLUSTERED | DICT_IBUF, 1); ibuf->index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID; - ibuf->index->table = dict_mem_table_create( - "innodb_change_buffer", IBUF_SPACE_ID, 1, 0, 0, 0); ibuf->index->n_uniq = REC_MAX_N_FIELDS; rw_lock_create(index_tree_rw_lock_key, &ibuf->index->lock, SYNC_IBUF_INDEX_TREE); @@ -540,7 +540,7 @@ ibuf_max_size_update( ulint new_val) /*!< in: new value in terms of percentage of the buffer pool size */ { - ulint new_size = ((buf_pool_get_curr_size() / UNIV_PAGE_SIZE) + ulint new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift) * new_val) / 100; mutex_enter(&ibuf_mutex); ibuf->max_size = new_size; @@ -563,6 +563,7 @@ ibuf_bitmap_page_init( fil_page_set_type(page, FIL_PAGE_IBUF_BITMAP); /* Write all zeros to the bitmap */ + compile_time_assert(!(IBUF_BITS_PER_PAGE % 2)); byte_offset = UT_BITS_IN_BYTES(block->page.size.physical() * IBUF_BITS_PER_PAGE); @@ -646,9 +647,7 @@ ibuf_bitmap_page_get_bits_low( ulint value; ut_ad(bit < IBUF_BITS_PER_PAGE); -#if IBUF_BITS_PER_PAGE % 2 -# error "IBUF_BITS_PER_PAGE % 2 != 0" -#endif + compile_time_assert(!(IBUF_BITS_PER_PAGE % 2)); ut_ad(mtr_memo_contains_page(mtr, page, latch_type)); bit_offset = (page_id.page_no() % page_size.physical()) @@ -657,7 +656,7 @@ ibuf_bitmap_page_get_bits_low( byte_offset = bit_offset / 8; bit_offset = bit_offset % 8; - ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE); + ut_ad(byte_offset + IBUF_BITMAP < srv_page_size); map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset); @@ -694,9 +693,7 @@ ibuf_bitmap_page_set_bits( ulint map_byte; ut_ad(bit < IBUF_BITS_PER_PAGE); -#if IBUF_BITS_PER_PAGE % 2 -# error "IBUF_BITS_PER_PAGE % 2 != 0" -#endif + compile_time_assert(!(IBUF_BITS_PER_PAGE % 2)); ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr->is_named_space(page_id.space())); @@ -706,7 +703,7 @@ ibuf_bitmap_page_set_bits( byte_offset = bit_offset / 8; bit_offset = bit_offset % 8; - ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE); + ut_ad(byte_offset + IBUF_BITMAP < srv_page_size); map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset); @@ -862,7 +859,8 @@ ibuf_set_free_bits_func( } mtr_start(&mtr); - const fil_space_t* space = mtr.set_named_space(block->page.id.space()); + const fil_space_t* space = mtr.set_named_space_id( + block->page.id.space()); bitmap_page = ibuf_bitmap_get_map_page(block->page.id, block->page.size, &mtr); @@ -1105,7 +1103,8 @@ ibuf_page_low( return(FALSE); } - ut_ad(fil_space_get_type(IBUF_SPACE_ID) == FIL_TYPE_TABLESPACE); + compile_time_assert(IBUF_SPACE_ID == 0); + ut_ad(fil_system.sys_space->purpose == FIL_TYPE_TABLESPACE); #ifdef UNIV_DEBUG if (!x_latch) { @@ -1276,6 +1275,8 @@ ibuf_rec_get_info_func( types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + compile_time_assert(IBUF_REC_INFO_SIZE + < DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); switch (info_len_local) { case 0: @@ -1445,14 +1446,10 @@ ibuf_dummy_index_create( dict_table_t* table; dict_index_t* index; - table = dict_mem_table_create("IBUF_DUMMY", - DICT_HDR_SPACE, n, 0, + table = dict_mem_table_create("IBUF_DUMMY", NULL, n, 0, comp ? DICT_TF_COMPACT : 0, 0); - index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY", - DICT_HDR_SPACE, 0, n); - - index->table = table; + index = dict_mem_index_create(table, "IBUF_DUMMY", 0, n); /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ index->cached = TRUE; @@ -1579,6 +1576,9 @@ ibuf_build_entry_from_ibuf_rec_func( ibuf_dummy_index_add_col(index, dfield_get_type(field), len); } + index->n_core_null_bytes + = UT_BITS_IN_BYTES(unsigned(index->n_nullable)); + /* Prevent an ut_ad() failure in page_zip_write_rec() by adding system columns to the dummy table pointed to by the dummy secondary index. The insert buffer is only used for @@ -1867,7 +1867,7 @@ ibuf_entry_build( field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_METADATA); - dfield_set_data(field, type_info, ti - type_info); + dfield_set_data(field, type_info, ulint(ti - type_info)); /* Set all the types in the new tuple binary */ @@ -1932,11 +1932,8 @@ ibuf_search_tuple_build( /*********************************************************************//** Checks if there are enough pages in the free list of the ibuf tree that we dare to start a pessimistic insert to the insert buffer. -@return TRUE if enough free pages in list */ -UNIV_INLINE -ibool -ibuf_data_enough_free_for_insert(void) -/*==================================*/ +@return whether enough free pages in list */ +static inline bool ibuf_data_enough_free_for_insert() { ut_ad(mutex_own(&ibuf_mutex)); @@ -1980,11 +1977,9 @@ ibuf_add_free_page(void) page_t* bitmap_page; mtr_start(&mtr); - fil_space_t* space = mtr.set_sys_modified(); - /* Acquire the fsp latch before the ibuf header, obeying the latching order */ - mtr.x_lock_space(space, __FILE__, __LINE__); + mtr_x_lock_space(fil_system.sys_space, &mtr); header_page = ibuf_header_page_get(&mtr); /* Allocate a new page: NOTE that if the page has been a part of a @@ -2030,13 +2025,11 @@ ibuf_add_free_page(void) (level 2 page) */ const page_id_t page_id(IBUF_SPACE_ID, block->page.id.page_no()); - const page_size_t page_size(space->flags); - - bitmap_page = ibuf_bitmap_get_map_page(page_id, page_size, &mtr); + bitmap_page = ibuf_bitmap_get_map_page(page_id, univ_page_size, &mtr); mutex_exit(&ibuf_mutex); - ibuf_bitmap_page_set_bits(bitmap_page, page_id, page_size, + ibuf_bitmap_page_set_bits(bitmap_page, page_id, univ_page_size, IBUF_BITMAP_IBUF, TRUE, &mtr); ibuf_mtr_commit(&mtr); @@ -2062,13 +2055,10 @@ ibuf_remove_free_page(void) log_free_check(); mtr_start(&mtr); - fil_space_t* space = mtr.set_sys_modified(); - const page_size_t page_size(space->flags); - /* Acquire the fsp latch before the ibuf header, obeying the latching order */ - mtr.x_lock_space(space, __FILE__, __LINE__); + mtr_x_lock_space(fil_system.sys_space, &mtr); header_page = ibuf_header_page_get(&mtr); /* Prevent pessimistic inserts to insert buffer trees for a while */ @@ -2108,8 +2098,9 @@ ibuf_remove_free_page(void) the free list was so long that they cannot have taken the last page from it. */ + compile_time_assert(IBUF_SPACE_ID == 0); fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, - IBUF_SPACE_ID, page_no, &mtr); + fil_system.sys_space, page_no, &mtr); const page_id_t page_id(IBUF_SPACE_ID, page_no); @@ -2147,12 +2138,12 @@ ibuf_remove_free_page(void) /* Set the bit indicating that this page is no more an ibuf tree page (level 2 page) */ - bitmap_page = ibuf_bitmap_get_map_page(page_id, page_size, &mtr); + bitmap_page = ibuf_bitmap_get_map_page(page_id, univ_page_size, &mtr); mutex_exit(&ibuf_mutex); ibuf_bitmap_page_set_bits( - bitmap_page, page_id, page_size, IBUF_BITMAP_IBUF, FALSE, + bitmap_page, page_id, univ_page_size, IBUF_BITMAP_IBUF, FALSE, &mtr); ut_d(buf_page_set_file_page_was_freed(page_id)); @@ -2342,7 +2333,7 @@ ibuf_get_merge_page_nos_func( && prev_space_id == first_space_id) || (volume_for_page > ((IBUF_MERGE_THRESHOLD - 1) - * 4 * UNIV_PAGE_SIZE + * 4U << srv_page_size_shift / IBUF_PAGE_SIZE_PER_FREE_SPACE) / IBUF_MERGE_THRESHOLD)) { @@ -2825,7 +2816,7 @@ ibuf_get_volume_buffered_count_func( types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); - switch (UNIV_EXPECT(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE, + switch (UNIV_EXPECT(int(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE), IBUF_REC_INFO_SIZE)) { default: ut_error; @@ -2908,7 +2899,7 @@ get_volume_comp: Gets an upper limit for the combined size of entries buffered in the insert buffer for a given page. @return upper limit for the volume of buffered inserts for the index -page, in bytes; UNIV_PAGE_SIZE, if the entries for the index page span +page, in bytes; srv_page_size, if the entries for the index page span several pages in the insert buffer */ static ulint @@ -3009,7 +3000,7 @@ ibuf_get_volume_buffered( do not have the x-latch on it, and cannot acquire one because of the latching order: we have to give up */ - return(UNIV_PAGE_SIZE); + return(srv_page_size); } if (page_no != ibuf_rec_get_page_no(mtr, rec) @@ -3079,7 +3070,7 @@ count_later: /* We give up */ - return(UNIV_PAGE_SIZE); + return(srv_page_size); } if (page_no != ibuf_rec_get_page_no(mtr, rec) @@ -3323,6 +3314,7 @@ ibuf_insert_low( ut_ad(!dict_index_is_spatial(index)); ut_ad(dtuple_check_typed(entry)); ut_ad(!no_counter || op == IBUF_OP_INSERT); + ut_ad(page_id.space() == index->table->space_id); ut_a(op < IBUF_OP_COUNT); do_merge = FALSE; @@ -3442,7 +3434,7 @@ fail_exit: and done mtr_commit(&mtr) to release the latch. */ ibuf_mtr_start(&bitmap_mtr); - bitmap_mtr.set_named_space(page_id.space()); + index->set_modified(bitmap_mtr); bitmap_page = ibuf_bitmap_get_map_page(page_id, page_size, &bitmap_mtr); @@ -3626,9 +3618,9 @@ ibuf_insert( dberr_t err; ulint entry_size; ibool no_counter; - /* Read the settable global variable ibuf_use only once in + /* Read the settable global variable only once in this function, so that we will have a consistent view of it. */ - ibuf_use_t use = ibuf_use; + ibuf_use_t use = ibuf_use_t(innodb_change_buffering); DBUG_ENTER("ibuf_insert"); DBUG_PRINT("ibuf", ("op: %d, space: " UINT32PF ", page_no: " UINT32PF, @@ -3638,7 +3630,7 @@ ibuf_insert( ut_ad(page_id.space() != SRV_TMP_SPACE_ID); ut_a(!dict_index_is_clust(index)); - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); no_counter = use <= IBUF_USE_INSERT; @@ -3653,8 +3645,6 @@ ibuf_insert( case IBUF_USE_INSERT_DELETE_MARK: case IBUF_USE_ALL: goto check_watch; - case IBUF_USE_COUNT: - break; } break; case IBUF_OP_DELETE_MARK: @@ -3668,8 +3658,6 @@ ibuf_insert( case IBUF_USE_ALL: ut_ad(!no_counter); goto check_watch; - case IBUF_USE_COUNT: - break; } break; case IBUF_OP_DELETE: @@ -3683,8 +3671,6 @@ ibuf_insert( case IBUF_USE_ALL: ut_ad(!no_counter); goto skip_watch; - case IBUF_USE_COUNT: - break; } break; case IBUF_OP_COUNT: @@ -3920,7 +3906,7 @@ dump: row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */ ut_ad(rec_get_deleted_flag(rec, page_is_comp(page))); - offsets = rec_get_offsets(rec, index, NULL, true, + offsets = rec_get_offsets(rec, index, NULL, index->n_fields, ULINT_UNDEFINED, &heap); update = row_upd_build_sec_rec_difference_binary( rec, index, offsets, entry, heap); @@ -4093,7 +4079,8 @@ ibuf_delete( ut_ad(ibuf_inside(mtr)); ut_ad(dtuple_check_typed(entry)); - ut_ad(!dict_index_is_spatial(index)); + ut_ad(!index->is_spatial()); + ut_ad(!index->is_clust()); low_match = page_cur_search(block, index, entry, &page_cur); @@ -4112,8 +4099,8 @@ ibuf_delete( rec_offs_init(offsets_); - offsets = rec_get_offsets( - rec, index, offsets, true, ULINT_UNDEFINED, &heap); + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, + ULINT_UNDEFINED, &heap); if (page_get_n_recs(page) <= 1 || !(REC_INFO_DELETED_FLAG @@ -4198,7 +4185,7 @@ ibuf_restore_pos( " ibuf record inserted to page " << space << ":" << page_no << " in file " << s->chain.start->name; - fil_space_release(s); + s->release(); ib::error() << BUG_REPORT_MSG; @@ -4333,7 +4320,7 @@ loop: &pcur, &mtr); if (!btr_pcur_is_on_user_rec(&pcur)) { - ut_ad(btr_pcur_is_after_last_on_page(&pcur)); + ut_ad(btr_pcur_is_after_last_in_tree(&pcur)); goto func_exit; } @@ -4442,8 +4429,7 @@ ibuf_merge_or_delete_for_page( if (!bitmap_bits) { /* No changes are buffered for this page. */ - - fil_space_release(space); + space->release(); if (UNIV_UNLIKELY(srv_shutdown_state) && !srv_fast_shutdown && (!block @@ -4503,12 +4489,8 @@ loop: ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &pcur, &mtr); - if (block != NULL) { - ibool success; - - mtr.set_named_space(space); - - success = buf_page_get_known_nowait( + if (block) { + ibool success = buf_page_get_known_nowait( RW_X_LATCH, block, BUF_KEEP_OLD, __FILE__, __LINE__, &mtr); @@ -4521,7 +4503,9 @@ loop: the block is io-fixed. Other threads must not try to latch an io-fixed block. */ buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); - } else if (space) { + } + + if (space) { mtr.set_named_space(space); } @@ -4572,6 +4556,9 @@ loop: entry = ibuf_build_entry_from_ibuf_rec( &mtr, rec, heap, &dummy_index); + ut_ad(!dummy_index->table->space); + dummy_index->table->space = space; + dummy_index->table->space_id = space->id; ut_ad(page_validate(block->frame, dummy_index)); @@ -4584,8 +4571,8 @@ loop: volume += page_dir_calc_reserved_space(1); - ut_a(volume <= 4 * UNIV_PAGE_SIZE - / IBUF_PAGE_SIZE_PER_FREE_SPACE); + ut_a(volume <= (4U << srv_page_size_shift) + / IBUF_PAGE_SIZE_PER_FREE_SPACE); #endif ibuf_insert_to_index_page( entry, block, dummy_index, &mtr); @@ -4707,7 +4694,7 @@ reset_bit: ibuf_mtr_commit(&mtr); if (space) { - fil_space_release(space); + space->release(); } btr_pcur_close(&pcur); @@ -4852,25 +4839,15 @@ ibuf_print( mutex_exit(&ibuf_mutex); } -/******************************************************************//** -Checks the insert buffer bitmaps on IMPORT TABLESPACE. +/** Check the insert buffer bitmaps on IMPORT TABLESPACE. +@param[in] trx transaction +@param[in,out] space tablespace being imported @return DB_SUCCESS or error code */ -dberr_t -ibuf_check_bitmap_on_import( -/*========================*/ - const trx_t* trx, /*!< in: transaction */ - ulint space_id) /*!< in: tablespace identifier */ +dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) { ulint page_no; - - ut_ad(space_id); ut_ad(trx->mysql_thd); - - FilSpace space(space_id); - if (!space()) { - return(DB_TABLE_NOT_FOUND); - } - + ut_ad(space->purpose == FIL_TYPE_IMPORT); const page_size_t page_size(space->flags); /* fil_space_t::size and fil_space_t::free_limit would still be 0 at this point. So, we will have to read page 0. */ @@ -4880,7 +4857,7 @@ ibuf_check_bitmap_on_import( mtr_t mtr; ulint size; mtr.start(); - if (buf_block_t* sp = buf_page_get(page_id_t(space_id, 0), page_size, + if (buf_block_t* sp = buf_page_get(page_id_t(space->id, 0), page_size, RW_S_LATCH, &mtr)) { size = std::min( mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT @@ -4920,7 +4897,7 @@ ibuf_check_bitmap_on_import( ibuf_enter(&mtr); bitmap_page = ibuf_bitmap_get_map_page( - page_id_t(space_id, page_no), page_size, &mtr); + page_id_t(space->id, page_no), page_size, &mtr); if (!bitmap_page) { mutex_exit(&ibuf_mutex); @@ -4939,9 +4916,8 @@ ibuf_check_bitmap_on_import( curr_page < page_size.physical(); curr_page++) { buf_block_t* block = buf_page_get( - page_id_t(space_id, curr_page), - page_size, - RW_S_LATCH, &mtr); + page_id_t(space->id, curr_page), + page_size, RW_S_LATCH, &mtr); page_t* page = buf_block_get_frame(block); ut_ad(buf_is_zeroes(span<const byte>( page, page_size.physical()))); @@ -4958,7 +4934,7 @@ ibuf_check_bitmap_on_import( const ulint offset = page_no + i; - const page_id_t cur_page_id(space_id, offset); + const page_id_t cur_page_id(space->id, offset); if (ibuf_bitmap_page_get_bits( bitmap_page, cur_page_id, page_size, @@ -4971,12 +4947,10 @@ ibuf_check_bitmap_on_import( ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, ER_INNODB_INDEX_CORRUPT, - "Space %u page %u" + "File %s page " ULINTPF " is wrongly flagged to belong to the" " insert buffer", - (unsigned) space_id, - (unsigned) offset); - + space->chain.start->name, offset); return(DB_CORRUPTION); } @@ -4988,9 +4962,9 @@ ibuf_check_bitmap_on_import( IB_LOG_LEVEL_WARN, ER_INNODB_INDEX_CORRUPT, "Buffered changes" - " for space %u page %u are lost", - (unsigned) space_id, - (unsigned) offset); + " for file %s page " ULINTPF + " are lost", + space->chain.start->name, offset); /* Tolerate this error, so that slightly corrupted tables can be @@ -5026,7 +5000,7 @@ ibuf_set_bitmap_for_bulk_load( free_val = ibuf_index_page_calc_free(block); mtr_start(&mtr); - mtr.set_named_space(block->page.id.space()); + mtr.set_named_space_id(block->page.id.space()); bitmap_page = ibuf_bitmap_get_map_page(block->page.id, block->page.size, &mtr); diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 29ece955702..07ec5357822 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -35,9 +35,15 @@ Created 6/2/1994 Heikki Tuuri #include "btr0types.h" #include "gis0type.h" +#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level + (not really a hard limit). + Used in debug assertions + in btr_page_set_level and + btr_page_get_level */ + /** Maximum record size which can be stored on a page, without using the special big record storage structure */ -#define BTR_PAGE_MAX_REC_SIZE (UNIV_PAGE_SIZE / 2 - 200) +#define BTR_PAGE_MAX_REC_SIZE (srv_page_size / 2 - 200) /** @brief Maximum depth of a B-tree in InnoDB. @@ -149,23 +155,23 @@ free the pages of externally stored fields. */ record is in spatial index */ #define BTR_RTREE_DELETE_MARK 524288U -#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \ - ((latch_mode) & btr_latch_mode(~(BTR_INSERT \ - | BTR_DELETE_MARK \ - | BTR_RTREE_UNDO_INS \ - | BTR_RTREE_DELETE_MARK \ - | BTR_DELETE \ - | BTR_ESTIMATE \ - | BTR_IGNORE_SEC_UNIQUE \ - | BTR_ALREADY_S_LATCHED \ - | BTR_LATCH_FOR_INSERT \ - | BTR_LATCH_FOR_DELETE \ - | BTR_MODIFY_EXTERNAL))) - -#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode) \ - ((latch_mode) & btr_latch_mode(~(BTR_LATCH_FOR_INSERT \ - | BTR_LATCH_FOR_DELETE \ - | BTR_MODIFY_EXTERNAL))) +#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \ + ((latch_mode) & ulint(~(BTR_INSERT \ + | BTR_DELETE_MARK \ + | BTR_RTREE_UNDO_INS \ + | BTR_RTREE_DELETE_MARK \ + | BTR_DELETE \ + | BTR_ESTIMATE \ + | BTR_IGNORE_SEC_UNIQUE \ + | BTR_ALREADY_S_LATCHED \ + | BTR_LATCH_FOR_INSERT \ + | BTR_LATCH_FOR_DELETE \ + | BTR_MODIFY_EXTERNAL))) + +#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode) \ + ((latch_mode) & ulint(~(BTR_LATCH_FOR_INSERT \ + | BTR_LATCH_FOR_DELETE \ + | BTR_MODIFY_EXTERNAL))) /**************************************************************//** Report that an index page is corrupted. */ @@ -258,14 +264,22 @@ btr_page_get_index_id( MY_ATTRIBUTE((warn_unused_result)); /********************************************************//** Gets the node level field in an index page. +@param[in] page index page @return level, leaf level == 0 */ UNIV_INLINE ulint -btr_page_get_level_low( -/*===================*/ - const page_t* page) /*!< in: index page */ - MY_ATTRIBUTE((warn_unused_result)); -#define btr_page_get_level(page, mtr) btr_page_get_level_low(page) +btr_page_get_level(const page_t* page) +{ + ulint level; + + ut_ad(page); + + level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL); + + ut_ad(level <= BTR_MAX_NODE_LEVEL); + + return(level); +} MY_ATTRIBUTE((warn_unused_result)) /** Read FIL_PAGE_NEXT. @param page buffer pool page @@ -311,8 +325,7 @@ btr_node_ptr_get_child_page_no( /** Create the root node for a new index tree. @param[in] type type of the index -@param[in] space space where created -@param[in] page_size page size +@param[in,out] space tablespace where created @param[in] index_id index id @param[in] index index, or NULL when applying TRUNCATE log record during recovery @@ -323,8 +336,7 @@ record during recovery ulint btr_create( ulint type, - ulint space, - const page_size_t& page_size, + fil_space_t* space, index_id_t index_id, dict_index_t* index, const btr_create_t* btr_redo_create_info, @@ -627,6 +639,20 @@ btr_page_alloc( for x-latching and initializing the page */ MY_ATTRIBUTE((warn_unused_result)); +/** Empty an index page (possibly the root page). @see btr_page_create(). +@param[in,out] block page to be emptied +@param[in,out] page_zip compressed page frame, or NULL +@param[in] index index of the page +@param[in] level B-tree level of the page (0=leaf) +@param[in,out] mtr mini-transaction */ +void +btr_page_empty( + buf_block_t* block, + page_zip_des_t* page_zip, + dict_index_t* index, + ulint level, + mtr_t* mtr) + MY_ATTRIBUTE((nonnull(1, 3, 5))); /**************************************************************//** Creates a new index page (not the root, and also not used in page reorganization). @see btr_page_empty(). */ diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic index 878e414a039..49567979c98 100644 --- a/storage/innobase/include/btr0btr.ic +++ b/storage/innobase/include/btr0btr.ic @@ -29,12 +29,6 @@ Created 6/2/1994 Heikki Tuuri #include "mtr0log.h" #include "page0zip.h" -#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level - (not really a hard limit). - Used in debug assertions - in btr_page_set_level and - btr_page_get_level_low */ - /** Gets a buffer page and declares its latching order level. @param[in] page_id page id @param[in] mode latch mode @@ -114,26 +108,6 @@ btr_page_get_index_id( } /********************************************************//** -Gets the node level field in an index page. -@return level, leaf level == 0 */ -UNIV_INLINE -ulint -btr_page_get_level_low( -/*===================*/ - const page_t* page) /*!< in: index page */ -{ - ulint level; - - ut_ad(page); - - level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL); - - ut_ad(level <= BTR_MAX_NODE_LEVEL); - - return(level); -} - -/********************************************************//** Sets the node level field in an index page. */ UNIV_INLINE void diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h index 911a2726422..7b2b5c7a5d7 100644 --- a/storage/innobase/include/btr0bulk.h +++ b/storage/innobase/include/btr0bulk.h @@ -34,7 +34,7 @@ Created 03/11/2014 Shaohua Wang #include <vector> /** Innodb B-tree index fill factor for bulk load. */ -extern long innobase_fill_factor; +extern uint innobase_fill_factor; /** whether to reduce redo logging during ALTER TABLE */ extern my_bool innodb_log_optimize_ddl; @@ -87,7 +87,7 @@ public: m_err(DB_SUCCESS) { ut_ad(!dict_index_is_spatial(m_index)); - ut_ad(!dict_table_is_temporary(m_index->table)); + ut_ad(!m_index->table->is_temporary()); } /** Deconstructor */ @@ -292,7 +292,8 @@ public: ut_ad(!dict_index_is_spatial(index)); #ifdef UNIV_DEBUG if (m_flush_observer) - fil_space_inc_redo_skipped_count(m_index->space); + my_atomic_addlint(&m_index->table->space->redo_skipped_count, + 1); #endif /* UNIV_DEBUG */ } @@ -301,7 +302,8 @@ public: { #ifdef UNIV_DEBUG if (m_flush_observer) - fil_space_dec_redo_skipped_count(m_index->space); + my_atomic_addlint(&m_index->table->space->redo_skipped_count, + ulint(-1)); #endif /* UNIV_DEBUG */ } diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index 2f49ac6d12f..c80b4063221 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -32,6 +32,7 @@ Created 10/16/1994 Heikki Tuuri #include "btr0types.h" #include "rem0types.h" #include "gis0type.h" +#include "my_base.h" /** Mode flags for btr_cur operations; these can be ORed */ enum { @@ -42,6 +43,11 @@ enum { /** sys fields will be found in the update vector or inserted entry */ BTR_KEEP_SYS_FLAG = 4, + + /** no rollback */ + BTR_NO_ROLLBACK = BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG, + /** btr_cur_pessimistic_update() must keep cursor position when moving columns to big_rec */ BTR_KEEP_POS_FLAG = 8, @@ -127,6 +133,24 @@ btr_cur_position( buf_block_t* block, /*!< in: buffer block of rec */ btr_cur_t* cursor);/*!< in: cursor */ +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] table table definition from the data dictionary +@return error code +@retval DB_SUCCESS if no error occurred */ +dberr_t +btr_cur_instant_init(dict_table_t* table) + ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result)); + +/** Initialize the n_core_null_bytes on first access to a clustered +index root page. +@param[in] index clustered index that is on its first access +@param[in] page clustered index root page +@return whether the page is corrupted */ +bool +btr_cur_instant_root_init(dict_index_t* index, const page_t* page) + ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result)); + /** Optimistically latches the leaf page or pages requested. @param[in] block guessed buffer block @param[in] modify_clock modify clock value @@ -173,30 +197,28 @@ btr_cur_search_to_nth_level_func( cursor->left_block is used to store a pointer to the left neighbor page, in the cases BTR_SEARCH_PREV and BTR_MODIFY_PREV; - NOTE that if has_search_latch - is != 0, we maybe do not have a latch set - on the cursor page, we assume - the caller uses his search latch - to protect the record! */ + NOTE that if ahi_latch, we might not have a + cursor page latch, we assume that ahi_latch + protects the record! */ btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is s- or x-latched, but see also above! */ #ifdef BTR_CUR_HASH_ADAPT - ulint has_search_latch, - /*!< in: latch mode the caller - currently has on search system: - RW_S_LATCH, or 0 */ + rw_lock_t* ahi_latch, + /*!< in: currently held btr_search_latch + (in RW_S_LATCH mode), or NULL */ #endif /* BTR_CUR_HASH_ADAPT */ const char* file, /*!< in: file name */ unsigned line, /*!< in: line where called */ mtr_t* mtr, /*!< in/out: mini-transaction */ - ib_uint64_t autoinc); + ib_uint64_t autoinc = 0); /*!< in: PAGE_ROOT_AUTO_INC to be written (0 if none) */ #ifdef BTR_CUR_HASH_ADAPT -# define btr_cur_search_to_nth_level btr_cur_search_to_nth_level_func +# define btr_cur_search_to_nth_level(i,l,t,m,lm,c,a,fi,li,mtr) \ + btr_cur_search_to_nth_level_func(i,l,t,m,lm,c,a,fi,li,mtr) #else /* BTR_CUR_HASH_ADAPT */ -# define btr_cur_search_to_nth_level(ix,lv,t,md,l,cur,has,file,line,m,ai) \ - btr_cur_search_to_nth_level_func(ix,lv,t,md,l,cur,file,line,m,ai) +# define btr_cur_search_to_nth_level(i,l,t,m,lm,c,a,fi,li,mtr) \ + btr_cur_search_to_nth_level_func(i,l,t,m,lm,c,fi,li,mtr) #endif /* BTR_CUR_HASH_ADAPT */ /*****************************************************************//** @@ -584,7 +606,7 @@ btr_cur_parse_del_mark_set_sec_rec( @param[in] tuple2 range end, may also be empty tuple @param[in] mode2 search mode for range end @return estimated number of rows */ -int64_t +ha_rows btr_estimate_n_rows_in_range( dict_index_t* index, const dtuple_t* tuple1, @@ -821,7 +843,7 @@ btr_cur_latch_leaves( /** In the pessimistic delete, if the page data size drops below this limit, merging it to a neighbor is tried */ #define BTR_CUR_PAGE_COMPRESS_LIMIT(index) \ - ((UNIV_PAGE_SIZE * (ulint)((index)->merge_threshold)) / 100) + ((srv_page_size * (ulint)((index)->merge_threshold)) / 100) /** A slot in the path array. We store here info on a search path down the tree. Each slot contains data on a single level of the tree. */ @@ -989,11 +1011,11 @@ We store locally a long enough prefix of each column so that we can determine the ordering parts of each index record without looking into the externally stored part. */ /*-------------------------------------- @{ */ -#define BTR_EXTERN_SPACE_ID 0 /*!< space id where stored */ -#define BTR_EXTERN_PAGE_NO 4 /*!< page no where stored */ -#define BTR_EXTERN_OFFSET 8 /*!< offset of BLOB header +#define BTR_EXTERN_SPACE_ID 0U /*!< space id where stored */ +#define BTR_EXTERN_PAGE_NO 4U /*!< page no where stored */ +#define BTR_EXTERN_OFFSET 8U /*!< offset of BLOB header on that page */ -#define BTR_EXTERN_LEN 12 /*!< 8 bytes containing the +#define BTR_EXTERN_LEN 12U /*!< 8 bytes containing the length of the externally stored part of the BLOB. The 2 highest bits are diff --git a/storage/innobase/include/btr0cur.ic b/storage/innobase/include/btr0cur.ic index 7cf6c5982fa..be6ac28129e 100644 --- a/storage/innobase/include/btr0cur.ic +++ b/storage/innobase/include/btr0cur.ic @@ -29,7 +29,7 @@ Created 10/16/1994 Heikki Tuuri #ifdef UNIV_DEBUG # define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\ if (btr_cur_limit_optimistic_insert_debug > 1\ - && (NREC) >= (ulint)btr_cur_limit_optimistic_insert_debug) {\ + && (NREC) >= btr_cur_limit_optimistic_insert_debug) {\ CODE;\ } #else @@ -129,9 +129,8 @@ btr_cur_compress_recommendation( { const page_t* page; - ut_ad(mtr_is_block_fix( - mtr, btr_cur_get_block(cursor), - MTR_MEMO_PAGE_X_FIX, cursor->index->table)); + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); page = btr_cur_get_page(cursor); diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h index 9e3b4fc20a6..b0b61a4d1ff 100644 --- a/storage/innobase/include/btr0pcur.h +++ b/storage/innobase/include/btr0pcur.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -132,25 +132,23 @@ btr_pcur_open_with_no_init_func( may end up on the previous page of the record! */ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...; - NOTE that if has_search_latch != 0 then - we maybe do not acquire a latch on the cursor - page, but assume that the caller uses his - btr search latch to protect the record! */ + NOTE that if ahi_latch then we might not + acquire a cursor page latch, but assume + that the ahi_latch protects the record! */ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ #ifdef BTR_CUR_HASH_ADAPT - ulint has_search_latch, - /*!< in: latch mode the caller - currently has on search system: - RW_S_LATCH, or 0 */ + rw_lock_t* ahi_latch, + /*!< in: adaptive hash index latch held + by the caller, or NULL if none */ #endif /* BTR_CUR_HASH_ADAPT */ const char* file, /*!< in: file name */ unsigned line, /*!< in: line where called */ mtr_t* mtr); /*!< in: mtr */ #ifdef BTR_CUR_HASH_ADAPT -# define btr_pcur_open_with_no_init(ix,t,md,l,cur,has,m) \ - btr_pcur_open_with_no_init_func(ix,t,md,l,cur,has,__FILE__,__LINE__,m) +# define btr_pcur_open_with_no_init(ix,t,md,l,cur,ahi,m) \ + btr_pcur_open_with_no_init_func(ix,t,md,l,cur,ahi,__FILE__,__LINE__,m) #else /* BTR_CUR_HASH_ADAPT */ -# define btr_pcur_open_with_no_init(ix,t,md,l,cur,has,m) \ +# define btr_pcur_open_with_no_init(ix,t,md,l,cur,ahi,m) \ btr_pcur_open_with_no_init_func(ix,t,md,l,cur,__FILE__,__LINE__,m) #endif /* BTR_CUR_HASH_ADAPT */ @@ -431,21 +429,11 @@ btr_pcur_is_before_first_on_page( /*********************************************************//** Checks if the persistent cursor is before the first user record in the index tree. */ -UNIV_INLINE -ibool -btr_pcur_is_before_first_in_tree( -/*=============================*/ - btr_pcur_t* cursor, /*!< in: persistent cursor */ - mtr_t* mtr); /*!< in: mtr */ +static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor); /*********************************************************//** Checks if the persistent cursor is after the last user record in the index tree. */ -UNIV_INLINE -ibool -btr_pcur_is_after_last_in_tree( -/*===========================*/ - btr_pcur_t* cursor, /*!< in: persistent cursor */ - mtr_t* mtr); /*!< in: mtr */ +static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor); /*********************************************************//** Moves the persistent cursor to the next record on the same page. */ UNIV_INLINE @@ -509,8 +497,10 @@ struct btr_pcur_t{ /** if cursor position is stored, contains an initial segment of the latest record cursor was positioned either on, before or after */ rec_t* old_rec; + /** btr_cur.index->n_core_fields when old_rec was copied */ + uint16 old_n_core_fields; /** number of fields in old_rec */ - ulint old_n_fields; + uint16 old_n_fields; /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on whether cursor was on, before, or after the old_rec record */ enum btr_pcur_pos_t rel_pos; diff --git a/storage/innobase/include/btr0pcur.ic b/storage/innobase/include/btr0pcur.ic index 6bc5b356dab..d93da475a1f 100644 --- a/storage/innobase/include/btr0pcur.ic +++ b/storage/innobase/include/btr0pcur.ic @@ -209,12 +209,7 @@ btr_pcur_is_on_user_rec( /*********************************************************//** Checks if the persistent cursor is before the first user record in the index tree. */ -UNIV_INLINE -ibool -btr_pcur_is_before_first_in_tree( -/*=============================*/ - btr_pcur_t* cursor, /*!< in: persistent cursor */ - mtr_t* mtr) /*!< in: mtr */ +static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor) { ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); ut_ad(cursor->latch_mode != BTR_NO_LATCHES); @@ -226,12 +221,7 @@ btr_pcur_is_before_first_in_tree( /*********************************************************//** Checks if the persistent cursor is after the last user record in the index tree. */ -UNIV_INLINE -ibool -btr_pcur_is_after_last_in_tree( -/*===========================*/ - btr_pcur_t* cursor, /*!< in: persistent cursor */ - mtr_t* mtr) /*!< in: mtr */ +static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor) { ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); ut_ad(cursor->latch_mode != BTR_NO_LATCHES); @@ -289,9 +279,7 @@ btr_pcur_move_to_next_user_rec( cursor->old_stored = false; loop: if (btr_pcur_is_after_last_on_page(cursor)) { - - if (btr_pcur_is_after_last_in_tree(cursor, mtr)) { - + if (btr_pcur_is_after_last_in_tree(cursor)) { return(FALSE); } @@ -326,19 +314,15 @@ btr_pcur_move_to_next( cursor->old_stored = false; if (btr_pcur_is_after_last_on_page(cursor)) { - - if (btr_pcur_is_after_last_in_tree(cursor, mtr)) { - + if (btr_pcur_is_after_last_in_tree(cursor)) { return(FALSE); } btr_pcur_move_to_next_page(cursor, mtr); - return(TRUE); } btr_pcur_move_to_next_on_page(cursor); - return(TRUE); } @@ -454,18 +438,21 @@ btr_pcur_open_low( ut_ad(!dict_index_is_spatial(index)); - err = btr_cur_search_to_nth_level( - index, level, tuple, mode, latch_mode, - btr_cursor, 0, file, line, mtr, autoinc); + err = btr_cur_search_to_nth_level_func( + index, level, tuple, mode, latch_mode, btr_cursor, +#ifdef BTR_CUR_HASH_ADAPT + NULL, +#endif /* BTR_CUR_HASH_ADAPT */ + file, line, mtr, autoinc); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - ib::warn() << " Error code: " << err - << " btr_pcur_open_low " + ib::warn() << "btr_pcur_open_low" << " level: " << level << " called from file: " << file << " line: " << line << " table: " << index->table->name - << " index: " << index->name; + << " index: " << index->name + << " error: " << err; } cursor->pos_state = BTR_PCUR_IS_POSITIONED; @@ -491,16 +478,14 @@ btr_pcur_open_with_no_init_func( may end up on the previous page of the record! */ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...; - NOTE that if has_search_latch != 0 then - we maybe do not acquire a latch on the cursor - page, but assume that the caller uses his - btr search latch to protect the record! */ + NOTE that if ahi_latch then we might not + acquire a cursor page latch, but assume + that the ahi_latch protects the record! */ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ #ifdef BTR_CUR_HASH_ADAPT - ulint has_search_latch, - /*!< in: latch mode the caller - currently has on search system: - RW_S_LATCH, or 0 */ + rw_lock_t* ahi_latch, + /*!< in: adaptive hash index latch held + by the caller, or NULL if none */ #endif /* BTR_CUR_HASH_ADAPT */ const char* file, /*!< in: file name */ unsigned line, /*!< in: line where called */ @@ -516,9 +501,12 @@ btr_pcur_open_with_no_init_func( btr_cursor = btr_pcur_get_btr_cur(cursor); - err = btr_cur_search_to_nth_level( + err = btr_cur_search_to_nth_level_func( index, 0, tuple, mode, latch_mode, btr_cursor, - has_search_latch, file, line, mtr, 0); +#ifdef BTR_CUR_HASH_ADAPT + ahi_latch, +#endif /* BTR_CUR_HASH_ADAPT */ + file, line, mtr); cursor->pos_state = BTR_PCUR_IS_POSITIONED; diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h index a9781c65491..adb14a7c16f 100644 --- a/storage/innobase/include/btr0sea.h +++ b/storage/innobase/include/btr0sea.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, 2020, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,20 +33,17 @@ Created 2/17/1996 Heikki Tuuri /** Creates and initializes the adaptive search system at a database start. @param[in] hash_size hash table size. */ -void -btr_search_sys_create(ulint hash_size); +void btr_search_sys_create(ulint hash_size); /** Frees the adaptive search system at a database shutdown. */ -void -btr_search_sys_free(); +void btr_search_sys_free(); /** Disable the adaptive hash search system and empty the index. */ void btr_search_disable(); /** Enable the adaptive hash search system. -@param[in] resize Flag to indicate call during buf_pool_resize() */ -void -btr_search_enable(bool resize=false); +@param resize whether buf_pool_resize() is the caller */ +void btr_search_enable(bool resize= false); /*********************************************************************//** Updates the search info. */ @@ -71,12 +68,11 @@ both have sensible values. we assume the caller uses his search latch to protect the record! @param[out] cursor tree cursor -@param[in] has_search_latch - latch mode the caller currently has on - search system: RW_S/X_LATCH or 0 +@param[in] ahi_latch the adaptive hash index latch being held, + or NULL @param[in] mtr mini transaction -@return TRUE if succeeded */ -ibool +@return whether the search succeeded */ +bool btr_search_guess_on_hash( dict_index_t* index, btr_search_t* info, @@ -84,22 +80,19 @@ btr_search_guess_on_hash( ulint mode, ulint latch_mode, btr_cur_t* cursor, - ulint has_search_latch, + rw_lock_t* ahi_latch, mtr_t* mtr); -/** Moves or deletes hash entries for moved records. If new_page is already -hashed, then the hash index for page, if any, is dropped. If new_page is not -hashed, and page is hashed, then a new hash index is built to new_page with the -same parameters as page (this often happens when a page is split). -@param[in,out] new_block records are copied to this page. -@param[in,out] block index page from which record are copied, and the - copied records will be deleted from this page. -@param[in,out] index record descriptor */ +/** Move or delete hash entries for moved records, usually in a page split. +If new_block is already hashed, then any hash index for block is dropped. +If new_block is not hashed, and block is hashed, then a new hash index is +built to new_block with the same parameters as block. +@param[in,out] new_block destination page +@param[in,out] block source page (subject to deletion later) */ void btr_search_move_or_delete_hash_entries( buf_block_t* new_block, - buf_block_t* block, - dict_index_t* index); + buf_block_t* block); /** Drop any adaptive hash index entries that point to an index page. @param[in,out] block block containing index page, s- or x-latched, or an @@ -107,8 +100,7 @@ btr_search_move_or_delete_hash_entries( block->buf_fix_count == 0 or it is an index page which has already been removed from the buf_pool->page_hash i.e.: it is in state BUF_BLOCK_REMOVE_HASH */ -void -btr_search_drop_page_hash_index(buf_block_t* block); +void btr_search_drop_page_hash_index(buf_block_t* block); /** Drop possible adaptive hash index entries when a page is evicted from the buffer pool or freed in a file, or the index is being dropped. @@ -118,90 +110,78 @@ void btr_search_drop_page_hash_when_freed(const page_id_t page_id); /** Updates the page hash index when a single record is inserted on a page. @param[in] cursor cursor which was positioned to the place to insert using btr_cur_search_, and the new record has been - inserted next to the cursor. */ + inserted next to the cursor. +@param[in] ahi_latch the adaptive hash index latch */ void -btr_search_update_hash_node_on_insert(btr_cur_t* cursor); +btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch); /** Updates the page hash index when a single record is inserted on a page. -@param[in] cursor cursor which was positioned to the +@param[in,out] cursor cursor which was positioned to the place to insert using btr_cur_search_..., and the new record has been inserted next - to the cursor */ + to the cursor +@param[in] ahi_latch the adaptive hash index latch */ void -btr_search_update_hash_on_insert(btr_cur_t* cursor); +btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch); /** Updates the page hash index when a single record is deleted from a page. @param[in] cursor cursor which was positioned on the record to delete using btr_cur_search_, the record is not yet deleted.*/ -void -btr_search_update_hash_on_delete(btr_cur_t* cursor); +void btr_search_update_hash_on_delete(btr_cur_t* cursor); /** Validates the search system. @return true if ok */ -bool -btr_search_validate(); +bool btr_search_validate(); /** Lock all search latches in exclusive mode. */ -UNIV_INLINE -void -btr_search_x_lock_all(); +static inline void btr_search_x_lock_all(); /** Unlock all search latches from exclusive mode. */ -UNIV_INLINE -void -btr_search_x_unlock_all(); +static inline void btr_search_x_unlock_all(); /** Lock all search latches in shared mode. */ -UNIV_INLINE -void -btr_search_s_lock_all(); +static inline void btr_search_s_lock_all(); #ifdef UNIV_DEBUG /** Check if thread owns all the search latches. @param[in] mode lock mode check @retval true if owns all of them @retval false if does not own some of them */ -UNIV_INLINE -bool -btr_search_own_all(ulint mode); +static inline bool btr_search_own_all(ulint mode); /** Check if thread owns any of the search latches. @param[in] mode lock mode check @retval true if owns any of them @retval false if owns no search latch */ -UNIV_INLINE -bool -btr_search_own_any(ulint mode); +static inline bool btr_search_own_any(ulint mode); + +/** @return whether this thread holds any of the search latches */ +static inline bool btr_search_own_any(); #endif /* UNIV_DEBUG */ /** Unlock all search latches from shared mode. */ -UNIV_INLINE -void -btr_search_s_unlock_all(); +static inline void btr_search_s_unlock_all(); /** Get the latch based on index attributes. A latch is selected from an array of latches using pair of index-id, space-id. @param[in] index index handler @return latch */ -UNIV_INLINE -rw_lock_t* -btr_get_search_latch(const dict_index_t* index); +static inline rw_lock_t* btr_get_search_latch(const dict_index_t* index); /** Get the hash-table based on index attributes. A table is selected from an array of tables using pair of index-id, space-id. @param[in] index index handler @return hash table */ -UNIV_INLINE -hash_table_t* -btr_get_search_table(const dict_index_t* index); +static inline hash_table_t* btr_get_search_table(const dict_index_t* index); #else /* BTR_CUR_HASH_ADAPT */ # define btr_search_sys_create(size) +# define btr_search_sys_free() # define btr_search_drop_page_hash_index(block) # define btr_search_s_lock_all(index) # define btr_search_s_unlock_all(index) # define btr_search_info_update(index, cursor) -# define btr_search_move_or_delete_hash_entries(new_block, block, index) -# define btr_search_update_hash_on_insert(cursor) +# define btr_search_move_or_delete_hash_entries(new_block, block) +# define btr_search_update_hash_on_insert(cursor, ahi_latch) # define btr_search_update_hash_on_delete(cursor) #endif /* BTR_CUR_HASH_ADAPT */ @@ -209,15 +189,11 @@ btr_get_search_table(const dict_index_t* index); /** Create and initialize search info. @param[in,out] heap heap where created @return own: search info struct */ -UNIV_INLINE -btr_search_t* -btr_search_info_create(mem_heap_t* heap) +static inline btr_search_t* btr_search_info_create(mem_heap_t* heap) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** @return the search info of an index */ -UNIV_INLINE -btr_search_t* -btr_search_get_info(dict_index_t* index) +static inline btr_search_t* btr_search_get_info(dict_index_t* index) { return(index->search_info); } @@ -259,7 +235,7 @@ struct btr_search_t{ ulint n_bytes; /*!< recommended prefix: number of bytes in an incomplete field @see BTR_PAGE_MAX_REC_SIZE */ - ibool left_side; /*!< TRUE or FALSE, depending on whether + bool left_side; /*!< true or false, depending on whether the leftmost record of several records with the same prefix should be indexed in the hash index */ diff --git a/storage/innobase/include/btr0sea.ic b/storage/innobase/include/btr0sea.ic index 90877d23192..9db0084ce59 100644 --- a/storage/innobase/include/btr0sea.ic +++ b/storage/innobase/include/btr0sea.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2020, MariaDB Corporation. +Copyright (c) 2018, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -31,9 +31,7 @@ Created 2/17/1996 Heikki Tuuri /** Create and initialize search info. @param[in,out] heap heap where created @return own: search info struct */ -UNIV_INLINE -btr_search_t* -btr_search_info_create(mem_heap_t* heap) +static inline btr_search_t* btr_search_info_create(mem_heap_t* heap) { btr_search_t* info = static_cast<btr_search_t*>( mem_heap_zalloc(heap, sizeof(btr_search_t))); @@ -46,25 +44,23 @@ btr_search_info_create(mem_heap_t* heap) } #ifdef BTR_CUR_HASH_ADAPT -/*********************************************************************//** -Updates the search info. */ +/** Updates the search info. +@param[in,out] info search info +@param[in,out] cursor cursor which was just positioned */ void -btr_search_info_update_slow( -/*========================*/ - btr_search_t* info, /*!< in/out: search info */ - btr_cur_t* cursor);/*!< in: cursor which was just positioned */ +btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor); /*********************************************************************//** Updates the search info. */ -UNIV_INLINE +static inline void btr_search_info_update( /*===================*/ dict_index_t* index, /*!< in: index of the cursor */ btr_cur_t* cursor) /*!< in: cursor which was just positioned */ { - ut_ad(!rw_lock_own_flagged(btr_get_search_latch(index), - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); if (dict_index_is_spatial(index) || !btr_search_enabled) { return; @@ -89,9 +85,7 @@ btr_search_info_update( } /** Lock all search latches in exclusive mode. */ -UNIV_INLINE -void -btr_search_x_lock_all() +static inline void btr_search_x_lock_all() { for (ulint i = 0; i < btr_ahi_parts; ++i) { rw_lock_x_lock(btr_search_latches[i]); @@ -99,9 +93,7 @@ btr_search_x_lock_all() } /** Unlock all search latches from exclusive mode. */ -UNIV_INLINE -void -btr_search_x_unlock_all() +static inline void btr_search_x_unlock_all() { for (ulint i = 0; i < btr_ahi_parts; ++i) { rw_lock_x_unlock(btr_search_latches[i]); @@ -109,9 +101,7 @@ btr_search_x_unlock_all() } /** Lock all search latches in shared mode. */ -UNIV_INLINE -void -btr_search_s_lock_all() +static inline void btr_search_s_lock_all() { for (ulint i = 0; i < btr_ahi_parts; ++i) { rw_lock_s_lock(btr_search_latches[i]); @@ -119,9 +109,7 @@ btr_search_s_lock_all() } /** Unlock all search latches from shared mode. */ -UNIV_INLINE -void -btr_search_s_unlock_all() +static inline void btr_search_s_unlock_all() { for (ulint i = 0; i < btr_ahi_parts; ++i) { rw_lock_s_unlock(btr_search_latches[i]); @@ -133,9 +121,7 @@ btr_search_s_unlock_all() @param[in] mode lock mode check @retval true if owns all of them @retval false if does not own some of them */ -UNIV_INLINE -bool -btr_search_own_all(ulint mode) +static inline bool btr_search_own_all(ulint mode) { for (ulint i = 0; i < btr_ahi_parts; ++i) { if (!rw_lock_own(btr_search_latches[i], mode)) { @@ -149,9 +135,7 @@ btr_search_own_all(ulint mode) @param[in] mode lock mode check @retval true if owns any of them @retval false if owns no search latch */ -UNIV_INLINE -bool -btr_search_own_any(ulint mode) +static inline bool btr_search_own_any(ulint mode) { for (ulint i = 0; i < btr_ahi_parts; ++i) { if (rw_lock_own(btr_search_latches[i], mode)) { @@ -160,19 +144,31 @@ btr_search_own_any(ulint mode) } return(false); } + +/** @return whether this thread holds any of the search latches */ +static inline bool btr_search_own_any() +{ + for (ulint i = btr_ahi_parts; i--; ) { + if (rw_lock_own_flagged(btr_search_latches[i], + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)) { + return true; + } + } + return false; +} #endif /* UNIV_DEBUG */ /** Get the adaptive hash search index latch for a b-tree. @param[in] index b-tree index @return latch */ -UNIV_INLINE -rw_lock_t* -btr_get_search_latch(const dict_index_t* index) +static inline rw_lock_t* btr_get_search_latch(const dict_index_t* index) { ut_ad(index != NULL); + ut_ad(!index->table->space + || index->table->space->id == index->table->space_id); - ulint ifold = ut_fold_ulint_pair(static_cast<ulint>(index->id), - static_cast<ulint>(index->space)); + ulint ifold = ut_fold_ulint_pair(ulint(index->id), + index->table->space_id); return(btr_search_latches[ifold % btr_ahi_parts]); } @@ -181,14 +177,13 @@ btr_get_search_latch(const dict_index_t* index) A table is selected from an array of tables using pair of index-id, space-id. @param[in] index index handler @return hash table */ -UNIV_INLINE -hash_table_t* -btr_get_search_table(const dict_index_t* index) +static inline hash_table_t* btr_get_search_table(const dict_index_t* index) { ut_ad(index != NULL); + ut_ad(index->table->space->id == index->table->space_id); - ulint ifold = ut_fold_ulint_pair(static_cast<ulint>(index->id), - static_cast<ulint>(index->space)); + ulint ifold = ut_fold_ulint_pair(ulint(index->id), + index->table->space_id); return(btr_search_sys->hash_tables[ifold % btr_ahi_parts]); } diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h index 1697c8649c0..5b1aefb4d69 100644 --- a/storage/innobase/include/buf0buddy.h +++ b/storage/innobase/include/buf0buddy.h @@ -47,9 +47,9 @@ buf_buddy_alloc( the page resides */ ulint size, /*!< in: compressed page size (between UNIV_ZIP_SIZE_MIN and - UNIV_PAGE_SIZE) */ - ibool* lru) /*!< in: pointer to a variable - that will be assigned TRUE if + srv_page_size) */ + bool* lru) /*!< in: pointer to a variable + that will be assigned true if storage was allocated from the LRU list and buf_pool->mutex was temporarily released */ @@ -66,14 +66,14 @@ buf_buddy_free( void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ ulint size) /*!< in: block size, - up to UNIV_PAGE_SIZE */ + up to srv_page_size */ MY_ATTRIBUTE((nonnull)); /** Reallocate a block. @param[in] buf_pool buffer pool instance @param[in] buf block to be reallocated, must be pointed to by the buffer pool -@param[in] size block size, up to UNIV_PAGE_SIZE +@param[in] size block size, up to srv_page_size @retval false if failed because of no free blocks. */ bool buf_buddy_realloc( diff --git a/storage/innobase/include/buf0buddy.ic b/storage/innobase/include/buf0buddy.ic index 4afa795e762..dad9cb668dd 100644 --- a/storage/innobase/include/buf0buddy.ic +++ b/storage/innobase/include/buf0buddy.ic @@ -42,8 +42,8 @@ buf_buddy_alloc_low( buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ - ibool* lru) /*!< in: pointer to a variable that - will be assigned TRUE if storage was + bool* lru) /*!< in: pointer to a variable that + will be assigned true if storage was allocated from the LRU list and buf_pool->mutex was temporarily released */ @@ -96,9 +96,9 @@ buf_buddy_alloc( the page resides */ ulint size, /*!< in: compressed page size (between UNIV_ZIP_SIZE_MIN and - UNIV_PAGE_SIZE) */ - ibool* lru) /*!< in: pointer to a variable - that will be assigned TRUE if + srv_page_size) */ + bool* lru) /*!< in: pointer to a variable + that will be assigned true if storage was allocated from the LRU list and buf_pool->mutex was temporarily released */ @@ -106,7 +106,7 @@ buf_buddy_alloc( ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(ut_is_2pow(size)); ut_ad(size >= UNIV_ZIP_SIZE_MIN); - ut_ad(size <= UNIV_PAGE_SIZE); + ut_ad(size <= srv_page_size); return((byte*) buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size), lru)); @@ -123,12 +123,12 @@ buf_buddy_free( void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ ulint size) /*!< in: block size, - up to UNIV_PAGE_SIZE */ + up to srv_page_size */ { ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(ut_is_2pow(size)); ut_ad(size >= UNIV_ZIP_SIZE_MIN); - ut_ad(size <= UNIV_PAGE_SIZE); + ut_ad(size <= srv_page_size); buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size)); } diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 40d2e6b2023..1c9383aff80 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1493,7 +1493,7 @@ public: buf_pool->watch */ ulint real_size; /*!< Real size of the page - Normal pages == UNIV_PAGE_SIZE + Normal pages == srv_page_size page compressed pages, payload size alligned to sector boundary. */ @@ -1625,9 +1625,9 @@ struct buf_block_t{ buf_pool->page_hash can point to buf_page_t or buf_block_t */ byte* frame; /*!< pointer to buffer frame which - is of size UNIV_PAGE_SIZE, and + is of size srv_page_size, and aligned to an address divisible by - UNIV_PAGE_SIZE */ + srv_page_size */ BPageLock lock; /*!< read-write lock of the buffer frame */ UT_LIST_NODE_T(buf_block_t) unzip_LRU; @@ -1641,7 +1641,7 @@ struct buf_block_t{ used in debugging */ ibool in_withdraw_list; #endif /* UNIV_DEBUG */ - unsigned lock_hash_val:32;/*!< hashed value of the page address + uint32_t lock_hash_val; /*!< hashed value of the page address in the record lock hash table; protected by buf_block_t::lock (or buf_block_t::mutex, buf_pool->mutex @@ -1784,7 +1784,7 @@ struct buf_block_t{ /**********************************************************************//** Compute the hash fold value for blocks in buf_pool->zip_hash. */ /* @{ */ -#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE) +#define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift) #define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame) #define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) /* @} */ @@ -2256,8 +2256,12 @@ Use these instead of accessing buf_pool->mutex directly. */ /** Get appropriate page_hash_lock. */ -# define buf_page_hash_lock_get(buf_pool, page_id) \ - hash_get_lock((buf_pool)->page_hash, (page_id).fold()) +UNIV_INLINE +rw_lock_t* +buf_page_hash_lock_get(const buf_pool_t* buf_pool, const page_id_t& page_id) +{ + return hash_get_lock(buf_pool->page_hash, page_id.fold()); +} /** If not appropriate page_hash_lock, relock until appropriate. */ # define buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id)\ diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index 49b741ab5c8..3f8ad25bdfe 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. -Copyright (c) 2014, 2019, MariaDB Corporation. +Copyright (c) 2014, 2020, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -788,7 +788,7 @@ buf_frame_align( ut_ad(ptr); - frame = (buf_frame_t*) ut_align_down(ptr, UNIV_PAGE_SIZE); + frame = (buf_frame_t*) ut_align_down(ptr, srv_page_size); return(frame); } @@ -805,11 +805,11 @@ buf_ptr_get_fsp_addr( fil_addr_t* addr) /*!< out: page offset and byte offset */ { const page_t* page = (const page_t*) ut_align_down(ptr, - UNIV_PAGE_SIZE); + srv_page_size); *space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); addr->page = mach_read_from_4(page + FIL_PAGE_OFFSET); - addr->boffset = ut_align_offset(ptr, UNIV_PAGE_SIZE); + addr->boffset = ut_align_offset(ptr, srv_page_size); } /**********************************************************************//** @@ -894,7 +894,7 @@ buf_frame_copy( { ut_ad(buf && frame); - ut_memcpy(buf, frame, UNIV_PAGE_SIZE); + ut_memcpy(buf, frame, srv_page_size); return(buf); } diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h index 07ffd626956..c34c1077d97 100644 --- a/storage/innobase/include/buf0dblwr.h +++ b/storage/innobase/include/buf0dblwr.h @@ -130,7 +130,7 @@ struct buf_dblwr_t{ doublewrite block (64 pages) */ ulint block2; /*!< page number of the second block */ ulint first_free;/*!< first free position in write_buf - measured in units of UNIV_PAGE_SIZE */ + measured in units of srv_page_size */ ulint b_reserved;/*!< number of slots currently reserved for batch flush. */ os_event_t b_event;/*!< event where threads wait for a @@ -149,7 +149,7 @@ struct buf_dblwr_t{ buffer. */ byte* write_buf;/*!< write buffer used in writing to the doublewrite buffer, aligned to an - address divisible by UNIV_PAGE_SIZE + address divisible by srv_page_size (which is required by Windows aio) */ byte* write_buf_unaligned;/*!< pointer to write_buf, but unaligned */ diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index c7f5d410099..a0122d1c3f8 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -206,16 +206,10 @@ buf_flush_ready_for_replace( #ifdef UNIV_DEBUG /** Disables page cleaner threads (coordinator and workers). It's used by: SET GLOBAL innodb_page_cleaner_disabled_debug = 1 (0). -@param[in] thd thread handle -@param[in] var pointer to system variable -@param[out] var_ptr where the formal string goes @param[in] save immediate result from check function */ -void -buf_flush_page_cleaner_disabled_debug_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save); +void buf_flush_page_cleaner_disabled_debug_update(THD*, + st_mysql_sys_var*, void*, + const void* save); #endif /* UNIV_DEBUG */ /******************************************************************//** @@ -228,6 +222,12 @@ DECLARE_THREAD(buf_flush_page_cleaner_coordinator)( /*===============================================*/ void* arg); /*!< in: a dummy parameter required by os_thread_create */ + +/** Adjust thread count for page cleaner workers. +@param[in] new_cnt Number of threads to be used */ +void +buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt); + /******************************************************************//** Worker thread of page_cleaner. @return a dummy parameter */ @@ -328,12 +328,12 @@ flushed to disk before any redo logged operations go to the index. */ class FlushObserver { public: /** Constructor - @param[in] space_id table space id + @param[in,out] space tablespace @param[in] trx trx instance @param[in] stage performance schema accounting object, used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages() for accounting. */ - FlushObserver(ulint space_id, trx_t* trx, ut_stage_alter_t* stage); + FlushObserver(fil_space_t* space, trx_t* trx, ut_stage_alter_t* stage); /** Deconstructor */ ~FlushObserver(); @@ -379,8 +379,8 @@ public: buf_pool_t* buf_pool, buf_page_t* bpage); private: - /** Table space id */ - const ulint m_space_id; + /** Tablespace */ + fil_space_t* m_space; /** Trx instance */ const trx_t* const m_trx; @@ -402,57 +402,6 @@ private: bool m_interrupted; }; -/******************************************************************//** -Start a buffer flush batch for LRU or flush list */ -ibool -buf_flush_start( -/*============*/ - buf_pool_t* buf_pool, /*!< buffer pool instance */ - buf_flush_t flush_type); /*!< in: BUF_FLUSH_LRU - or BUF_FLUSH_LIST */ -/******************************************************************//** -End a buffer flush batch for LRU or flush list */ -void -buf_flush_end( -/*==========*/ - buf_pool_t* buf_pool, /*!< buffer pool instance */ - buf_flush_t flush_type); /*!< in: BUF_FLUSH_LRU - or BUF_FLUSH_LIST */ -/******************************************************************//** -Gather the aggregated stats for both flush list and LRU list flushing */ -void -buf_flush_common( -/*=============*/ - buf_flush_t flush_type, /*!< in: type of flush */ - ulint page_count); /*!< in: number of pages flushed */ - -/*******************************************************************//** -This utility flushes dirty blocks from the end of the LRU list or flush_list. -NOTE 1: in the case of an LRU flush the calling thread may own latches to -pages: to avoid deadlocks, this function must be written so that it cannot -end up waiting for these latches! NOTE 2: in the case of a flush list flush, -the calling thread is not allowed to own any latches on pages! */ -__attribute__((nonnull)) -void -buf_flush_batch( -/*============*/ - buf_pool_t* buf_pool, /*!< in: buffer pool instance */ - buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or - BUF_FLUSH_LIST; if BUF_FLUSH_LIST, - then the caller must not own any - latches on pages */ - ulint min_n, /*!< in: wished minimum mumber of blocks - flushed (it is not guaranteed that the - actual number is that big, though) */ - lsn_t lsn_limit, /*!< in: in the case of BUF_FLUSH_LIST - all blocks whose oldest_modification is - smaller than this should be flushed - (if their number does not exceed - min_n), otherwise ignored */ - flush_counters_t* n); /*!< out: flushed/evicted page - counts */ - - #include "buf0flu.ic" #endif diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index a72d98395af..e7707ffd6dc 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -32,6 +32,7 @@ Created 11/5/1995 Heikki Tuuri // Forward declaration struct trx_t; +struct fil_space_t; /******************************************************************//** Returns TRUE if less than 25 % of the buffer pool is available. This can be diff --git a/storage/innobase/include/buf0mtflu.h b/storage/innobase/include/buf0mtflu.h deleted file mode 100644 index 5f11de572ad..00000000000 --- a/storage/innobase/include/buf0mtflu.h +++ /dev/null @@ -1,95 +0,0 @@ -/***************************************************************************** - -Copyright (C) 2014 SkySQL Ab. All Rights Reserved. -Copyright (C) 2014 Fusion-io. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/******************************************************************//** -@file include/buf0mtflu.h -Multi-threadef flush method interface function prototypes - -Created 06/02/2014 Jan Lindström jan.lindstrom@skysql.com - Dhananjoy Das DDas@fusionio.com -***********************************************************************/ - -#ifndef buf0mtflu_h -#define buf0mtflu_h - -/******************************************************************//** -Add exit work item to work queue to signal multi-threded flush -threads that they should exit. -*/ -void -buf_mtflu_io_thread_exit(void); -/*===========================*/ - -/******************************************************************//** -Initialize multi-threaded flush thread syncronization data. -@return Initialized multi-threaded flush thread syncroniztion data. */ -void* -buf_mtflu_handler_init( -/*===================*/ - ulint n_threads, /*!< in: Number of threads to create */ - ulint wrk_cnt); /*!< in: Number of work items */ - -/******************************************************************//** -Return true if multi-threaded flush is initialized -@return true if initialized, false if not */ -bool -buf_mtflu_init_done(void); -/*======================*/ - -/*********************************************************************//** -Clears up tail of the LRU lists: -* Put replaceable pages at the tail of LRU to the free list -* Flush dirty pages at the tail of LRU to the disk -The depth to which we scan each buffer pool is controlled by dynamic -config parameter innodb_LRU_scan_depth. -@return total pages flushed */ -UNIV_INTERN -ulint -buf_mtflu_flush_LRU_tail(void); -/*===========================*/ - -/*******************************************************************//** -Multi-threaded version of buf_flush_list -*/ -bool -buf_mtflu_flush_list( -/*=================*/ - ulint min_n, /*!< in: wished minimum mumber of blocks - flushed (it is not guaranteed that the - actual number is that big, though) */ - lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all - blocks whose oldest_modification is - smaller than this should be flushed - (if their number does not exceed - min_n), otherwise ignored */ - ulint* n_processed); /*!< out: the number of pages - which were processed is passed - back to caller. Ignored if NULL */ - -/*********************************************************************//** -Set correct thread identifiers to io thread array based on -information we have. */ -void -buf_mtflu_set_thread_ids( -/*=====================*/ - ulint n_threads, /*!<in: Number of threads to fill */ - void* ctx, /*!<in: thread context */ - os_thread_id_t* thread_ids); /*!<in: thread id array */ - -#endif diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index 27ffee03d4c..bd5e26df47b 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -113,7 +113,7 @@ is_checksum_strict(ulint algo) #define BUF_BUDDY_LOW (1U << BUF_BUDDY_LOW_SHIFT) /** Actual number of buddy sizes based on current page size */ -#define BUF_BUDDY_SIZES (UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT) +#define BUF_BUDDY_SIZES (srv_page_size_shift - BUF_BUDDY_LOW_SHIFT) /** Maximum number of buddy sizes based on the max page size */ #define BUF_BUDDY_SIZES_MAX (UNIV_PAGE_SIZE_SHIFT_MAX \ @@ -121,7 +121,7 @@ is_checksum_strict(ulint algo) /** twice the maximum block size of the buddy system; the underlying memory is aligned by this amount: -this must be equal to UNIV_PAGE_SIZE */ +this must be equal to srv_page_size */ #define BUF_BUDDY_HIGH (BUF_BUDDY_LOW << BUF_BUDDY_SIZES) /* @} */ diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h index fdf1a14feee..002332852b8 100644 --- a/storage/innobase/include/data0data.h +++ b/storage/innobase/include/data0data.h @@ -492,6 +492,22 @@ struct dfield_t{ @param[in,out] heap memory heap in which the clone will be created @return the cloned object */ dfield_t* clone(mem_heap_t* heap) const; + + /** @return system field indicates history row */ + bool vers_history_row() const + { + ut_ad(type.vers_sys_end()); + if (type.mtype == DATA_FIXBINARY) { + ut_ad(len == sizeof timestamp_max_bytes); + return 0 != memcmp(data, timestamp_max_bytes, len); + } else { + ut_ad(type.mtype == DATA_INT); + ut_ad(len == sizeof trx_id_max_bytes); + return 0 != memcmp(data, trx_id_max_bytes, len); + } + ut_ad(0); + return false; + } }; /** Structure for an SQL data tuple of fields (logical record) */ @@ -517,6 +533,26 @@ struct dtuple_t { /** Value of dtuple_t::magic_n */ # define DATA_TUPLE_MAGIC_N 65478679 #endif /* UNIV_DEBUG */ + + /** Trim the tail of an index tuple before insert or update. + After instant ADD COLUMN, if the last fields of a clustered index tuple + match the default values that were explicitly specified or implied + during ADD COLUMN, there will be no need to store them. + NOTE: A page latch in the index must be held, so that the index + may not lose 'instantness' before the trimmed tuple has been + inserted or updated. + @param[in] index index possibly with instantly added columns */ + void trim(const dict_index_t& index); + bool vers_history_row() const + { + for (ulint i = 0; i < n_fields; i++) { + const dfield_t* field = &fields[i]; + if (field->type.vers_sys_end()) { + return field->vers_history_row(); + } + } + return false; + } }; inline ulint dtuple_get_n_fields(const dtuple_t* tuple) @@ -534,7 +570,11 @@ inline const void* dfield_get_data(const dfield_t* field) ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error); return field->data; } -inline ulint dfield_get_len(const dfield_t* field) { return field->len; } +inline ulint dfield_get_len(const dfield_t* field) { + ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error); + ut_ad(field->len != UNIV_SQL_DEFAULT); + return field->len; +} inline bool dfield_is_null(const dfield_t* field) { return field->len == UNIV_SQL_NULL; } /** @return whether a column is to be stored off-page */ diff --git a/storage/innobase/include/data0data.ic b/storage/innobase/include/data0data.ic index 9b7a3132873..39ade7b1e09 100644 --- a/storage/innobase/include/data0data.ic +++ b/storage/innobase/include/data0data.ic @@ -50,6 +50,7 @@ dfield_set_len( dfield_t* field, /*!< in: field */ ulint len) /*!< in: length or UNIV_SQL_NULL */ { + ut_ad(len != UNIV_SQL_DEFAULT); field->ext = 0; field->len = static_cast<unsigned int>(len); } @@ -213,6 +214,7 @@ dfield_data_is_binary_equal( ulint len, /*!< in: data length or UNIV_SQL_NULL */ const byte* data) /*!< in: data */ { + ut_ad(len != UNIV_SQL_DEFAULT); return(len == dfield_get_len(field) && (!len || len == UNIV_SQL_NULL || !memcmp(dfield_get_data(field), data, len))); diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h index f641af8a6c1..740a1b83aca 100644 --- a/storage/innobase/include/data0type.h +++ b/storage/innobase/include/data0type.h @@ -29,6 +29,12 @@ Created 1/16/1996 Heikki Tuuri #include "univ.i" +/** Special length indicating a missing instantly added column */ +#define UNIV_SQL_DEFAULT (UNIV_SQL_NULL - 1) + +/** @return whether a length is actually stored in a field */ +#define len_is_stored(len) (len != UNIV_SQL_NULL && len != UNIV_SQL_DEFAULT) + extern ulint data_mysql_default_charset_coll; #define DATA_MYSQL_BINARY_CHARSET_COLL 63 @@ -183,6 +189,12 @@ be less than 256 */ for shorter VARCHARs MySQL uses only 1 byte */ #define DATA_VIRTUAL 8192U /* Virtual column */ +/** System Versioning */ +#define DATA_VERS_START 16384U /* start system field */ +#define DATA_VERS_END 32768U /* end system field */ +/** system-versioned user data column */ +#define DATA_VERSIONED (DATA_VERS_START|DATA_VERS_END) + /** Check whether locking is disabled (never). */ #define dict_table_is_locking_disabled(table) false @@ -353,9 +365,9 @@ dtype_form_prtype(ulint old_prtype, ulint charset_coll) Determines if a MySQL string type is a subset of UTF-8. This function may return false negatives, in case further character-set collation codes are introduced in MySQL later. -@return TRUE if a subset of UTF-8 */ +@return whether a subset of UTF-8 */ UNIV_INLINE -ibool +bool dtype_is_utf8( /*==========*/ ulint prtype);/*!< in: precise data type */ @@ -529,8 +541,24 @@ struct dtype_t{ in bytes */ unsigned mbmaxlen:3; /*!< maximum length of a character, in bytes */ + + /** @return whether this is system versioned user field */ + bool is_versioned() const { return !(~prtype & DATA_VERSIONED); } + /** @return whether this is the system field start */ + bool vers_sys_start() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_START; + } + /** @return whether this is the system field end */ + bool vers_sys_end() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_END; + } }; +/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */ +extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + #include "data0type.ic" #endif diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic index 1956016c58b..f2c499716ce 100644 --- a/storage/innobase/include/data0type.ic +++ b/storage/innobase/include/data0type.ic @@ -43,9 +43,9 @@ dtype_get_charset_coll( Determines if a MySQL string type is a subset of UTF-8. This function may return false negatives, in case further character-set collation codes are introduced in MySQL later. -@return TRUE if a subset of UTF-8 */ +@return whether a subset of UTF-8 */ UNIV_INLINE -ibool +bool dtype_is_utf8( /*==========*/ ulint prtype) /*!< in: precise data type */ @@ -58,10 +58,10 @@ dtype_is_utf8( case 33: /* utf8_general_ci */ case 83: /* utf8_bin */ case 254: /* utf8_general_cs */ - return(TRUE); + return true; } - return(FALSE); + return false; } /*********************************************************************//** @@ -235,9 +235,8 @@ dtype_new_store_for_order_and_null_size( ulint prefix_len)/*!< in: prefix length to replace type->len, or 0 */ { -#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE -#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" -#endif + compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + ulint len; ut_ad(type); @@ -280,10 +279,7 @@ dtype_read_for_order_and_null_size( dtype_t* type, /*!< in: type struct */ const byte* buf) /*!< in: buffer for stored type order info */ { -#if 4 != DATA_ORDER_NULL_TYPE_BUF_SIZE -# error "4 != DATA_ORDER_NULL_TYPE_BUF_SIZE" -#endif - + compile_time_assert(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE); type->mtype = buf[0] & 63; type->prtype = buf[1]; @@ -309,11 +305,7 @@ dtype_new_read_for_order_and_null_size( dtype_t* type, /*!< in: type struct */ const byte* buf) /*!< in: buffer for stored type order info */ { - ulint charset_coll; - -#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE -#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" -#endif + compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); type->mtype = buf[0] & 63; type->prtype = buf[1]; @@ -328,7 +320,7 @@ dtype_new_read_for_order_and_null_size( type->len = mach_read_from_2(buf + 2); - charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK; + ulint charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK; if (dtype_is_string_type(type->mtype)) { ut_a(charset_coll <= MAX_CHAR_COLL_NUM); diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h index 3baefdd1132..4853d5ad73f 100644 --- a/storage/innobase/include/dict0boot.h +++ b/storage/innobase/include/dict0boot.h @@ -119,7 +119,7 @@ dict_is_sys_table( /* The ids for the basic system tables and their indexes */ #define DICT_TABLES_ID 1 #define DICT_COLUMNS_ID 2 -#define DICT_INDEXES_ID 3 +#define DICT_INDEXES_ID dict_index_t::DICT_INDEXES_ID /* 3 */ #define DICT_FIELDS_ID 4 /* The following is a secondary index on SYS_TABLES */ #define DICT_TABLE_IDS_ID 5 diff --git a/storage/innobase/include/dict0boot.ic b/storage/innobase/include/dict0boot.ic index c3862b5c76a..dacfcd58b53 100644 --- a/storage/innobase/include/dict0boot.ic +++ b/storage/innobase/include/dict0boot.ic @@ -58,10 +58,7 @@ dict_sys_read_row_id( /*=================*/ const byte* field) /*!< in: record field */ { -#if DATA_ROW_ID_LEN != 6 -# error "DATA_ROW_ID_LEN != 6" -#endif - + compile_time_assert(DATA_ROW_ID_LEN == 6); return(mach_read_from_6(field)); } @@ -74,10 +71,7 @@ dict_sys_write_row_id( byte* field, /*!< in: record field */ row_id_t row_id) /*!< in: row id */ { -#if DATA_ROW_ID_LEN != 6 -# error "DATA_ROW_ID_LEN != 6" -#endif - + compile_time_assert(DATA_ROW_ID_LEN == 6); mach_write_to_6(field, row_id); } diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h index 359d9f556e5..8ab987cd39a 100644 --- a/storage/innobase/include/dict0crea.h +++ b/storage/innobase/include/dict0crea.h @@ -47,6 +47,7 @@ tab_create_graph_create( /** Creates an index create graph. @param[in] index index to create, built as a memory data structure +@param[in] table table name @param[in,out] heap heap where created @param[in] add_v new virtual columns added in the same clause with add index @@ -54,8 +55,9 @@ tab_create_graph_create( ind_node_t* ind_create_graph_create( dict_index_t* index, + const char* table, mem_heap_t* heap, - const dict_add_v_col_t* add_v); + const dict_add_v_col_t* add_v = NULL); /***********************************************************//** Creates a table. This is a high-level function used in SQL execution graphs. @@ -137,22 +139,6 @@ dict_create_index_tree_in_mem( dict_index_t* index, /*!< in/out: index */ const trx_t* trx); /*!< in: InnoDB transaction handle */ -/*******************************************************************//** -Truncates the index tree but don't update SYSTEM TABLES. -@return DB_SUCCESS or error */ -dberr_t -dict_truncate_index_tree_in_mem( -/*============================*/ - dict_index_t* index); /*!< in/out: index */ - -/*******************************************************************//** -Drops the index tree but don't update SYS_INDEXES table. */ -void -dict_drop_index_tree_in_mem( -/*========================*/ - const dict_index_t* index, /*!< in: index */ - ulint page_no);/*!< in: index page-no */ - /****************************************************************//** Creates the foreign key constraints system tables inside InnoDB at server bootstrap or server start if they are not found or are @@ -301,6 +287,7 @@ struct ind_node_t{ dict_index_t* index; /*!< index to create, built as a memory data structure with dict_mem_... functions */ + const char* table_name; /*!< table name */ ins_node_t* ind_def; /*!< child node which does the insert of the index definition; the row to be inserted is built by the parent node */ diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 6cfb92a94d3..37eae6641af 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -2,7 +2,7 @@ Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2020, MariaDB Corporation. +Copyright (c) 2013, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -173,7 +173,7 @@ dict_col_copy_type( /**********************************************************************//** Determine bytes of column prefix to be stored in the undo log. Please -note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix +note that if !dict_table_has_atomic_blobs(table), no prefix needs to be stored in the undo log. @return bytes of column prefix to be stored in the undo log */ UNIV_INLINE @@ -356,15 +356,6 @@ dict_table_add_system_columns( mem_heap_t* heap) /*!< in: temporary heap */ MY_ATTRIBUTE((nonnull)); /**********************************************************************//** -Adds a table object to the dictionary cache. */ -void -dict_table_add_to_cache( -/*====================*/ - dict_table_t* table, /*!< in: table */ - bool can_be_evicted, /*!< in: whether can be evicted*/ - mem_heap_t* heap) /*!< in: temporary heap */ - MY_ATTRIBUTE((nonnull)); -/**********************************************************************//** Removes a table object from the dictionary cache. */ void dict_table_remove_from_cache( @@ -577,16 +568,6 @@ dict_foreign_find_index( happened */ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)); -/**********************************************************************//** -Returns a column's name. -@return column name. NOTE: not guaranteed to stay valid if table is -modified in any way (columns added, etc.). */ -const char* -dict_table_get_col_name( -/*====================*/ - const dict_table_t* table, /*!< in: table */ - ulint col_nr) /*!< in: column number */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Returns a virtual column's name. @param[in] table table object @@ -895,6 +876,18 @@ dict_table_get_sys_col( /* Get nth virtual columns */ #define dict_table_get_nth_v_col(table, pos) (&(table)->v_cols[pos]) #endif /* UNIV_DEBUG */ +/** Wrapper function. +@see dict_col_t::name() +@param[in] table table +@param[in] col_nr column number in table +@return column name */ +inline +const char* +dict_table_get_col_name(const dict_table_t* table, ulint col_nr) +{ + return(dict_table_get_nth_col(table, col_nr)->name(*table)); +} + /********************************************************************//** Gets the given system column number of a table. @return column number */ @@ -919,30 +912,21 @@ dict_index_get_min_size( Check whether the table uses the compact page format. @return TRUE if table uses the compact page format */ UNIV_INLINE -ibool +bool dict_table_is_comp( /*===============*/ const dict_table_t* table) /*!< in: table */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/********************************************************************//** -Determine the file format of a table. -@return file format version */ -UNIV_INLINE -ulint -dict_table_get_format( -/*==================*/ - const dict_table_t* table) /*!< in: table */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); -/********************************************************************//** -Determine the file format from a dict_table_t::flags. -@return file format version */ -UNIV_INLINE -ulint -dict_tf_get_format( -/*===============*/ - ulint flags) /*!< in: dict_table_t::flags */ - MY_ATTRIBUTE((warn_unused_result)); +/** Determine if a table uses atomic BLOBs (no locally stored prefix). +@param[in] table InnoDB table +@return whether BLOBs are atomic */ +inline +bool +dict_table_has_atomic_blobs(const dict_table_t* table) +{ + return(DICT_TF_HAS_ATOMIC_BLOBS(table->flags)); +} /** Set the various values in a dict_table_t::flags pointer. @param[in,out] flags, Pointer to a 4 byte Table Flags @@ -950,8 +934,7 @@ dict_tf_get_format( @param[in] zip_ssize Zip Shift Size @param[in] use_data_dir Table uses DATA DIRECTORY @param[in] page_compressed Table uses page compression -@param[in] page_compression_level Page compression level -@param[in] not_used For future */ +@param[in] page_compression_level Page compression level */ UNIV_INLINE void dict_tf_set( @@ -960,8 +943,7 @@ dict_tf_set( ulint zip_ssize, bool use_data_dir, bool page_compressed, - ulint page_compression_level, - ulint not_used); + ulint page_compression_level); /** Convert a 32 bit integer table flags to the 32 bit FSP Flags. Fsp Flags are written into the tablespace header at the offset @@ -989,14 +971,8 @@ dict_tf_get_page_size( ulint flags) MY_ATTRIBUTE((const)); -/** Get the table page size. -@param[in] table table -@return compressed page size, or 0 if not compressed */ -UNIV_INLINE -const page_size_t -dict_table_page_size( - const dict_table_t* table) - MY_ATTRIBUTE((warn_unused_result)); +/** Get the table page size. */ +#define dict_table_page_size(table) page_size_t(table->space->flags) /*********************************************************************//** Obtain exclusive locks on all index trees of the table. This is to prevent @@ -1077,17 +1053,12 @@ dict_make_room_in_cache( ulint max_tables, /*!< in: max tables allowed in cache */ ulint pct_check); /*!< in: max percent to check */ -#define BIG_ROW_SIZE 1024 - /** Clears the virtual column's index list before index is being freed. @param[in] index Index being freed */ -void -dict_index_remove_from_v_col_list( - dict_index_t* index); +void dict_index_remove_from_v_col_list(dict_index_t* index); /** Adds an index to the dictionary cache, with possible indexing newly added column. -@param[in,out] table table on which the index is @param[in,out] index index; NOTE! The index memory object is freed in this function! @param[in] page_no root page number of the index @@ -1095,7 +1066,6 @@ added column. @return DB_SUCCESS, or DB_CORRUPTION */ dberr_t dict_index_add_to_cache( - dict_table_t* table, dict_index_t*& index, ulint page_no, const dict_add_v_col_t* add_v = NULL) @@ -1112,6 +1082,7 @@ dict_index_get_n_fields( representation of index (in the dictionary cache) */ MY_ATTRIBUTE((nonnull, warn_unused_result)); + /********************************************************************//** Gets the number of fields in the internal representation of an index that uniquely determine the position of an index entry in the index, if @@ -1238,7 +1209,7 @@ Returns TRUE if the index contains a column or a prefix of that column. @param[in] n column number @param[in] is_virtual whether it is a virtual col @return TRUE if contains the column or its prefix */ -ibool +bool dict_index_contains_col_or_prefix( /*==============================*/ const dict_index_t* index, /*!< in: index */ @@ -1377,21 +1348,6 @@ dict_index_build_node_ptr( ulint level) /*!< in: level of rec in tree: 0 means leaf level */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/**********************************************************************//** -Copies an initial segment of a physical record, long enough to specify an -index entry uniquely. -@return pointer to the prefix record */ -rec_t* -dict_index_copy_rec_order_prefix( -/*=============================*/ - const dict_index_t* index, /*!< in: index */ - const rec_t* rec, /*!< in: record for which to - copy prefix */ - ulint* n_fields,/*!< out: number of fields copied */ - byte** buf, /*!< in/out: memory buffer for the - copied prefix, or NULL */ - ulint* buf_size)/*!< in/out: buffer size */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Convert a physical record into a search tuple. @param[in] rec index record (not necessarily in an index page) @param[in] index index @@ -1400,42 +1356,15 @@ dict_index_copy_rec_order_prefix( @param[in,out] heap memory heap for allocation @return own: data tuple */ dtuple_t* -dict_index_build_data_tuple_func( +dict_index_build_data_tuple( const rec_t* rec, const dict_index_t* index, -#ifdef UNIV_DEBUG bool leaf, -#endif /* UNIV_DEBUG */ ulint n_fields, mem_heap_t* heap) MY_ATTRIBUTE((nonnull, warn_unused_result)); -#ifdef UNIV_DEBUG -# define dict_index_build_data_tuple(rec, index, leaf, n_fields, heap) \ - dict_index_build_data_tuple_func(rec, index, leaf, n_fields, heap) -#else /* UNIV_DEBUG */ -# define dict_index_build_data_tuple(rec, index, leaf, n_fields, heap) \ - dict_index_build_data_tuple_func(rec, index, n_fields, heap) -#endif /* UNIV_DEBUG */ /*********************************************************************//** -Gets the space id of the root of the index tree. -@return space id */ -UNIV_INLINE -ulint -dict_index_get_space( -/*=================*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); -/*********************************************************************//** -Sets the space id of the root of the index tree. */ -UNIV_INLINE -void -dict_index_set_space( -/*=================*/ - dict_index_t* index, /*!< in/out: index */ - ulint space) /*!< in: space id */ - MY_ATTRIBUTE((nonnull)); -/*********************************************************************//** Gets the page number of the root of the index tree. @return page number */ UNIV_INLINE @@ -1772,18 +1701,10 @@ dict_set_corrupted_index_cache_only( Flags a table with specified space_id corrupted in the table dictionary cache. @return TRUE if successful */ -ibool -dict_set_corrupted_by_space( -/*========================*/ - ulint space_id); /*!< in: space ID */ +bool dict_set_corrupted_by_space(const fil_space_t* space); -/** Flag a table with specified space_id encrypted in the data dictionary -cache -@param[in] space_id Tablespace id */ -UNIV_INTERN -void -dict_set_encrypted_by_space( - ulint space_id); +/** Flag a table encrypted in the data dictionary cache. */ +void dict_set_encrypted_by_space(const fil_space_t* space); /** Sets merge_threshold in the SYS_INDEXES @param[in,out] index index @@ -1820,18 +1741,6 @@ dict_tf2_is_valid( ulint flags, ulint flags2); -/********************************************************************//** -Check if the tablespace for the table has been discarded. -@return true if the tablespace has been discarded. */ -UNIV_INLINE -bool -dict_table_is_discarded( -/*====================*/ - const dict_table_t* table) /*!< in: table to check */ - MY_ATTRIBUTE((warn_unused_result)); - -#define dict_table_is_temporary(table) (table)->is_temporary() - /*********************************************************************//** This function should be called whenever a page is successfully compressed. Updates the compression padding information. */ @@ -1865,8 +1774,6 @@ dict_tf_to_row_format_string( /*=========================*/ ulint table_flag); /*!< in: row format setting */ -#define dict_col_is_virtual(col) (col)->is_virtual() - /** encode number of columns and number of virtual columns in one 4 bytes value. We could do this because the number of columns in InnoDB is limited to 1017 diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 554bfdd50ac..3b8808bf14c 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -288,7 +288,7 @@ dict_index_is_spatial( const dict_index_t* index) /*!< in: index */ { ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return(index->type & DICT_SPATIAL); + return ulint(UNIV_EXPECT(index->type & DICT_SPATIAL, 0)); } /********************************************************************//** @@ -329,7 +329,10 @@ dict_table_get_n_user_cols( const dict_table_t* table) /*!< in: table */ { ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); - return(table->n_cols - DATA_N_SYS_COLS); + /* n_cols counts stored columns only. A table may contain + virtual columns and no user-specified stored columns at all. */ + ut_ad(table->n_cols >= DATA_N_SYS_COLS); + return unsigned(table->n_cols) - DATA_N_SYS_COLS; } /********************************************************************//** @@ -459,8 +462,8 @@ dict_table_get_nth_v_col( ut_ad(table); ut_ad(pos < table->n_v_def); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); - - return(static_cast<dict_v_col_t*>(table->v_cols) + pos); + ut_ad(!table->v_cols[pos].m_col.is_instant()); + return &table->v_cols[pos]; } /********************************************************************//** @@ -474,12 +477,8 @@ dict_table_get_sys_col( ulint sys) /*!< in: DATA_ROW_ID, ... */ { dict_col_t* col; - - ut_ad(sys < DATA_N_SYS_COLS); - ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); - - col = dict_table_get_nth_col(table, table->n_cols - + (sys - DATA_N_SYS_COLS)); + col = dict_table_get_nth_col(table, + dict_table_get_sys_col_no(table, sys)); ut_ad(col->mtype == DATA_SYS); ut_ad(col->prtype == (sys | DATA_NOT_NULL)); @@ -499,25 +498,20 @@ dict_table_get_sys_col_no( { ut_ad(sys < DATA_N_SYS_COLS); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); - return table->n_cols + (sys - DATA_N_SYS_COLS); + return unsigned(table->n_cols) + (sys - DATA_N_SYS_COLS); } /********************************************************************//** Check whether the table uses the compact page format. @return TRUE if table uses the compact page format */ UNIV_INLINE -ibool +bool dict_table_is_comp( /*===============*/ const dict_table_t* table) /*!< in: table */ { ut_ad(table); - -#if DICT_TF_COMPACT != 1 -#error "DICT_TF_COMPACT must be 1" -#endif - - return(table->flags & DICT_TF_COMPACT); + return (table->flags & DICT_TF_COMPACT) != 0; } /************************************************************************ @@ -550,8 +544,8 @@ dict_tf_is_valid_not_redundant(ulint flags) for the uncompressed page format */ return(false); } else if (zip_ssize > PAGE_ZIP_SSIZE_MAX - || zip_ssize > UNIV_PAGE_SIZE_SHIFT - || UNIV_PAGE_SIZE_SHIFT > UNIV_ZIP_SIZE_SHIFT_MAX) { + || zip_ssize > srv_page_size_shift + || srv_page_size_shift > UNIV_ZIP_SIZE_SHIFT_MAX) { /* KEY_BLOCK_SIZE is out of bounds, or ROW_FORMAT=COMPRESSED is not supported with this innodb_page_size (only up to 16KiB) */ @@ -591,7 +585,7 @@ dict_tf_is_valid( bit. For ROW_FORMAT=REDUNDANT, only the DATA_DIR flag (which we cleared above) can be set. If any other flags are set, the flags are invalid. */ - return(flags == 0); + return(flags == 0 || flags == DICT_TF_MASK_NO_ROLLBACK); } return(dict_tf_is_valid_not_redundant(flags)); @@ -647,44 +641,13 @@ dict_tf_get_rec_format( return(REC_FORMAT_DYNAMIC); } -/********************************************************************//** -Determine the file format from a dict_table_t::flags. -@return file format version */ -UNIV_INLINE -ulint -dict_tf_get_format( -/*===============*/ - ulint flags) /*!< in: dict_table_t::flags */ -{ - if (DICT_TF_HAS_ATOMIC_BLOBS(flags)) { - return(UNIV_FORMAT_B); - } - - return(UNIV_FORMAT_A); -} - -/********************************************************************//** -Determine the file format of a table. -@return file format version */ -UNIV_INLINE -ulint -dict_table_get_format( -/*==================*/ - const dict_table_t* table) /*!< in: table */ -{ - ut_ad(table); - - return(dict_tf_get_format(table->flags)); -} - /** Set the various values in a dict_table_t::flags pointer. @param[in,out] flags, Pointer to a 4 byte Table Flags @param[in] format File Format @param[in] zip_ssize Zip Shift Size @param[in] use_data_dir Table uses DATA DIRECTORY @param[in] page_compressed Table uses page compression -@param[in] page_compression_level Page compression level -@param[in] not_used For future */ +@param[in] page_compression_level Page compression level */ UNIV_INLINE void dict_tf_set( @@ -694,8 +657,7 @@ dict_tf_set( ulint zip_ssize, bool use_data_dir, bool page_compressed, - ulint page_compression_level, - ulint not_used) + ulint page_compression_level) { *flags = use_data_dir ? 1 << DICT_TF_POS_DATA_DIR : 0; @@ -811,7 +773,8 @@ dict_tf_to_sys_tables_type( | DICT_TF_MASK_ATOMIC_BLOBS | DICT_TF_MASK_DATA_DIR | DICT_TF_MASK_PAGE_COMPRESSION - | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL); + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_NO_ROLLBACK); return(type); } @@ -835,21 +798,7 @@ dict_tf_get_page_size( ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); - return(page_size_t(zip_size, univ_page_size.logical(), true)); -} - -/** Get the table page size. -@param[in] table table -@return a structure containing the compressed and uncompressed -page sizes and a boolean indicating if the page is compressed */ -UNIV_INLINE -const page_size_t -dict_table_page_size( - const dict_table_t* table) -{ - ut_ad(table != NULL); - - return(dict_tf_get_page_size(table->flags)); + return(page_size_t(zip_size, srv_page_size, true)); } /*********************************************************************//** @@ -1131,36 +1080,6 @@ dict_index_get_min_size( } /*********************************************************************//** -Gets the space id of the root of the index tree. -@return space id */ -UNIV_INLINE -ulint -dict_index_get_space( -/*=================*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index); - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - - return(index->space); -} - -/*********************************************************************//** -Sets the space id of the root of the index tree. */ -UNIV_INLINE -void -dict_index_set_space( -/*=================*/ - dict_index_t* index, /*!< in/out: index */ - ulint space) /*!< in: space id */ -{ - ut_ad(index); - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - - index->space = unsigned(space); -} - -/*********************************************************************//** Gets the page number of the root of the index tree. @return page number */ UNIV_INLINE @@ -1198,7 +1117,7 @@ ulint dict_index_get_space_reserve(void) /*==============================*/ { - return(UNIV_PAGE_SIZE / 16); + return(srv_page_size / 16); } /********************************************************************//** @@ -1328,7 +1247,7 @@ dict_table_is_fts_column( /**********************************************************************//** Determine bytes of column prefix to be stored in the undo log. Please -note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix +note that if !dict_table_has_atomic_blobs(table), no prefix needs to be stored in the undo log. @return bytes of column prefix to be stored in the undo log */ UNIV_INLINE @@ -1339,16 +1258,15 @@ dict_max_field_len_store_undo( const dict_col_t* col) /*!< in: column which index prefix is based on */ { - ulint prefix_len = 0; + if (!dict_table_has_atomic_blobs(table)) { + return(0); + } - if (dict_table_get_format(table) >= UNIV_FORMAT_B) - { - prefix_len = col->max_prefix - ? col->max_prefix - : DICT_MAX_FIELD_LEN_BY_FORMAT(table); + if (col->max_prefix != 0) { + return(col->max_prefix); } - return(prefix_len); + return(REC_VERSION_56_MAX_INDEX_COL_LEN); } /** Determine maximum bytes of a virtual column need to be stored @@ -1368,10 +1286,10 @@ dict_max_v_field_len_store_undo( /* This calculation conforms to the non-virtual column maximum log length calculation: - 1) for UNIV_FORMAT_A, upto REC_ANTELOPE_MAX_INDEX_COL_LEN - for UNIV_FORMAT_B, upto col->max_prefix or - 2) REC_VERSION_56_MAX_INDEX_COL_LEN, whichever is less */ - if (dict_table_get_format(table) >= UNIV_FORMAT_B) { + 1) if No atomic BLOB, upto REC_ANTELOPE_MAX_INDEX_COL_LEN + 2) if atomic BLOB, upto col->max_prefix or + REC_VERSION_56_MAX_INDEX_COL_LEN, whichever is less */ + if (dict_table_has_atomic_blobs(table)) { if (DATA_BIG_COL(col) && col->max_prefix > 0) { max_log_len = col->max_prefix; } else { @@ -1412,18 +1330,6 @@ dict_table_is_corrupted( return(table->corrupted); } -/********************************************************************//** -Check if the tablespace for the table has been discarded. -@return true if the tablespace has been discarded. */ -UNIV_INLINE -bool -dict_table_is_discarded( -/*====================*/ - const dict_table_t* table) /*!< in: table to check */ -{ - return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_DISCARDED)); -} - /** Check if the table is found is a file_per_table tablespace. This test does not use table flags2 since some REDUNDANT tables in the system tablespace may have garbage in the MIX_LEN field where flags2 is @@ -1445,7 +1351,8 @@ bool dict_table_is_file_per_table( const dict_table_t* table) /*!< in: table to check */ { - return !is_system_tablespace(table->space); + return table->space != fil_system.sys_space + && table->space != fil_system.temp_space; } /** Acquire the table handle. */ diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h index b288c0b337a..aa3de6d0b17 100644 --- a/storage/innobase/include/dict0load.h +++ b/storage/innobase/include/dict0load.h @@ -55,15 +55,6 @@ enum dict_system_id_t { SYS_NUM_SYSTEM_TABLES }; -/** Status bit for dict_process_sys_tables_rec_and_mtr_commit() */ -enum dict_table_info_t { - DICT_TABLE_LOAD_FROM_RECORD = 0,/*!< Directly populate a dict_table_t - structure with information from - a SYS_TABLES record */ - DICT_TABLE_LOAD_FROM_CACHE = 1 /*!< Check first whether dict_table_t - is in the cache, if so, return it */ -}; - /** Check each tablespace found in the data dictionary. Look at each table defined in SYS_TABLES that has a space_id > 0. If the tablespace is not yet in the fil_system cache, look up the @@ -183,10 +174,7 @@ dict_process_sys_tables_rec_and_mtr_commit( mem_heap_t* heap, /*!< in: temporary memory heap */ const rec_t* rec, /*!< in: SYS_TABLES record */ dict_table_t** table, /*!< out: dict_table_t to fill */ - dict_table_info_t status, /*!< in: status bit controls - options such as whether we shall - look for dict_table_t from cache - first */ + bool cached, /*!< in: whether to load from cache */ mtr_t* mtr); /*!< in/out: mini-transaction, will be committed */ /********************************************************************//** @@ -227,7 +215,6 @@ information @return error message, or NULL on success */ const char* dict_process_sys_virtual_rec( - mem_heap_t* heap, const rec_t* rec, table_id_t* table_id, ulint* pos, diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 7ca1ad9ecd3..40c343652fb 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -2,7 +2,7 @@ Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2020, MariaDB Corporation. +Copyright (c) 2013, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -45,7 +45,6 @@ Created 1/8/1996 Heikki Tuuri #include "buf0buf.h" #include "gis0type.h" #include "fil0fil.h" -#include <my_crypt.h> #include "fil0crypt.h" #include <set> #include <algorithm> @@ -106,7 +105,7 @@ are described in fsp0fsp.h. */ /** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */ #define DICT_TF_REDUNDANT 0 /*!< Redundant row format. */ /** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */ -#define DICT_TF_COMPACT 1 /*!< Compact row format. */ +#define DICT_TF_COMPACT 1U /*!< Compact row format. */ /** This bitmask is used in SYS_TABLES.N_COLS to set and test whether the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */ @@ -118,9 +117,10 @@ the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */ /** Width of the ZIP_SSIZE flag */ #define DICT_TF_WIDTH_ZIP_SSIZE 4 -/** Width of the ATOMIC_BLOBS flag. The Antelope file formats broke up -BLOB and TEXT fields, storing the first 768 bytes in the clustered index. -Barracuda row formats store the whole blob or text field off-page atomically. +/** Width of the ATOMIC_BLOBS flag. The ROW_FORMAT=REDUNDANT and +ROW_FORMAT=COMPACT broke up BLOB and TEXT fields, storing the first 768 bytes +in the clustered index. ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED +store the whole blob or text field off-page atomically. Secondary indexes are created from this external data using row_ext_t to cache the BLOB prefixes. */ #define DICT_TF_WIDTH_ATOMIC_BLOBS 1 @@ -138,10 +138,10 @@ Width of the page compression flag #define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 /** -Width of atomic writes flag -DEFAULT=0, ON = 1, OFF = 2 +The NO_ROLLBACK flag (3=yes; the values 1,2 used stand for +ATOMIC_WRITES=ON and ATOMIC_WRITES=OFF between MariaDB 10.1.0 and 10.2.3) */ -#define DICT_TF_WIDTH_ATOMIC_WRITES 2 +#define DICT_TF_WIDTH_NO_ROLLBACK 2 /** Width of all the currently known table flags */ #define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ @@ -149,7 +149,8 @@ DEFAULT=0, ON = 1, OFF = 2 + DICT_TF_WIDTH_ATOMIC_BLOBS \ + DICT_TF_WIDTH_DATA_DIR \ + DICT_TF_WIDTH_PAGE_COMPRESSION \ - + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_NO_ROLLBACK) /** Zero relative shift position of the COMPACT field */ #define DICT_TF_POS_COMPACT 0 @@ -168,11 +169,11 @@ DEFAULT=0, ON = 1, OFF = 2 /** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ #define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \ + DICT_TF_WIDTH_PAGE_COMPRESSION) -/** Zero relative shift position of the ATOMIC_WRITES field */ -#define DICT_TF_POS_ATOMIC_WRITES (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ +/** Zero relative shift position of the NO_ROLLBACK field */ +#define DICT_TF_POS_NO_ROLLBACK (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) -#define DICT_TF_POS_UNUSED (DICT_TF_POS_ATOMIC_WRITES \ - + DICT_TF_WIDTH_ATOMIC_WRITES) +#define DICT_TF_POS_UNUSED (DICT_TF_POS_NO_ROLLBACK \ + + DICT_TF_WIDTH_NO_ROLLBACK) /** Bit mask of the COMPACT field */ #define DICT_TF_MASK_COMPACT \ @@ -198,10 +199,10 @@ DEFAULT=0, ON = 1, OFF = 2 #define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \ ((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \ << DICT_TF_POS_PAGE_COMPRESSION_LEVEL) -/** Bit mask of the ATOMIC_WRITES field */ -#define DICT_TF_MASK_ATOMIC_WRITES \ - ((~(~0U << DICT_TF_WIDTH_ATOMIC_WRITES)) \ - << DICT_TF_POS_ATOMIC_WRITES) +/** Bit mask of the NO_ROLLBACK field */ +#define DICT_TF_MASK_NO_ROLLBACK \ + ((~(~0U << DICT_TF_WIDTH_NO_ROLLBACK)) \ + << DICT_TF_POS_NO_ROLLBACK) /** Return the value of the COMPACT field */ #define DICT_TF_GET_COMPACT(flags) \ @@ -227,10 +228,6 @@ DEFAULT=0, ON = 1, OFF = 2 #define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \ ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \ >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL) -/** Return the value of the ATOMIC_WRITES field */ -#define DICT_TF_GET_ATOMIC_WRITES(flags) \ - ((flags & DICT_TF_MASK_ATOMIC_WRITES) \ - >> DICT_TF_POS_ATOMIC_WRITES) /* @} */ @@ -298,24 +295,22 @@ parent table will fail, and user has to drop excessive foreign constraint before proceeds. */ #define FK_MAX_CASCADE_DEL 15 -/** Creates a table memory object. -@param[in] name table name -@param[in] space space where the clustered index - of the table is placed -@param[in] n_cols total number of columns including - virtual and non-virtual columns -@param[in] n_v_cols number of virtual columns -@param[in] flags table flags -@param[in] flags2 table flags2 +/** Create a table memory object. +@param name table name +@param space tablespace +@param n_cols total number of columns (both virtual and non-virtual) +@param n_v_cols number of virtual columns +@param flags table flags +@param flags2 table flags2 @return own: table object */ dict_table_t* dict_mem_table_create( - const char* name, - ulint space, - ulint n_cols, - ulint n_v_cols, - ulint flags, - ulint flags2); + const char* name, + fil_space_t* space, + ulint n_cols, + ulint n_v_cols, + ulint flags, + ulint flags2); /****************************************************************//** Free a table memory object. */ @@ -400,11 +395,7 @@ dict_mem_fill_index_struct( /*=======================*/ dict_index_t* index, /*!< out: index to be filled */ mem_heap_t* heap, /*!< in: memory heap */ - const char* table_name, /*!< in: table name */ const char* index_name, /*!< in: index name */ - ulint space, /*!< in: space where the index tree is - placed, ignored if the index is of - the clustered type */ ulint type, /*!< in: DICT_UNIQUE, DICT_CLUSTERED, ... ORed */ ulint n_fields); /*!< in: number of fields */ @@ -414,11 +405,8 @@ Creates an index memory object. dict_index_t* dict_mem_index_create( /*==================*/ - const char* table_name, /*!< in: table name */ + dict_table_t* table, /*!< in: table */ const char* index_name, /*!< in: index name */ - ulint space, /*!< in: space where the index tree is - placed, ignored if the index is of - the clustered type */ ulint type, /*!< in: DICT_UNIQUE, DICT_CLUSTERED, ... ORed */ ulint n_fields); /*!< in: number of fields */ @@ -501,10 +489,6 @@ dict_mem_create_temporary_tablename( const char* dbtab, table_id_t id); -/** Initialize dict memory variables */ -void -dict_mem_init(void); - /** SQL identifier name wrapper for pretty-printing */ class id_name_t { @@ -548,44 +532,6 @@ private: const char* m_name; }; -/** Table name wrapper for pretty-printing */ -struct table_name_t -{ - /** The name in internal representation */ - char* m_name; - - /** Default constructor */ - table_name_t() {} - /** Constructor */ - table_name_t(char* name) : m_name(name) {} - - /** @return the end of the schema name */ - const char* dbend() const - { - const char* sep = strchr(m_name, '/'); - ut_ad(sep); - return sep; - } - - /** @return the length of the schema name, in bytes */ - size_t dblen() const { return dbend() - m_name; } - - /** Determine the filename-safe encoded table name. - @return the filename-safe encoded table name */ - const char* basename() const { return dbend() + 1; } - - /** The start of the table basename suffix for partitioned tables */ - static const char part_suffix[4]; - - /** Determine the partition or subpartition name suffix. - @return the partition name - @retval NULL if the table is not partitioned */ - const char* part() const { return strstr(basename(), part_suffix); } - - /** @return whether this is a temporary or intermediate table name */ - inline bool is_temporary() const; -}; - /** Data structure for a column in a table */ struct dict_col_t{ /*----------------------*/ @@ -627,14 +573,75 @@ struct dict_col_t{ of an index */ unsigned max_prefix:12; /*!< maximum index prefix length on this column. Our current max limit is - 3072 for Barracuda table */ - - /** @return whether this is a virtual column */ - bool is_virtual() const { return prtype & DATA_VIRTUAL; } + 3072 (REC_VERSION_56_MAX_INDEX_COL_LEN) + bytes. */ /** Detach a virtual column from an index. @param index being-freed index */ inline void detach(const dict_index_t &index); + + /** Data for instantly added columns */ + struct def_t + { + /** original default value of instantly added column */ + const void *data; + /** len of data, or UNIV_SQL_DEFAULT if unavailable */ + ulint len; + } def_val; + + /** Retrieve the column name. + @param[in] table the table of this column */ + const char *name(const dict_table_t &table) const; + + /** @return whether this is a virtual column */ + bool is_virtual() const { return prtype & DATA_VIRTUAL; } + /** @return whether NULL is an allowed value for this column */ + bool is_nullable() const { return !(prtype & DATA_NOT_NULL); } + + /** @return whether table of this system field is TRX_ID-based */ + bool vers_native() const + { + ut_ad(vers_sys_start() || vers_sys_end()); + ut_ad(mtype == DATA_INT || mtype == DATA_FIXBINARY); + return mtype == DATA_INT; + } + /** @return whether this is system versioned */ + bool is_versioned() const { return !(~prtype & DATA_VERSIONED); } + /** @return whether this is the system version start */ + bool vers_sys_start() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_START; + } + /** @return whether this is the system version end */ + bool vers_sys_end() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_END; + } + + /** @return whether this is an instantly-added column */ + bool is_instant() const + { + DBUG_ASSERT(def_val.len != UNIV_SQL_DEFAULT || !def_val.data); + return def_val.len != UNIV_SQL_DEFAULT; + } + /** Get the default value of an instantly-added column. + @param[out] len value length (in bytes), or UNIV_SQL_NULL + @return default value + @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */ + const byte *instant_value(ulint *len) const + { + DBUG_ASSERT(is_instant()); + *len= def_val.len; + return static_cast<const byte*>(def_val.data); + } + + /** Remove the 'instant ADD' status of the column */ + void remove_instant() + { + DBUG_ASSERT(is_instant()); + def_val.len= UNIV_SQL_DEFAULT; + def_val.data= NULL; + } }; /** Index information put in a list of virtual column structure. Index @@ -646,6 +653,9 @@ struct dict_v_idx_t { /** position in this index */ ulint nth_field; + + dict_v_idx_t(dict_index_t* index, ulint nth_field) + : index(index), nth_field(nth_field) {} }; /** Index list to put in dict_v_col_t */ @@ -743,17 +753,17 @@ files would be at risk! */ /** Find out maximum indexed column length by its table format. For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For -Barracuda row formats COMPRESSED and DYNAMIC, the length could +ROW_FORMAT=COMPRESSED and ROW_FORMAT=DYNAMIC, the length could be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */ -#define DICT_MAX_FIELD_LEN_BY_FORMAT(table) \ - ((dict_table_get_format(table) < UNIV_FORMAT_B) \ - ? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) \ - : REC_VERSION_56_MAX_INDEX_COL_LEN) +#define DICT_MAX_FIELD_LEN_BY_FORMAT(table) \ + (dict_table_has_atomic_blobs(table) \ + ? REC_VERSION_56_MAX_INDEX_COL_LEN \ + : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) -#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags) \ - ((DICT_TF_HAS_ATOMIC_BLOBS(flags) < UNIV_FORMAT_B) \ - ? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) \ - : REC_VERSION_56_MAX_INDEX_COL_LEN) +#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags) \ + (DICT_TF_HAS_ATOMIC_BLOBS(flags) \ + ? REC_VERSION_56_MAX_INDEX_COL_LEN \ + : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) /** Defines the maximum fixed length column size */ #define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN @@ -780,6 +790,15 @@ struct dict_field_t{ /** Zero-initialize all fields */ dict_field_t() : col(NULL), name(NULL), prefix_len(0), fixed_len(0) {} + + /** Check whether two index fields are equivalent. + @param[in] old the other index field + @return whether the index fields are equivalent */ + bool same(const dict_field_t& other) const + { + return(prefix_len == other.prefix_len + && fixed_len == other.fixed_len); + } }; /**********************************************************************//** @@ -848,14 +867,11 @@ struct dict_index_t{ index_id_t id; /*!< id of the index */ mem_heap_t* heap; /*!< memory heap */ id_name_t name; /*!< index name */ - const char* table_name;/*!< table name */ dict_table_t* table; /*!< back pointer to table */ - unsigned space:32; - /*!< space where the index tree is placed */ /** root page number, or FIL_NULL if the index has been detached from storage (DISCARD TABLESPACE or similar), or 1 if the index is in table->freed_indexes */ - unsigned page:32;/*!< index tree root page number */ + unsigned page:32; unsigned merge_threshold:6; /*!< In the pessimistic delete, if the page data size drops below this limit in percent, @@ -870,8 +886,8 @@ struct dict_index_t{ in a clustered index record, if the fields before it are known to be of a fixed size, 0 otherwise */ -#if (1<<MAX_KEY_LENGTH_BITS) < MAX_KEY_LENGTH -# error (1<<MAX_KEY_LENGTH_BITS) < MAX_KEY_LENGTH +#if (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH +# error (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH #endif unsigned n_user_defined_cols:10; /*!< number of columns the user defined to @@ -894,6 +910,17 @@ struct dict_index_t{ unsigned n_def:10;/*!< number of fields defined so far */ unsigned n_fields:10;/*!< number of fields in the index */ unsigned n_nullable:10;/*!< number of nullable fields */ + unsigned n_core_fields:10;/*!< number of fields in the index + (before the first time of instant add columns) */ + /** number of bytes of null bits in ROW_FORMAT!=REDUNDANT node pointer + records; usually equal to UT_BITS_IN_BYTES(n_nullable), but + can be less in clustered indexes with instant ADD COLUMN */ + unsigned n_core_null_bytes:8; + /** magic value signalling that n_core_null_bytes was not + initialized yet */ + static const unsigned NO_CORE_NULL_BYTES = 0xff; + /** The clustered index ID of the hard-coded SYS_INDEXES table. */ + static const unsigned DICT_INDEXES_ID = 3; unsigned cached:1;/*!< TRUE if the index object is in the dictionary cache */ unsigned to_be_dropped:1; @@ -1022,6 +1049,10 @@ struct dict_index_t{ uncommitted = !committed; } + /** Notify that the index pages are going to be modified. + @param[in,out] mtr mini-transaction */ + inline void set_modified(mtr_t& mtr) const; + /** @return whether this index is readable @retval true normally @retval false if this is a single-table tablespace @@ -1029,6 +1060,9 @@ struct dict_index_t{ page cannot be read or decrypted */ inline bool is_readable() const; + /** @return whether instant ADD COLUMN is in effect */ + inline bool is_instant() const; + /** @return whether the index is the primary key index (not the clustered index of the change buffer) */ bool is_primary() const @@ -1036,6 +1070,21 @@ struct dict_index_t{ return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF)); } + /** @return whether this is a generated clustered index */ + bool is_gen_clust() const { return type == DICT_CLUSTERED; } + + /** @return whether this is a clustered index */ + bool is_clust() const { return type & DICT_CLUSTERED; } + + /** @return whether this is a unique index */ + bool is_unique() const { return type & DICT_UNIQUE; } + + /** @return whether this is a spatial index */ + bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); } + + /** @return whether this is the change buffer */ + bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); } + /** @return whether the index includes virtual columns */ bool has_virtual() const { return type & DICT_VIRTUAL; } @@ -1075,8 +1124,68 @@ struct dict_index_t{ } } - /** @return whether this is the change buffer */ - bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); } + /** Determine how many fields of a given prefix can be set NULL. + @param[in] n_prefix number of fields in the prefix + @return number of fields 0..n_prefix-1 that can be set NULL */ + unsigned get_n_nullable(ulint n_prefix) const + { + DBUG_ASSERT(n_prefix > 0); + DBUG_ASSERT(n_prefix <= n_fields); + unsigned n = n_nullable; + for (; n_prefix < n_fields; n_prefix++) { + const dict_col_t* col = fields[n_prefix].col; + DBUG_ASSERT(!col->is_virtual()); + n -= col->is_nullable(); + } + DBUG_ASSERT(n < n_def); + return n; + } + + /** Get the default value of an instantly-added clustered index field. + @param[in] n instantly added field position + @param[out] len value length (in bytes), or UNIV_SQL_NULL + @return default value + @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */ + const byte* instant_field_value(ulint n, ulint* len) const + { + DBUG_ASSERT(is_instant() || id == DICT_INDEXES_ID); + DBUG_ASSERT(n + (id == DICT_INDEXES_ID) >= n_core_fields); + DBUG_ASSERT(n < n_fields); + return fields[n].col->instant_value(len); + } + + /** Adjust clustered index metadata for instant ADD COLUMN. + @param[in] clustered index definition after instant ADD COLUMN */ + void instant_add_field(const dict_index_t& instant); + + /** Remove the 'instant ADD' status of a clustered index. + Protected by index root page x-latch or table X-lock. */ + void remove_instant() + { + DBUG_ASSERT(is_primary()); + if (!is_instant()) { + return; + } + for (unsigned i = n_core_fields; i < n_fields; i++) { + fields[i].col->remove_instant(); + } + n_core_fields = n_fields; + n_core_null_bytes = UT_BITS_IN_BYTES(unsigned(n_nullable)); + } + + /** Check if record in clustered index is historical row. + @param[in] rec clustered row + @param[in] offsets offsets + @return true if row is historical */ + bool + vers_history_row(const rec_t* rec, const rec_offs* offsets); + + /** Check if record in secondary index is historical row. + @param[in] rec record in a secondary index + @param[out] history_row true if row is historical + @return true on error */ + bool + vers_history_row(const rec_t* rec, bool &history_row); /** Assign the number of new column to be added as a part of the index @@ -1544,6 +1653,11 @@ struct dict_table_t { @return whether the last handle was released */ inline bool release(); + /** @return whether the table supports transactions */ + bool no_rollback() const + { + return !(~unsigned(flags) & DICT_TF_MASK_NO_ROLLBACK); + } /** @return whether this is a temporary table */ bool is_temporary() const { @@ -1557,6 +1671,7 @@ struct dict_table_t { page cannot be read or decrypted */ bool is_readable() const { + ut_ad(file_unreadable || space); return(UNIV_LIKELY(!file_unreadable)); } @@ -1567,6 +1682,66 @@ struct dict_table_t { return strstr(name, "/" TEMP_FILE_PREFIX) != NULL; } + /** @return whether instant ADD COLUMN is in effect */ + bool is_instant() const + { + return(UT_LIST_GET_FIRST(indexes)->is_instant()); + } + + /** @return whether the table supports instant ADD COLUMN */ + bool supports_instant() const + { + return(!(flags & DICT_TF_MASK_ZIP_SSIZE)); + } + + /** Adjust metadata for instant ADD COLUMN. + @param[in] table table definition after instant ADD COLUMN */ + void instant_add_column(const dict_table_t& table); + + /** Roll back instant_add_column(). + @param[in] old_n_cols original n_cols + @param[in] old_cols original cols + @param[in] old_col_names original col_names */ + void rollback_instant( + unsigned old_n_cols, + dict_col_t* old_cols, + const char* old_col_names); + + /** Trim the instantly added columns when an insert into SYS_COLUMNS + is rolled back during ALTER TABLE or recovery. + @param[in] n number of surviving non-system columns */ + void rollback_instant(unsigned n); + + /** Add the table definition to the data dictionary cache */ + void add_to_cache(); + + /** @return whether the table is versioned. + It is assumed that both vers_start and vers_end set to 0 + iff table is not versioned. In any other case, + these fields correspond to actual positions in cols[]. */ + bool versioned() const { return vers_start || vers_end; } + bool versioned_by_id() const + { + return versioned() && cols[vers_start].mtype == DATA_INT; + } + + void inc_fk_checks() + { +#ifdef UNIV_DEBUG + lint fk_checks= (lint) +#endif + my_atomic_addlint(&n_foreign_key_checks_running, 1); + ut_ad(fk_checks >= 0); + } + void dec_fk_checks() + { +#ifdef UNIV_DEBUG + lint fk_checks= (lint) +#endif + my_atomic_addlint(&n_foreign_key_checks_running, ulint(-1)); + ut_ad(fk_checks > 0); + } + /** For overflow fields returns potential max length stored inline */ size_t get_overflow_field_local_len() const; @@ -1585,8 +1760,10 @@ struct dict_table_t { /** NULL or the directory path specified by DATA DIRECTORY. */ char* data_dir_path; - /** Space where the clustered index of the table is placed. */ - uint32_t space; + /** The tablespace of the table */ + fil_space_t* space; + /** Tablespace ID */ + ulint space_id; /** Stores information about: 1 row format (redundant or compact), @@ -1685,7 +1862,10 @@ struct dict_table_t { /** Virtual column names */ const char* v_col_names; - + unsigned vers_start:10; + /*!< System Versioning: row start col index */ + unsigned vers_end:10; + /*!< System Versioning: row end col index */ bool is_system_db; /*!< True if the table belongs to a system database (mysql, information_schema or @@ -1883,7 +2063,7 @@ struct dict_table_t { ulong n_waiting_or_granted_auto_inc_locks; /** The transaction that currently holds the the AUTOINC lock on this - table. Protected by lock_sys->mutex. */ + table. Protected by lock_sys.mutex. */ const trx_t* autoinc_trx; /* @} */ @@ -1898,7 +2078,7 @@ struct dict_table_t { /** Count of the number of record locks on this table. We use this to determine whether we can evict the table from the dictionary cache. - It is protected by lock_sys->mutex. */ + It is protected by lock_sys.mutex. */ ulint n_rec_locks; private: @@ -1908,7 +2088,7 @@ private: int32 n_ref_count; public: - /** List of locks on the table. Protected by lock_sys->mutex. */ + /** List of locks on the table. Protected by lock_sys.mutex. */ table_lock_list_t locks; /** Timestamp of the last modification of this table. */ @@ -1937,14 +2117,27 @@ public: } }; +inline void dict_index_t::set_modified(mtr_t& mtr) const +{ + mtr.set_named_space(table->space); +} + inline bool table_name_t::is_temporary() const { return dict_table_t::is_temporary_name(m_name); } -inline bool dict_index_t::is_readable() const +inline bool dict_index_t::is_readable() const { return table->is_readable(); } + +inline bool dict_index_t::is_instant() const { - return(UNIV_LIKELY(!table->file_unreadable)); + ut_ad(n_core_fields > 0); + ut_ad(n_core_fields <= n_fields); + ut_ad(n_core_fields == n_fields + || (type & ~(DICT_UNIQUE | DICT_CORRUPT)) == DICT_CLUSTERED); + ut_ad(n_core_fields == n_fields || table->supports_instant()); + ut_ad(n_core_fields == n_fields || !table->is_temporary()); + return(n_core_fields != n_fields); } inline bool dict_index_t::is_corrupted() const diff --git a/storage/innobase/include/dict0mem.ic b/storage/innobase/include/dict0mem.ic index d63f15ef7f3..090ec73278b 100644 --- a/storage/innobase/include/dict0mem.ic +++ b/storage/innobase/include/dict0mem.ic @@ -37,11 +37,7 @@ dict_mem_fill_index_struct( /*=======================*/ dict_index_t* index, /*!< out: index to be filled */ mem_heap_t* heap, /*!< in: memory heap */ - const char* table_name, /*!< in: table name */ const char* index_name, /*!< in: index name */ - ulint space, /*!< in: space where the index tree is - placed, ignored if the index is of - the clustered type */ ulint type, /*!< in: DICT_UNIQUE, DICT_CLUSTERED, ... ORed */ ulint n_fields) /*!< in: number of fields */ @@ -61,11 +57,10 @@ dict_mem_fill_index_struct( /* Assign a ulint to a 4-bit-mapped field. Only the low-order 4 bits are assigned. */ index->type = unsigned(type); - index->space = (unsigned int) space; index->page = FIL_NULL; index->merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; - index->table_name = table_name; index->n_fields = (unsigned int) n_fields; + index->n_core_fields = (unsigned int) n_fields; /* The '1 +' above prevents allocation of an empty mem block */ index->nulls_equal = false; diff --git a/storage/innobase/include/dict0stats.ic b/storage/innobase/include/dict0stats.ic index c467ee1f3ac..b1c0a421dd5 100644 --- a/storage/innobase/include/dict0stats.ic +++ b/storage/innobase/include/dict0stats.ic @@ -184,7 +184,7 @@ dict_stats_deinit( table->stat_initialized = FALSE; -#ifdef HAVE_valgrind_or_MSAN +#ifdef HAVE_valgrind MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows); MEM_UNDEFINED(&table->stat_clustered_index_size, sizeof table->stat_clustered_index_size); @@ -217,5 +217,5 @@ dict_stats_deinit( &index->stat_n_leaf_pages, sizeof(index->stat_n_leaf_pages)); } -#endif /* HAVE_valgrind_or_MSAN */ +#endif /* HAVE_valgrind */ } diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h index 9c611640fdf..66b98629033 100644 --- a/storage/innobase/include/dict0stats_bg.h +++ b/storage/innobase/include/dict0stats_bg.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -114,16 +114,9 @@ dict_stats_thread_deinit(); #ifdef UNIV_DEBUG /** Disables dict stats thread. It's used by: SET GLOBAL innodb_dict_stats_disabled_debug = 1 (0). -@param[in] thd thread handle -@param[in] var pointer to system variable -@param[out] var_ptr where the formal string goes @param[in] save immediate result from check function */ -void -dict_stats_disabled_debug_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save); +void dict_stats_disabled_debug_update(THD*, st_mysql_sys_var*, void*, + const void* save); #endif /* UNIV_DEBUG */ /*****************************************************************//** diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index bea08f398de..1e16e501a48 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -28,6 +28,7 @@ Created 1/8/1996 Heikki Tuuri #define dict0types_h #include <ut0mutex.h> +#include <rem0types.h> struct dict_sys_t; struct dict_col_t; @@ -52,6 +53,13 @@ DICT_IBUF_ID_MIN plus the space id */ typedef ib_id_t table_id_t; typedef ib_id_t index_id_t; +/** Maximum transaction identifier */ +#define TRX_ID_MAX IB_ID_MAX + +/** The bit pattern corresponding to TRX_ID_MAX */ +extern const byte trx_id_max_bytes[8]; +extern const byte timestamp_max_bytes[7]; + /** Error to ignore when we load table dictionary into memory. However, the table and index will be marked as "corrupted", and caller will be responsible to deal with corrupted table or index. @@ -95,6 +103,44 @@ typedef ib_mutex_t DictSysMutex; #define TEMP_TABLE_PREFIX "#sql" #define TEMP_TABLE_PATH_PREFIX "/" TEMP_TABLE_PREFIX +/** Table name wrapper for pretty-printing */ +struct table_name_t +{ + /** The name in internal representation */ + char* m_name; + + /** Default constructor */ + table_name_t() {} + /** Constructor */ + table_name_t(char* name) : m_name(name) {} + + /** @return the end of the schema name */ + const char* dbend() const + { + const char* sep = strchr(m_name, '/'); + ut_ad(sep); + return sep; + } + + /** @return the length of the schema name, in bytes */ + size_t dblen() const { return size_t(dbend() - m_name); } + + /** Determine the filename-safe encoded table name. + @return the filename-safe encoded table name */ + const char* basename() const { return dbend() + 1; } + + /** The start of the table basename suffix for partitioned tables */ + static const char part_suffix[4]; + + /** Determine the partition or subpartition name suffix. + @return the partition name + @retval NULL if the table is not partitioned */ + const char* part() const { return strstr(basename(), part_suffix); } + + /** @return whether this is a temporary or intermediate table name */ + inline bool is_temporary() const; +}; + #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG /** Dump the change buffer at startup */ extern my_bool ibuf_dump; diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h index da8d4b7de26..bd883eb796c 100644 --- a/storage/innobase/include/dyn0buf.h +++ b/storage/innobase/include/dyn0buf.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, 2020, MariaDB Corporation. +Copyright (c) 2018, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,27 +33,27 @@ Created 2013-03-16 Sunny Bains /** Class that manages dynamic buffers. It uses a UT_LIST of -dyn_buf_t::block_t instances. We don't use STL containers in +mtr_buf_t::block_t instances. We don't use STL containers in order to avoid the overhead of heap calls. Using a custom memory allocator doesn't solve the problem either because we have to get the memory from somewhere. We can't use the block_t::m_data as the backend for the custom allocator because we would like the data in the blocks to be contiguous. */ -template <size_t SIZE = DYN_ARRAY_DATA_SIZE> -class dyn_buf_t { +class mtr_buf_t { public: + /** SIZE - sizeof(m_node) + sizeof(m_used) */ + enum { MAX_DATA_SIZE = DYN_ARRAY_DATA_SIZE + - sizeof(ilist_node<>) + sizeof(uint32_t) }; class block_t : public ilist_node<> { public: block_t() { - ut_ad(MAX_DATA_SIZE <= (2 << 15)); + compile_time_assert(MAX_DATA_SIZE <= (2 << 15)); init(); } - ~block_t() { } - /** Gets the number of used bytes in a block. @return number of bytes used */ @@ -108,12 +108,12 @@ public: /** @return pointer to start of reserved space */ template <typename Type> - Type push(ib_uint32_t size) + Type push(uint32_t size) { Type ptr = reinterpret_cast<Type>(end()); m_used += size; - ut_ad(m_used <= static_cast<ib_uint32_t>(MAX_DATA_SIZE)); + ut_ad(m_used <= uint32_t(MAX_DATA_SIZE)); return(ptr); } @@ -127,7 +127,7 @@ public: ut_ad(ptr <= begin() + m_buf_end); /* We have done the boundary check above */ - m_used = static_cast<ib_uint32_t>(ptr - begin()); + m_used = uint32_t(ptr - begin()); ut_ad(m_used <= MAX_DATA_SIZE); ut_d(m_buf_end = 0); @@ -150,29 +150,20 @@ public: ulint m_magic_n; #endif /* UNIV_DEBUG */ - /** SIZE - sizeof(m_node) + sizeof(m_used) */ - enum { - MAX_DATA_SIZE = SIZE - - sizeof(ilist_node<>) - + sizeof(ib_uint32_t) - }; - /** Storage */ byte m_data[MAX_DATA_SIZE]; /** number of data bytes used in this block; DYN_BLOCK_FULL_FLAG is set when the block becomes full */ - ib_uint32_t m_used; + uint32_t m_used; - friend class dyn_buf_t; + friend class mtr_buf_t; }; typedef sized_ilist<block_t> list_t; - enum { MAX_DATA_SIZE = block_t::MAX_DATA_SIZE}; - /** Default constructor */ - dyn_buf_t() + mtr_buf_t() : m_heap(), m_size() @@ -181,7 +172,7 @@ public: } /** Destructor */ - ~dyn_buf_t() + ~mtr_buf_t() { erase(); } @@ -246,7 +237,7 @@ public: @param size in bytes of the element @return pointer to the element */ template <typename Type> - Type push(ib_uint32_t size) + Type push(uint32_t size) { ut_ad(size > 0); ut_ad(size <= MAX_DATA_SIZE); @@ -266,17 +257,11 @@ public: Pushes n bytes. @param str string to write @param len string length */ - void push(const byte* ptr, ib_uint32_t len) + void push(const byte* ptr, uint32_t len) { while (len > 0) { - ib_uint32_t n_copied; - - if (len >= MAX_DATA_SIZE) { - n_copied = MAX_DATA_SIZE; - } else { - n_copied = len; - } - + uint32_t n_copied = std::min(len, + uint32_t(MAX_DATA_SIZE)); ::memmove(push<byte*>(n_copied), ptr, n_copied); ptr += n_copied; @@ -292,7 +277,7 @@ public: const Type at(ulint pos) const { block_t* block = const_cast<block_t*>( - const_cast<dyn_buf_t*>(this)->find(pos)); + const_cast<mtr_buf_t*>(this)->find(pos)); return(reinterpret_cast<Type>(block->begin() + pos)); } @@ -318,8 +303,7 @@ public: #ifdef UNIV_DEBUG ulint total_size = 0; - for (typename list_t::iterator it = m_list.begin(), - end = m_list.end(); + for (list_t::iterator it = m_list.begin(), end = m_list.end(); it != end; ++it) { total_size += it->used(); } @@ -335,8 +319,7 @@ public: template <typename Functor> bool for_each_block(Functor& functor) const { - for (typename list_t::iterator it = m_list.begin(), - end = m_list.end(); + for (list_t::iterator it = m_list.begin(), end = m_list.end(); it != end; ++it) { if (!functor(&*it)) { @@ -371,8 +354,8 @@ public: template <typename Functor> bool for_each_block_in_reverse(Functor& functor) const { - for (typename list_t::reverse_iterator it = m_list.rbegin(), - end = m_list.rend(); + for (list_t::reverse_iterator it = m_list.rbegin(), + end = m_list.rend(); it != end; ++it) { if (!functor(&*it)) { @@ -389,8 +372,8 @@ public: template <typename Functor> bool for_each_block_in_reverse(const Functor& functor) const { - for (typename list_t::reverse_iterator it = m_list.rbegin(), - end = m_list.rend(); + for (list_t::reverse_iterator it = m_list.rbegin(), + end = m_list.rend(); it != end; ++it) { if (!functor(&*it)) { @@ -419,8 +402,8 @@ public: private: // Disable copying - dyn_buf_t(const dyn_buf_t&); - dyn_buf_t& operator=(const dyn_buf_t&); + mtr_buf_t(const mtr_buf_t&); + mtr_buf_t& operator=(const mtr_buf_t&); /** Add the block to the end of the list*/ @@ -431,9 +414,9 @@ private: } /** @return the last block in the list */ - block_t* back() + block_t* back() const { - return &m_list.back(); + return &const_cast<block_t&>(m_list.back()); } /* @@ -458,8 +441,7 @@ private: { ut_ad(!m_list.empty()); - for (typename list_t::iterator it = m_list.begin(), - end = m_list.end(); + for (list_t::iterator it = m_list.begin(), end = m_list.end(); it != end; ++it) { if (pos < it->used()) { @@ -508,8 +490,6 @@ private: block_t m_first_block; }; -typedef dyn_buf_t<DYN_ARRAY_DATA_SIZE> mtr_buf_t; - /** mtr_buf_t copier */ struct mtr_buf_copy_t { /** The copied buffer */ diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h index 3c56315ee9a..870858b4ccd 100644 --- a/storage/innobase/include/fil0crypt.h +++ b/storage/innobase/include/fil0crypt.h @@ -27,9 +27,9 @@ Created 04/01/2015 Jan Lindström #define fil0crypt_h #ifndef UNIV_INNOCHECKSUM - #include "os0event.h" #include "my_crypt.h" +#include "fil0fil.h" #endif /*! UNIV_INNOCHECKSUM */ /** @@ -302,7 +302,6 @@ fil_space_destroy_crypt_data( Parse a MLOG_FILE_WRITE_CRYPT_DATA log entry @param[in] ptr Log entry start @param[in] end_ptr Log entry end -@param[in] block buffer block @param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED @return position on log buffer */ UNIV_INTERN @@ -310,7 +309,6 @@ byte* fil_parse_write_crypt_data( byte* ptr, const byte* end_ptr, - const buf_block_t* block, dberr_t* err) MY_ATTRIBUTE((warn_unused_result)); diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 65457b414ca..6a184fe6f94 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -32,23 +32,18 @@ Created 10/25/1995 Heikki Tuuri #include "log0recv.h" #include "dict0types.h" #include "page0size.h" -#include "ibuf0types.h" #include "ilist.h" -#include <list> - struct unflushed_spaces_tag_t; struct rotation_list_tag_t; // Forward declaration -extern ibool srv_use_doublewrite_buf; +extern my_bool srv_use_doublewrite_buf; extern struct buf_dblwr_t* buf_dblwr; class page_id_t; struct trx_t; class truncate_t; -typedef std::list<char*, ut_allocator<char*> > space_name_list_t; - /** Structure containing encryption specification */ struct fil_space_crypt_t; @@ -88,36 +83,24 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, ulint id; /*!< space id */ hash_node_t hash; /*!< hash chain node */ char* name; /*!< Tablespace name */ - hash_node_t name_hash;/*!< hash chain the name_hash table */ lsn_t max_lsn; /*!< LSN of the most recent fil_names_write_if_was_clean(). Reset to 0 by fil_names_clear(). - Protected by log_sys->mutex. + Protected by log_sys.mutex. If and only if this is nonzero, the tablespace will be in named_spaces. */ /** Log sequence number of the latest MLOG_INDEX_LOAD record that was found while parsing the redo log */ lsn_t enable_lsn; - bool stop_new_ops; - /*!< we set this true when we start - deleting a single-table tablespace. - When this is set following new ops - are not allowed: - * read IO request - * ibuf merge - * file flush - Note that we can still possibly have - new write operations because we don't - check this flag when doing flush - batches. */ /** whether undo tablespace truncation is in progress */ bool is_being_truncated; #ifdef UNIV_DEBUG ulint redo_skipped_count; /*!< reference count for operations who want to skip redo log in the file space in order - to make modify_check() pass. */ + to make modify_check() pass. + Uses my_atomic_loadlint() and friends. */ #endif fil_type_t purpose;/*!< purpose */ UT_LIST_BASE_NODE_T(fil_node_t) chain; @@ -147,20 +130,30 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, ulint n_pending_flushes; /*!< this is positive when flushing the tablespace to disk; dropping of the tablespace is forbidden if this is positive */ - /** Number of pending buffer pool operations accessing the tablespace - without holding a table lock or dict_operation_lock S-latch - that would prevent the table (and tablespace) from being - dropped. An example is change buffer merge. - The tablespace cannot be dropped while this is nonzero, - or while fil_node_t::n_pending is nonzero. - Protected by fil_system->mutex. */ - ulint n_pending_ops; +private: + /** Number of pending buffer pool operations accessing the + tablespace without holding a table lock or dict_operation_lock + S-latch that would prevent the table (and tablespace) from being + dropped. An example is change buffer merge. + + The tablespace cannot be dropped while this is nonzero, or while + fil_node_t::n_pending is nonzero. + + The most significant bit contains the STOP_NEW_OPS flag. + + Protected by my_atomic. */ + int32 n_pending_ops; + + /** Flag in n_pending_ops that indicates that the tablespace is being + deleted, and no further operations should be performed */ + static const int32 STOP_NEW_OPS= 1 << 31; +public: /** Number of pending block read or write operations (when a write is imminent or a read has recently completed). The tablespace object cannot be freed while this is nonzero, but it can be detached from fil_system. Note that fil_node_t::n_pending tracks actual pending I/O requests. - Protected by fil_system->mutex. */ + Protected by fil_system.mutex and my_atomic_loadlint() and friends. */ ulint n_pending_ios; rw_lock_t latch; /*!< latch protecting the file space storage allocation */ @@ -182,19 +175,12 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, /** True if the device this filespace is on supports atomic writes */ bool atomic_write_supported; - /** Release the reserved free extents. - @param[in] n_reserved number of reserved extents */ - void release_free_extents(ulint n_reserved); - /** True if file system storing this tablespace supports punch hole */ bool punch_hole; ulint magic_n;/*!< FIL_SPACE_MAGIC_N */ - /** @return whether the tablespace is about to be dropped */ - bool is_stopping() const { return stop_new_ops; } - /** Clamp a page number for batched I/O, such as read-ahead. @param offset page number limit @return offset clamped to the tablespace size */ @@ -229,6 +215,125 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, @param[in] mtr mini-transaction */ void modify_check(const mtr_t& mtr) const; #endif /* UNIV_DEBUG */ + + /** Try to reserve free extents. + @param[in] n_free_now current number of free extents + @param[in] n_to_reserve number of extents to reserve + @return whether the reservation succeeded */ + bool reserve_free_extents(ulint n_free_now, ulint n_to_reserve) + { + ut_ad(rw_lock_own(&latch, RW_LOCK_X)); + if (n_reserved_extents + n_to_reserve > n_free_now) { + return false; + } + + n_reserved_extents += n_to_reserve; + return true; + } + + /** Release the reserved free extents. + @param[in] n_reserved number of reserved extents */ + void release_free_extents(ulint n_reserved) + { + if (!n_reserved) return; + ut_ad(rw_lock_own(&latch, RW_LOCK_X)); + ut_a(n_reserved_extents >= n_reserved); + n_reserved_extents -= n_reserved; + } + + /** Rename a file. + @param[in] name table name after renaming + @param[in] path tablespace file name after renaming + @param[in] log whether to write redo log + @param[in] replace whether to ignore the existence of path + @return error code + @retval DB_SUCCESS on success */ + dberr_t rename(const char* name, const char* path, bool log, + bool replace = false); + + /** Note that the tablespace has been imported. + Initially, purpose=FIL_TYPE_IMPORT so that no redo log is + written while the space ID is being updated in each page. */ + void set_imported(); + + /** Open each file. Only invoked on fil_system.temp_space. + @return whether all files were opened */ + bool open(); + /** Close each file. Only invoked on fil_system.temp_space. */ + void close(); + + /** @return whether the tablespace is about to be dropped or is referenced */ + int32 is_stopping_or_referenced() + { + return my_atomic_load32(&n_pending_ops); + } + + /** @return whether the tablespace is about to be dropped or is referenced */ + int32 is_stopping_or_referenced() const + { + return const_cast<fil_space_t*>(this)->is_stopping_or_referenced(); + } + + /** @return whether the tablespace is about to be dropped */ + bool is_stopping() const + { + return is_stopping_or_referenced() & STOP_NEW_OPS; + } + + /** @return number of references being held */ + int32 referenced() const + { + return is_stopping_or_referenced() & ~STOP_NEW_OPS; + } + + /** Note that operations on the tablespace must stop or can resume */ + void set_stopping(bool stopping) + { + /* Note: starting with 10.4 this should be std::atomic::fetch_xor() */ + int32 n= stopping ? 0 : STOP_NEW_OPS; + while (!my_atomic_cas32_strong_explicit(&n_pending_ops, &n, + n ^ STOP_NEW_OPS, + MY_MEMORY_ORDER_ACQUIRE, + MY_MEMORY_ORDER_RELAXED)) + ut_ad(!(n & STOP_NEW_OPS) == stopping); + } + + MY_ATTRIBUTE((warn_unused_result)) + /** @return whether a tablespace reference was successfully acquired */ + bool acquire() + { + int32 n= 0; + while (!my_atomic_cas32_strong_explicit(&n_pending_ops, &n, n + 1, + MY_MEMORY_ORDER_ACQUIRE, + MY_MEMORY_ORDER_RELAXED)) + if (UNIV_UNLIKELY(n & STOP_NEW_OPS)) + return false; + return true; + } + /** Release a tablespace reference. + @return whether this was the last reference */ + bool release() + { + int32 n= my_atomic_add32(&n_pending_ops, -1); + ut_ad(n & ~STOP_NEW_OPS); + return (n & ~STOP_NEW_OPS) == 1; + } + + /** Acquire a tablespace reference for I/O. */ + void acquire_for_io() { my_atomic_addlint(&n_pending_ios, 1); } + /** Release a tablespace reference for I/O. */ + void release_for_io() + { + ut_d(ulint n=) my_atomic_addlint(&n_pending_ios, ulint(-1)); + ut_ad(n); + } + /** @return whether I/O is pending */ + bool pending_io() { return my_atomic_loadlint(&n_pending_ios); } + /** @return whether I/O is pending */ + bool pending_io() const + { + return const_cast<fil_space_t*>(this)->pending_io(); + } }; /** Value of fil_space_t::magic_n */ @@ -238,7 +343,7 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, struct fil_node_t { /** tablespace containing this file */ fil_space_t* space; - /** file name; protected by fil_system->mutex and log_sys->mutex. */ + /** file name; protected by fil_system.mutex and log_sys.mutex. */ char* name; /** file handle (valid if is_open) */ pfs_os_file_t handle; @@ -263,7 +368,7 @@ struct fil_node_t { bool needs_flush; /** link to other files in this tablespace */ UT_LIST_NODE_T(fil_node_t) chain; - /** link to the fil_system->LRU list (keeping track of open files) */ + /** link to the fil_system.LRU list (keeping track of open files) */ UT_LIST_NODE_T(fil_node_t) LRU; /** whether this file could use atomic write (data file) */ @@ -285,12 +390,15 @@ struct fil_node_t { @param[in] first whether this is the very first read @return whether the page was found valid */ bool read_page0(bool first); + + /** Close the file handle. */ + void close(); }; /** Value of fil_node_t::magic_n */ #define FIL_NODE_MAGIC_N 89389 -/** Common InnoDB file extentions */ +/** Common InnoDB file extensions */ enum ib_extention { NO_EXT = 0, IBD = 1, @@ -313,18 +421,20 @@ of the address is FIL_NULL, the address is considered undefined. */ typedef byte fil_faddr_t; /*!< 'type' definition in C: an address stored in a file page is a string of bytes */ +#else +# include "univ.i" #endif /* !UNIV_INNOCHECKSUM */ /** Initial size of a single-table tablespace in pages */ -#define FIL_IBD_FILE_INITIAL_SIZE 4 +#define FIL_IBD_FILE_INITIAL_SIZE 4U /** 'null' (undefined) page offset in the context of file spaces */ #define FIL_NULL ULINT32_UNDEFINED -#define FIL_ADDR_PAGE 0 /* first in address is the page offset */ -#define FIL_ADDR_BYTE 4 /* then comes 2-byte byte offset within page*/ -#define FIL_ADDR_SIZE 6 /* address size is 6 bytes */ +#define FIL_ADDR_PAGE 0U /* first in address is the page offset */ +#define FIL_ADDR_BYTE 4U /* then comes 2-byte byte offset within page*/ +#define FIL_ADDR_SIZE 6U /* address size is 6 bytes */ #ifndef UNIV_INNOCHECKSUM @@ -335,7 +445,7 @@ struct fil_addr_t { }; /** The null file address */ -extern fil_addr_t fil_addr_null; +extern const fil_addr_t fil_addr_null; #endif /* !UNIV_INNOCHECKSUM */ @@ -344,15 +454,15 @@ extern fil_addr_t fil_addr_null; page belongs to (== 0) but in later versions the 'new' checksum of the page */ -#define FIL_PAGE_OFFSET 4 /*!< page offset inside space */ -#define FIL_PAGE_PREV 8 /*!< if there is a 'natural' +#define FIL_PAGE_OFFSET 4U /*!< page offset inside space */ +#define FIL_PAGE_PREV 8U /*!< if there is a 'natural' predecessor of the page, its offset. Otherwise FIL_NULL. This field is not set on BLOB pages, which are stored as a singly-linked list. See also FIL_PAGE_NEXT. */ -#define FIL_PAGE_NEXT 12 /*!< if there is a 'natural' successor +#define FIL_PAGE_NEXT 12U /*!< if there is a 'natural' successor of the page, its offset. Otherwise FIL_NULL. B-tree index pages @@ -362,9 +472,9 @@ extern fil_addr_t fil_addr_null; FIL_PAGE_PREV and FIL_PAGE_NEXT in the collation order of the smallest user record on each page. */ -#define FIL_PAGE_LSN 16 /*!< lsn of the end of the newest +#define FIL_PAGE_LSN 16U /*!< lsn of the end of the newest modification log record to the page */ -#define FIL_PAGE_TYPE 24 /*!< file page type: FIL_PAGE_INDEX,..., +#define FIL_PAGE_TYPE 24U /*!< file page type: FIL_PAGE_INDEX,..., 2 bytes. The contents of this field can only @@ -379,7 +489,7 @@ extern fil_addr_t fil_addr_null; MySQL/InnoDB 5.1.7 or later, the contents of this field is valid for all uncompressed pages. */ -#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26 /*!< for the first page +#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U /*!< for the first page in a system tablespace data file (ibdata*, not *.ibd): the file has been flushed to disk at least up @@ -393,7 +503,7 @@ extern fil_addr_t fil_addr_null; #define FIL_RTREE_SPLIT_SEQ_NUM FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION /** starting from 4.1.x this contains the space id of the page */ -#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 +#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34U #define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID @@ -420,7 +530,7 @@ extern fil_addr_t fil_addr_null; then encrypted */ #define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< page compressed page */ #define FIL_PAGE_INDEX 17855 /*!< B-tree node */ -#define FIL_PAGE_RTREE 17854 /*!< B-tree node */ +#define FIL_PAGE_RTREE 17854 /*!< R-tree node (SPATIAL INDEX) */ #define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */ #define FIL_PAGE_INODE 3 /*!< Index node */ #define FIL_PAGE_IBUF_FREE_LIST 4 /*!< Insert buffer free list */ @@ -443,15 +553,26 @@ extern fil_addr_t fil_addr_null; //#define FIL_PAGE_ENCRYPTED 15 //#define FIL_PAGE_COMPRESSED_AND_ENCRYPTED 16 //#define FIL_PAGE_ENCRYPTED_RTREE 17 +/** Clustered index root page after instant ADD COLUMN */ +#define FIL_PAGE_TYPE_INSTANT 18 -/** Used by i_s.cc to index into the text description. */ +/** Used by i_s.cc to index into the text description. +Note: FIL_PAGE_TYPE_INSTANT maps to the same as FIL_PAGE_INDEX. */ #define FIL_PAGE_TYPE_LAST FIL_PAGE_TYPE_UNKNOWN /*!< Last page type */ /* @} */ -/** macro to check whether the page type is index (Btree or Rtree) type */ -#define fil_page_type_is_index(page_type) \ - (page_type == FIL_PAGE_INDEX || page_type == FIL_PAGE_RTREE) +/** @return whether the page type is B-tree or R-tree index */ +inline bool fil_page_type_is_index(ulint page_type) +{ + switch (page_type) { + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + return(true); + } + return(false); +} /** Check whether the page is index page (either regular Btree index or Rtree index */ @@ -483,7 +604,7 @@ The caller should hold an InnoDB table lock or a MDL that prevents the tablespace from being dropped during the operation, or the caller should be in single-threaded crash recovery mode (no user connections that could drop tablespaces). -If this is not the case, fil_space_acquire() and fil_space_release() +If this is not the case, fil_space_acquire() and fil_space_t::release() should be used instead. @param[in] id tablespace ID @return tablespace, or NULL if not found */ @@ -496,17 +617,40 @@ fil_space_get( data space) is stored here; below we talk about tablespaces, but also the ib_logfiles form a 'space' and it is handled here */ struct fil_system_t { - fil_system_t() - : n_open(0), max_assigned_id(0), space_id_reuse_warned(false) - { - } + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + fil_system_t(): m_initialised(false) + { + UT_LIST_INIT(LRU, &fil_node_t::LRU); + UT_LIST_INIT(space_list, &fil_space_t::space_list); + UT_LIST_INIT(named_spaces, &fil_space_t::named_spaces); + } + + bool is_initialised() const { return m_initialised; } + + /** + Create the file system interface at database start. + + @param[in] hash_size hash table size + */ + void create(ulint hash_size); + + /** Close the file system interface at shutdown */ + void close(); +private: + bool m_initialised; +public: ib_mutex_t mutex; /*!< The mutex protecting the cache */ + fil_space_t* sys_space; /*!< The innodb_system tablespace */ + fil_space_t* temp_space; /*!< The innodb_temporary tablespace */ hash_table_t* spaces; /*!< The hash table of spaces in the system; they are hashed on the space id */ - hash_table_t* name_hash; /*!< hash table based on the space - name */ UT_LIST_BASE_NODE_T(fil_node_t) LRU; /*!< base node for the LRU list of the most recently used open files with no @@ -525,8 +669,6 @@ struct fil_system_t { at least one file node where needs_flush == true */ ulint n_open; /*!< number of files currently open */ - ulint max_n_open; /*!< n_open is not allowed to exceed - this */ ulint max_assigned_id;/*!< maximum space id in the existing tables, or assigned during the time mysqld has been up; at an InnoDB @@ -540,13 +682,13 @@ struct fil_system_t { for which a MLOG_FILE_NAME record has been written since the latest redo log checkpoint. - Protected only by log_sys->mutex. */ + Protected only by log_sys.mutex. */ ilist<fil_space_t, rotation_list_tag_t> rotation_list; /*!< list of all file spaces needing key rotation.*/ bool space_id_reuse_warned; - /* !< TRUE if fil_space_create() + /*!< whether fil_space_create() has issued a warning about potential space_id reuse */ @@ -567,29 +709,11 @@ struct fil_system_t { bool encrypt); }; -/** The tablespace memory cache. This variable is NULL before the module is -initialized. */ -extern fil_system_t* fil_system; +/** The tablespace memory cache. */ +extern fil_system_t fil_system; #include "fil0crypt.h" -/** Gets the type of a file space. -@param[in] id tablespace identifier -@return file type */ -fil_type_t -fil_space_get_type( - ulint id); - -/** Note that a tablespace has been imported. -It is initially marked as FIL_TYPE_IMPORT so that no logging is -done during the import process when the space ID is stamped to each page. -Now we change it to FIL_SPACE_TABLESPACE to start redo and undo logging. -NOTE: temporary tablespaces are never imported. -@param[in] id tablespace identifier */ -void -fil_space_set_imported( - ulint id); - /** Create a space memory object and put it to the fil_system hash table. Error messages are issued to the server log. @param[in] name tablespace name @@ -631,16 +755,6 @@ fil_space_free( ulint id, bool x_latched); -/** Returns the path from the first fil_node_t found with this space ID. -The caller is responsible for freeing the memory allocated here for the -value returned. -@param[in] id Tablespace ID -@return own: A copy of fil_node_t::path, NULL if space ID is zero -or not found. */ -char* -fil_space_get_first_path( - ulint id); - /** Set the recovered size of a tablespace in pages. @param id tablespace ID @param size recovered size in pages */ @@ -664,19 +778,6 @@ fil_space_get_flags( /*================*/ ulint id); /*!< in: space id */ -/** Open each fil_node_t of a named fil_space_t if not already open. -@param[in] name Tablespace name -@return true if all file nodes are opened. */ -bool -fil_space_open( - const char* name); - -/** Close each fil_node_t of a named fil_space_t if open. -@param[in] name Tablespace name */ -void -fil_space_close( - const char* name); - /** Returns the page size of the space and whether it is compressed or not. The tablespace must be cached in the memory cache. @param[in] id space id @@ -687,18 +788,6 @@ fil_space_get_page_size( ulint id, bool* found); -/****************************************************************//** -Initializes the tablespace memory cache. */ -void -fil_init( -/*=====*/ - ulint hash_size, /*!< in: hash table size */ - ulint max_n_open); /*!< in: max number of open files */ -/*******************************************************************//** -Initializes the tablespace memory cache. */ -void -fil_close(void); -/*===========*/ /*******************************************************************//** Opens all log files and system tablespace data files. They stay open until the database server shutdown. This should be called at a server startup after the @@ -776,11 +865,6 @@ fil_space_acquire_silent(ulint id) return (fil_space_acquire_low(id, true)); } -/** Release a tablespace acquired with fil_space_acquire(). -@param[in,out] space tablespace to release */ -void -fil_space_release(fil_space_t* space); - /** Acquire a tablespace for reading or writing a block, when it could be dropped concurrently. @param[in] id tablespace ID @@ -789,73 +873,6 @@ when it could be dropped concurrently. fil_space_t* fil_space_acquire_for_io(ulint id); -/** Release a tablespace acquired with fil_space_acquire_for_io(). -@param[in,out] space tablespace to release */ -void -fil_space_release_for_io(fil_space_t* space); - -/** Wrapper with reference-counting for a fil_space_t. */ -class FilSpace -{ -public: - /** Default constructor: Use this when reference counting - is done outside this wrapper. */ - FilSpace() : m_space(NULL) {} - - /** Constructor: Look up the tablespace and increment the - reference count if found. - @param[in] space_id tablespace ID - @param[in] silent whether not to display errors */ - explicit FilSpace(ulint space_id, bool silent = false) - : m_space(fil_space_acquire_low(space_id, silent)) {} - - /** Assignment operator: This assumes that fil_space_acquire() - has already been done for the fil_space_t. The caller must - assign NULL if it calls fil_space_release(). - @param[in] space tablespace to assign */ - class FilSpace& operator=(fil_space_t* space) - { - /* fil_space_acquire() must have been invoked. */ - ut_ad(space == NULL || space->n_pending_ops > 0); - m_space = space; - return(*this); - } - - /** Destructor - Decrement the reference count if a fil_space_t - is still assigned. */ - ~FilSpace() - { - if (m_space != NULL) { - fil_space_release(m_space); - } - } - - /** Implicit type conversion - @return the wrapped object */ - operator const fil_space_t*() const - { - return(m_space); - } - - /** Member accessor - @return the wrapped object */ - const fil_space_t* operator->() const - { - return(m_space); - } - - /** Explicit type conversion - @return the wrapped object */ - const fil_space_t* operator()() const - { - return(m_space); - } - -private: - /** The wrapped pointer */ - fil_space_t* m_space; -}; - /********************************************************//** Creates the database directory for a table if it does not exist yet. */ void @@ -863,43 +880,6 @@ fil_create_directory_for_tablename( /*===============================*/ const char* name); /*!< in: name in the standard 'databasename/tablename' format */ -/** Write redo log for renaming a file. -@param[in] space_id tablespace id -@param[in] old_name tablespace file name -@param[in] new_name tablespace file name after renaming */ -void -fil_name_write_rename( - ulint space_id, - const char* old_name, - const char* new_name); -/********************************************************//** -Recreates table indexes by applying -TRUNCATE log record during recovery. -@return DB_SUCCESS or error code */ -dberr_t -fil_recreate_table( -/*===============*/ - ulint space_id, /*!< in: space id */ - ulint format_flags, /*!< in: page format */ - ulint flags, /*!< in: tablespace flags */ - const char* name, /*!< in: table name */ - truncate_t& truncate); /*!< in/out: The information of - TRUNCATE log record */ -/********************************************************//** -Recreates the tablespace and table indexes by applying -TRUNCATE log record during recovery. -@return DB_SUCCESS or error code */ -dberr_t -fil_recreate_tablespace( -/*====================*/ - ulint space_id, /*!< in: space id */ - ulint format_flags, /*!< in: page format */ - ulint flags, /*!< in: tablespace flags */ - const char* name, /*!< in: table name */ - truncate_t& truncate, /*!< in/out: The information of - TRUNCATE log record */ - lsn_t recv_lsn); /*!< in: the end LSN of - the log record */ /** Replay a file rename operation if possible. @param[in] space_id tablespace identifier @param[in] first_page_no first page number in the file @@ -943,37 +923,6 @@ fil_space_t* fil_truncate_prepare(ulint space_id); void fil_truncate_log(fil_space_t* space, ulint size, mtr_t* mtr) MY_ATTRIBUTE((nonnull)); -/** Truncate the tablespace to needed size. -@param[in] space_id id of tablespace to truncate -@param[in] size_in_pages truncate size. -@return true if truncate was successful. */ -bool -fil_truncate_tablespace( - ulint space_id, - ulint size_in_pages); - -/*******************************************************************//** -Prepare for truncating a single-table tablespace. The tablespace -must be cached in the memory cache. -1) Check pending operations on a tablespace; -2) Remove all insert buffer entries for the tablespace; -@return DB_SUCCESS or error */ -dberr_t -fil_prepare_for_truncate( -/*=====================*/ - ulint id); /*!< in: space id */ - -/** Reinitialize the original tablespace header with the same space id -for single tablespace -@param[in] table table belongs to the tablespace -@param[in] size size in blocks -@param[in] trx Transaction covering truncate */ -void -fil_reinit_space_header_for_table( - dict_table_t* table, - ulint size, - trx_t* trx); - /*******************************************************************//** Closes a single-table tablespace. The tablespace must be cached in the memory cache. Free all pages used by the tablespace. @@ -985,58 +934,6 @@ fil_close_tablespace( ulint id); /*!< in: space id */ /*******************************************************************//** -Discards a single-table tablespace. The tablespace must be cached in the -memory cache. Discarding is like deleting a tablespace, but - - 1. We do not drop the table from the data dictionary; - - 2. We remove all insert buffer entries for the tablespace immediately; - in DROP TABLE they are only removed gradually in the background; - - 3. When the user does IMPORT TABLESPACE, the tablespace will have the - same id as it originally had. - - 4. Free all the pages in use by the tablespace if rename=true. -@return DB_SUCCESS or error */ -dberr_t -fil_discard_tablespace( -/*===================*/ - ulint id) /*!< in: space id */ - MY_ATTRIBUTE((warn_unused_result)); - -/** Test if a tablespace file can be renamed to a new filepath by checking -if that the old filepath exists and the new filepath does not exist. -@param[in] space_id tablespace id -@param[in] old_path old filepath -@param[in] new_path new filepath -@param[in] is_discarded whether the tablespace is discarded -@param[in] replace_new whether to ignore the existence of new_path -@return innodb error code */ -dberr_t -fil_rename_tablespace_check( - ulint space_id, - const char* old_path, - const char* new_path, - bool is_discarded, - bool replace_new = false); - -/** Rename a single-table tablespace. -The tablespace must exist in the memory cache. -@param[in] id tablespace identifier -@param[in] old_path old file name -@param[in] new_name new table name in the -databasename/tablename format -@param[in] new_path_in new file name, -or NULL if it is located in the normal data directory -@return true if success */ -bool -fil_rename_tablespace( - ulint id, - const char* old_path, - const char* new_name, - const char* new_path_in); - -/*******************************************************************//** Allocates and builds a file name from a path, a table or tablespace name and a suffix. The string must be freed by caller with ut_free(). @param[in] path NULL or the direcory path or the full path and filename. @@ -1059,8 +956,10 @@ fil_make_filepath( must be >= FIL_IBD_FILE_INITIAL_SIZE @param[in] mode MariaDB encryption mode @param[in] key_id MariaDB encryption key_id -@return DB_SUCCESS or error code */ -dberr_t +@param[out] err DB_SUCCESS or error code +@return the created tablespace +@retval NULL on error */ +fil_space_t* fil_ibd_create( ulint space_id, const char* name, @@ -1068,16 +967,15 @@ fil_ibd_create( ulint flags, ulint size, fil_encryption_t mode, - uint32_t key_id) - MY_ATTRIBUTE((nonnull(2), warn_unused_result)); + uint32_t key_id, + dberr_t* err) + MY_ATTRIBUTE((nonnull(2,8), warn_unused_result)); /** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations. (Typically when upgrading from MariaDB 10.1.0..10.1.20.) -@param[in] space_id tablespace ID +@param[in,out] space tablespace @param[in] flags desired tablespace flags */ -UNIV_INTERN -void -fsp_flags_try_adjust(ulint space_id, ulint flags); +void fsp_flags_try_adjust(fil_space_t* space, ulint flags); /********************************************************************//** Tries to open a single-table tablespace and optionally checks the space id is @@ -1104,19 +1002,22 @@ statement to update the dictionary tables if they are incorrect. @param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY @param[in] id tablespace ID @param[in] flags expected FSP_SPACE_FLAGS -@param[in] space_name tablespace name of the datafile +@param[in] tablename table name If file-per-table, it is the table name in the databasename/tablename format @param[in] path_in expected filepath, usually read from dictionary -@return DB_SUCCESS or error code */ -dberr_t +@param[out] err DB_SUCCESS or error code +@return tablespace +@retval NULL if the tablespace could not be opened */ +fil_space_t* fil_ibd_open( - bool validate, - bool fix_dict, - fil_type_t purpose, - ulint id, - ulint flags, - const char* tablename, - const char* path_in) + bool validate, + bool fix_dict, + fil_type_t purpose, + ulint id, + ulint flags, + const table_name_t& tablename, + const char* path_in, + dberr_t* err = NULL) MY_ATTRIBUTE((warn_unused_result)); enum fil_load_status { @@ -1164,8 +1065,9 @@ startup, there may be many tablespaces which are not yet in the memory cache. @param[in] id Tablespace ID @param[in] name Tablespace name used in fil_space_create(). @param[in] table_flags table flags -@return true if a matching tablespace exists in the memory cache */ -bool +@return the tablespace +@retval NULL if no matching tablespace exists in the memory cache */ +fil_space_t* fil_space_for_table_exists_in_mem( ulint id, const char* name, @@ -1179,29 +1081,6 @@ bool fil_space_extend( fil_space_t* space, ulint size); -/*******************************************************************//** -Tries to reserve free extents in a file space. -@return true if succeed */ -bool -fil_space_reserve_free_extents( -/*===========================*/ - ulint id, /*!< in: space id */ - ulint n_free_now, /*!< in: number of free extents now */ - ulint n_to_reserve); /*!< in: how many one wants to reserve */ -/*******************************************************************//** -Releases free extents in a file space. */ -void -fil_space_release_free_extents( -/*===========================*/ - ulint id, /*!< in: space id */ - ulint n_reserved); /*!< in: how many one reserved */ -/*******************************************************************//** -Gets the number of reserved extents. If the database is silent, this number -should be zero. */ -ulint -fil_space_get_n_reserved_extents( -/*=============================*/ - ulint id); /*!< in: space id */ /** Reads or writes data. This operation could be asynchronous (aio). @@ -1298,20 +1177,6 @@ fil_page_set_type( byte* page, /*!< in/out: file page */ ulint type); /*!< in: type */ -#ifdef UNIV_DEBUG -/** Increase redo skipped of a tablespace. -@param[in] id space id */ -void -fil_space_inc_redo_skipped_count( - ulint id); - -/** Decrease redo skipped of a tablespace. -@param[in] id space id */ -void -fil_space_dec_redo_skipped_count( - ulint id); -#endif - /********************************************************************//** Delete the tablespace file and any related files like .cfg. This should not be called for temporary tables. */ @@ -1340,15 +1205,6 @@ char* fil_path_to_space_name( const char* filename); -/** Returns the space ID based on the tablespace name. -The tablespace must be found in the tablespace memory cache. -This call is made from external to this module, so the mutex is not owned. -@param[in] tablespace Tablespace name -@return space ID if tablespace found, ULINT_UNDEFINED if space not. */ -ulint -fil_space_get_id_by_name( - const char* tablespace); - /** Generate redo log for swapping two .ibd files @param[in] old_table old table @param[in] new_table new table @@ -1364,9 +1220,9 @@ fil_mtr_rename_log( MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Acquire the fil_system mutex. */ -#define fil_system_enter() mutex_enter(&fil_system->mutex) +#define fil_system_enter() mutex_enter(&fil_system.mutex) /** Release the fil_system mutex. */ -#define fil_system_exit() mutex_exit(&fil_system->mutex) +#define fil_system_exit() mutex_exit(&fil_system.mutex) /*******************************************************************//** Returns the table space by a given id, NULL if not found. */ @@ -1375,14 +1231,7 @@ fil_space_get_by_id( /*================*/ ulint id); /*!< in: space id */ -/** Look up a tablespace. -@param[in] name tablespace name -@return tablespace -@retval NULL if not found */ -fil_space_t* -fil_space_get_by_name(const char* name); - -/*******************************************************************//** +/** Note that a non-predefined persistent tablespace has been modified by redo log. @param[in,out] space tablespace */ void @@ -1417,8 +1266,8 @@ fil_names_write_if_was_clean( } const bool was_clean = space->max_lsn == 0; - ut_ad(space->max_lsn <= log_sys->lsn); - space->max_lsn = log_sys->lsn; + ut_ad(space->max_lsn <= log_sys.lsn); + space->max_lsn = log_sys.lsn; if (was_clean) { fil_names_dirty_and_write(space, mtr); diff --git a/storage/innobase/include/fil0fil.ic b/storage/innobase/include/fil0fil.ic index 1d1aaab61f5..31466f38546 100644 --- a/storage/innobase/include/fil0fil.ic +++ b/storage/innobase/include/fil0fil.ic @@ -39,6 +39,7 @@ fil_get_page_type_name( return "PAGE_COMPRESSED_ENRYPTED"; case FIL_PAGE_PAGE_COMPRESSED: return "PAGE_COMPRESSED"; + case FIL_PAGE_TYPE_INSTANT: case FIL_PAGE_INDEX: return "INDEX"; case FIL_PAGE_RTREE: @@ -68,7 +69,7 @@ fil_get_page_type_name( case FIL_PAGE_TYPE_ZBLOB2: return "ZBLOB2"; case FIL_PAGE_TYPE_UNKNOWN: - return "OLD UNKOWN PAGE TYPE"; + return "OLD UNKNOWN PAGE TYPE"; default: return "PAGE TYPE CORRUPTED"; } @@ -89,6 +90,7 @@ fil_page_type_validate( if (!((page_type == FIL_PAGE_PAGE_COMPRESSED || page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED || page_type == FIL_PAGE_INDEX || + page_type == FIL_PAGE_TYPE_INSTANT || page_type == FIL_PAGE_RTREE || page_type == FIL_PAGE_UNDO_LOG || page_type == FIL_PAGE_INODE || diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h index 3e5fa1369b0..8c5b24fbadb 100644 --- a/storage/innobase/include/fsp0file.h +++ b/storage/innobase/include/fsp0file.h @@ -433,7 +433,8 @@ private: /** Flags to use for opening the data file */ os_file_create_t m_open_flags; - /** size in database pages */ + /** size in megabytes or pages; converted from megabytes to + pages in SysTablespace::normalize_size() */ ulint m_size; /** ordinal position of this datafile in the tablespace */ @@ -496,7 +497,7 @@ public: /* No op - base constructor is called. */ } - RemoteDatafile(const char* name, ulint size, ulint order) + RemoteDatafile(const char*, ulint, ulint) : m_link_filepath() { @@ -518,12 +519,6 @@ public: return(m_link_filepath); } - /** Set the link filepath. Use default datadir, the base name of - the path provided without its suffix, plus DOT_ISL. - @param[in] path filepath which contains a basename to use. - If NULL, use m_name as the basename. */ - void set_link_filepath(const char* path); - /** Create a link filename based on the contents of m_name, open that file, and read the contents into m_filepath. @retval DB_SUCCESS if remote linked tablespace file is opened and read. diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index a50be6f8998..5fd98494e26 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -41,8 +41,8 @@ Created 12/18/1995 Heikki Tuuri /** @return the PAGE_SSIZE flags for the current innodb_page_size */ #define FSP_FLAGS_PAGE_SSIZE() \ - ((UNIV_PAGE_SIZE == UNIV_PAGE_SIZE_ORIG) ? \ - 0 : (UNIV_PAGE_SIZE_SHIFT - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \ + ((srv_page_size == UNIV_PAGE_SIZE_ORIG) ? \ + 0U : (srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \ << FSP_FLAGS_POS_PAGE_SSIZE) /* @defgroup Compatibility macros for MariaDB 10.1.0 through 10.1.20; @@ -290,22 +290,6 @@ the extent are free and which contain old tuple version to clean. */ #ifndef UNIV_INNOCHECKSUM /* @} */ -/**********************************************************************//** -Initializes the file space system. */ -void -fsp_init(void); -/*==========*/ - -/**********************************************************************//** -Gets the size of the system tablespace from the tablespace header. If -we do not have an auto-extending data file, this should be equal to -the size of the data files. If there is an auto-extending data file, -this can be smaller. -@return size in pages */ -ulint -fsp_header_get_tablespace_size(void); -/*================================*/ - /** Calculate the number of pages to extend a datafile. We extend single-table tablespaces first one extent at a time, but 4 at a time for bigger tablespaces. It is not enough to extend always @@ -330,7 +314,7 @@ UNIV_INLINE ulint fsp_get_extent_size_in_pages(const page_size_t& page_size) { - return(FSP_EXTENT_SIZE * UNIV_PAGE_SIZE / page_size.physical()); + return (FSP_EXTENT_SIZE << srv_page_size_shift) / page_size.physical(); } /**********************************************************************//** @@ -393,65 +377,24 @@ fsp_header_init_fields( ulint flags); /*!< in: tablespace flags (FSP_SPACE_FLAGS): 0, or table->flags if newer than COMPACT */ /** Initialize a tablespace header. -@param[in] space_id space id -@param[in] size current size in blocks -@param[in,out] mtr mini-transaction */ -void -fsp_header_init(ulint space_id, ulint size, mtr_t* mtr); - -/**********************************************************************//** -Increases the space size field of a space. */ -void -fsp_header_inc_size( -/*================*/ - ulint space_id, /*!< in: space id */ - ulint size_inc, /*!< in: size increment in pages */ - mtr_t* mtr); /*!< in/out: mini-transaction */ - -/** Creates a new segment. -@param[in] space space id -@param[in] byte_offset byte offset of the created segment header - on the page -@param[in,out] mtr mini-transaction -@param[in,out] block block where segment header is placed; - If it is null then new page will be - allocated and it will belong to - the created segment -@return the block where the segment header is placed, x-latched, NULL -if could not create segment because of lack of space */ -buf_block_t* -fseg_create( - ulint space, - ulint byte_offset, - mtr_t* mtr, - buf_block_t* block=NULL); - -/** Creates a new segment. -@param[in] space_id space_id -@param[in] byte_offset byte offset of the created segment - header on the page -@param[in] has_done_reservation TRUE if the caller has already - done the reservation for the pages - with fsp_reserve_free_externts - (at least 2 extents: one for - the inode and the other for the - segment) then there is no need to do - the check for this individual - operation -@param[in,out] mtr mini-transaction -@param[in] block block where the segment header is - placed. If it is null then new page - will be allocated and it will belong - to the created segment -@return the block where the segment header is placed, x-latched, NULL -if could not create segment because of lack of space */ +@param[in,out] space tablespace +@param[in] size current size in blocks +@param[in,out] mtr mini-transaction */ +void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); + +/** Create a new segment. +@param space tablespace +@param byte_offset byte offset of the created segment header +@param mtr mini-transaction +@param has_done_reservation whether fsp_reserve_free_extents() was invoked +@param block block where segment header is placed, + or NULL to allocate an additional page for that +@return the block where the segment header is placed, x-latched +@retval NULL if could not create segment because of lack of space */ buf_block_t* -fseg_create_general( - ulint space_id, - ulint byte_offset, - ibool has_done_reservation, - mtr_t* mtr, - buf_block_t* block); +fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, + bool has_done_reservation= false, buf_block_t *block= NULL); /**********************************************************************//** Calculates the number of pages reserved by a segment, and how many pages are @@ -514,7 +457,7 @@ fseg_alloc_free_page_general( use several pages from the tablespace should call this function beforehand and reserve enough free extents so that they certainly will be able to do their operation, like a B-tree page split, fully. Reservations -must be released with function fil_space_release_free_extents! +must be released with function fil_space_t::release_free_extents()! The alloc_type below has the following meaning: FSP_NORMAL means an operation which will probably result in more space usage, like an @@ -540,7 +483,7 @@ free pages available. return true and the tablespace size is < FSP_EXTENT_SIZE pages, then this can be 0, otherwise it is n_ext -@param[in] space_id tablespace identifier +@param[in,out] space tablespace @param[in] n_ext number of extents to reserve @param[in] alloc_type page reservation type (FSP_BLOB, etc) @param[in,out] mtr the mini transaction @@ -551,38 +494,23 @@ free pages available. bool fsp_reserve_free_extents( ulint* n_reserved, - ulint space_id, + fil_space_t* space, ulint n_ext, fsp_reserve_t alloc_type, mtr_t* mtr, ulint n_pages = 2); -/** Calculate how many KiB of new data we will be able to insert to the -tablespace without running out of space. -@param[in] space_id tablespace ID -@return available space in KiB -@retval UINTMAX_MAX if unknown */ -uintmax_t -fsp_get_available_space_in_free_extents( - ulint space_id); - -/** Calculate how many KiB of new data we will be able to insert to the -tablespace without running out of space. Start with a space object that has -been acquired by the caller who holds it for the calculation, -@param[in] space tablespace object from fil_space_acquire() -@return available space in KiB */ -uintmax_t -fsp_get_available_space_in_free_extents( - const fil_space_t* space); - -/**********************************************************************//** -Frees a single page of a segment. */ +/** Free a page in a file segment. +@param[in,out] seg_header file segment header +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction */ void fseg_free_page( - fseg_header_t* seg_header, /*!< in: segment header */ - ulint space_id, /*!< in: space id */ - ulint page, /*!< in: page offset */ - mtr_t* mtr); /*!< in/out: mini-transaction */ + fseg_header_t* seg_header, + fil_space_t* space, + ulint offset, + mtr_t* mtr); /** Determine whether a page is free. @param[in,out] space tablespace @param[in] page page number diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic index d0f7fba4047..9f28aacaff5 100644 --- a/storage/innobase/include/fsp0fsp.ic +++ b/storage/innobase/include/fsp0fsp.ic @@ -92,21 +92,15 @@ xdes_calc_descriptor_page( const page_size_t& page_size, ulint offset) { -#ifndef DOXYGEN /* Doxygen gets confused by these */ -# if UNIV_PAGE_SIZE_MAX <= XDES_ARR_OFFSET \ - + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) \ - * XDES_SIZE_MAX -# error -# endif -# if UNIV_ZIP_SIZE_MIN <= XDES_ARR_OFFSET \ - + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE_MIN) \ - * XDES_SIZE_MIN -# error -# endif -#endif /* !DOXYGEN */ - - ut_ad(UNIV_PAGE_SIZE > XDES_ARR_OFFSET - + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) + compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) + * XDES_SIZE_MAX); + compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN) + * XDES_SIZE_MIN); + + ut_ad(srv_page_size > XDES_ARR_OFFSET + + (srv_page_size / FSP_EXTENT_SIZE) * XDES_SIZE); ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h index da19547fc36..d3a79ec23a9 100644 --- a/storage/innobase/include/fsp0sysspace.h +++ b/storage/innobase/include/fsp0sysspace.h @@ -32,14 +32,6 @@ Created 2013-7-26 by Kevin Lewis at a time. We have to make this public because it is a config variable. */ extern ulong sys_tablespace_auto_extend_increment; -#ifdef UNIV_DEBUG -/** Control if extra debug checks need to be done for temporary tablespace. -Default = true that is disable such checks. -This variable is not exposed to end-user but still kept as variable for -developer to enable it during debug. */ -extern bool srv_skip_temp_table_checks_debug; -#endif /* UNIV_DEBUG */ - /** Data structure that contains the information about shared tablespaces. Currently this can be the system tablespace or a temporary table tablespace */ class SysTablespace : public Tablespace @@ -110,7 +102,7 @@ public: void shutdown(); /** Normalize the file size, convert to extents. */ - void normalize(); + void normalize_size(); /** @return true if a new raw device was created. */ @@ -146,8 +138,8 @@ public: @return the autoextend increment in pages. */ ulint get_autoextend_increment() const { - return(sys_tablespace_auto_extend_increment - * ((1024 * 1024) / UNIV_PAGE_SIZE)); + return sys_tablespace_auto_extend_increment + << (20 - srv_page_size_shift); } /** diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h index c0150262242..5c77b62723a 100644 --- a/storage/innobase/include/fsp0types.h +++ b/storage/innobase/include/fsp0types.h @@ -60,11 +60,8 @@ page size | file space extent size 32 KiB | 64 pages = 2 MiB 64 KiB | 64 pages = 4 MiB */ -#define FSP_EXTENT_SIZE ((UNIV_PAGE_SIZE <= (16384) ? \ - (1048576 / UNIV_PAGE_SIZE) : \ - ((UNIV_PAGE_SIZE <= (32768)) ? \ - (2097152 / UNIV_PAGE_SIZE) : \ - (4194304 / UNIV_PAGE_SIZE)))) +#define FSP_EXTENT_SIZE (srv_page_size_shift < 14 ? \ + (1048576U >> srv_page_size_shift) : 64U) /** File space extent size (four megabyte) in pages for MAX page size */ #define FSP_EXTENT_SIZE_MAX (4194304 / UNIV_PAGE_SIZE_MAX) @@ -152,38 +149,38 @@ enum fsp_reserve_t { /* Number of pages described in a single descriptor page: currently each page description takes less than 1 byte; a descriptor page is repeated every this many file pages */ -/* #define XDES_DESCRIBED_PER_PAGE UNIV_PAGE_SIZE */ -/* This has been replaced with either UNIV_PAGE_SIZE or page_zip->size. */ +/* #define XDES_DESCRIBED_PER_PAGE srv_page_size */ +/* This has been replaced with either srv_page_size or page_zip->size. */ /** @name The space low address page map The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */ /* @{ */ /*--------------------------------------*/ -#define FSP_XDES_OFFSET 0 /* !< extent descriptor */ -#define FSP_IBUF_BITMAP_OFFSET 1 /* !< insert buffer bitmap */ +#define FSP_XDES_OFFSET 0U /* !< extent descriptor */ +#define FSP_IBUF_BITMAP_OFFSET 1U /* !< insert buffer bitmap */ /* The ibuf bitmap pages are the ones whose page number is the number above plus a multiple of XDES_DESCRIBED_PER_PAGE */ -#define FSP_FIRST_INODE_PAGE_NO 2 /*!< in every tablespace */ +#define FSP_FIRST_INODE_PAGE_NO 2U /*!< in every tablespace */ /* The following pages exist in the system tablespace (space 0). */ -#define FSP_IBUF_HEADER_PAGE_NO 3 /*!< insert buffer +#define FSP_IBUF_HEADER_PAGE_NO 3U /*!< insert buffer header page, in tablespace 0 */ -#define FSP_IBUF_TREE_ROOT_PAGE_NO 4 /*!< insert buffer +#define FSP_IBUF_TREE_ROOT_PAGE_NO 4U /*!< insert buffer B-tree root page in tablespace 0 */ /* The ibuf tree root page number in tablespace 0; its fseg inode is on the page number FSP_FIRST_INODE_PAGE_NO */ -#define FSP_TRX_SYS_PAGE_NO 5 /*!< transaction +#define FSP_TRX_SYS_PAGE_NO 5U /*!< transaction system header, in tablespace 0 */ -#define FSP_FIRST_RSEG_PAGE_NO 6 /*!< first rollback segment +#define FSP_FIRST_RSEG_PAGE_NO 6U /*!< first rollback segment page, in tablespace 0 */ -#define FSP_DICT_HDR_PAGE_NO 7 /*!< data dictionary header +#define FSP_DICT_HDR_PAGE_NO 7U /*!< data dictionary header page, in tablespace 0 */ /*--------------------------------------*/ /* @} */ @@ -197,17 +194,6 @@ fsp_is_system_temporary(ulint space_id) { return(space_id == SRV_TMP_SPACE_ID); } - -#ifdef UNIV_DEBUG -/** Skip some of the sanity checks that are time consuming even in debug mode -and can affect frequent verification runs that are done to ensure stability of -the product. -@return true if check should be skipped for given space. */ -bool -fsp_skip_sanity_check( - ulint space_id); -#endif /* UNIV_DEBUG */ - #endif /* !UNIV_INNOCHECKSUM */ /* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */ @@ -218,7 +204,7 @@ fsp_skip_sanity_check( #define FSP_FLAGS_WIDTH_ZIP_SSIZE 4 /** Width of the ATOMIC_BLOBS flag. The ability to break up a long column into an in-record prefix and an externally stored part is available -to the two Barracuda row formats COMPRESSED and DYNAMIC. */ +to ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. */ #define FSP_FLAGS_WIDTH_ATOMIC_BLOBS 1 /** Number of flag bits used to indicate the tablespace page size */ #define FSP_FLAGS_WIDTH_PAGE_SSIZE 4 diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h index 84d8ccd26ef..9f5b83e7d2e 100644 --- a/storage/innobase/include/fts0fts.h +++ b/storage/innobase/include/fts0fts.h @@ -450,47 +450,49 @@ fts_trx_free( /*=========*/ fts_trx_t* fts_trx); /*!< in, own: FTS trx */ -/******************************************************************//** -Creates the common ancillary tables needed for supporting an FTS index -on the given table. row_mysql_lock_data_dictionary must have been -called before this. -@return DB_SUCCESS or error code */ +/** Creates the common auxiliary tables needed for supporting an FTS index +on the given table. row_mysql_lock_data_dictionary must have been called +before this. +The following tables are created. +CREATE TABLE $FTS_PREFIX_DELETED + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_DELETED_CACHE + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_BEING_DELETED + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_CONFIG + (key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key) +@param[in,out] trx transaction +@param[in] table table with FTS index +@param[in] skip_doc_id_index Skip index on doc id +@return DB_SUCCESS if succeed */ dberr_t fts_create_common_tables( -/*=====================*/ - trx_t* trx, /*!< in: transaction handle */ - const dict_table_t* - table, /*!< in: table with one FTS - index */ - const char* name, /*!< in: table name */ - bool skip_doc_id_index) /*!< in: Skip index on doc id */ - MY_ATTRIBUTE((warn_unused_result)); -/******************************************************************//** -Wrapper function of fts_create_index_tables_low(), create auxiliary -tables for an FTS index -@return DB_SUCCESS or error code */ -dberr_t -fts_create_index_tables( -/*====================*/ - trx_t* trx, /*!< in: transaction handle */ - const dict_index_t* index) /*!< in: the FTS index - instance */ - MY_ATTRIBUTE((warn_unused_result)); -/******************************************************************//** -Creates the column specific ancillary tables needed for supporting an + trx_t* trx, + dict_table_t* table, + bool skip_doc_id_index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Creates the column specific ancillary tables needed for supporting an FTS index on the given table. row_mysql_lock_data_dictionary must have been called before this. + +All FTS AUX Index tables have the following schema. +CREAT TABLE $FTS_PREFIX_INDEX_[1-6]( + word VARCHAR(FTS_MAX_WORD_LEN), + first_doc_id INT NOT NULL, + last_doc_id UNSIGNED NOT NULL, + doc_count UNSIGNED INT NOT NULL, + ilist VARBINARY NOT NULL, + UNIQUE CLUSTERED INDEX ON (word, first_doc_id)) +@param[in,out] trx dictionary transaction +@param[in] index fulltext index +@param[in] id table id @return DB_SUCCESS or error code */ dberr_t -fts_create_index_tables_low( -/*========================*/ - trx_t* trx, /*!< in: transaction handle */ - const dict_index_t* - index, /*!< in: the FTS index - instance */ - const char* table_name, /*!< in: the table name */ - table_id_t table_id) /*!< in: the table id */ - MY_ATTRIBUTE((warn_unused_result)); +fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id) + MY_ATTRIBUTE((nonnull, warn_unused_result)); /******************************************************************//** Add the FTS document id hidden column. */ void @@ -521,7 +523,7 @@ fts_commit( MY_ATTRIBUTE((warn_unused_result)); /** FTS Query entry point. -@param[in] trx transaction +@param[in,out] trx transaction @param[in] index fts index to search @param[in] flags FTS search mode @param[in] query_str FTS query @@ -680,7 +682,6 @@ Take a FTS savepoint. */ void fts_savepoint_take( /*===============*/ - trx_t* trx, /*!< in: transaction */ fts_trx_t* fts_trx, /*!< in: fts transaction */ const char* name); /*!< in: savepoint name */ diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h index 2b7f31dfe54..fc888a0c1e6 100644 --- a/storage/innobase/include/fts0priv.h +++ b/storage/innobase/include/fts0priv.h @@ -315,10 +315,9 @@ the dict mutex que_t* fts_parse_sql_no_dict_lock( /*=======================*/ - fts_table_t* fts_table, /*!< in: table with FTS index */ pars_info_t* info, /*!< in: parser info */ const char* sql) /*!< in: SQL string to evaluate */ - MY_ATTRIBUTE((nonnull(3), malloc, warn_unused_result)); + MY_ATTRIBUTE((nonnull(2), malloc, warn_unused_result)); /******************************************************************//** Get value from config table. The caller must ensure that enough space is allocated for value to hold the column contents diff --git a/storage/innobase/include/fts0tokenize.h b/storage/innobase/include/fts0tokenize.h index 2c4b2418ecb..ddfb5bbabfd 100644 --- a/storage/innobase/include/fts0tokenize.h +++ b/storage/innobase/include/fts0tokenize.h @@ -144,7 +144,7 @@ fts_get_word( } } - info->prev = *doc; + info->prev = char(*doc); info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0); info->weight_adjust = info->wasign = 0; } diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic index cf5cf35c48e..ed61726ff80 100644 --- a/storage/innobase/include/fts0types.ic +++ b/storage/innobase/include/fts0types.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -170,7 +170,6 @@ fts_select_index_by_hash( const byte* str, ulint len) { - int char_len; ulong nr1 = 1; ulong nr2 = 4; @@ -185,9 +184,9 @@ fts_select_index_by_hash( char_len = my_mbcharlen_ptr(cs, reinterpret_cast<const char*>(str), reinterpret_cast<const char*>(str + len)); */ - char_len = cs->cset->charlen(cs, str, str+len); + size_t char_len = size_t(cs->cset->charlen(cs, str, str + len)); - ut_ad(static_cast<ulint>(char_len) <= len); + ut_ad(char_len <= len); /* Get collation hash code */ cs->coll->hash_sort(cs, str, char_len, &nr1, &nr2); diff --git a/storage/innobase/include/fut0fut.ic b/storage/innobase/include/fut0fut.ic index bba84d0d80e..b5c1e15e059 100644 --- a/storage/innobase/include/fut0fut.ic +++ b/storage/innobase/include/fut0fut.ic @@ -48,7 +48,7 @@ fut_get_ptr( buf_block_t* block; byte* ptr = NULL; - ut_ad(addr.boffset < UNIV_PAGE_SIZE); + ut_ad(addr.boffset < srv_page_size); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH) || (rw_latch == RW_SX_LATCH)); diff --git a/storage/innobase/include/fut0lst.ic b/storage/innobase/include/fut0lst.ic index fae7fa078bf..00bb3fe8e9c 100644 --- a/storage/innobase/include/fut0lst.ic +++ b/storage/innobase/include/fut0lst.ic @@ -58,7 +58,7 @@ flst_write_addr( MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); - ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); mlog_write_ulint(faddr + FIL_ADDR_PAGE, addr.page, MLOG_4BYTES, mtr); mlog_write_ulint(faddr + FIL_ADDR_BYTE, addr.boffset, @@ -83,7 +83,7 @@ flst_read_addr( addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES, mtr); ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); - ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); return(addr); } diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h index 5e812d10451..e189b6a7f28 100644 --- a/storage/innobase/include/gis0rtree.h +++ b/storage/innobase/include/gis0rtree.h @@ -71,10 +71,8 @@ rtr_index_build_node_ptr( pointer */ ulint page_no,/*!< in: page number to put in node pointer */ - mem_heap_t* heap, /*!< in: memory heap where pointer + mem_heap_t* heap); /*!< in: memory heap where pointer created */ - ulint level); /*!< in: level of rec in tree: - 0 means leaf level */ /*************************************************************//** Splits an R-tree index page to halves and inserts the tuple. It is assumed @@ -163,7 +161,6 @@ dberr_t rtr_ins_enlarge_mbr( /*=================*/ btr_cur_t* cursor, /*!< in: btr cursor */ - que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr); /*!< in: mtr */ /**************************************************************//** @@ -406,9 +403,6 @@ rtr_merge_and_update_mbr( rec_offs* offsets, /*!< in: rec offsets */ rec_offs* offsets2, /*!< in: rec offsets */ page_t* child_page, /*!< in: the child page. */ - buf_block_t* merge_block, /*!< in: page to merge */ - buf_block_t* block, /*!< in: page be merged */ - dict_index_t* index, /*!< in: index */ mtr_t* mtr); /*!< in: mtr */ /*************************************************************//** @@ -416,10 +410,8 @@ Deletes on the upper level the node pointer to a page. */ void rtr_node_ptr_delete( /*================*/ - dict_index_t* index, /*!< in: index tree */ - btr_cur_t* sea_cur,/*!< in: search cursor, contains information + btr_cur_t* cursor, /*!< in: search cursor, contains information about parent nodes in search */ - buf_block_t* block, /*!< in: page whose node pointer is deleted */ mtr_t* mtr); /*!< in: mtr */ /****************************************************************//** @@ -431,10 +423,7 @@ rtr_merge_mbr_changed( btr_cur_t* cursor2, /*!< in: the other cursor */ rec_offs* offsets, /*!< in: rec offsets */ rec_offs* offsets2, /*!< in: rec offsets */ - rtr_mbr_t* new_mbr, /*!< out: MBR to update */ - buf_block_t* merge_block, /*!< in: page to merge */ - buf_block_t* block, /*!< in: page be merged */ - dict_index_t* index); /*!< in: index */ + rtr_mbr_t* new_mbr); /*!< out: MBR to update */ /**************************************************************//** @@ -511,7 +500,7 @@ rtr_info_reinit_in_cursor( @param[in] tuple range tuple containing mbr, may also be empty tuple @param[in] mode search mode @return estimated number of rows */ -int64_t +ha_rows rtr_estimate_n_rows_in_range( dict_index_t* index, const dtuple_t* tuple, diff --git a/storage/innobase/include/gis0rtree.ic b/storage/innobase/include/gis0rtree.ic index 696aa1e2f5f..89676fb2386 100644 --- a/storage/innobase/include/gis0rtree.ic +++ b/storage/innobase/include/gis0rtree.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,7 +38,7 @@ rtr_page_cal_mbr( { page_t* page; rec_t* rec; - byte* field; + const byte* field; ulint len; rec_offs* offsets = NULL; double bmin, bmax; @@ -57,7 +57,8 @@ rtr_page_cal_mbr( page = buf_block_get_frame(block); rec = page_rec_get_next(page_get_infimum_rec(page)); - offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page), + offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page) + ? index->n_fields : 0, ULINT_UNDEFINED, &heap); do { diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h index 3eab2135969..f116cad32e2 100644 --- a/storage/innobase/include/ha_prototypes.h +++ b/storage/innobase/include/ha_prototypes.h @@ -69,13 +69,11 @@ innobase_invalidate_query_cache( /*============================*/ trx_t* trx, /*!< in: transaction which modifies the table */ - const char* full_name, /*!< in: concatenation of + const char* full_name); /*!< in: concatenation of database name, path separator, table name, null char NUL; NOTE that in Windows this is always in LOWER CASE! */ - ulint full_name_len); /*!< in: full name length where - also the null chars count */ /** Quote a standard SQL identifier like tablespace, index or column name. @param[in] file output stream @@ -157,7 +155,6 @@ UNIV_INTERN void innobase_mysql_log_notify( /*======================*/ - ib_uint64_t write_lsn, /*!< in: LSN written to log file */ ib_uint64_t flush_lsn); /*!< in: LSN flushed to disk */ /** Converts a MySQL type to an InnoDB type. Note that this function returns @@ -239,7 +236,7 @@ wsrep_innobase_kill_one_trx(MYSQL_THD const thd_ptr, const trx_t * const bf_trx, trx_t *victim_trx, ibool signal); -int wsrep_innobase_mysql_sort(int mysql_type, uint charset_number, +ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number, unsigned char* str, unsigned int str_length, unsigned int buf_length); #endif /* WITH_WSREP */ @@ -289,14 +286,6 @@ thd_lock_wait_timeout( /*==================*/ THD* thd); /*!< in: thread handle, or NULL to query the global innodb_lock_wait_timeout */ -/******************************************************************//** -Add up the time waited for the lock for the current query. */ -void -thd_set_lock_wait_time( -/*===================*/ - THD* thd, /*!< in/out: thread handle */ - ulint value); /*!< in: time waited for the lock */ - /** Get status of innodb_tmpdir. @param[in] thd thread handle, or NULL to query the global innodb_tmpdir. @@ -424,14 +413,6 @@ const char* server_get_hostname(); /*=================*/ -/******************************************************************//** -Get the error message format string. -@return the format string or 0 if not found. */ -const char* -innobase_get_err_msg( -/*=================*/ - int error_code); /*!< in: MySQL error code */ - /*********************************************************************//** Compute the next autoinc value. @@ -504,7 +485,7 @@ UNIV_INTERN void ib_push_warning( trx_t* trx, /*!< in: trx */ - ulint error, /*!< in: error code to push as warning */ + dberr_t error, /*!< in: error code to push as warning */ const char *format,/*!< in: warning message */ ...); @@ -514,7 +495,7 @@ UNIV_INTERN void ib_push_warning( void* ithd, /*!< in: thd */ - ulint error, /*!< in: error code to push as warning */ + dberr_t error, /*!< in: error code to push as warning */ const char *format,/*!< in: warning message */ ...); @@ -543,20 +524,6 @@ innobase_index_cond( void* file) /*!< in/out: pointer to ha_innobase */ MY_ATTRIBUTE((warn_unused_result)); -/******************************************************************//** -Gets information on the durability property requested by thread. -Used when writing either a prepare or commit record to the log -buffer. -@return the durability property. */ - -#include <dur_prop.h> - -enum durability_properties -thd_requested_durability( -/*=====================*/ - const THD* thd) /*!< in: thread handle */ - MY_ATTRIBUTE((warn_unused_result)); - /** Update the system variable with the given value of the InnoDB buffer pool size. @param[in] buf_pool_size given value of buffer pool size.*/ diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h index 35a04163f43..add983a0a9b 100644 --- a/storage/innobase/include/handler0alter.h +++ b/storage/innobase/include/handler0alter.h @@ -56,14 +56,6 @@ innobase_row_to_mysql( const dtuple_t* row) /*!< in: InnoDB row */ MY_ATTRIBUTE((nonnull)); -/*************************************************************//** -Resets table->record[0]. */ -void -innobase_rec_reset( -/*===============*/ - struct TABLE* table) /*!< in/out: MySQL table */ - MY_ATTRIBUTE((nonnull)); - /** Generate the next autoinc based on a snapshot of the session auto_increment_increment and auto_increment_offset variables. */ struct ib_sequence_t { diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h index a7289777e00..e496c65e46a 100644 --- a/storage/innobase/include/ib0mutex.h +++ b/storage/innobase/include/ib0mutex.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,8 +29,8 @@ Created 2013-03-26 Sunny Bains. #ifndef ib0mutex_h #define ib0mutex_h -#include "ut0ut.h" -#include "ut0rnd.h" +#include "my_atomic.h" +#include "my_cpu.h" #include "os0event.h" #include "sync0arr.h" @@ -53,15 +53,8 @@ struct OSTrackMutex { ut_ad(!m_destroy_at_exit || !m_locked); } - /** Initialise the mutex. - @param[in] id Mutex ID - @param[in] filename File where mutex was created - @param[in] line Line in filename */ - void init( - latch_id_t id, - const char* filename, - uint32_t line) - UNIV_NOTHROW + /** Initialise the mutex. */ + void init(latch_id_t, const char*, uint32_t) UNIV_NOTHROW { ut_ad(m_freed); ut_ad(!m_locked); @@ -92,16 +85,8 @@ struct OSTrackMutex { m_mutex.exit(); } - /** Acquire the mutex. - @param[in] max_spins max number of spins - @param[in] max_delay max delay per spin - @param[in] filename from where called - @param[in] line within filename */ - void enter( - uint32_t max_spins, - uint32_t max_delay, - const char* filename, - uint32_t line) + /** Acquire the mutex. */ + void enter(uint32_t, uint32_t, const char*, uint32_t) UNIV_NOTHROW { ut_ad(!m_freed); @@ -186,15 +171,8 @@ struct TTASFutexMutex { } /** Called when the mutex is "created". Note: Not from the constructor - but when the mutex is initialised. - @param[in] id Mutex ID - @param[in] filename File where mutex was created - @param[in] line Line in filename */ - void init( - latch_id_t id, - const char* filename, - uint32_t line) - UNIV_NOTHROW + but when the mutex is initialised. */ + void init(latch_id_t, const char*, uint32_t) UNIV_NOTHROW { ut_a(m_lock_word == MUTEX_STATE_UNLOCKED); } @@ -208,14 +186,9 @@ struct TTASFutexMutex { /** Acquire the mutex. @param[in] max_spins max number of spins - @param[in] max_delay max delay per spin - @param[in] filename from where called - @param[in] line within filename */ - void enter( - uint32_t max_spins, - uint32_t max_delay, - const char* filename, - uint32_t line) UNIV_NOTHROW + @param[in] max_delay max delay per spin */ + void enter(uint32_t max_spins, uint32_t max_delay, + const char*, uint32_t) UNIV_NOTHROW { uint32_t n_spins, n_waits; @@ -308,15 +281,8 @@ struct TTASMutex { } /** Called when the mutex is "created". Note: Not from the constructor - but when the mutex is initialised. - @param[in] id Mutex ID - @param[in] filename File where mutex was created - @param[in] line Line in filename */ - void init( - latch_id_t id, - const char* filename, - uint32_t line) - UNIV_NOTHROW + but when the mutex is initialised. */ + void init(latch_id_t) UNIV_NOTHROW { ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); } @@ -349,14 +315,9 @@ struct TTASMutex { /** Acquire the mutex. @param max_spins max number of spins - @param max_delay max delay per spin - @param filename from where called - @param line within filename */ - void enter( - uint32_t max_spins, - uint32_t max_delay, - const char* filename, - uint32_t line) UNIV_NOTHROW + @param max_delay max delay per spin */ + void enter(uint32_t max_spins, uint32_t max_delay, + const char*, uint32_t) UNIV_NOTHROW { const uint32_t step = max_spins; uint32_t n_spins = 0; @@ -420,14 +381,8 @@ struct TTASEventMutex { /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. - @param[in] id Mutex ID - @param[in] filename File where mutex was created - @param[in] line Line in filename */ - void init( - latch_id_t id, - const char* filename, - uint32_t line) - UNIV_NOTHROW + @param[in] id Mutex ID */ + void init(latch_id_t id, const char*, uint32_t) UNIV_NOTHROW { ut_a(m_event == 0); ut_a(m_lock_word == MUTEX_STATE_UNLOCKED); diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h index 1a5214085de..e5892057209 100644 --- a/storage/innobase/include/ibuf0ibuf.h +++ b/storage/innobase/include/ibuf0ibuf.h @@ -47,22 +47,19 @@ typedef enum { IBUF_OP_COUNT = 3 } ibuf_op_t; -/** Combinations of operations that can be buffered. Because the enum -values are used for indexing innobase_change_buffering_values[], they -should start at 0 and there should not be any gaps. */ -typedef enum { +/** Combinations of operations that can be buffered. +@see innodb_change_buffering_names */ +enum ibuf_use_t { IBUF_USE_NONE = 0, IBUF_USE_INSERT, /* insert */ IBUF_USE_DELETE_MARK, /* delete */ IBUF_USE_INSERT_DELETE_MARK, /* insert+delete */ IBUF_USE_DELETE, /* delete+purge */ - IBUF_USE_ALL, /* insert+delete+purge */ - - IBUF_USE_COUNT /* number of entries in ibuf_use_t */ -} ibuf_use_t; + IBUF_USE_ALL /* insert+delete+purge */ +}; /** Operations that can currently be buffered. */ -extern ibuf_use_t ibuf_use; +extern ulong innodb_change_buffering; /** The insert buffer control structure */ extern ibuf_t* ibuf; @@ -414,14 +411,11 @@ void ibuf_close(void); /*============*/ -/******************************************************************//** -Checks the insert buffer bitmaps on IMPORT TABLESPACE. +/** Check the insert buffer bitmaps on IMPORT TABLESPACE. +@param[in] trx transaction +@param[in,out] space tablespace being imported @return DB_SUCCESS or error code */ -dberr_t -ibuf_check_bitmap_on_import( -/*========================*/ - const trx_t* trx, /*!< in: transaction */ - ulint space_id) /*!< in: tablespace identifier */ +dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Updates free bits and buffered bits for bulk loaded page. diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic index 1b19d5450b7..b3e04ee1661 100644 --- a/storage/innobase/include/ibuf0ibuf.ic +++ b/storage/innobase/include/ibuf0ibuf.ic @@ -28,7 +28,7 @@ Created 7/19/1997 Heikki Tuuri #include "fsp0types.h" #include "buf0lru.h" -/** An index page must contain at least UNIV_PAGE_SIZE / +/** An index page must contain at least srv_page_size / IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to buffer inserts to this page. If there is this much of free space, the corresponding bits are set in the ibuf bitmap. */ @@ -124,7 +124,7 @@ ibuf_should_try( a secondary index when we decide */ { - return(ibuf_use != IBUF_USE_NONE + return(innodb_change_buffering && ibuf->max_size != 0 && !dict_index_is_clust(index) && !dict_index_is_spatial(index) @@ -314,9 +314,7 @@ ibuf_update_free_bits_if_full( block->page.size.physical(), max_ins_size); if (max_ins_size >= increase) { -#if ULINT32_UNDEFINED <= UNIV_PAGE_SIZE_MAX -# error "ULINT32_UNDEFINED <= UNIV_PAGE_SIZE_MAX" -#endif + compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX); after = ibuf_index_page_calc_free_bits( block->page.size.physical(), max_ins_size - increase); #ifdef UNIV_IBUF_DEBUG diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index edde6bf516e..53e8761971d 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -63,23 +63,6 @@ ulint lock_get_size(void); /*===============*/ /*********************************************************************//** -Creates the lock system at database start. */ -void -lock_sys_create( -/*============*/ - ulint n_cells); /*!< in: number of slots in lock hash table */ -/** Resize the lock hash table. -@param[in] n_cells number of slots in lock hash table */ -void -lock_sys_resize( - ulint n_cells); - -/*********************************************************************//** -Closes the lock system at database shutdown. */ -void -lock_sys_close(void); -/*================*/ -/*********************************************************************//** Gets the heap_no of the smallest user record on a page. @return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */ UNIV_INLINE @@ -294,7 +277,7 @@ lock_rec_insert_check_and_lock( dict_index_t* index, /*!< in: index */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr, /*!< in/out: mini-transaction */ - ibool* inherit)/*!< out: set to TRUE if the new + bool* inherit)/*!< out: set to true if the new inserted record maybe should inherit LOCK_GAP type locks from the successor record */ @@ -502,20 +485,7 @@ lock_rec_unlock( /** Release the explicit locks of a committing transaction, and release possible other transactions waiting because of these locks. */ -void lock_trx_release_locks(trx_t* trx); - -/*********************************************************************//** -Removes locks on a table to be dropped or discarded. -If remove_also_table_sx_locks is TRUE then table-level S and X locks are -also removed in addition to other table-level and record-level locks. -No lock, that is going to be removed, is allowed to be a wait lock. */ -void -lock_remove_all_on_table( -/*=====================*/ - dict_table_t* table, /*!< in: table to be dropped - or discarded */ - ibool remove_also_table_sx_locks);/*!< in: also removes - table S and X locks */ +void lock_release(trx_t* trx); /*********************************************************************//** Calculates the fold value of a page file address: used in inserting or @@ -560,8 +530,8 @@ lock_rec_find_set_bit( /*********************************************************************//** Checks if a lock request lock1 has to wait for request lock2. -@return TRUE if lock1 has to wait for lock2 to be removed */ -ibool +@return whether lock1 has to wait for lock2 to be removed */ +bool lock_has_to_wait( /*=============*/ const lock_t* lock1, /*!< in: waiting lock */ @@ -579,7 +549,7 @@ lock_report_trx_id_insanity( const rec_t* rec, /*!< in: user record */ dict_index_t* index, /*!< in: index */ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */ - trx_id_t max_trx_id); /*!< in: trx_sys_get_max_trx_id() */ + trx_id_t max_trx_id); /*!< in: trx_sys.get_max_trx_id() */ /*********************************************************************//** Prints info of locks for all transactions. @return FALSE if not able to obtain lock mutex and exits without @@ -610,7 +580,7 @@ lock_print_info_all_transactions( Return approximate number or record locks (bits set in the bitmap) for this transaction. Since delete-marked records may be removed, the record count will not be precise. -The caller must be holding lock_sys->mutex. */ +The caller must be holding lock_sys.mutex. */ ulint lock_number_of_rows_locked( /*=======================*/ @@ -619,7 +589,7 @@ lock_number_of_rows_locked( /*********************************************************************//** Return the number of table locks for a transaction. -The caller must be holding lock_sys->mutex. */ +The caller must be holding lock_sys.mutex. */ ulint lock_number_of_tables_locked( /*=========================*/ @@ -794,7 +764,6 @@ Set the lock system timeout event. */ void lock_set_timeout_event(); /*====================*/ -#ifdef UNIV_DEBUG /*********************************************************************//** Checks that a transaction id is sensible, i.e., not in the future. @return true if ok */ @@ -804,8 +773,8 @@ lock_check_trx_id_sanity( trx_id_t trx_id, /*!< in: trx id */ const rec_t* rec, /*!< in: user record */ dict_index_t* index, /*!< in: index */ - const rec_offs* offsets) /*!< in: rec_get_offsets(rec, index) */ - MY_ATTRIBUTE((warn_unused_result)); + const rec_offs* offsets); /*!< in: rec_get_offsets(rec, index) */ +#ifdef UNIV_DEBUG /*******************************************************************//** Check if the transaction holds any locks on the sys tables or its records. @@ -814,19 +783,21 @@ const lock_t* lock_trx_has_sys_table_locks( /*=========================*/ const trx_t* trx) /*!< in: transaction to check */ - MY_ATTRIBUTE((warn_unused_result)); + MY_ATTRIBUTE((nonnull, warn_unused_result)); -/*******************************************************************//** -Check if the transaction holds an exclusive lock on a record. -@return whether the locks are held */ +/** Check if the transaction holds an explicit exclusive lock on a record. +@param[in] trx transaction +@param[in] table table +@param[in] block leaf page +@param[in] heap_no heap number identifying the record +@return whether an explicit X-lock is held */ bool -lock_trx_has_rec_x_lock( -/*====================*/ +lock_trx_has_expl_x_lock( const trx_t* trx, /*!< in: transaction to check */ const dict_table_t* table, /*!< in: table to check */ const buf_block_t* block, /*!< in: buffer block of the record */ ulint heap_no)/*!< in: record heap number */ - MY_ATTRIBUTE((warn_unused_result)); + MY_ATTRIBUTE((nonnull, warn_unused_result)); #endif /* UNIV_DEBUG */ /** Lock operation struct */ @@ -838,11 +809,12 @@ struct lock_op_t{ typedef ib_mutex_t LockMutex; /** The lock system struct */ -struct lock_sys_t{ - char pad1[CACHE_LINE_SIZE]; /*!< padding to prevent other - memory update hotspots from - residing on the same memory - cache line */ +class lock_sys_t +{ + bool m_initialised; + +public: + MY_ALIGNED(CACHE_LINE_SIZE) LockMutex mutex; /*!< Mutex protecting the locks */ hash_table_t* rec_hash; /*!< hash table of the record @@ -852,13 +824,13 @@ struct lock_sys_t{ hash_table_t* prdt_page_hash; /*!< hash table of the page lock */ - char pad2[CACHE_LINE_SIZE]; /*!< Padding */ + MY_ALIGNED(CACHE_LINE_SIZE) LockMutex wait_mutex; /*!< Mutex protecting the next two fields */ srv_slot_t* waiting_threads; /*!< Array of user threads suspended while waiting for locks within InnoDB, protected - by the lock_sys->wait_mutex; + by the lock_sys.wait_mutex; os_event_set() and os_event_reset() on waiting_threads[]->event @@ -867,12 +839,7 @@ struct lock_sys_t{ srv_slot_t* last_slot; /*!< highest slot ever used in the waiting_threads array, protected by - lock_sys->wait_mutex */ - ibool rollback_complete; - /*!< TRUE if rollback of all - recovered transactions is - complete. Protected by - lock_sys->mutex */ + lock_sys.wait_mutex */ ulint n_lock_max_wait_time; /*!< Max wait time */ @@ -884,6 +851,38 @@ struct lock_sys_t{ bool timeout_thread_active; /*!< True if the timeout thread is running */ + + + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + lock_sys_t(): m_initialised(false) {} + + + bool is_initialised() { return m_initialised; } + + + /** + Creates the lock system at database start. + + @param[in] n_cells number of slots in lock hash table + */ + void create(ulint n_cells); + + + /** + Resize the lock hash table. + + @param[in] n_cells number of slots in lock hash table + */ + void resize(ulint n_cells); + + + /** Closes the lock system at database shutdown. */ + void close(); }; /*********************************************************************//** @@ -997,36 +996,36 @@ lock_rec_free_all_from_discard_page( const buf_block_t* block); /*!< in: page to be discarded */ /** The lock system */ -extern lock_sys_t* lock_sys; +extern lock_sys_t lock_sys; -/** Test if lock_sys->mutex can be acquired without waiting. */ +/** Test if lock_sys.mutex can be acquired without waiting. */ #define lock_mutex_enter_nowait() \ - (lock_sys->mutex.trylock(__FILE__, __LINE__)) + (lock_sys.mutex.trylock(__FILE__, __LINE__)) -/** Test if lock_sys->mutex is owned. */ -#define lock_mutex_own() (lock_sys->mutex.is_owned()) +/** Test if lock_sys.mutex is owned. */ +#define lock_mutex_own() (lock_sys.mutex.is_owned()) -/** Acquire the lock_sys->mutex. */ +/** Acquire the lock_sys.mutex. */ #define lock_mutex_enter() do { \ - mutex_enter(&lock_sys->mutex); \ + mutex_enter(&lock_sys.mutex); \ } while (0) -/** Release the lock_sys->mutex. */ +/** Release the lock_sys.mutex. */ #define lock_mutex_exit() do { \ - lock_sys->mutex.exit(); \ + lock_sys.mutex.exit(); \ } while (0) -/** Test if lock_sys->wait_mutex is owned. */ -#define lock_wait_mutex_own() (lock_sys->wait_mutex.is_owned()) +/** Test if lock_sys.wait_mutex is owned. */ +#define lock_wait_mutex_own() (lock_sys.wait_mutex.is_owned()) -/** Acquire the lock_sys->wait_mutex. */ +/** Acquire the lock_sys.wait_mutex. */ #define lock_wait_mutex_enter() do { \ - mutex_enter(&lock_sys->wait_mutex); \ + mutex_enter(&lock_sys.wait_mutex); \ } while (0) -/** Release the lock_sys->wait_mutex. */ +/** Release the lock_sys.wait_mutex. */ #define lock_wait_mutex_exit() do { \ - lock_sys->wait_mutex.exit(); \ + lock_sys.wait_mutex.exit(); \ } while (0) #ifdef WITH_WSREP diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic index 0ed933cba78..abe5052627b 100644 --- a/storage/innobase/include/lock0lock.ic +++ b/storage/innobase/include/lock0lock.ic @@ -54,7 +54,7 @@ lock_rec_hash( ulint page_no)/*!< in: page number */ { return(unsigned(hash_calc_hash(lock_rec_fold(space, page_no), - lock_sys->rec_hash))); + lock_sys.rec_hash))); } /*********************************************************************//** @@ -90,11 +90,11 @@ lock_hash_get( ulint mode) /*!< in: lock mode */ { if (mode & LOCK_PREDICATE) { - return(lock_sys->prdt_hash); + return(lock_sys.prdt_hash); } else if (mode & LOCK_PRDT_PAGE) { - return(lock_sys->prdt_page_hash); + return(lock_sys.prdt_page_hash); } else { - return(lock_sys->rec_hash); + return(lock_sys.rec_hash); } } diff --git a/storage/innobase/include/lock0prdt.h b/storage/innobase/include/lock0prdt.h index 6a7b88eba1f..378a5c2faca 100644 --- a/storage/innobase/include/lock0prdt.h +++ b/storage/innobase/include/lock0prdt.h @@ -50,9 +50,8 @@ lock_prdt_lock( SELECT FOR UPDATE */ ulint type_mode, /*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */ - que_thr_t* thr, /*!< in: query thread + que_thr_t* thr); /*!< in: query thread (can be NULL if BTR_NO_LOCKING_FLAG) */ - mtr_t* mtr); /*!< in/out: mini-transaction */ /*********************************************************************//** Acquire a "Page" lock on a block @@ -106,7 +105,6 @@ Update predicate lock when page splits */ void lock_prdt_update_split( /*===================*/ - buf_block_t* block, /*!< in/out: page to be split */ buf_block_t* new_block, /*!< in/out: the new half page */ lock_prdt_t* prdt, /*!< in: MBR on the old page */ lock_prdt_t* new_prdt, /*!< in: MBR on the new page */ @@ -122,7 +120,6 @@ lock_prdt_update_parent( buf_block_t* right_block, /*!< in/out: the new half page */ lock_prdt_t* left_prdt, /*!< in: MBR on the old page */ lock_prdt_t* right_prdt, /*!< in: MBR on the new page */ - lock_prdt_t* parent_prdt, /*!< in: original parent MBR */ ulint space, /*!< in: space id */ ulint page_no); /*!< in: page number */ diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h index 1ae319e6b79..cb04afdf9db 100644 --- a/storage/innobase/include/lock0types.h +++ b/storage/innobase/include/lock0types.h @@ -33,7 +33,6 @@ Created 5/7/1996 Heikki Tuuri #define lock_t ib_lock_t struct lock_t; -struct lock_sys_t; struct lock_table_t; /* Basic lock modes */ @@ -176,7 +175,7 @@ operator<<(std::ostream& out, const lock_rec_t& lock) #endif /* @} */ -/** Lock struct; protected by lock_sys->mutex */ +/** Lock struct; protected by lock_sys.mutex */ struct ib_lock_t { trx_t* trx; /*!< transaction owning the diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 612a27976e7..b201de06d17 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -44,8 +44,8 @@ Created 12/9/1995 Heikki Tuuri #define UINT32_MAX (4294967295U) #endif -/** Redo log group */ -struct log_group_t; +/** Maximum number of srv_n_log_files, or innodb_log_files_in_group */ +#define SRV_N_LOG_FILES_MAX 100 /** Magic value to use instead of log checksums when they are disabled */ #define LOG_NO_CHECKSUM_MAGIC 0xDEADBEEFUL @@ -53,8 +53,8 @@ struct log_group_t; /* Margin for the free space in the smallest log group, before a new query step which modifies the database, is started */ -#define LOG_CHECKPOINT_FREE_PER_THREAD (4 * UNIV_PAGE_SIZE) -#define LOG_CHECKPOINT_EXTRA_FREE (8 * UNIV_PAGE_SIZE) +#define LOG_CHECKPOINT_FREE_PER_THREAD (4U << srv_page_size_shift) +#define LOG_CHECKPOINT_EXTRA_FREE (8U << srv_page_size_shift) /** Append a string to the log. @param[in] str string @@ -79,9 +79,7 @@ log_free_check(void); /** Extends the log buffer. @param[in] len requested minimum size in bytes */ -void -log_buffer_extend( - ulint len); +void log_buffer_extend(ulong len); /** Check margin not to overwrite transaction log from the last checkpoint. If would estimate the log write to exceed the log_group_capacity, @@ -135,7 +133,7 @@ log_get_flush_lsn(void); /*=============*/ /**************************************************************** Gets the log group capacity. It is OK to read the value without -holding log_sys->mutex because it is constant. +holding log_sys.mutex because it is constant. @return log group capacity */ UNIV_INLINE lsn_t @@ -149,14 +147,7 @@ UNIV_INLINE lsn_t log_get_max_modified_age_async(void); /*================================*/ -/** Initializes the redo logging subsystem. */ -void -log_sys_init(); -/** Initialize the redo log. -@param[in] n_files number of files */ -void -log_init(ulint n_files); /** Calculate the recommended highest values for lsn - last_checkpoint_lsn and lsn - buf_get_oldest_modification(). @param[in] file_size requested innodb_log_file_size @@ -168,12 +159,6 @@ log_set_capacity(ulonglong file_size) MY_ATTRIBUTE((warn_unused_result)); /******************************************************//** -Completes an i/o to a log file. */ -void -log_io_complete( -/*============*/ - log_group_t* group); /*!< in: log group */ -/******************************************************//** This function is called, e.g., when a transaction wants to commit. It checks that the log has been written to the log file up to the last log entry written by the transaction. If there is a flush running, it waits and checks if the @@ -220,13 +205,9 @@ shutdown. This function also writes all log in log files to the log archive. */ void logs_empty_and_mark_files_at_shutdown(void); /*=======================================*/ -/** Read a log group header page to log_sys->checkpoint_buf. -@param[in] group log group -@param[in] header 0 or LOG_CHEKCPOINT_1 or LOG_CHECKPOINT2 */ -void -log_group_header_read( - const log_group_t* group, - ulint header); +/** Read a log group header page to log_sys.checkpoint_buf. +@param[in] header 0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */ +void log_header_read(ulint header); /** Write checkpoint info to the log header and invoke log_mutex_exit(). @param[in] sync whether to wait for the write to complete @param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */ @@ -247,16 +228,6 @@ objects! */ void log_check_margins(void); -/********************************************************//** -Sets the field values in group to correspond to a given lsn. For this function -to work, the values must already be correctly initialized to correspond to -some lsn, for instance, a checkpoint lsn. */ -void -log_group_set_fields( -/*=================*/ - log_group_t* group, /*!< in/out: group */ - lsn_t lsn); /*!< in: lsn for which the values should be - set */ /************************************************************//** Gets a log block flush bit. @return TRUE if this block was the first to be written in a log flush */ @@ -373,14 +344,6 @@ Refreshes the statistics used to print per-second averages. */ void log_refresh_stats(void); /*===================*/ -/********************************************************//** -Closes all log groups. */ -void -log_group_close_all(void); -/*=====================*/ -/** Shut down the redo log subsystem. */ -void -log_shutdown(); /** Whether to require checksums on the redo log pages */ extern my_bool innodb_log_checksums; @@ -415,7 +378,7 @@ extern my_bool innodb_log_checksums; from this offset in this log block, if value not 0 */ #define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of - log_sys->next_checkpoint_no when the + log_sys.next_checkpoint_no when the log block was last written to: if the block has not yet been written full, this value is only updated before a @@ -438,7 +401,7 @@ extern my_bool innodb_log_checksums; #define LOG_CHECKPOINT_LSN 8 /** Byte offset of the log record corresponding to LOG_CHECKPOINT_LSN */ #define LOG_CHECKPOINT_OFFSET 16 -/** log_sys_t::buf_size at the time of the checkpoint (not used) */ +/** srv_log_buffer_size at the time of the checkpoint (not used) */ #define LOG_CHECKPOINT_LOG_BUF_SIZE 24 /** MariaDB 10.2.5 encrypted redo log encryption key version (32 bits)*/ #define LOG_CHECKPOINT_CRYPT_KEY 32 @@ -480,16 +443,20 @@ or the MySQL version that created the redo log file. */ IB_TO_STR(MYSQL_VERSION_MINOR) "." \ IB_TO_STR(MYSQL_VERSION_PATCH) -/** The redo log format identifier corresponding to the current format version. -Stored in LOG_HEADER_FORMAT. +/** The original (not version-tagged) InnoDB redo log format */ +#define LOG_HEADER_FORMAT_3_23 0 +/** The MySQL 5.7.9/MariaDB 10.2.2 log format */ +#define LOG_HEADER_FORMAT_10_2 1 +/** The MariaDB 10.3.2 log format. To prevent crash-downgrade to earlier 10.2 due to the inability to roll back a retroactively introduced TRX_UNDO_RENAME_TABLE undo log record, MariaDB 10.2.18 and later will use the 10.3 format, but LOG_HEADER_SUBFORMAT 1 instead of 0. MariaDB 10.3 will use subformat 0 (5.7-style TRUNCATE) or 2 (MDEV-13564 backup-friendly TRUNCATE). */ #define LOG_HEADER_FORMAT_10_3 103 -/** The old MariaDB 10.2.2..10.2.17 log format */ -#define LOG_HEADER_FORMAT_10_2 1 +/** The redo log format identifier corresponding to the current format version. +Stored in LOG_HEADER_FORMAT. */ +#define LOG_HEADER_FORMAT_CURRENT LOG_HEADER_FORMAT_10_3 /** Future MariaDB 10.4 log format */ #define LOG_HEADER_FORMAT_10_4 104 /** Encrypted MariaDB redo log */ @@ -514,98 +481,35 @@ Remove this limitation if page number is no longer used for log file io. */ static const ulonglong log_group_max_size = ((ulonglong(UINT32_MAX) + 1) * UNIV_PAGE_SIZE_MIN - 1); -/** The state of a log group */ -enum log_group_state_t { - /** No corruption detected */ - LOG_GROUP_OK, - /** Corrupted */ - LOG_GROUP_CORRUPTED -}; - typedef ib_mutex_t LogSysMutex; typedef ib_mutex_t FlushOrderMutex; -/** Log group consists of a number of log files, each of the same size; a log -group is implemented as a space in the sense of the module fil0fil. -Currently, this is only protected by log_sys->mutex. However, in the case -of log_write_up_to(), we will access some members only with the protection -of log_sys->write_mutex, which should affect nothing for now. */ -struct log_group_t{ - /** number of files in the group */ - ulint n_files; - /** format of the redo log: e.g., LOG_HEADER_FORMAT_10_3 */ - uint32_t format; - /** redo log subformat: 0 with separately logged TRUNCATE, - 1 with fully redo-logged TRUNCATE */ - uint32_t subformat; - /** individual log file size in bytes, including the header */ - lsn_t file_size; - /** corruption status */ - log_group_state_t state; - /** lsn used to fix coordinates within the log group */ - lsn_t lsn; - /** the byte offset of the above lsn */ - lsn_t lsn_offset; - - /** used only in recovery: recovery scan succeeded up to this - lsn in this log group */ - lsn_t scanned_lsn; - /** unaligned checkpoint header */ - byte* checkpoint_buf_ptr; - /** buffer for writing a checkpoint header */ - byte* checkpoint_buf; - - /** @return whether the redo log is encrypted */ - bool is_encrypted() const - { - return((format & LOG_HEADER_FORMAT_ENCRYPTED) != 0); - } - - /** @return capacity in bytes */ - inline lsn_t capacity() const - { - return((file_size - LOG_FILE_HDR_SIZE) * n_files); - } -}; - /** Redo log buffer */ struct log_t{ - char pad1[CACHE_LINE_SIZE]; - /*!< Padding to prevent other memory - update hotspots from residing on the - same memory cache line */ + MY_ALIGNED(CACHE_LINE_SIZE) lsn_t lsn; /*!< log sequence number */ - ulint buf_free; /*!< first free offset within the log + ulong buf_free; /*!< first free offset within the log buffer in use */ - char pad2[CACHE_LINE_SIZE];/*!< Padding */ + MY_ALIGNED(CACHE_LINE_SIZE) LogSysMutex mutex; /*!< mutex protecting the log */ - char pad3[CACHE_LINE_SIZE]; /*!< Padding */ - LogSysMutex write_mutex; /*!< mutex protecting writing to log - file and accessing to log_group_t */ - char pad4[CACHE_LINE_SIZE];/*!< Padding */ + MY_ALIGNED(CACHE_LINE_SIZE) + LogSysMutex write_mutex; /*!< mutex protecting writing to log */ + MY_ALIGNED(CACHE_LINE_SIZE) FlushOrderMutex log_flush_order_mutex;/*!< mutex to serialize access to the flush list when we are putting dirty blocks in the list. The idea behind this mutex is to be able - to release log_sys->mutex during + to release log_sys.mutex during mtr_commit and still ensure that insertions in the flush_list happen in the LSN order. */ - byte* buf_ptr; /*!< unaligned log buffer, which should - be of double of buf_size */ - byte* buf; /*!< log buffer currently in use; - this could point to either the first - half of the aligned(buf_ptr) or the - second half in turns, so that log - write/flush to disk don't block - concurrent mtrs which will write - log to this buffer */ - bool first_in_use; /*!< true if buf points to the first - half of the aligned(buf_ptr), false - if the second half */ - ulint buf_size; /*!< log buffer size of each in bytes */ - ulint max_buf_free; /*!< recommended maximum value of + /** log_buffer, append data here */ + byte* buf; + /** log_buffer, writing data to file from this buffer. + Before flushing write_buf is swapped with flush_buf */ + byte* flush_buf; + ulong max_buf_free; /*!< recommended maximum value of buf_free for the buffer in use, after which the buffer is flushed */ bool check_flush_or_checkpoint; @@ -617,12 +521,70 @@ struct log_t{ max_checkpoint_age; this flag is peeked at by log_free_check(), which does not reserve the log mutex */ - /** the redo log */ - log_group_t log; + + /** Log files. Protected by mutex or write_mutex. */ + struct files { + /** number of files */ + ulint n_files; + /** format of the redo log: e.g., LOG_HEADER_FORMAT_CURRENT */ + uint32_t format; + /** redo log subformat: 0 with separately logged TRUNCATE, + 2 with fully redo-logged TRUNCATE (1 in MariaDB 10.2) */ + uint32_t subformat; + /** individual log file size in bytes, including the header */ + lsn_t file_size; + private: + /** lsn used to fix coordinates within the log group */ + lsn_t lsn; + /** the byte offset of the above lsn */ + lsn_t lsn_offset; + public: + /** used only in recovery: recovery scan succeeded up to this + lsn in this log group */ + lsn_t scanned_lsn; + + /** @return whether the redo log is encrypted */ + bool is_encrypted() const { return format & LOG_HEADER_FORMAT_ENCRYPTED; } + /** @return capacity in bytes */ + lsn_t capacity() const{ return (file_size - LOG_FILE_HDR_SIZE) * n_files; } + /** Calculate the offset of a log sequence number. + @param[in] lsn log sequence number + @return offset within the log */ + inline lsn_t calc_lsn_offset(lsn_t lsn) const; + + /** Set the field values to correspond to a given lsn. */ + void set_fields(lsn_t lsn) + { + lsn_t c_lsn_offset = calc_lsn_offset(lsn); + set_lsn(lsn); + set_lsn_offset(c_lsn_offset); + } + + /** Read a log segment to log_sys.buf. + @param[in,out] start_lsn in: read area start, + out: the last read valid lsn + @param[in] end_lsn read area end + @return whether no invalid blocks (e.g checksum mismatch) were found */ + bool read_log_seg(lsn_t* start_lsn, lsn_t end_lsn); + + /** Initialize the redo log buffer. + @param[in] n_files number of files */ + void create(ulint n_files); + + /** Close the redo log buffer. */ + void close() + { + n_files = 0; + } + void set_lsn(lsn_t a_lsn); + lsn_t get_lsn() const { return lsn; } + void set_lsn_offset(lsn_t a_lsn); + lsn_t get_lsn_offset() const { return lsn_offset; } + } log; /** The fields involved in the log buffer flush @{ */ - ulint buf_next_to_write;/*!< first offset in the log buffer + ulong buf_next_to_write;/*!< first offset in the log buffer where the byte content may not exist written to file, e.g., the start offset of a log record catenated @@ -637,11 +599,11 @@ struct log_t{ AND flushed to disk */ ulint n_pending_flushes;/*!< number of currently pending flushes; protected by - log_sys_t::mutex */ + log_sys.mutex */ os_event_t flush_event; /*!< this event is in the reset state when a flush is running; os_event_set() and os_event_reset() - are protected by log_sys_t::mutex */ + are protected by log_sys.mutex */ ulint n_log_ios; /*!< number of log i/os initiated thus far */ ulint n_log_ios_old; /*!< number of log i/o's at the @@ -687,7 +649,7 @@ struct log_t{ /*!< extra redo log records to write during a checkpoint, or NULL if none. The pointer is protected by - log_sys->mutex, and the data must + log_sys.mutex, and the data must remain constant as long as this pointer is not NULL. */ ulint n_pending_checkpoint_writes; @@ -697,73 +659,116 @@ struct log_t{ checkpoint write is running; a thread should wait for this without owning the log mutex */ - byte* checkpoint_buf_ptr;/* unaligned checkpoint header */ - byte* checkpoint_buf; /*!< checkpoint header is read to this - buffer */ + + /** buffer for checkpoint header */ + MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE) + byte checkpoint_buf[OS_FILE_LOG_BLOCK_SIZE]; /* @} */ - /** @return whether the redo log is encrypted */ - bool is_encrypted() const - { - return(log.is_encrypted()); - } +private: + bool m_initialised; +public: + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + log_t(): m_initialised(false) {} + + /** @return whether the redo log is encrypted */ + bool is_encrypted() const { return(log.is_encrypted()); } + + bool is_initialised() { return m_initialised; } + + /** Complete an asynchronous checkpoint write. */ + void complete_checkpoint(); + + /** Initialise the redo log subsystem. */ + void create(); + + /** Shut down the redo log subsystem. */ + void close(); }; /** Redo log system */ -extern log_t* log_sys; +extern log_t log_sys; + +/** Calculate the offset of a log sequence number. +@param[in] lsn log sequence number +@return offset within the log */ +inline lsn_t log_t::files::calc_lsn_offset(lsn_t lsn) const +{ + ut_ad(this == &log_sys.log); + /* The lsn parameters are updated while holding both the mutexes + and it is ok to have either of them while reading */ + ut_ad(log_sys.mutex.is_owned() || log_sys.write_mutex.is_owned()); + const lsn_t group_size= capacity(); + lsn_t l= lsn - this->lsn; + if (longlong(l) < 0) { + l= lsn_t(-longlong(l)) % group_size; + l= group_size - l; + } + + l+= lsn_offset - LOG_FILE_HDR_SIZE * (1 + lsn_offset / file_size); + l%= group_size; + return l + LOG_FILE_HDR_SIZE * (1 + l / (file_size - LOG_FILE_HDR_SIZE)); +} + +inline void log_t::files::set_lsn(lsn_t a_lsn) { + ut_ad(log_sys.mutex.is_owned() || log_sys.write_mutex.is_owned()); + lsn = a_lsn; +} + +inline void log_t::files::set_lsn_offset(lsn_t a_lsn) { + ut_ad(log_sys.mutex.is_owned() || log_sys.write_mutex.is_owned()); + ut_ad((lsn % OS_FILE_LOG_BLOCK_SIZE) == (a_lsn % OS_FILE_LOG_BLOCK_SIZE)); + lsn_offset = a_lsn; +} /** Test if flush order mutex is owned. */ #define log_flush_order_mutex_own() \ - mutex_own(&log_sys->log_flush_order_mutex) + mutex_own(&log_sys.log_flush_order_mutex) /** Acquire the flush order mutex. */ #define log_flush_order_mutex_enter() do { \ - mutex_enter(&log_sys->log_flush_order_mutex); \ + mutex_enter(&log_sys.log_flush_order_mutex); \ } while (0) /** Release the flush order mutex. */ # define log_flush_order_mutex_exit() do { \ - mutex_exit(&log_sys->log_flush_order_mutex); \ + mutex_exit(&log_sys.log_flush_order_mutex); \ } while (0) /** Test if log sys mutex is owned. */ -#define log_mutex_own() mutex_own(&log_sys->mutex) +#define log_mutex_own() mutex_own(&log_sys.mutex) /** Test if log sys write mutex is owned. */ -#define log_write_mutex_own() mutex_own(&log_sys->write_mutex) +#define log_write_mutex_own() mutex_own(&log_sys.write_mutex) /** Acquire the log sys mutex. */ -#define log_mutex_enter() mutex_enter(&log_sys->mutex) +#define log_mutex_enter() mutex_enter(&log_sys.mutex) /** Acquire the log sys write mutex. */ -#define log_write_mutex_enter() mutex_enter(&log_sys->write_mutex) +#define log_write_mutex_enter() mutex_enter(&log_sys.write_mutex) /** Acquire all the log sys mutexes. */ #define log_mutex_enter_all() do { \ - mutex_enter(&log_sys->write_mutex); \ - mutex_enter(&log_sys->mutex); \ + mutex_enter(&log_sys.write_mutex); \ + mutex_enter(&log_sys.mutex); \ } while (0) /** Release the log sys mutex. */ -#define log_mutex_exit() mutex_exit(&log_sys->mutex) +#define log_mutex_exit() mutex_exit(&log_sys.mutex) /** Release the log sys write mutex.*/ -#define log_write_mutex_exit() mutex_exit(&log_sys->write_mutex) +#define log_write_mutex_exit() mutex_exit(&log_sys.write_mutex) /** Release all the log sys mutexes. */ #define log_mutex_exit_all() do { \ - mutex_exit(&log_sys->mutex); \ - mutex_exit(&log_sys->write_mutex); \ + mutex_exit(&log_sys.mutex); \ + mutex_exit(&log_sys.write_mutex); \ } while (0) -/** Calculate the offset of an lsn within a log group. -@param[in] lsn log sequence number -@param[in] group log group -@return offset within the log group */ -lsn_t -log_group_calc_lsn_offset( - lsn_t lsn, - const log_group_t* group); - /* log scrubbing speed, in bytes/sec */ extern ulonglong innodb_scrub_log_speed; diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic index c366affcdef..be28fee7b8e 100644 --- a/storage/innobase/include/log0log.ic +++ b/storage/innobase/include/log0log.ic @@ -26,12 +26,12 @@ Created 12/9/1995 Heikki Tuuri #include "mach0data.h" #include "srv0mon.h" -#include "srv0srv.h" #include "ut0crc32.h" #ifdef UNIV_LOG_LSN_DEBUG #include "mtr0types.h" #endif /* UNIV_LOG_LSN_DEBUG */ +extern ulong srv_log_buffer_size; /************************************************************//** Gets a log block flush bit. @@ -309,15 +309,15 @@ log_reserve_and_write_fast( len - SIZE_OF_MLOG_CHECKPOINT] ? 0 : 1 - + mach_get_compressed_size(log_sys->lsn >> 32) - + mach_get_compressed_size(log_sys->lsn & 0xFFFFFFFFUL); + + mach_get_compressed_size(log_sys.lsn >> 32) + + mach_get_compressed_size(log_sys.lsn & 0xFFFFFFFFUL); #endif /* UNIV_LOG_LSN_DEBUG */ const ulint data_len = len #ifdef UNIV_LOG_LSN_DEBUG + lsn_len #endif /* UNIV_LOG_LSN_DEBUG */ - + log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE; + + log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; if (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { @@ -327,44 +327,44 @@ log_reserve_and_write_fast( return(0); } - *start_lsn = log_sys->lsn; + *start_lsn = log_sys.lsn; #ifdef UNIV_LOG_LSN_DEBUG if (lsn_len) { /* Write the LSN pseudo-record. */ - byte* b = &log_sys->buf[log_sys->buf_free]; + byte* b = &log_sys.buf[log_sys.buf_free]; *b++ = MLOG_LSN | (MLOG_SINGLE_REC_FLAG & *(const byte*) str); /* Write the LSN in two parts, as a pseudo page number and space id. */ - b += mach_write_compressed(b, log_sys->lsn >> 32); - b += mach_write_compressed(b, log_sys->lsn & 0xFFFFFFFFUL); - ut_a(b - lsn_len == &log_sys->buf[log_sys->buf_free]); + b += mach_write_compressed(b, log_sys.lsn >> 32); + b += mach_write_compressed(b, log_sys.lsn & 0xFFFFFFFFUL); + ut_a(b - lsn_len == &log_sys.buf[log_sys.buf_free]); ::memcpy(b, str, len); len += lsn_len; } else #endif /* UNIV_LOG_LSN_DEBUG */ - memcpy(log_sys->buf + log_sys->buf_free, str, len); + memcpy(log_sys.buf + log_sys.buf_free, str, len); log_block_set_data_len( reinterpret_cast<byte*>(ut_align_down( - log_sys->buf + log_sys->buf_free, + log_sys.buf + log_sys.buf_free, OS_FILE_LOG_BLOCK_SIZE)), data_len); - log_sys->buf_free += len; + log_sys.buf_free += ulong(len); - ut_ad(log_sys->buf_free <= log_sys->buf_size); + ut_ad(log_sys.buf_free <= srv_log_buffer_size); - log_sys->lsn += len; + log_sys.lsn += len; MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, - log_sys->lsn - log_sys->last_checkpoint_lsn); + log_sys.lsn - log_sys.last_checkpoint_lsn); - return(log_sys->lsn); + return(log_sys.lsn); } /************************************************************//** @@ -379,7 +379,7 @@ log_get_lsn(void) log_mutex_enter(); - lsn = log_sys->lsn; + lsn = log_sys.lsn; log_mutex_exit(); @@ -397,7 +397,7 @@ log_get_flush_lsn(void) log_mutex_enter(); - lsn = log_sys->flushed_to_disk_lsn; + lsn = log_sys.flushed_to_disk_lsn; log_mutex_exit(); @@ -414,11 +414,11 @@ log_get_lsn_nowait(void) { lsn_t lsn=0; - if (!mutex_enter_nowait(&(log_sys->mutex))) { + if (!mutex_enter_nowait(&(log_sys.mutex))) { - lsn = log_sys->lsn; + lsn = log_sys.lsn; - mutex_exit(&(log_sys->mutex)); + mutex_exit(&(log_sys.mutex)); } return(lsn); @@ -426,14 +426,14 @@ log_get_lsn_nowait(void) /**************************************************************** Gets the log group capacity. It is OK to read the value without -holding log_sys->mutex because it is constant. +holding log_sys.mutex because it is constant. @return log group capacity */ UNIV_INLINE lsn_t log_get_capacity(void) /*==================*/ { - return(log_sys->log_group_capacity); + return(log_sys.log_group_capacity); } /**************************************************************** @@ -445,7 +445,7 @@ lsn_t log_get_max_modified_age_async(void) /*================================*/ { - return(log_sys->max_modified_age_async); + return(log_sys.max_modified_age_async); } /***********************************************************************//** @@ -477,7 +477,7 @@ log_free_check(void) sync_allowed_latches(latches, latches + UT_ARR_SIZE(latches)))); - if (log_sys->check_flush_or_checkpoint) { + if (log_sys.check_flush_or_checkpoint) { log_check_margins(); } diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 6425e3c6c37..b8a96a9e431 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -86,20 +86,6 @@ void recv_sys_debug_free(void); /*=====================*/ -/** Read a log segment to a buffer. -@param[out] buf buffer -@param[in] group redo log files -@param[in, out] start_lsn in : read area start, out: the last read valid lsn -@param[in] end_lsn read area end -@param[out] invalid_block - invalid, (maybe incompletely written) block encountered -@return false, if invalid block encountered (e.g checksum mismatch), true otherwise */ -bool -log_group_read_log_seg( - byte* buf, - const log_group_t* group, - lsn_t* start_lsn, - lsn_t end_lsn); - /********************************************************//** Reset the state of the recovery system variables. */ void @@ -234,7 +220,7 @@ struct recv_sys_t{ ib_mutex_t writer_mutex;/*!< mutex coordinating flushing between recv_writer_thread and the recovery thread. */ - os_event_t flush_start;/*!< event to acticate + os_event_t flush_start;/*!< event to activate page cleaner threads */ os_event_t flush_end;/*!< event to signal that the page cleaner has finished the request */ @@ -250,6 +236,7 @@ struct recv_sys_t{ /*!< this is TRUE when a log rec application batch is running */ byte* buf; /*!< buffer for parsing log records */ + size_t buf_size; /*!< size of buf */ ulint len; /*!< amount of data in buf */ lsn_t parse_start_lsn; /*!< this is the lsn from which we were able to @@ -337,7 +324,7 @@ extern bool recv_no_ibuf_operations; extern bool recv_needed_recovery; #ifdef UNIV_DEBUG /** TRUE if writing to the redo log (mtr_commit) is forbidden. -Protected by log_sys->mutex. */ +Protected by log_sys.mutex. */ extern bool recv_no_log_write; #endif /* UNIV_DEBUG */ @@ -348,11 +335,11 @@ extern bool recv_lsn_checks_on; /** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many times! */ -#define RECV_PARSING_BUF_SIZE (2 * 1024 * 1024) +#define RECV_PARSING_BUF_SIZE (2U << 20) /** Size of block reads when the log groups are scanned forward to do a roll-forward */ -#define RECV_SCAN_SIZE (4 * UNIV_PAGE_SIZE) +#define RECV_SCAN_SIZE (4U << srv_page_size_shift) /** This is a low level function for the recovery system to create a page which has buffered intialized redo log records. diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h index 0cd15ebb261..18ae845955a 100644 --- a/storage/innobase/include/mem0mem.h +++ b/storage/innobase/include/mem0mem.h @@ -69,11 +69,11 @@ allocations of small buffers. */ #define MEM_BLOCK_START_SIZE 64 #define MEM_BLOCK_STANDARD_SIZE \ - (UNIV_PAGE_SIZE >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF) + (srv_page_size >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF) /** If a memory heap is allowed to grow into the buffer pool, the following is the maximum size for a single allocated buffer: */ -#define MEM_MAX_ALLOC_IN_BUF (UNIV_PAGE_SIZE - 200 + REDZONE_SIZE) +#define MEM_MAX_ALLOC_IN_BUF (srv_page_size - 200 + REDZONE_SIZE) /** Space needed when allocating for a user a field of length N. The space is allocated only in multiples of UNIV_MEM_ALIGNMENT. */ @@ -247,22 +247,26 @@ mem_heap_dup(mem_heap_t* heap, const void* data, size_t len) @param[in] heap memory heap where string is allocated @param[in] str string to be copied @return own: a copy of the string */ +inline char* -mem_heap_strdup( - mem_heap_t* heap, - const char* str); +mem_heap_strdup(mem_heap_t* heap, const char* str) +{ + return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1))); +} -/**********************************************************************//** -Makes a NUL-terminated copy of a nonterminated string, -allocated from a memory heap. -@return own: a copy of the string */ -UNIV_INLINE +/** Duplicate a string, allocated from a memory heap. +@param[in] heap memory heap where string is allocated +@param[in] str string to be copied +@param[in] len length of str, in bytes +@return own: a NUL-terminated copy of str */ +inline char* -mem_heap_strdupl( -/*=============*/ - mem_heap_t* heap, /*!< in: memory heap where string is allocated */ - const char* str, /*!< in: string to be copied */ - ulint len); /*!< in: length of str, in bytes */ +mem_heap_strdupl(mem_heap_t* heap, const char* str, size_t len) +{ + char* s = static_cast<char*>(mem_heap_alloc(heap, len + 1)); + s[len] = 0; + return(static_cast<char*>(memcpy(s, str, len))); +} /**********************************************************************//** Concatenate two strings and return the result, using a memory heap. diff --git a/storage/innobase/include/mem0mem.ic b/storage/innobase/include/mem0mem.ic index bd0db9bf503..9236bbef05d 100644 --- a/storage/innobase/include/mem0mem.ic +++ b/storage/innobase/include/mem0mem.ic @@ -203,7 +203,7 @@ mem_heap_alloc( mem_block_set_free(block, free + MEM_SPACE_NEEDED(n)); buf = buf + REDZONE_SIZE; - MEM_UNDEFINED(buf, n - REDZONE_SIZE); + MEM_MAKE_ADDRESSABLE(buf, n - REDZONE_SIZE); return(buf); } @@ -264,7 +264,8 @@ mem_heap_free_heap_top( ut_ad(block); /* Set the free field of block */ - mem_block_set_free(block, old_top - (byte*) block); + mem_block_set_free(block, + ulint(old_top - reinterpret_cast<byte*>(block))); ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); MEM_NOACCESS(old_top, (byte*) block + block->len - old_top); @@ -430,7 +431,7 @@ mem_heap_get_size( ulint size = heap->total_size; if (heap->free_block) { - size += UNIV_PAGE_SIZE; + size += srv_page_size; } return(size); @@ -463,20 +464,3 @@ mem_strdupl( s[len] = 0; return(static_cast<char*>(memcpy(s, str, len))); } - -/**********************************************************************//** -Makes a NUL-terminated copy of a nonterminated string, -allocated from a memory heap. -@return own: a copy of the string */ -UNIV_INLINE -char* -mem_heap_strdupl( -/*=============*/ - mem_heap_t* heap, /*!< in: memory heap where string is allocated */ - const char* str, /*!< in: string to be copied */ - ulint len) /*!< in: length of str, in bytes */ -{ - char* s = (char*) mem_heap_alloc(heap, len + 1); - s[len] = 0; - return((char*) memcpy(s, str, len)); -} diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h index eaf2fad9e7f..dc76b40a3db 100644 --- a/storage/innobase/include/mtr0log.h +++ b/storage/innobase/include/mtr0log.h @@ -109,14 +109,6 @@ mlog_catenate_string( const byte* str, /*!< in: string to write */ ulint len); /*!< in: string length */ /********************************************************//** -Catenates a compressed ulint to mlog. */ -UNIV_INLINE -void -mlog_catenate_ulint_compressed( -/*===========================*/ - mtr_t* mtr, /*!< in: mtr */ - ulint val); /*!< in: value to write */ -/********************************************************//** Catenates a compressed 64-bit integer to mlog. */ UNIV_INLINE void diff --git a/storage/innobase/include/mtr0log.ic b/storage/innobase/include/mtr0log.ic index 5cfc08622d5..70bcaf43b9e 100644 --- a/storage/innobase/include/mtr0log.ic +++ b/storage/innobase/include/mtr0log.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -118,30 +118,6 @@ mlog_catenate_ulint( } /********************************************************//** -Catenates a compressed ulint to mlog. */ -UNIV_INLINE -void -mlog_catenate_ulint_compressed( -/*===========================*/ - mtr_t* mtr, /*!< in: mtr */ - ulint val) /*!< in: value to write */ -{ - byte* log_ptr; - - log_ptr = mlog_open(mtr, 10); - - /* If no logging is requested, we may return now */ - if (log_ptr == NULL) { - - return; - } - - log_ptr += mach_write_compressed(log_ptr, val); - - mlog_close(mtr, log_ptr); -} - -/********************************************************//** Catenates a compressed 64-bit integer to mlog. */ UNIV_INLINE void @@ -187,7 +163,6 @@ mlog_write_initial_log_record_low( || type == MLOG_FILE_CREATE2 || type == MLOG_FILE_RENAME2 || type == MLOG_INDEX_LOAD - || type == MLOG_TRUNCATE || type == MLOG_FILE_WRITE_CRYPT_DATA || mtr->is_named_space(space_id)); @@ -225,7 +200,7 @@ mlog_write_initial_log_record_fast( ut_ad(log_ptr); ut_d(mtr->memo_modify_page(ptr)); - page = (const byte*) ut_align_down(ptr, UNIV_PAGE_SIZE); + page = (const byte*) ut_align_down(ptr, srv_page_size); space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); offset = mach_read_from_4(page + FIL_PAGE_OFFSET); diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index b57a38f8eab..1ddfda05186 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -28,9 +28,7 @@ Created 11/26/1995 Heikki Tuuri #ifndef mtr0mtr_h #define mtr0mtr_h -#include "log0types.h" -#include "mtr0types.h" -#include "buf0types.h" +#include "fil0fil.h" #include "dyn0buf.h" /** Start a mini-transaction. */ @@ -66,13 +64,6 @@ savepoint. */ (m)->memo_release((o), (t)) #ifdef UNIV_DEBUG - -/** Check if memo contains the given item. */ -#define mtr_is_block_fix(m, o, t, table) mtr_memo_contains(m, o, t) - -/** Check if memo contains the given page. */ -#define mtr_is_page_fix(m, p, t, table) mtr_memo_contains_page(m, p, t) - /** Check if memo contains the given item. @return TRUE if contains */ #define mtr_memo_contains(m, o, t) \ @@ -94,17 +85,12 @@ savepoint. */ /** Push an object to an mtr memo stack. */ #define mtr_memo_push(m, o, t) (m)->memo_push(o, t) -/** Lock an rw-lock in s-mode. */ -#define mtr_s_lock(l, m) (m)->s_lock((l), __FILE__, __LINE__) - -/** Lock an rw-lock in x-mode. */ -#define mtr_x_lock(l, m) (m)->x_lock((l), __FILE__, __LINE__) - -/** Lock a tablespace in x-mode. */ +#define mtr_s_lock_space(s, m) (m)->s_lock_space((s), __FILE__, __LINE__) #define mtr_x_lock_space(s, m) (m)->x_lock_space((s), __FILE__, __LINE__) -/** Lock an rw-lock in sx-mode. */ -#define mtr_sx_lock(l, m) (m)->sx_lock((l), __FILE__, __LINE__) +#define mtr_s_lock_index(i, m) (m)->s_lock(&(i)->lock, __FILE__, __LINE__) +#define mtr_x_lock_index(i, m) (m)->x_lock(&(i)->lock, __FILE__, __LINE__) +#define mtr_sx_lock_index(i, m) (m)->sx_lock(&(i)->lock, __FILE__, __LINE__) #define mtr_memo_contains_flagged(m, p, l) \ (m)->memo_contains_flagged((p), (l)) @@ -126,9 +112,6 @@ savepoint. */ @return true if the mtr is dirtying a clean page. */ #define mtr_block_dirtied(b) mtr_t::is_block_dirtied((b)) -/** Forward declaration of a tablespace object */ -struct fil_space_t; - /** Append records to the system-wide redo log buffer. @param[in] log redo log records */ void @@ -148,13 +131,6 @@ struct mtr_memo_slot_t { struct mtr_t { mtr_t() : m_state(MTR_STATE_INIT) {} - /** Release the free extents that was reserved using - fsp_reserve_free_extents(). This is equivalent to calling - fil_space_release_free_extents(). This is intended for use - with index pages. - @param[in] n_reserved number of reserved extents */ - void release_free_extents(ulint n_reserved); - /** Start a mini-transaction. */ void start(); @@ -206,17 +182,6 @@ struct mtr_t { @return old mode */ inline mtr_log_t set_log_mode(mtr_log_t mode); - /** Note that the mini-transaction is modifying the system tablespace - (for example, for the change buffer or for undo logs) - @return the system tablespace */ - fil_space_t* set_sys_modified() - { - if (!m_sys_space) { - lookup_sys_space(); - } - return m_sys_space; - } - /** Copy the tablespaces associated with the mini-transaction (needed for generating MLOG_FILE_NAME records) @param[in] mtr mini-transaction that may modify @@ -225,27 +190,26 @@ struct mtr_t { { ut_ad(!m_user_space_id); ut_ad(!m_user_space); - ut_ad(!m_undo_space); - ut_ad(!m_sys_space); ut_d(m_user_space_id = mtr.m_user_space_id); m_user_space = mtr.m_user_space; - m_undo_space = mtr.m_undo_space; - m_sys_space = mtr.m_sys_space; } /** Set the tablespace associated with the mini-transaction (needed for generating a MLOG_FILE_NAME record) @param[in] space_id user or system tablespace ID @return the tablespace */ - fil_space_t* set_named_space(ulint space_id) + fil_space_t* set_named_space_id(ulint space_id) { ut_ad(!m_user_space_id); ut_d(m_user_space_id = space_id); if (!space_id) { - return(set_sys_modified()); + return fil_system.sys_space; } else { - lookup_user_space(space_id); + ut_ad(m_user_space_id == space_id); + ut_ad(!m_user_space); + m_user_space = fil_space_get(space_id); + ut_ad(m_user_space); return m_user_space; } } @@ -253,7 +217,14 @@ struct mtr_t { /** Set the tablespace associated with the mini-transaction (needed for generating a MLOG_FILE_NAME record) @param[in] space user or system tablespace */ - void set_named_space(fil_space_t* space); + void set_named_space(fil_space_t* space) + { + ut_ad(!m_user_space_id); + ut_d(m_user_space_id = space->id); + if (space->id) { + m_user_space = space; + } + } #ifdef UNIV_DEBUG /** Check the tablespace associated with the mini-transaction @@ -261,6 +232,11 @@ struct mtr_t { @param[in] space tablespace @return whether the mini-transaction is associated with the space */ bool is_named_space(ulint space) const; + /** Check the tablespace associated with the mini-transaction + (needed for generating a MLOG_FILE_NAME record) + @param[in] space tablespace + @return whether the mini-transaction is associated with the space */ + bool is_named_space(const fil_space_t* space) const; #endif /* UNIV_DEBUG */ /** Read 1 - 4 bytes from a file page buffered in the buffer pool. @@ -270,29 +246,7 @@ struct mtr_t { inline ulint read_ulint(const byte* ptr, mlog_id_t type) const MY_ATTRIBUTE((warn_unused_result)); - /** Locks a rw-latch in S mode. - NOTE: use mtr_s_lock(). - @param lock rw-lock - @param file file name from where called - @param line line number in file */ - inline void s_lock(rw_lock_t* lock, const char* file, unsigned line); - - /** Locks a rw-latch in X mode. - NOTE: use mtr_x_lock(). - @param lock rw-lock - @param file file name from where called - @param line line number in file */ - inline void x_lock(rw_lock_t* lock, const char* file, unsigned line); - - /** Locks a rw-latch in X mode. - NOTE: use mtr_sx_lock(). - @param lock rw-lock - @param file file name from where called - @param line line number in file */ - inline void sx_lock(rw_lock_t* lock, const char* file, unsigned line); - /** Acquire a tablespace X-latch. - NOTE: use mtr_x_lock_space(). @param[in] space_id tablespace ID @param[in] file file name from where called @param[in] line line number in file @@ -302,11 +256,60 @@ struct mtr_t { const char* file, unsigned line); - /** Exclusively aqcuire a tablespace latch. - @param space tablespace - @param file source code file name of the caller - @param line source code line number */ - void x_lock_space(fil_space_t *space, const char *file, unsigned line); + /** Acquire a shared rw-latch. + @param[in] lock rw-latch + @param[in] file file name from where called + @param[in] line line number in file */ + void s_lock(rw_lock_t* lock, const char* file, unsigned line) + { + rw_lock_s_lock_inline(lock, 0, file, line); + memo_push(lock, MTR_MEMO_S_LOCK); + } + + /** Acquire an exclusive rw-latch. + @param[in] lock rw-latch + @param[in] file file name from where called + @param[in] line line number in file */ + void x_lock(rw_lock_t* lock, const char* file, unsigned line) + { + rw_lock_x_lock_inline(lock, 0, file, line); + memo_push(lock, MTR_MEMO_X_LOCK); + } + + /** Acquire an shared/exclusive rw-latch. + @param[in] lock rw-latch + @param[in] file file name from where called + @param[in] line line number in file */ + void sx_lock(rw_lock_t* lock, const char* file, unsigned line) + { + rw_lock_sx_lock_inline(lock, 0, file, line); + memo_push(lock, MTR_MEMO_SX_LOCK); + } + + /** Acquire a tablespace S-latch. + @param[in] space tablespace + @param[in] file file name from where called + @param[in] line line number in file */ + void s_lock_space(fil_space_t* space, const char* file, unsigned line) + { + ut_ad(space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT + || space->purpose == FIL_TYPE_TABLESPACE); + s_lock(&space->latch, file, line); + } + + /** Acquire a tablespace X-latch. + @param[in] space tablespace + @param[in] file file name from where called + @param[in] line line number in file */ + void x_lock_space(fil_space_t* space, const char* file, unsigned line) + { + ut_ad(space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT + || space->purpose == FIL_TYPE_TABLESPACE); + memo_push(space, MTR_MEMO_SPACE_X_LOCK); + rw_lock_x_lock_inline(&space->latch, 0, file, line); + } /** Release an object in the memo stack. @param object object @@ -438,12 +441,6 @@ struct mtr_t { @param block buffer pool block to search for */ bool have_x_latch(const buf_block_t& block) const; private: - /** Look up the system tablespace. */ - void lookup_sys_space(); - /** Look up the user tablespace. - @param[in] space_id tablespace ID */ - void lookup_user_space(ulint space_id); - /** Prepare to write the mini-transaction log to the redo log buffer. @return number of bytes to write in finish_write() */ inline ulint prepare_write(); @@ -485,10 +482,6 @@ private: #endif /* UNIV_DEBUG */ /** User tablespace that is being modified by the mini-transaction */ fil_space_t* m_user_space; - /** Undo tablespace that is being modified by the mini-transaction */ - fil_space_t* m_undo_space; - /** System tablespace if being modified by the mini-transaction */ - fil_space_t* m_sys_space; /** State of the transaction */ mtr_state_t m_state; diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index a45d088d5d7..17b7f04a29d 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -229,39 +229,6 @@ mtr_t::set_log_mode(mtr_log_t mode) } /** -Locks a lock in s-mode. */ - -void -mtr_t::s_lock(rw_lock_t* lock, const char* file, unsigned line) -{ - rw_lock_s_lock_inline(lock, 0, file, line); - - memo_push(lock, MTR_MEMO_S_LOCK); -} - -/** -Locks a lock in x-mode. */ - -void -mtr_t::x_lock(rw_lock_t* lock, const char* file, unsigned line) -{ - rw_lock_x_lock_inline(lock, 0, file, line); - - memo_push(lock, MTR_MEMO_X_LOCK); -} - -/** -Locks a lock in sx-mode. */ - -void -mtr_t::sx_lock(rw_lock_t* lock, const char* file, unsigned line) -{ - rw_lock_sx_lock_inline(lock, 0, file, line); - - memo_push(lock, MTR_MEMO_SX_LOCK); -} - -/** Reads 1 - 4 bytes from a file page buffered in the buffer pool. @return value read */ diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h index 2d4cd7b97ac..25406fa6aee 100644 --- a/storage/innobase/include/mtr0types.h +++ b/storage/innobase/include/mtr0types.h @@ -102,16 +102,16 @@ enum mlog_id_t { /** Create an index page */ MLOG_PAGE_CREATE = 19, - /** Insert entry in an undo log */ + /** insert an undo log record */ MLOG_UNDO_INSERT = 20, - /** erase an undo log page end */ + /** erase an undo log page end (used in MariaDB 10.2) */ MLOG_UNDO_ERASE_END = 21, /** initialize a page in an undo log */ MLOG_UNDO_INIT = 22, - /** reuse an insert undo log header */ + /** reuse an insert undo log header (used in MariaDB 10.2) */ MLOG_UNDO_HDR_REUSE = 24, /** create an undo log header */ @@ -225,8 +225,12 @@ enum mlog_id_t { redo log about individual pages */ MLOG_INDEX_LOAD = 61, + /** write DB_TRX_ID,DB_ROLL_PTR to a clustered index leaf page + of a ROW_FORMAT=COMPRESSED table */ + MLOG_ZIP_WRITE_TRX_ID = 62, + /** biggest value (used in assertions) */ - MLOG_BIGGEST_TYPE = MLOG_INDEX_LOAD, + MLOG_BIGGEST_TYPE = MLOG_ZIP_WRITE_TRX_ID, /** log record for writing/updating crypt data of a tablespace */ diff --git a/storage/innobase/include/os0event.h b/storage/innobase/include/os0event.h index 55b9d054021..52f6500ae63 100644 --- a/storage/innobase/include/os0event.h +++ b/storage/innobase/include/os0event.h @@ -42,11 +42,7 @@ Creates an event semaphore, i.e., a semaphore which may just have two states: signaled and nonsignaled. The created event is manual reset: it must be reset explicitly by calling os_event_reset(). @return the event handle */ -os_event_t -os_event_create( -/*============*/ - const char* name); /*!< in: the name of the event, if NULL - the event is created without a name */ +os_event_t os_event_create(const char*); /** Sets an event semaphore to the signaled state: lets waiting threads diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 23de5bd0ef1..222f2cf6b22 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -50,7 +50,6 @@ struct fil_node_t; struct fil_space_t; extern bool os_has_said_disk_full; -extern my_bool srv_use_trim; /** File offset in bytes */ typedef ib_uint64_t os_offset_t; @@ -69,10 +68,6 @@ the OS actually supports it: Win 95 does not, NT does. */ /** File handle */ typedef HANDLE os_file_t; -/** Convert a C file descriptor to a native file handle -@param fd file descriptor -@return native file handle */ -# define OS_FILE_FROM_FD(fd) (HANDLE) _get_osfhandle(fd) #else /* _WIN32 */ @@ -81,14 +76,9 @@ typedef DIR* os_file_dir_t; /*!< directory stream */ /** File handle */ typedef int os_file_t; -/** Convert a C file descriptor to a native file handle -@param fd file descriptor -@return native file handle */ -# define OS_FILE_FROM_FD(fd) fd - #endif /* _WIN32 */ -static const os_file_t OS_FILE_CLOSED = os_file_t(~0); +static const os_file_t OS_FILE_CLOSED = IF_WIN(os_file_t(INVALID_HANDLE_VALUE),-1); /** File descriptor with optional PERFORMANCE_SCHEMA instrumentation */ struct pfs_os_file_t @@ -251,7 +241,7 @@ public: m_fil_node(NULL), m_type(static_cast<uint16_t>(type)) { - if (!is_punch_hole_supported() || !srv_use_trim) { + if (!is_punch_hole_supported()) { clear_punch_hole(); } } @@ -270,7 +260,7 @@ public: set_punch_hole(); } - if (!is_punch_hole_supported() || !srv_use_trim) { + if (!is_punch_hole_supported()) { clear_punch_hole(); } } @@ -357,7 +347,7 @@ public: /** Set the punch hole flag */ void set_punch_hole() { - if (is_punch_hole_supported() && srv_use_trim) { + if (is_punch_hole_supported()) { m_type |= PUNCH_HOLE; } } @@ -372,8 +362,7 @@ public: @param[in] node File node */ void set_fil_node(fil_node_t* node) { - if (!srv_use_trim || - (node && !fil_node_should_punch_hole(node))) { + if (node && !fil_node_should_punch_hole(node)) { clear_punch_hole(); } @@ -537,14 +526,11 @@ struct os_file_stat_t { }; /** Create a temporary file. This function is like tmpfile(3), but -the temporary file is created in the given parameter path. If the path -is null then it will create the file in the mysql server configuration +the temporary file is created in the in the mysql server configuration parameter (--tmpdir). -@param[in] path location for creating temporary file @return temporary file handle, or NULL on error */ FILE* -os_file_create_tmpfile( - const char* path); +os_file_create_tmpfile(); /** The os_file_opendir() function opens a directory stream corresponding to the directory named by the dirname argument. The directory stream is positioned @@ -810,9 +796,7 @@ os_file_rename os_aio os_file_read os_file_read_no_error_handling -os_file_read_no_error_handling_int_fd os_file_write -os_file_write_int_fd The wrapper functions have the prefix of "innodb_". */ @@ -848,18 +832,10 @@ The wrapper functions have the prefix of "innodb_". */ pfs_os_file_read_no_error_handling_func( \ type, file, buf, offset, n, o, __FILE__, __LINE__) -# define os_file_read_no_error_handling_int_fd(type, file, buf, offset, n) \ - pfs_os_file_read_no_error_handling_int_fd_func( \ - type, file, buf, offset, n, __FILE__, __LINE__) - # define os_file_write(type, name, file, buf, offset, n) \ pfs_os_file_write_func(type, name, file, buf, offset, \ n, __FILE__, __LINE__) -# define os_file_write_int_fd(type, name, file, buf, offset, n) \ - pfs_os_file_write_int_fd_func(type, name, file, buf, offset, \ - n, __FILE__, __LINE__) - # define os_file_flush(file) \ pfs_os_file_flush_func(file, __FILE__, __LINE__) @@ -1196,13 +1172,9 @@ to original un-instrumented file I/O APIs */ # define os_file_read_no_error_handling(type, file, buf, offset, n, o) \ os_file_read_no_error_handling_func(type, file, buf, offset, n, o) -# define os_file_read_no_error_handling_int_fd(type, file, buf, offset, n) \ - os_file_read_no_error_handling_func(type, OS_FILE_FROM_FD(file), buf, offset, n, NULL) # define os_file_write(type, name, file, buf, offset, n) \ os_file_write_func(type, name, file, buf, offset, n) -# define os_file_write_int_fd(type, name, file, buf, offset, n) \ - os_file_write_func(type, name, OS_FILE_FROM_FD(file), buf, offset, n) # define os_file_flush(file) os_file_flush_func(file) @@ -1570,7 +1542,7 @@ path. If the path is NULL then it will be created on --tmpdir location. This function is defined in ha_innodb.cc. @param[in] path location for creating temporary file @return temporary file descriptor, or < 0 on error */ -int +os_file_t innobase_mysql_tmpfile( const char* path); diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic index f363bd5135a..e01fcb41afb 100644 --- a/storage/innobase/include/os0file.ic +++ b/storage/innobase/include/os0file.ic @@ -338,50 +338,6 @@ pfs_os_file_read_no_error_handling_func( return(result); } -/** NOTE! Please use the corresponding macro -os_file_read_no_error_handling_int_fd() to request -a synchronous read operation. -@param[in] type read request -@param[in] file file handle -@param[out] buf buffer where to read -@param[in] offset file offset where to read -@param[in] n number of bytes to read -@param[in] src_file caller file name -@param[in] src_line caller line number -@return error code -@retval DB_SUCCESS if the operation succeeded */ -UNIV_INLINE -dberr_t -pfs_os_file_read_no_error_handling_int_fd_func( - const IORequest& type, - int file, - void* buf, - os_offset_t offset, - ulint n, - const char* src_file, - uint src_line) -{ - PSI_file_locker_state state; - - PSI_file_locker* locker = PSI_FILE_CALL( - get_thread_file_descriptor_locker)( - &state, file, PSI_FILE_READ); - if (locker != NULL) { - PSI_FILE_CALL(start_file_wait)( - locker, n, - __FILE__, __LINE__); - } - - dberr_t err = os_file_read_no_error_handling_func( - type, OS_FILE_FROM_FD(file), buf, offset, n, NULL); - - if (locker != NULL) { - PSI_FILE_CALL(end_file_wait)(locker, n); - } - - return err; -} - /** NOTE! Please use the corresponding macro os_file_write(), not directly this function! This is the performance schema instrumented wrapper function for @@ -424,52 +380,6 @@ pfs_os_file_write_func( return(result); } -/** NOTE! Please use the corresponding macro os_file_write_int_fd(), -not directly this function! -This is the performance schema instrumented wrapper function for -os_file_write_int_fd() which requests a synchronous write operation. -@param[in] type write request -@param[in] name file name -@param[in] file file handle -@param[in] buf buffer to write -@param[in] offset file offset -@param[in] n number of bytes -@param[in] src_file file name where func invoked -@param[in] src_line line where the func invoked -@return error code -@retval DB_SUCCESS if the operation succeeded */ -UNIV_INLINE -dberr_t -pfs_os_file_write_int_fd_func( - const IORequest& type, - const char* name, - int file, - const void* buf, - os_offset_t offset, - ulint n, - const char* src_file, - uint src_line) -{ - PSI_file_locker_state state; - struct PSI_file_locker* locker; - - locker = PSI_FILE_CALL(get_thread_file_descriptor_locker)( - &state, file, PSI_FILE_WRITE); - if (locker != NULL) { - PSI_FILE_CALL(start_file_wait)( - locker, n, - __FILE__, __LINE__); - } - - dberr_t err = os_file_write_func( - type, name, OS_FILE_FROM_FD(file), buf, offset, n); - - if (locker != NULL) { - PSI_FILE_CALL(end_file_wait)(locker, n); - } - - return err; -} /** NOTE! Please use the corresponding macro os_file_flush(), not directly this function! diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h index 0218dea97bd..d99bc841de9 100644 --- a/storage/innobase/include/os0thread.h +++ b/storage/innobase/include/os0thread.h @@ -30,12 +30,6 @@ Created 9/8/1995 Heikki Tuuri #include "univ.i" -/* Maximum number of threads which can be created in the program; -this is also the size of the wait slot array for MySQL threads which -can wait inside InnoDB */ - -#define OS_THREAD_MAX_N srv_max_n_threads - /* Possible fixed priorities for threads */ #define OS_THREAD_PRIORITY_NONE 100 #define OS_THREAD_PRIORITY_BACKGROUND 1 @@ -53,12 +47,8 @@ typedef LPTHREAD_START_ROUTINE os_thread_func_t; /** Macro for specifying a Windows thread start function. */ #define DECLARE_THREAD(func) WINAPI func -/** Required to get around a build error on Windows. Even though our functions -are defined/declared as WINAPI f(LPVOID a); the compiler complains that they -are defined as: os_thread_ret_t (__cdecl*)(void*). Because our functions -don't access the arguments and don't return any value, we should be safe. */ #define os_thread_create(f,a,i) \ - os_thread_create_func(reinterpret_cast<os_thread_func_t>(f), a, i) + os_thread_create_func(f, a, i) #else diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h index 817902b6404..3cba50947a7 100644 --- a/storage/innobase/include/page0cur.h +++ b/storage/innobase/include/page0cur.h @@ -155,10 +155,7 @@ page_cur_tuple_insert( rec_offs** offsets,/*!< out: offsets on *rec */ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ ulint n_ext, /*!< in: number of externally stored columns */ - mtr_t* mtr, /*!< in: mini-transaction handle, or NULL */ - bool use_cache = false) - /*!< in: if true, then use record cache to - hold the tuple converted record. */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ MY_ATTRIBUTE((nonnull(1,2,3,4,5), warn_unused_result)); /***********************************************************//** Inserts a record next to page cursor. Returns pointer to inserted record if diff --git a/storage/innobase/include/page0cur.ic b/storage/innobase/include/page0cur.ic index e57d3f9dee1..7896aa64792 100644 --- a/storage/innobase/include/page0cur.ic +++ b/storage/innobase/include/page0cur.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, MariaDB Corporation. +Copyright (c) 2015, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -262,10 +262,7 @@ page_cur_tuple_insert( rec_offs** offsets,/*!< out: offsets on *rec */ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ ulint n_ext, /*!< in: number of externally stored columns */ - mtr_t* mtr, /*!< in: mini-transaction handle, or NULL */ - bool use_cache) - /*!< in: if true, then use record cache to - hold the tuple converted record. */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ { rec_t* rec; ulint size = rec_get_converted_size(index, tuple, n_ext); @@ -281,7 +278,8 @@ page_cur_tuple_insert( index, tuple, n_ext); *offsets = rec_get_offsets(rec, index, *offsets, - page_is_leaf(cursor->block->frame), + page_is_leaf(cursor->block->frame) + ? index->n_core_fields : 0, ULINT_UNDEFINED, heap); if (buf_block_get_page_zip(cursor->block)) { diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index a569912b82b..78cf3e26d4d 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -62,9 +62,42 @@ typedef byte page_header_t; #define PAGE_FREE 6 /* pointer to start of page free record list */ #define PAGE_GARBAGE 8 /* number of bytes in deleted records */ #define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or - NULL if this info has been reset by a delete, + 0 if this info has been reset by a delete, for example */ -#define PAGE_DIRECTION 12 /* last insert direction: PAGE_LEFT, ... */ + +/** This 10-bit field is usually 0. In B-tree index pages of +ROW_FORMAT=REDUNDANT tables, this byte can contain garbage if the .ibd +file was created in MySQL 4.1.0 or if the table resides in the system +tablespace and was created before MySQL 4.1.1 or MySQL 4.0.14. +In this case, the FIL_PAGE_TYPE would be FIL_PAGE_INDEX. + +In ROW_FORMAT=COMPRESSED tables, this field is always 0, because +instant ADD COLUMN is not supported. + +In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables, this field is +always 0, except in the root page of the clustered index after instant +ADD COLUMN. + +Instant ADD COLUMN will change FIL_PAGE_TYPE to FIL_PAGE_TYPE_INSTANT +and initialize the PAGE_INSTANT field to the original number of +fields in the clustered index (dict_index_t::n_core_fields). The most +significant bits are in the first byte, and the least significant 5 +bits are stored in the most significant 5 bits of PAGE_DIRECTION_B. + +These FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be assigned even if +instant ADD COLUMN was not committed. Changes to these page header fields +are not undo-logged, but changes to the hidden metadata record are. +If the server is killed and restarted, the page header fields could +remain set even though no metadata record is present. + +When the table becomes empty, the PAGE_INSTANT field and the +FIL_PAGE_TYPE can be reset and any metadata record be removed. */ +#define PAGE_INSTANT 12 + +/** last insert direction: PAGE_LEFT, .... +In ROW_FORMAT=REDUNDANT tables created before MySQL 4.1.1 or MySQL 4.0.14, +this byte can be garbage. */ +#define PAGE_DIRECTION_B 13 #define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same direction */ #define PAGE_N_RECS 16 /* number of user records on the page */ @@ -124,9 +157,9 @@ Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */ /*-----------------------------*/ /* Heap numbers */ -#define PAGE_HEAP_NO_INFIMUM 0 /* page infimum */ -#define PAGE_HEAP_NO_SUPREMUM 1 /* page supremum */ -#define PAGE_HEAP_NO_USER_LOW 2 /* first user record in +#define PAGE_HEAP_NO_INFIMUM 0U /* page infimum */ +#define PAGE_HEAP_NO_SUPREMUM 1U /* page supremum */ +#define PAGE_HEAP_NO_USER_LOW 2U /* first user record in creation (insertion) order, not necessarily collation order; this record may have been deleted */ @@ -250,6 +283,18 @@ page_rec_is_comp(const byte* rec) return(page_is_comp(page_align(rec))); } +# ifdef UNIV_DEBUG +/** Determine if the record is the metadata pseudo-record +in the clustered index. +@param[in] rec leaf page record on an index page +@return whether the record is the metadata pseudo-record */ +inline bool page_rec_is_metadata(const rec_t* rec) +{ + return rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG; +} +# endif /* UNIV_DEBUG */ + /** Determine the offset of the infimum record on the page. @param[in] page index page @return offset of the infimum record in record list, relative from page */ @@ -685,7 +730,6 @@ ulint page_rec_get_heap_no( /*=================*/ const rec_t* rec); /*!< in: the physical record */ - /** Determine whether a page has any siblings. @param[in] page page frame @return true if the page has any siblings */ @@ -924,6 +968,45 @@ page_mem_free( const dict_index_t* index, /*!< in: index of rec */ const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */ + +/** Read the PAGE_DIRECTION field from a byte. +@param[in] ptr pointer to PAGE_DIRECTION_B +@return the value of the PAGE_DIRECTION field */ +inline +byte +page_ptr_get_direction(const byte* ptr); + +/** Set the PAGE_DIRECTION field. +@param[in] ptr pointer to PAGE_DIRECTION_B +@param[in] dir the value of the PAGE_DIRECTION field */ +inline +void +page_ptr_set_direction(byte* ptr, byte dir); + +/** Read the PAGE_DIRECTION field. +@param[in] page index page +@return the value of the PAGE_DIRECTION field */ +inline +byte +page_get_direction(const page_t* page) +{ + return page_ptr_get_direction(PAGE_HEADER + PAGE_DIRECTION_B + page); +} + +/** Read the PAGE_INSTANT field. +@param[in] page index page +@return the value of the PAGE_INSTANT field */ +inline +uint16_t +page_get_instant(const page_t* page); +/** Assign the PAGE_INSTANT field. +@param[in,out] page clustered index root page +@param[in] n original number of clustered index fields +@param[in,out] mtr mini-transaction */ +inline +void +page_set_instant(page_t* page, unsigned n, mtr_t* mtr); + /**********************************************************//** Create an uncompressed B-tree index page. @return pointer to the page */ @@ -1219,15 +1302,12 @@ ibool page_simple_validate_new( /*=====================*/ const page_t* page); /*!< in: index page in ROW_FORMAT!=REDUNDANT */ -/***************************************************************//** -This function checks the consistency of an index page. -@return TRUE if ok */ -ibool -page_validate( -/*==========*/ - const page_t* page, /*!< in: index page */ - dict_index_t* index); /*!< in: data dictionary index containing - the page record type definition */ +/** Check the consistency of an index page. +@param[in] page index page +@param[in] index B-tree or R-tree index +@return whether the page is valid */ +bool page_validate(const page_t* page, const dict_index_t* index) + MY_ATTRIBUTE((nonnull)); /***************************************************************//** Looks in the page record list for a record with the given heap number. @return record, NULL if not found */ @@ -1253,5 +1333,4 @@ page_find_rec_max_not_deleted( #include "page0page.ic" - #endif diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic index 5f69925a41a..d1bf382c1d5 100644 --- a/storage/innobase/include/page0page.ic +++ b/storage/innobase/include/page0page.ic @@ -1052,6 +1052,75 @@ page_mem_free( } } +/** Read the PAGE_DIRECTION field from a byte. +@param[in] ptr pointer to PAGE_DIRECTION_B +@return the value of the PAGE_DIRECTION field */ +inline +byte +page_ptr_get_direction(const byte* ptr) +{ + ut_ad(page_offset(ptr) == PAGE_HEADER + PAGE_DIRECTION_B); + return *ptr & ((1U << 3) - 1); +} + +/** Set the PAGE_DIRECTION field. +@param[in] ptr pointer to PAGE_DIRECTION_B +@param[in] dir the value of the PAGE_DIRECTION field */ +inline +void +page_ptr_set_direction(byte* ptr, byte dir) +{ + ut_ad(page_offset(ptr) == PAGE_HEADER + PAGE_DIRECTION_B); + ut_ad(dir >= PAGE_LEFT); + ut_ad(dir <= PAGE_NO_DIRECTION); + *ptr = (*ptr & ~((1U << 3) - 1)) | dir; +} + +/** Read the PAGE_INSTANT field. +@param[in] page index page +@return the value of the PAGE_INSTANT field */ +inline +uint16_t +page_get_instant(const page_t* page) +{ + uint16_t i = page_header_get_field(page, PAGE_INSTANT); +#ifdef UNIV_DEBUG + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_INSTANT: + ut_ad(page_get_direction(page) <= PAGE_NO_DIRECTION); + ut_ad(i >> 3); + break; + case FIL_PAGE_INDEX: + ut_ad(i <= PAGE_NO_DIRECTION || !page_is_comp(page)); + break; + case FIL_PAGE_RTREE: + ut_ad(i <= PAGE_NO_DIRECTION); + break; + default: + ut_ad(!"invalid page type"); + break; + } +#endif /* UNIV_DEBUG */ + return(i >> 3); +} + +/** Assign the PAGE_INSTANT field. +@param[in,out] page clustered index root page +@param[in] n original number of clustered index fields +@param[in,out] mtr mini-transaction */ +inline +void +page_set_instant(page_t* page, unsigned n, mtr_t* mtr) +{ + ut_ad(fil_page_get_type(page) == FIL_PAGE_TYPE_INSTANT); + ut_ad(n > 0); + ut_ad(n < REC_MAX_N_FIELDS); + uint16_t i = page_header_get_field(page, PAGE_INSTANT); + ut_ad(i <= PAGE_NO_DIRECTION); + i |= n << 3; + mlog_write_ulint(PAGE_HEADER + PAGE_INSTANT + page, i, + MLOG_2BYTES, mtr); +} #endif /* !UNIV_INNOCHECKSUM */ #ifdef UNIV_MATERIALIZE diff --git a/storage/innobase/include/page0size.h b/storage/innobase/include/page0size.h index 74fcfb106ea..ca1e704eda1 100644 --- a/storage/innobase/include/page0size.h +++ b/storage/innobase/include/page0size.h @@ -29,7 +29,7 @@ Created Nov 14, 2013 Vasil Dimov #include "fsp0types.h" -#define FIELD_REF_SIZE 20 +#define FIELD_REF_SIZE 20U /** A BLOB field reference full of zero, for use in assertions and tests.Initially, BLOB field references are set to zero, in diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h index f411fd0eee9..bf6ad5c860f 100644 --- a/storage/innobase/include/page0zip.h +++ b/storage/innobase/include/page0zip.h @@ -340,19 +340,39 @@ page_zip_write_node_ptr( ulint ptr, /*!< in: node pointer */ mtr_t* mtr); /*!< in: mini-transaction, or NULL */ -/**********************************************************************//** -Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */ +/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record. +@param[in,out] page_zip compressed page +@param[in,out] rec record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields) +@param[in] trx_id DB_TRX_ID value (transaction identifier) +@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer) +@param[in,out] mtr mini-transaction, or NULL to skip logging */ void page_zip_write_trx_id_and_roll_ptr( -/*===============================*/ - page_zip_des_t* page_zip,/*!< in/out: compressed page */ - byte* rec, /*!< in/out: record */ - const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ - ulint trx_id_col,/*!< in: column number of TRX_ID in rec */ - trx_id_t trx_id, /*!< in: transaction identifier */ - roll_ptr_t roll_ptr)/*!< in: roll_ptr */ - MY_ATTRIBUTE((nonnull)); - + page_zip_des_t* page_zip, + byte* rec, + const rec_offs* offsets, + ulint trx_id_col, + trx_id_t trx_id, + roll_ptr_t roll_ptr, + mtr_t* mtr = NULL) + MY_ATTRIBUTE((nonnull(1,2,3))); + +/** Parse a MLOG_ZIP_WRITE_TRX_ID record. +@param[in] ptr redo log buffer +@param[in] end_ptr end of redo log buffer +@param[in,out] page uncompressed page +@param[in,out] page_zip compressed page +@return end of log record +@retval NULL if the log record is incomplete */ +byte* +page_zip_parse_write_trx_id( + byte* ptr, + byte* end_ptr, + page_t* page, + page_zip_des_t* page_zip) + MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)); /**********************************************************************//** Write the "deleted" flag of a record on a compressed page. The flag must already have been written on the uncompressed page. */ diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic index 5a3b500e2c8..4e4ccdb492f 100644 --- a/storage/innobase/include/page0zip.ic +++ b/storage/innobase/include/page0zip.ic @@ -120,7 +120,7 @@ page_zip_get_size( size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize; ut_ad(size >= UNIV_ZIP_SIZE_MIN); - ut_ad(size <= UNIV_PAGE_SIZE); + ut_ad(size <= srv_page_size); return(size); } @@ -245,9 +245,9 @@ page_zip_get_trailer_len( ut_ad(!page_zip->n_blobs); } - return((page_dir_get_n_heap(page_zip->data) - 2) - * uncompressed_size - + page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE); + return (ulint(page_dir_get_n_heap(page_zip->data)) - 2) + * uncompressed_size + + ulint(page_zip->n_blobs) * BTR_EXTERN_FIELD_REF_SIZE; } /**********************************************************************//** diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h index f54c50e5b85..73a19dd24d8 100644 --- a/storage/innobase/include/pars0pars.h +++ b/storage/innobase/include/pars0pars.h @@ -508,7 +508,7 @@ pars_info_add_int4_literal( /*=======================*/ pars_info_t* info, /*!< in: info struct */ const char* name, /*!< in: name */ - lint val); /*!< in: value */ + ulint val); /*!< in: value */ /****************************************************************//** Equivalent to: diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h index 2798a4d40fb..fa408fb10a1 100644 --- a/storage/innobase/include/que0que.h +++ b/storage/innobase/include/que0que.h @@ -319,13 +319,6 @@ enum que_thr_lock_t { QUE_THR_LOCK_TABLE }; -/** From where the cursor position is counted */ -enum que_cur_t { - QUE_CUR_NOT_DEFINED, - QUE_CUR_START, - QUE_CUR_END -}; - /* Query graph query thread node: the fields are protected by the trx_t::mutex with the exceptions named below */ @@ -402,18 +395,7 @@ struct que_fork_t{ generated by the parser, or NULL if the graph was created 'by hand' */ pars_info_t* info; /*!< info struct, or NULL */ - /* The following cur_... fields are relevant only in a select graph */ - ulint cur_end; /*!< QUE_CUR_NOT_DEFINED, QUE_CUR_START, - QUE_CUR_END */ - ulint cur_pos; /*!< if there are n rows in the result - set, values 0 and n + 1 mean before - first row, or after last row, depending - on cur_end; values 1...n mean a row - index */ - ibool cur_on_row; /*!< TRUE if cursor is on a row, i.e., - it is not before the first row or - after the last row */ sel_node_t* last_sel_node; /*!< last executed select node, or NULL if none */ UT_LIST_NODE_T(que_fork_t) diff --git a/storage/innobase/include/read0read.h b/storage/innobase/include/read0read.h deleted file mode 100644 index 359db1d8c39..00000000000 --- a/storage/innobase/include/read0read.h +++ /dev/null @@ -1,124 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/read0read.h -Cursor read - -Created 2/16/1997 Heikki Tuuri -*******************************************************/ - -#ifndef read0read_h -#define read0read_h - -#include "read0types.h" - -#include <algorithm> - -/** The MVCC read view manager */ -class MVCC { -public: - /** Constructor - @param size Number of views to pre-allocate */ - explicit MVCC(ulint size); - - /** Destructor. - Free all the views in the m_free list */ - ~MVCC(); - - /** - Allocate and create a view. - @param view view owned by this class created for the - caller. Must be freed by calling close() - @param trx transaction creating the view */ - void view_open(ReadView*& view, trx_t* trx); - - /** - Close a view created by view_open(). - @param view view allocated by view_open() - @param own_mutex whether the caller owns trx_sys_t::mutex */ - void view_close(ReadView*& view, bool own_mutex); - - /** - Release a view that is inactive but not closed. Caller must own - the trx_sys_t::mutex. - @param view View to release */ - void view_release(ReadView*& view); - - /** Clones the oldest view and stores it in view. No need to - call view_close(). The caller owns the view that is passed in. - It will also move the closed views from the m_views list to the - m_free list. This function is called by Purge to create it view. - @param view Preallocated view, owned by the caller */ - void clone_oldest_view(ReadView* view); - - /** - @return the number of active views */ - ulint size() const; - - /** - @return true if the view is active and valid */ - static bool is_view_active(ReadView* view) - { - ut_a(view != reinterpret_cast<ReadView*>(0x1)); - - return(view != NULL && !(intptr_t(view) & 0x1)); - } - - /** - Set the view creator transaction id. Note: This shouldbe set only - for views created by RW transactions. */ - static void set_view_creator_trx_id(ReadView* view, trx_id_t id); - -private: - - /** - Validates a read view list. */ - bool validate() const; - - /** - Find a free view from the active list, if none found then allocate - a new view. This function will also attempt to move delete marked - views from the active list to the freed list. - @return a view to use */ - inline ReadView* get_view(); - - /** - Get the oldest view in the system. It will also move the delete - marked read views from the views list to the freed list. - @return oldest view if found or NULL */ - inline ReadView* get_oldest_view() const; - -private: - // Prevent copying - MVCC(const MVCC&); - MVCC& operator=(const MVCC&); - -private: - typedef UT_LIST_BASE_NODE_T(ReadView) view_list_t; - - /** Free views ready for reuse. */ - view_list_t m_free; - - /** Active and closed views, the closed views will have the - creator trx id set to TRX_ID_MAX */ - view_list_t m_views; -}; - -#endif /* read0read_h */ diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h index 520b0324310..c0faf84cfbe 100644 --- a/storage/innobase/include/read0types.h +++ b/storage/innobase/include/read0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,122 +31,163 @@ Created 2/16/1997 Heikki Tuuri #include "trx0types.h" #include <algorithm> -// Friend declaration -class MVCC; -/** Read view lists the trx ids of those transactions for which a consistent -read should not see the modifications to the database. */ +/** View is not visible to purge thread. */ +#define READ_VIEW_STATE_CLOSED 0 -class ReadView { - /** This is similar to a std::vector but it is not a drop - in replacement. It is specific to ReadView. */ - class ids_t { - typedef trx_ids_t::value_type value_type; +/** View is being opened, purge thread must wait for state change. */ +#define READ_VIEW_STATE_SNAPSHOT 1 - /** - Constructor */ - ids_t() : m_ptr(), m_size(), m_reserved() { } +/** View is visible to purge thread. */ +#define READ_VIEW_STATE_OPEN 2 - /** - Destructor */ - ~ids_t() { UT_DELETE_ARRAY(m_ptr); } - /** - Try and increase the size of the array. Old elements are - copied across. It is a no-op if n is < current size. +/** + Read view lists the trx ids of those transactions for which a consistent read + should not see the modifications to the database. +*/ +class ReadView +{ + /** + View state. - @param n Make space for n elements */ - void reserve(ulint n); + It is not defined as enum as it has to be updated using atomic operations. + Possible values are READ_VIEW_STATE_CLOSED, READ_VIEW_STATE_SNAPSHOT and + READ_VIEW_STATE_OPEN. - /** - Resize the array, sets the current element count. - @param n new size of the array, in elements */ - void resize(ulint n) - { - ut_ad(n <= capacity()); + Possible state transfers... - m_size = n; - } - - /** - Reset the size to 0 */ - void clear() { resize(0); } - - /** - @return the capacity of the array in elements */ - ulint capacity() const { return(m_reserved); } - - /** - Copy and overwrite the current array contents - - @param start Source array - @param end Pointer to end of array */ - void assign(const value_type* start, const value_type* end); - - /** - Insert the value in the correct slot, preserving the order. - Doesn't check for duplicates. */ - void insert(value_type value); - - /** - @return the value of the first element in the array */ - value_type front() const - { - ut_ad(!empty()); - - return(m_ptr[0]); - } - - /** - @return the value of the last element in the array */ - value_type back() const - { - ut_ad(!empty()); - - return(m_ptr[m_size - 1]); - } - - /** - Append a value to the array. - @param value the value to append */ - void push_back(value_type value); - - /** - @return a pointer to the start of the array */ - trx_id_t* data() { return(m_ptr); }; - - /** - @return a const pointer to the start of the array */ - const trx_id_t* data() const { return(m_ptr); }; + Start view open: + READ_VIEW_STATE_CLOSED -> READ_VIEW_STATE_SNAPSHOT - /** - @return the number of elements in the array */ - ulint size() const { return(m_size); } + Complete view open: + READ_VIEW_STATE_SNAPSHOT -> READ_VIEW_STATE_OPEN - /** - @return true if size() == 0 */ - bool empty() const { return(size() == 0); } + Close view: + READ_VIEW_STATE_OPEN -> READ_VIEW_STATE_CLOSED + */ + int32_t m_state; - private: - // Prevent copying - ids_t(const ids_t&); - ids_t& operator=(const ids_t&); - private: - /** Memory for the array */ - value_type* m_ptr; - - /** Number of active elements in the array */ - ulint m_size; +public: + ReadView(): m_state(READ_VIEW_STATE_CLOSED), m_low_limit_id(0) {} + + + /** + Copy state from another view. + + This method is used to find min(m_low_limit_no), min(m_low_limit_id) and + all transaction ids below min(m_low_limit_id). These values effectively + form oldest view. + + @param other view to copy from + */ + void copy(const ReadView &other) + { + ut_ad(&other != this); + if (m_low_limit_no > other.m_low_limit_no) + m_low_limit_no= other.m_low_limit_no; + if (m_low_limit_id > other.m_low_limit_id) + m_low_limit_id= other.m_low_limit_id; + + trx_ids_t::iterator dst= m_ids.begin(); + for (trx_ids_t::const_iterator src= other.m_ids.begin(); + src != other.m_ids.end(); src++) + { + if (*src >= m_low_limit_id) + break; +loop: + if (dst == m_ids.end()) + { + m_ids.push_back(*src); + dst= m_ids.end(); + continue; + } + if (*dst < *src) + { + dst++; + goto loop; + } + else if (*dst > *src) + dst= m_ids.insert(dst, *src) + 1; + } + m_ids.erase(std::lower_bound(dst, m_ids.end(), m_low_limit_id), + m_ids.end()); + + m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front(); + ut_ad(m_up_limit_id <= m_low_limit_id); + } + + + /** + Opens a read view where exactly the transactions serialized before this + point in time are seen in the view. + + View becomes visible to purge thread. + + @param[in,out] trx transaction + */ + void open(trx_t *trx); + + + /** + Closes the view. + + View becomes not visible to purge thread. + */ + void close() + { + ut_ad(m_state == READ_VIEW_STATE_CLOSED || + m_state == READ_VIEW_STATE_OPEN); + if (m_state == READ_VIEW_STATE_OPEN) + my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_CLOSED, + MY_MEMORY_ORDER_RELAXED); + } + + + /** m_state getter for trx_sys::clone_oldest_view() trx_sys::size(). */ + int32_t get_state() const + { + return my_atomic_load32_explicit(const_cast<int32*>(&m_state), + MY_MEMORY_ORDER_ACQUIRE); + } + + + /** + Returns true if view is open. + + Only used by view owner thread, thus we can omit atomic operations. + */ + bool is_open() const + { + ut_ad(m_state == READ_VIEW_STATE_OPEN || + m_state == READ_VIEW_STATE_CLOSED); + return m_state == READ_VIEW_STATE_OPEN; + } + + + /** + Creates a snapshot where exactly the transactions serialized before this + point in time are seen in the view. + + @param[in,out] trx transaction + */ + inline void snapshot(trx_t *trx); + + + /** + Sets the creator transaction id. + + This should be set only for views created by RW transactions. + */ + void set_creator_trx_id(trx_id_t id) + { + ut_ad(id > 0); + ut_ad(m_creator_trx_id == 0); + m_creator_trx_id= id; + } - /** Size of m_ptr in elements */ - ulint m_reserved; - friend class ReadView; - }; -public: - ReadView(); - ~ReadView(); /** Check whether transaction id is valid. @param[in] id transaction id to check @param[in] name table name */ @@ -162,8 +204,6 @@ public: const table_name_t& name) const MY_ATTRIBUTE((warn_unused_result)) { - ut_ad(id > 0); - if (id < m_up_limit_id || id == m_creator_trx_id) { return(true); @@ -180,9 +220,7 @@ public: return(true); } - const ids_t::value_type* p = m_ids.data(); - - return(!std::binary_search(p, p + m_ids.size(), id)); + return(!std::binary_search(m_ids.begin(), m_ids.end(), id)); } /** @@ -194,21 +232,6 @@ public: } /** - Mark the view as closed */ - void close() - { - ut_ad(m_creator_trx_id != TRX_ID_MAX); - m_creator_trx_id = TRX_ID_MAX; - } - - /** - @return true if the view is closed */ - bool is_closed() const - { - return(m_closed); - } - - /** Write the limits to the file. @param file file to write to */ void print_limits(FILE* file) const @@ -233,66 +256,6 @@ public: return(m_low_limit_id); } - /** - @return true if there are no transaction ids in the snapshot */ - bool empty() const - { - return(m_ids.empty()); - } - -#ifdef UNIV_DEBUG - /** - @param rhs view to compare with - @return truen if this view is less than or equal rhs */ - bool le(const ReadView* rhs) const - { - return(m_low_limit_no <= rhs->m_low_limit_no); - } - - trx_id_t up_limit_id() const - { - return(m_up_limit_id); - } -#endif /* UNIV_DEBUG */ -private: - /** - Copy the transaction ids from the source vector */ - inline void copy_trx_ids(const trx_ids_t& trx_ids); - - /** - Opens a read view where exactly the transactions serialized before this - point in time are seen in the view. - @param id Creator transaction id */ - inline void prepare(trx_id_t id); - - /** - Complete the read view creation */ - inline void complete(); - - /** - Copy state from another view. Must call copy_complete() to finish. - @param other view to copy from */ - inline void copy_prepare(const ReadView& other); - - /** - Complete the copy, insert the creator transaction id into the - m_trx_ids too and adjust the m_up_limit_id *, if required */ - inline void copy_complete(); - - /** - Set the creator transaction id, existing id must be 0 */ - void creator_trx_id(trx_id_t id) - { - ut_ad(m_creator_trx_id == 0); - m_creator_trx_id = id; - } - - friend class MVCC; - -private: - // Disable copying - ReadView(const ReadView&); - ReadView& operator=(const ReadView&); private: /** The read should not see any transaction with trx id >= this @@ -310,21 +273,12 @@ private: /** Set of RW transactions that was active when this snapshot was taken */ - ids_t m_ids; + trx_ids_t m_ids; /** The view does not need to see the undo logs for transactions whose transaction number is strictly smaller (<) than this value: they can be removed in purge if not needed by other views */ trx_id_t m_low_limit_no; - - /** AC-NL-RO transaction view that has been "closed". */ - bool m_closed; - - typedef UT_LIST_NODE_T(ReadView) node_t; - - /** List of read views in trx_sys */ - byte pad1[64 - sizeof(node_t)]; - node_t m_view_list; }; #endif diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h index 81a09afa3d8..7d7af9e2beb 100644 --- a/storage/innobase/include/rem0rec.h +++ b/storage/innobase/include/rem0rec.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -32,6 +32,7 @@ Created 5/30/1994 Heikki Tuuri #include "rem0types.h" #include "mtr0types.h" #include "page0types.h" +#include "dict0dict.h" #include "trx0types.h" #endif /*! UNIV_INNOCHECKSUM */ #include <ostream> @@ -53,11 +54,29 @@ in addition to the data and the offsets */ in addition to the data and the offsets */ #define REC_N_NEW_EXTRA_BYTES 5 -/* Record status values */ -#define REC_STATUS_ORDINARY 0 -#define REC_STATUS_NODE_PTR 1 -#define REC_STATUS_INFIMUM 2 -#define REC_STATUS_SUPREMUM 3 +/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */ +enum rec_comp_status_t { + /** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */ + REC_STATUS_ORDINARY = 0, + /** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */ + REC_STATUS_NODE_PTR = 1, + /** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */ + REC_STATUS_INFIMUM = 2, + /** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */ + REC_STATUS_SUPREMUM = 3, + /** Clustered index record that has been inserted or updated + after instant ADD COLUMN (more than dict_index_t::n_core_fields) */ + REC_STATUS_COLUMNS_ADDED = 4 +}; + +/** The dtuple_t::info_bits of the metadata pseudo-record. +@see rec_is_metadata() */ +static const byte REC_INFO_METADATA + = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED; + +#define REC_NEW_STATUS 3 /* This is single byte bit-field */ +#define REC_NEW_STATUS_MASK 0x7UL +#define REC_NEW_STATUS_SHIFT 0 /* The following four constants are needed in page0zip.cc in order to efficiently compress and decompress pages. */ @@ -84,12 +103,15 @@ static const rec_offs REC_2BYTE_EXTERN_MASK= 0x4000; static const size_t RECORD_OFFSET= 2; static const size_t INDEX_OFFSET= RECORD_OFFSET + sizeof(rec_t *) / sizeof(rec_offs); +#endif /* UNIV_INNOCHECKSUM */ /* Length of the rec_get_offsets() header */ static const size_t REC_OFFS_HEADER_SIZE= #ifdef UNIV_DEBUG +#ifndef UNIV_INNOCHECKSUM sizeof(rec_t *) / sizeof(rec_offs) + sizeof(dict_index_t *) / sizeof(rec_offs) + +#endif /* UNIV_INNOCHECKSUM */ #endif /* UNIV_DEBUG */ 2; @@ -101,9 +123,16 @@ static const size_t REC_OFFS_SEC_INDEX_SIZE= /* PK max key parts */ 16 + /* sec idx max key parts */ 16 + /* child page number for non-leaf pages */ 1; +/** Get the base address of offsets. The extra_size is stored at +this position, and following positions hold the end offsets of +the fields. */ +#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE) + +#ifndef UNIV_INNOCHECKSUM /* Offset consists of two parts: 2 upper bits is type and all other bits is value */ +/** Only 4 different values is possible! */ enum field_type_t { /** normal field */ @@ -111,7 +140,9 @@ enum field_type_t /** this field is stored off-page */ STORED_OFFPAGE= 1 << 14, /** just an SQL NULL */ - SQL_NULL= 2 << 14 + SQL_NULL= 2 << 14, + /** instantly added field */ + DEFAULT= 3 << 14, }; /** without 2 upper bits */ @@ -132,11 +163,13 @@ inline rec_offs combine(rec_offs value, field_type_t type) return get_value(value) | static_cast<rec_offs>(type); } -/** Compact flag ORed to the extra size returned by rec_offs_base()[0] */ -static const rec_offs REC_OFFS_COMPACT= 1 << 15; -/** External flag in offsets returned by rec_offs_base()[0] */ -static const rec_offs REC_OFFS_EXTERNAL= 1 << 14; - +/** Compact flag ORed to the extra size returned by rec_get_offsets() */ +const rec_offs REC_OFFS_COMPACT= ~(rec_offs(~0) >> 1); +/** External flag in offsets returned by rec_get_offsets() */ +const rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1; +/** Default value flag in offsets returned by rec_get_offsets() */ +const rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2; +const rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1; /******************************************************//** The following function is used to get the pointer of the next chained record on the same page. @@ -294,25 +327,55 @@ rec_set_info_bits_new( rec_t* rec, /*!< in/out: new-style physical record */ ulint bits) /*!< in: info bits */ MY_ATTRIBUTE((nonnull)); -/******************************************************//** -The following function retrieves the status bits of a new-style record. + +/** Determine the status bits of a non-REDUNDANT record. +@param[in] rec ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record @return status bits */ -UNIV_INLINE -ulint -rec_get_status( -/*===========*/ - const rec_t* rec) /*!< in: physical record */ - MY_ATTRIBUTE((warn_unused_result)); +inline +rec_comp_status_t +rec_get_status(const rec_t* rec) +{ + byte bits = rec[-REC_NEW_STATUS] & REC_NEW_STATUS_MASK; + ut_ad(bits <= REC_STATUS_COLUMNS_ADDED); + return static_cast<rec_comp_status_t>(bits); +} -/******************************************************//** -The following function is used to set the status bits of a new-style record. */ -UNIV_INLINE +/** Set the status bits of a non-REDUNDANT record. +@param[in,out] rec ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record +@param[in] bits status bits */ +inline void -rec_set_status( -/*===========*/ - rec_t* rec, /*!< in/out: physical record */ - ulint bits) /*!< in: info bits */ - MY_ATTRIBUTE((nonnull)); +rec_set_status(rec_t* rec, byte bits) +{ + ut_ad(bits <= REC_STATUS_COLUMNS_ADDED); + rec[-REC_NEW_STATUS] = (rec[-REC_NEW_STATUS] & ~REC_NEW_STATUS_MASK) + | bits; +} + +/** Get the length of added field count in a REC_STATUS_COLUMNS_ADDED record. +@param[in] n_add_field number of added fields, minus one +@return storage size of the field count, in bytes */ +inline unsigned rec_get_n_add_field_len(ulint n_add_field) +{ + ut_ad(n_add_field < REC_MAX_N_FIELDS); + return n_add_field < 0x80 ? 1 : 2; +} + +/** Set the added field count in a REC_STATUS_COLUMNS_ADDED record. +@param[in,out] header variable header of a REC_STATUS_COLUMNS_ADDED record +@param[in] n_add number of added fields, minus 1 +@return record header before the number of added fields */ +inline void rec_set_n_add_field(byte*& header, ulint n_add) +{ + ut_ad(n_add < REC_MAX_N_FIELDS); + + if (n_add < 0x80) { + *header-- = byte(n_add); + } else { + *header-- = byte(n_add) | 0x80; + *header-- = byte(n_add >> 7); + } +} /******************************************************//** The following function is used to retrieve the info and status @@ -369,7 +432,7 @@ rec_set_deleted_flag_new( The following function tells if a new-style record is a node pointer. @return TRUE if node pointer */ UNIV_INLINE -ibool +bool rec_get_node_ptr_flag( /*==================*/ const rec_t* rec) /*!< in: physical record */ @@ -455,7 +518,7 @@ value. @return offset of the start of the field, SQL null flag and extern storage flag ORed */ UNIV_INLINE -rec_offs +uint16_t rec_2_get_field_end_info( /*=====================*/ const rec_t* rec, /*!< in: record */ @@ -491,7 +554,7 @@ rec_get_n_extern_new( @param[in] index the index that the record belongs to @param[in,out] offsets array comprising offsets[0] allocated elements, or an array from rec_get_offsets(), or NULL -@param[in] leaf whether this is a leaf-page record +@param[in] n_core 0, or index->n_core_fields for leaf page @param[in] n_fields maximum number of offsets to compute (ULINT_UNDEFINED to compute all offsets) @param[in,out] heap memory heap @@ -501,9 +564,7 @@ rec_get_offsets_func( const rec_t* rec, const dict_index_t* index, rec_offs* offsets, -#ifdef UNIV_DEBUG - bool leaf, -#endif /* UNIV_DEBUG */ + ulint n_core, ulint n_fields, #ifdef UNIV_DEBUG const char* file, /*!< in: file name where called */ @@ -513,7 +574,7 @@ rec_get_offsets_func( #ifdef UNIV_DEBUG MY_ATTRIBUTE((nonnull(1,2,6,8),warn_unused_result)); #else /* UNIV_DEBUG */ - MY_ATTRIBUTE((nonnull(1,2,5),warn_unused_result)); + MY_ATTRIBUTE((nonnull(1,2,6),warn_unused_result)); #endif /* UNIV_DEBUG */ #ifdef UNIV_DEBUG @@ -521,7 +582,7 @@ rec_get_offsets_func( rec_get_offsets_func(rec,index,offsets,leaf,n,__FILE__,__LINE__,heap) #else /* UNIV_DEBUG */ # define rec_get_offsets(rec, index, offsets, leaf, n, heap) \ - rec_get_offsets_func(rec, index, offsets, n, heap) + rec_get_offsets_func(rec, index, offsets, leaf, n, heap) #endif /* UNIV_DEBUG */ /******************************************************//** @@ -541,32 +602,31 @@ rec_get_offsets_reverse( offsets[0] allocated elements */ MY_ATTRIBUTE((nonnull)); #ifdef UNIV_DEBUG -/************************************************************//** -Validates offsets returned by rec_get_offsets(). -@return TRUE if valid */ -UNIV_INLINE -ibool +/** Validate offsets returned by rec_get_offsets(). +@param[in] rec record, or NULL +@param[in] index the index that the record belongs in, or NULL +@param[in,out] offsets the offsets of the record +@return true */ +bool rec_offs_validate( -/*==============*/ - const rec_t* rec, /*!< in: record or NULL */ - const dict_index_t* index, /*!< in: record descriptor or NULL */ - const rec_offs* offsets)/*!< in: array returned by - rec_get_offsets() */ + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets) MY_ATTRIBUTE((nonnull(3), warn_unused_result)); -/************************************************************//** -Updates debug data in offsets, in order to avoid bogus -rec_offs_validate() failures. */ -UNIV_INLINE +/** Update debug data in offsets, in order to tame rec_offs_validate(). +@param[in] rec record +@param[in] index the index that the record belongs in +@param[in] leaf whether the record resides in a leaf page +@param[in,out] offsets offsets from rec_get_offsets() to adjust */ void rec_offs_make_valid( -/*================*/ - const rec_t* rec, /*!< in: record */ - const dict_index_t* index, /*!< in: record descriptor */ - rec_offs* offsets)/*!< in: array returned by - rec_get_offsets() */ + const rec_t* rec, + const dict_index_t* index, + bool leaf, + rec_offs* offsets) MY_ATTRIBUTE((nonnull)); #else -# define rec_offs_make_valid(rec, index, offsets) ((void) 0) +# define rec_offs_make_valid(rec, index, leaf, offsets) #endif /* UNIV_DEBUG */ /************************************************************//** @@ -611,26 +671,6 @@ rec_get_nth_field_offs( #define rec_get_nth_field(rec, offsets, n, len) \ ((rec) + rec_get_nth_field_offs(offsets, n, len)) /******************************************************//** -Determine if the offsets are for a record in the new -compact format. -@return nonzero if compact format */ -UNIV_INLINE -ulint -rec_offs_comp( -/*==========*/ - const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ - MY_ATTRIBUTE((warn_unused_result)); -/******************************************************//** -Determine if the offsets are for a record containing -externally stored columns. -@return nonzero if externally stored */ -UNIV_INLINE -ulint -rec_offs_any_extern( -/*================*/ - const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ - MY_ATTRIBUTE((warn_unused_result)); -/******************************************************//** Determine if the offsets are for a record containing null BLOB pointers. @return first field containing a null BLOB pointer, or NULL if none found */ UNIV_INLINE @@ -640,16 +680,6 @@ rec_offs_any_null_extern( const rec_t* rec, /*!< in: record */ const rec_offs* offsets) /*!< in: rec_get_offsets(rec) */ MY_ATTRIBUTE((warn_unused_result)); -/******************************************************//** -Returns nonzero if the extern bit is set in nth field of rec. -@return nonzero if externally stored */ -UNIV_INLINE -ulint -rec_offs_nth_extern( -/*================*/ - const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ - ulint n) /*!< in: nth field */ - MY_ATTRIBUTE((warn_unused_result)); /** Mark the nth field as externally stored. @param[in] offsets array returned by rec_get_offsets() @@ -658,16 +688,158 @@ void rec_offs_make_nth_extern( rec_offs* offsets, const ulint n); -/******************************************************//** -Returns nonzero if the SQL NULL bit is set in nth field of rec. -@return nonzero if SQL NULL */ -UNIV_INLINE + +MY_ATTRIBUTE((nonnull)) +/** Determine the number of allocated elements for an array of offsets. +@param[in] offsets offsets after rec_offs_set_n_alloc() +@return number of elements */ +inline ulint rec_offs_get_n_alloc(const rec_offs *offsets) +{ + ut_ad(offsets); + ulint n_alloc= offsets[0]; + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + MEM_CHECK_ADDRESSABLE(offsets, n_alloc * sizeof *offsets); + return n_alloc; +} + +/** Determine the number of fields for which offsets have been initialized. +@param[in] offsets rec_get_offsets() +@return number of fields */ +inline ulint -rec_offs_nth_sql_null( -/*==================*/ - const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ - ulint n) /*!< in: nth field */ - MY_ATTRIBUTE((warn_unused_result)); +rec_offs_n_fields(const rec_offs* offsets) +{ + ulint n_fields; + ut_ad(offsets); + n_fields = offsets[1]; + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + return(n_fields); +} + +/** Get a flag of a record field. +@param[in] offsets rec_get_offsets() +@param[in] n nth field +@param[in] flag flag to extract +@return type of the record field */ +inline field_type_t rec_offs_nth_type(const rec_offs *offsets, ulint n) +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return get_type(rec_offs_base(offsets)[1 + n]); +} + +/** Determine if a record field is missing +(should be replaced by dict_index_t::instant_field_value()). +@param[in] offsets rec_get_offsets() +@param[in] n nth field +@return nonzero if default bit is set */ +inline ulint rec_offs_nth_default(const rec_offs *offsets, ulint n) +{ + return rec_offs_nth_type(offsets, n) == DEFAULT; +} + +/** Determine if a record field is SQL NULL +(should be replaced by dict_index_t::instant_field_value()). +@param[in] offsets rec_get_offsets() +@param[in] n nth field +@return nonzero if SQL NULL set */ +inline ulint rec_offs_nth_sql_null(const rec_offs *offsets, ulint n) +{ + return rec_offs_nth_type(offsets, n) == SQL_NULL; +} + +/** Determine if a record field is stored off-page. +@param[in] offsets rec_get_offsets() +@param[in] n nth field +Returns nonzero if the extern bit is set in nth field of rec. +@return nonzero if externally stored */ +inline ulint rec_offs_nth_extern(const rec_offs *offsets, ulint n) +{ + return rec_offs_nth_type(offsets, n) == STORED_OFFPAGE; +} + +/** Get a global flag of a record. +@param[in] offsets rec_get_offsets() +@param[in] flag flag to extract +@return the flag of the record field */ +inline ulint rec_offs_any_flag(const rec_offs *offsets, ulint flag) +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return *rec_offs_base(offsets) & flag; +} + +/** Determine if the offsets are for a record containing off-page columns. +@param[in] offsets rec_get_offsets() +@return nonzero if any off-page columns exist */ +inline bool rec_offs_any_extern(const rec_offs *offsets) +{ + return rec_offs_any_flag(offsets, REC_OFFS_EXTERNAL); +} + +/** Determine if the offsets are for a record that is missing fields. +@param[in] offsets rec_get_offsets() +@return nonzero if any fields need to be replaced with + dict_index_t::instant_field_value() */ +inline ulint rec_offs_any_default(const rec_offs *offsets) +{ + return rec_offs_any_flag(offsets, REC_OFFS_DEFAULT); +} + +/** Determine if the offsets are for other than ROW_FORMAT=REDUNDANT. +@param[in] offsets rec_get_offsets() +@return nonzero if ROW_FORMAT is COMPACT,DYNAMIC or COMPRESSED +@retval 0 if ROW_FORMAT=REDUNDANT */ +inline ulint rec_offs_comp(const rec_offs *offsets) +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return (*rec_offs_base(offsets) & REC_OFFS_COMPACT); +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index. +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_metadata(const rec_t* rec, const dict_index_t* index) +{ + bool is = rec_get_info_bits(rec, dict_table_is_comp(index->table)) + & REC_INFO_MIN_REC_FLAG; + ut_ad(!is || index->is_instant()); + ut_ad(!is || !dict_table_is_comp(index->table) + || rec_get_status(rec) == REC_STATUS_COLUMNS_ADDED); + return is; +} + +/** Get the nth field from an index. +@param[in] rec index record +@param[in] index index +@param[in] offsets rec_get_offsets(rec, index) +@param[in] n field number +@param[out] len length of the field in bytes, or UNIV_SQL_NULL +@return a read-only copy of the index field */ +inline +const byte* +rec_get_nth_cfield( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + ulint n, + ulint* len) +{ + /* Because this function may be invoked by innobase_rec_to_mysql() + for reporting a duplicate key during ALTER TABLE or + CREATE UNIQUE INDEX, and in that case the rec omit the fixed-size + header of 5 or 6 bytes, the check + rec_offs_validate(rec, index, offsets) must be avoided here. */ + if (!rec_offs_nth_default(offsets, n)) { + return rec_get_nth_field(rec, offsets, n, len); + } + return index->instant_field_value(n, len); +} + /******************************************************//** Gets the physical size of a field. @return length of field */ @@ -721,16 +893,6 @@ rec_get_data_size_old( const rec_t* rec) /*!< in: physical record */ MY_ATTRIBUTE((warn_unused_result)); /**********************************************************//** -The following function returns the number of allocated elements -for an array of offsets. -@return number of elements */ -UNIV_INLINE -ulint -rec_offs_get_n_alloc( -/*=================*/ - const rec_offs* offsets)/*!< in: array for rec_get_offsets() */ - MY_ATTRIBUTE((warn_unused_result)); -/**********************************************************//** The following function sets the number of allocated elements for an array of offsets. */ UNIV_INLINE @@ -744,15 +906,6 @@ rec_offs_set_n_alloc( #define rec_offs_init(offsets) \ rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets) /**********************************************************//** -The following function returns the number of fields in a record. -@return number of fields */ -UNIV_INLINE -ulint -rec_offs_n_fields( -/*==============*/ - const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ - MY_ATTRIBUTE((warn_unused_result)); -/**********************************************************//** The following function returns the data size of a physical record, that is the sum of field lengths. SQL null fields are counted as length 0 fields. The value returned by the function @@ -827,37 +980,62 @@ rec_copy( @param[in] fields data fields @param[in] n_fields number of data fields @param[out] extra record header size +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED @return total size, in bytes */ +template<bool redundant_temp> ulint rec_get_converted_size_temp( const dict_index_t* index, const dfield_t* fields, ulint n_fields, - ulint* extra) - MY_ATTRIBUTE((warn_unused_result, nonnull(1,2))); + ulint* extra, + rec_comp_status_t status = REC_STATUS_ORDINARY) + MY_ATTRIBUTE((warn_unused_result, nonnull)); -/******************************************************//** -Determine the offset to each field in temporary file. -@see rec_convert_dtuple_to_temp() */ +/** Determine the offset to each field in temporary file. +@param[in] rec temporary file record +@param[in] index index of that the record belongs to +@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) +@param[in] n_core number of core fields (index->n_core_fields) +@param[in] def_val default values for non-core fields +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED */ void rec_init_offsets_temp( -/*==================*/ - const rec_t* rec, /*!< in: temporary file record */ - const dict_index_t* index, /*!< in: record descriptor */ - rec_offs* offsets)/*!< in/out: array of offsets; - in: n=rec_offs_n_fields(offsets) */ + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets, + ulint n_core, + const dict_col_t::def_t*def_val, + rec_comp_status_t status = REC_STATUS_ORDINARY) + MY_ATTRIBUTE((nonnull(1,2,3))); +/** Determine the offset to each field in temporary file. +@param[in] rec temporary file record +@param[in] index index of that the record belongs to +@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) +*/ +void +rec_init_offsets_temp( + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets) MY_ATTRIBUTE((nonnull)); -/*********************************************************//** -Builds a temporary file record out of a data tuple. -@see rec_init_offsets_temp() */ +/** Convert a data tuple prefix to the temporary file format. +@param[out] rec record in temporary file format +@param[in] index clustered or secondary index +@param[in] fields data fields +@param[in] n_fields number of data fields +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED +*/ +template<bool redundant_temp> void rec_convert_dtuple_to_temp( -/*=======================*/ - rec_t* rec, /*!< out: record */ - const dict_index_t* index, /*!< in: record descriptor */ - const dfield_t* fields, /*!< in: array of data fields */ - ulint n_fields); /*!< in: number of fields */ + rec_t* rec, + const dict_index_t* index, + const dfield_t* fields, + ulint n_fields, + rec_comp_status_t status = REC_STATUS_ORDINARY) + MY_ATTRIBUTE((nonnull)); /**************************************************************//** Copies the first n fields of a physical record to a new physical record in @@ -875,22 +1053,6 @@ rec_copy_prefix_to_buf( or NULL */ ulint* buf_size) /*!< in/out: buffer size */ MY_ATTRIBUTE((nonnull)); -/** Fold a prefix of a physical record. -@param[in] rec index record -@param[in] offsets return value of rec_get_offsets() -@param[in] n_fields number of complete fields to fold -@param[in] n_bytes number of bytes to fold in the last field -@param[in] index_id index tree ID -@return the folded value */ -UNIV_INLINE -ulint -rec_fold( - const rec_t* rec, - const rec_offs* offsets, - ulint n_fields, - ulint n_bytes, - index_id_t tree_id) - MY_ATTRIBUTE((warn_unused_result)); /*********************************************************//** Builds a physical record out of a data tuple and stores it into the given buffer. @@ -938,7 +1100,7 @@ rec_get_converted_size_comp( dict_table_is_comp() is assumed to hold, even if it does not */ - ulint status, /*!< in: status bits of the record */ + rec_comp_status_t status, /*!< in: status bits of the record */ const dfield_t* fields, /*!< in: array of data fields */ ulint n_fields,/*!< in: number of data fields */ ulint* extra) /*!< out: extra size */ @@ -959,27 +1121,20 @@ rec_get_converted_size( The fields are copied into the memory heap. @param[out] tuple data tuple @param[in] rec index record, or a copy thereof -@param[in] is_leaf whether rec is a leaf page record +@param[in] index index of rec +@param[in] n_core index->n_core_fields at the time rec was + copied, or 0 if non-leaf page record @param[in] n_fields number of fields to copy @param[in,out] heap memory heap */ void -rec_copy_prefix_to_dtuple_func( +rec_copy_prefix_to_dtuple( dtuple_t* tuple, const rec_t* rec, const dict_index_t* index, -#ifdef UNIV_DEBUG - bool is_leaf, -#endif /* UNIV_DEBUG */ + ulint n_core, ulint n_fields, mem_heap_t* heap) MY_ATTRIBUTE((nonnull)); -#ifdef UNIV_DEBUG -# define rec_copy_prefix_to_dtuple(tuple,rec,index,leaf,n_fields,heap) \ - rec_copy_prefix_to_dtuple_func(tuple,rec,index,leaf,n_fields,heap) -#else /* UNIV_DEBUG */ -# define rec_copy_prefix_to_dtuple(tuple,rec,index,leaf,n_fields,heap) \ - rec_copy_prefix_to_dtuple_func(tuple,rec,index,n_fields,heap) -#endif /* UNIV_DEBUG */ /***************************************************************//** Validates the consistency of a physical record. @return TRUE if ok */ diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic index f65bca8181d..48898b1f916 100644 --- a/storage/innobase/include/rem0rec.ic +++ b/storage/innobase/include/rem0rec.ic @@ -26,7 +26,6 @@ Created 5/30/1994 Heikki Tuuri #include "mach0data.h" #include "ut0byte.h" -#include "dict0dict.h" #include "dict0boot.h" #include "btr0types.h" @@ -62,12 +61,13 @@ most significant bytes and bits are written below less significant. we can calculate the offset of the next record with the formula: relative_offset + offset_of_this_record - mod UNIV_PAGE_SIZE + mod srv_page_size 3 3 bits status: - 000=conventional record - 001=node pointer record (inside B-tree) - 010=infimum record - 011=supremum record + 000=REC_STATUS_ORDINARY + 001=REC_STATUS_NODE_PTR + 010=REC_STATUS_INFIMUM + 011=REC_STATUS_SUPREMUM + 100=REC_STATUS_COLUMNS_ADDED 1xx=reserved 5 bits heap number 4 8 bits heap number @@ -90,10 +90,6 @@ and the shift needed to obtain each bit-field of the record. */ #define REC_OLD_N_FIELDS_MASK 0x7FEUL #define REC_OLD_N_FIELDS_SHIFT 1 -#define REC_NEW_STATUS 3 /* This is single byte bit-field */ -#define REC_NEW_STATUS_MASK 0x7UL -#define REC_NEW_STATUS_SHIFT 0 - #define REC_OLD_HEAP_NO 5 #define REC_HEAP_NO_MASK 0xFFF8UL #if 0 /* defined in rem0rec.h for use of page0zip.cc */ @@ -239,8 +235,8 @@ rec_get_next_ptr_const( { ulint field_value; - ut_ad(REC_NEXT_MASK == 0xFFFFUL); - ut_ad(REC_NEXT_SHIFT == 0); + compile_time_assert(REC_NEXT_MASK == 0xFFFFUL); + compile_time_assert(REC_NEXT_SHIFT == 0); field_value = mach_read_from_2(rec - REC_NEXT); @@ -258,13 +254,13 @@ rec_get_next_ptr_const( as signed 16-bit integer in 2's complement arithmetics. If all platforms defined int16_t in the standard headers, the expression could be written simpler as - (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE + (int16_t) field_value + ut_align_offset(...) < srv_page_size */ ut_ad((field_value >= 32768 ? field_value - 65536 : field_value) - + ut_align_offset(rec, UNIV_PAGE_SIZE) - < UNIV_PAGE_SIZE); + + ut_align_offset(rec, srv_page_size) + < srv_page_size); #endif /* There must be at least REC_N_NEW_EXTRA_BYTES + 1 between each record. */ @@ -272,12 +268,12 @@ rec_get_next_ptr_const( && field_value < 32768) || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); - return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE) - + ut_align_offset(rec + field_value, UNIV_PAGE_SIZE)); + return((byte*) ut_align_down(rec, srv_page_size) + + ut_align_offset(rec + field_value, srv_page_size)); } else { - ut_ad(field_value < UNIV_PAGE_SIZE); + ut_ad(field_value < srv_page_size); - return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE) + return((byte*) ut_align_down(rec, srv_page_size) + field_value); } } @@ -308,12 +304,8 @@ rec_get_next_offs( ulint comp) /*!< in: nonzero=compact page format */ { ulint field_value; -#if REC_NEXT_MASK != 0xFFFFUL -# error "REC_NEXT_MASK != 0xFFFFUL" -#endif -#if REC_NEXT_SHIFT -# error "REC_NEXT_SHIFT != 0" -#endif + compile_time_assert(REC_NEXT_MASK == 0xFFFFUL); + compile_time_assert(REC_NEXT_SHIFT == 0); field_value = mach_read_from_2(rec - REC_NEXT); @@ -326,13 +318,13 @@ rec_get_next_offs( as signed 16-bit integer in 2's complement arithmetics. If all platforms defined int16_t in the standard headers, the expression could be written simpler as - (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE + (int16_t) field_value + ut_align_offset(...) < srv_page_size */ ut_ad((field_value >= 32768 ? field_value - 65536 : field_value) - + ut_align_offset(rec, UNIV_PAGE_SIZE) - < UNIV_PAGE_SIZE); + + ut_align_offset(rec, srv_page_size) + < srv_page_size); #endif if (field_value == 0) { @@ -345,9 +337,9 @@ rec_get_next_offs( && field_value < 32768) || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); - return(ut_align_offset(rec + field_value, UNIV_PAGE_SIZE)); + return(ut_align_offset(rec + field_value, srv_page_size)); } else { - ut_ad(field_value < UNIV_PAGE_SIZE); + ut_ad(field_value < srv_page_size); return(field_value); } @@ -392,7 +384,7 @@ rec_set_next_offs_new( field_value = (ulint) ((lint) next - - (lint) ut_align_offset(rec, UNIV_PAGE_SIZE)); + - (lint) ut_align_offset(rec, srv_page_size)); field_value &= REC_NEXT_MASK; } @@ -441,26 +433,6 @@ rec_set_n_fields_old( } /******************************************************//** -The following function retrieves the status bits of a new-style record. -@return status bits */ -UNIV_INLINE -ulint -rec_get_status( -/*===========*/ - const rec_t* rec) /*!< in: physical record */ -{ - ulint ret; - - ut_ad(rec); - - ret = rec_get_bit_field_1(rec, REC_NEW_STATUS, - REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); - ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0); - - return(ret); -} - -/******************************************************//** The following function is used to get the number of fields in a record. @return number of data fields */ @@ -479,6 +451,7 @@ rec_get_n_fields( } switch (rec_get_status(rec)) { + case REC_STATUS_COLUMNS_ADDED: case REC_STATUS_ORDINARY: return(dict_index_get_n_fields(index)); case REC_STATUS_NODE_PTR: @@ -486,10 +459,10 @@ rec_get_n_fields( case REC_STATUS_INFIMUM: case REC_STATUS_SUPREMUM: return(1); - default: - ut_error; - return(ULINT_UNDEFINED); } + + ut_error; + return(ULINT_UNDEFINED); } /** Confirms the n_fields of the entry is sane with comparing the other @@ -505,13 +478,15 @@ rec_n_fields_is_sane( const rec_t* rec, const dtuple_t* entry) { - return(rec_get_n_fields(rec, index) - == dtuple_get_n_fields(entry) + const ulint n_fields = rec_get_n_fields(rec, index); + + return(n_fields == dtuple_get_n_fields(entry) + || (index->is_instant() + && n_fields >= index->n_core_fields) /* a record for older SYS_INDEXES table (missing merge_threshold column) is acceptable. */ || (index->table->id == DICT_INDEXES_ID - && rec_get_n_fields(rec, index) - == dtuple_get_n_fields(entry) - 1)); + && n_fields == dtuple_get_n_fields(entry) - 1)); } /******************************************************//** @@ -630,19 +605,6 @@ rec_set_info_bits_new( } /******************************************************//** -The following function is used to set the status bits of a new-style record. */ -UNIV_INLINE -void -rec_set_status( -/*===========*/ - rec_t* rec, /*!< in/out: physical record */ - ulint bits) /*!< in: info bits */ -{ - rec_set_bit_field_1(rec, bits, REC_NEW_STATUS, - REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); -} - -/******************************************************//** The following function is used to retrieve the info and status bits of a record. (Only compact records have status bits.) @return info bits */ @@ -654,12 +616,11 @@ rec_get_info_and_status_bits( ulint comp) /*!< in: nonzero=compact page format */ { ulint bits; -#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ -& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) -# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" -#endif + compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) + & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); if (comp) { - bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec); + bits = rec_get_info_bits(rec, TRUE) + | ulint(rec_get_status(rec)); } else { bits = rec_get_info_bits(rec, FALSE); ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); @@ -676,10 +637,8 @@ rec_set_info_and_status_bits( rec_t* rec, /*!< in/out: physical record */ ulint bits) /*!< in: info bits */ { -#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ -& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) -# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" -#endif + compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) + & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); rec_set_status(rec, bits & REC_NEW_STATUS_MASK); rec_set_info_bits_new(rec, bits & ~REC_NEW_STATUS_MASK); } @@ -758,7 +717,7 @@ rec_set_deleted_flag_new( The following function tells if a new-style record is a node pointer. @return TRUE if node pointer */ UNIV_INLINE -ibool +bool rec_get_node_ptr_flag( /*==================*/ const rec_t* rec) /*!< in: physical record */ @@ -832,10 +791,6 @@ rec_get_1byte_offs_flag( /*====================*/ const rec_t* rec) /*!< in: physical record */ { -#if TRUE != 1 -#error "TRUE != 1" -#endif - return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK, REC_OLD_SHORT_SHIFT)); } @@ -849,10 +804,7 @@ rec_set_1byte_offs_flag( rec_t* rec, /*!< in: physical record */ ibool flag) /*!< in: TRUE if 1byte form */ { -#if TRUE != 1 -#error "TRUE != 1" -#endif - ut_ad(flag <= TRUE); + ut_ad(flag <= 1); rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK, REC_OLD_SHORT_SHIFT); @@ -883,7 +835,7 @@ value. @return offset of the start of the field, SQL null flag and extern storage flag ORed */ UNIV_INLINE -rec_offs +uint16_t rec_2_get_field_end_info( /*=====================*/ const rec_t* rec, /*!< in: record */ @@ -909,29 +861,6 @@ rec_2_is_field_extern( return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK); } -/* Get the base address of offsets. The extra_size is stored at -this position, and following positions hold the end offsets of -the fields. */ -#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE) - -/**********************************************************//** -The following function returns the number of allocated elements -for an array of offsets. -@return number of elements */ -UNIV_INLINE -ulint -rec_offs_get_n_alloc( -/*=================*/ - const rec_offs* offsets)/*!< in: array for rec_get_offsets() */ -{ - ulint n_alloc; - ut_ad(offsets); - n_alloc = offsets[0]; - ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); - MEM_CHECK_ADDRESSABLE(offsets, n_alloc * sizeof *offsets); - return(n_alloc); -} - /**********************************************************//** The following function sets the number of allocated elements for an array of offsets. */ @@ -948,102 +877,6 @@ rec_offs_set_n_alloc( offsets[0] = static_cast<rec_offs>(n_alloc); } -/**********************************************************//** -The following function returns the number of fields in a record. -@return number of fields */ -UNIV_INLINE -ulint -rec_offs_n_fields( -/*==============*/ - const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ -{ - ulint n_fields; - ut_ad(offsets); - n_fields = offsets[1]; - ut_ad(n_fields > 0); - ut_ad(n_fields <= REC_MAX_N_FIELDS); - ut_ad(n_fields + REC_OFFS_HEADER_SIZE - <= rec_offs_get_n_alloc(offsets)); - return(n_fields); -} - -/************************************************************//** -Validates offsets returned by rec_get_offsets(). -@return TRUE if valid */ -UNIV_INLINE -ibool -rec_offs_validate( -/*==============*/ - const rec_t* rec, /*!< in: record or NULL */ - const dict_index_t* index, /*!< in: record descriptor or NULL */ - const rec_offs* offsets)/*!< in: array returned by - rec_get_offsets() */ -{ - ulint i = rec_offs_n_fields(offsets); - ulint last = ULINT_MAX; - bool comp = rec_offs_base(offsets)[0] & REC_OFFS_COMPACT; - - if (rec) { - ut_ad(!memcmp(&rec, &offsets[RECORD_OFFSET], sizeof(rec))); - if (!comp) { - ut_a(rec_get_n_fields_old(rec) >= i); - } - } - if (index) { - ulint max_n_fields; - ut_ad(!memcmp(&index, &offsets[INDEX_OFFSET], sizeof(index))); - max_n_fields = ut_max( - dict_index_get_n_fields(index), - dict_index_get_n_unique_in_tree(index) + 1); - if (comp && rec) { - switch (rec_get_status(rec)) { - case REC_STATUS_ORDINARY: - break; - case REC_STATUS_NODE_PTR: - max_n_fields = dict_index_get_n_unique_in_tree( - index) + 1; - break; - case REC_STATUS_INFIMUM: - case REC_STATUS_SUPREMUM: - max_n_fields = 1; - break; - default: - ut_error; - } - } - /* index->n_def == 0 for dummy indexes if !comp */ - ut_a(!comp || index->n_def); - ut_a(!index->n_def || i <= max_n_fields); - } - while (i--) { - rec_offs curr = get_value(rec_offs_base(offsets)[1 + i]); - ut_a(curr <= last); - last = curr; - } - return(TRUE); -} -#ifdef UNIV_DEBUG -/************************************************************//** -Updates debug data in offsets, in order to avoid bogus -rec_offs_validate() failures. */ -UNIV_INLINE -void -rec_offs_make_valid( -/*================*/ - const rec_t* rec, /*!< in: record */ - const dict_index_t* index, /*!< in: record descriptor */ - rec_offs* offsets)/*!< in: array returned by - rec_get_offsets() */ -{ - ut_ad(rec); - ut_ad(index); - ut_ad(offsets); - ut_ad(rec_get_n_fields(rec, index) >= rec_offs_n_fields(offsets)); - memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec)); - memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index)); -} -#endif /* UNIV_DEBUG */ - /************************************************************//** The following function is used to get an offset to the nth data field in a record. @@ -1055,7 +888,7 @@ rec_get_nth_field_offs( const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ ulint n, /*!< in: index of the field */ ulint* len) /*!< out: length of the field; UNIV_SQL_NULL - if SQL null */ + if SQL null; UNIV_SQL_DEFAULT is default value */ { ut_ad(n < rec_offs_n_fields(offsets)); @@ -1064,8 +897,9 @@ rec_get_nth_field_offs( if (get_type(next_offs) == SQL_NULL) { *len = UNIV_SQL_NULL; + } else if (get_type(next_offs) == DEFAULT) { + *len = UNIV_SQL_DEFAULT; } else { - *len = get_value(next_offs) - offs; } @@ -1073,34 +907,6 @@ rec_get_nth_field_offs( } /******************************************************//** -Determine if the offsets are for a record in the new -compact format. -@return nonzero if compact format */ -UNIV_INLINE -ulint -rec_offs_comp( -/*==========*/ - const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ -{ - ut_ad(rec_offs_validate(NULL, NULL, offsets)); - return(*rec_offs_base(offsets) & REC_OFFS_COMPACT); -} - -/******************************************************//** -Determine if the offsets are for a record containing -externally stored columns. -@return nonzero if externally stored */ -UNIV_INLINE -ulint -rec_offs_any_extern( -/*================*/ - const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ -{ - ut_ad(rec_offs_validate(NULL, NULL, offsets)); - return(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL); -} - -/******************************************************//** Determine if the offsets are for a record containing null BLOB pointers. @return first field containing a null BLOB pointer, or NULL if none found */ UNIV_INLINE @@ -1137,36 +943,6 @@ rec_offs_any_null_extern( } /******************************************************//** -Returns nonzero if the extern bit is set in nth field of rec. -@return nonzero if externally stored */ -UNIV_INLINE -ulint -rec_offs_nth_extern( -/*================*/ - const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ - ulint n) /*!< in: nth field */ -{ - ut_ad(rec_offs_validate(NULL, NULL, offsets)); - ut_ad(n < rec_offs_n_fields(offsets)); - return get_type(rec_offs_base(offsets)[1 + n]) == STORED_OFFPAGE; -} - -/******************************************************//** -Returns nonzero if the SQL NULL bit is set in nth field of rec. -@return nonzero if SQL NULL */ -UNIV_INLINE -ulint -rec_offs_nth_sql_null( -/*==================*/ - const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ - ulint n) /*!< in: nth field */ -{ - ut_ad(rec_offs_validate(NULL, NULL, offsets)); - ut_ad(n < rec_offs_n_fields(offsets)); - return get_type(rec_offs_base(offsets)[1 + n]) == SQL_NULL; -} - -/******************************************************//** Gets the physical size of a field. @return length of field */ UNIV_INLINE @@ -1374,7 +1150,7 @@ rec_get_nth_field_size( os = rec_get_field_start_offs(rec, n); next_os = rec_get_field_start_offs(rec, n + 1); - ut_ad(next_os - os < UNIV_PAGE_SIZE); + ut_ad(next_os - os < srv_page_size); return(next_os - os); } @@ -1400,6 +1176,7 @@ rec_set_nth_field( ulint len2; ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_nth_default(offsets, n)); if (len == UNIV_SQL_NULL) { if (!rec_offs_nth_sql_null(offsets, n)) { @@ -1410,7 +1187,7 @@ rec_set_nth_field( return; } - data2 = rec_get_nth_field(rec, offsets, n, &len2); + data2 = (byte*)rec_get_nth_field(rec, offsets, n, &len2); if (len2 == UNIV_SQL_NULL) { ut_ad(!rec_offs_comp(offsets)); rec_set_nth_field_null_bit(rec, n, FALSE); @@ -1473,7 +1250,7 @@ rec_offs_data_size( ut_ad(rec_offs_validate(NULL, NULL, offsets)); size = get_value(rec_offs_base(offsets)[rec_offs_n_fields(offsets)]); - ut_ad(size < UNIV_PAGE_SIZE); + ut_ad(size < srv_page_size); return(size); } @@ -1490,8 +1267,8 @@ rec_offs_extra_size( { ulint size; ut_ad(rec_offs_validate(NULL, NULL, offsets)); - size = *rec_offs_base(offsets) & ~(REC_OFFS_COMPACT | REC_OFFS_EXTERNAL); - ut_ad(size < UNIV_PAGE_SIZE); + size = *rec_offs_base(offsets) & REC_OFFS_MASK; + ut_ad(size < srv_page_size); return(size); } @@ -1601,27 +1378,34 @@ rec_get_converted_size( ulint extra_size; ut_ad(dtuple_check_typed(dtuple)); - - ut_ad(dict_index_is_ibuf(index) - - || dtuple_get_n_fields(dtuple) - == (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) - == REC_STATUS_NODE_PTR) - ? dict_index_get_n_unique_in_tree_nonleaf(index) + 1 - : dict_index_get_n_fields(index)) - - /* a record for older SYS_INDEXES table - (missing merge_threshold column) is acceptable. */ - || (index->table->id == DICT_INDEXES_ID - && dtuple_get_n_fields(dtuple) - == dict_index_get_n_fields(index) - 1)); +#ifdef UNIV_DEBUG + if (dict_index_is_ibuf(index)) { + ut_ad(dtuple->n_fields > 1); + } else if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) + == REC_STATUS_NODE_PTR) { + ut_ad(dtuple->n_fields + == dict_index_get_n_unique_in_tree_nonleaf(index) + 1); + } else if (index->table->id == DICT_INDEXES_ID) { + /* The column SYS_INDEXES.MERGE_THRESHOLD was + instantly added in MariaDB 10.2.2 (MySQL 5.7). */ + ut_ad(index->n_fields == DICT_NUM_FIELDS__SYS_INDEXES); + ut_ad(dtuple->n_fields == DICT_NUM_FIELDS__SYS_INDEXES + || dtuple->n_fields + == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD); + } else { + ut_ad(dtuple->n_fields >= index->n_core_fields); + ut_ad(dtuple->n_fields <= index->n_fields); + } +#endif if (dict_table_is_comp(index->table)) { - return(rec_get_converted_size_comp(index, - dtuple_get_info_bits(dtuple) - & REC_NEW_STATUS_MASK, - dtuple->fields, - dtuple->n_fields, NULL)); + return(rec_get_converted_size_comp( + index, + static_cast<rec_comp_status_t>( + dtuple->info_bits + & REC_NEW_STATUS_MASK), + dtuple->fields, + dtuple->n_fields, NULL)); } data_size = dtuple_get_data_size(dtuple, 0); @@ -1634,105 +1418,5 @@ rec_get_converted_size( extra_size = rec_get_converted_extra_size( data_size, dtuple_get_n_fields(dtuple), n_ext); -#if 0 - /* This code is inactive since it may be the wrong place to add - in the size of node pointers used in parent pages AND it is not - currently needed since ha_innobase::max_supported_key_length() - ensures that the key size limit for each page size is well below - the actual limit ((free space on page / 4) - record overhead). - But those limits will need to be raised when InnoDB can - support multiple page sizes. At that time, we will need - to consider the node pointer on these universal btrees. */ - - if (dict_index_is_ibuf(index)) { - /* This is for the insert buffer B-tree. - All fields in the leaf tuple ascend to the - parent node plus the child page pointer. */ - - /* ibuf cannot contain externally stored fields */ - ut_ad(n_ext == 0); - - /* Add the data pointer and recompute extra_size - based on one more field. */ - data_size += REC_NODE_PTR_SIZE; - extra_size = rec_get_converted_extra_size( - data_size, - dtuple_get_n_fields(dtuple) + 1, - 0); - - /* Be sure dtuple->n_fields has this node ptr - accounted for. This function should correspond to - what rec_convert_dtuple_to_rec() needs in storage. - In optimistic insert or update-not-in-place, we will - have to ensure that if the record is converted to a - node pointer, it will not become too large.*/ - } -#endif - return(data_size + extra_size); } - -/** Fold a prefix of a physical record. -@param[in] rec index record -@param[in] offsets return value of rec_get_offsets() -@param[in] n_fields number of complete fields to fold -@param[in] n_bytes number of bytes to fold in the last field -@param[in] index_id index tree ID -@return the folded value */ -UNIV_INLINE -ulint -rec_fold( - const rec_t* rec, - const rec_offs* offsets, - ulint n_fields, - ulint n_bytes, - index_id_t tree_id) -{ - ulint i; - const byte* data; - ulint len; - ulint fold; - ulint n_fields_rec; - - ut_ad(rec_offs_validate(rec, NULL, offsets)); - ut_ad(rec_validate(rec, offsets)); - ut_ad(n_fields > 0 || n_bytes > 0); - - n_fields_rec = rec_offs_n_fields(offsets); - ut_ad(n_fields <= n_fields_rec); - ut_ad(n_fields < n_fields_rec || n_bytes == 0); - - if (n_fields > n_fields_rec) { - n_fields = n_fields_rec; - } - - if (n_fields == n_fields_rec) { - n_bytes = 0; - } - - fold = ut_fold_ull(tree_id); - - for (i = 0; i < n_fields; i++) { - data = rec_get_nth_field(rec, offsets, i, &len); - - if (len != UNIV_SQL_NULL) { - fold = ut_fold_ulint_pair(fold, - ut_fold_binary(data, len)); - } - } - - if (n_bytes > 0) { - data = rec_get_nth_field(rec, offsets, i, &len); - - if (len != UNIV_SQL_NULL) { - if (len > n_bytes) { - len = n_bytes; - } - - fold = ut_fold_ulint_pair(fold, - ut_fold_binary(data, len)); - } - } - - return(fold); -} diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h index 9172385a802..11a164f5130 100644 --- a/storage/innobase/include/rem0types.h +++ b/storage/innobase/include/rem0types.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under @@ -58,8 +58,7 @@ This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data files would be at risk! */ #define REC_ANTELOPE_MAX_INDEX_COL_LEN 768 -/** Maximum indexed field length for table format UNIV_FORMAT_B and -beyond. +/** Maximum indexed field length for tables that have atomic BLOBs. This (3072) is the maximum index row length allowed, so we cannot create index prefix column longer than that. */ #define REC_VERSION_56_MAX_INDEX_COL_LEN 3072 diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h index 84368374430..3a65e1c58da 100644 --- a/storage/innobase/include/row0ftsort.h +++ b/storage/innobase/include/row0ftsort.h @@ -60,6 +60,8 @@ struct fts_psort_t; struct fts_psort_common_t { row_merge_dup_t* dup; /*!< descriptor of FTS index */ dict_table_t* new_table; /*!< source table */ + /* Old table page size */ + page_size_t old_page_size; trx_t* trx; /*!< transaction */ fts_psort_t* all_info; /*!< all parallel sort info */ os_event_t sort_event; /*!< sort event */ @@ -181,36 +183,37 @@ tokenized doc string. The index has three "fields": dict_index_t* row_merge_create_fts_sort_index( /*============================*/ - dict_index_t* index, /*!< in: Original FTS index - based on which this sort index - is created */ - const dict_table_t* table, /*!< in: table that FTS index - is being created on */ - ibool* opt_doc_id_size); - /*!< out: whether to use 4 bytes - instead of 8 bytes integer to - store Doc ID during sort */ - -/********************************************************************//** -Initialize FTS parallel sort structures. + dict_index_t* index, /*!< in: Original FTS index + based on which this sort index + is created */ + dict_table_t* table, /*!< in,out: table that FTS index + is being created on */ + ibool* opt_doc_id_size); + /*!< out: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ + +/** Initialize FTS parallel sort structures. +@param[in] trx transaction +@param[in,out] dup descriptor of FTS index being created +@param[in] new_table table where indexes are created +@param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes + integer to store Doc ID during sort +@param[in] old_page_size page size of the old table during alter +@param[out] psort parallel sort info to be instantiated +@param[out] merge parallel merge info to be instantiated @return TRUE if all successful */ ibool row_fts_psort_info_init( -/*====================*/ - trx_t* trx, /*!< in: transaction */ - row_merge_dup_t* dup, /*!< in,own: descriptor of - FTS index being created */ - const dict_table_t* new_table,/*!< in: table where indexes are - created */ + trx_t* trx, + row_merge_dup_t* dup, + const dict_table_t* new_table, ibool opt_doc_id_size, - /*!< in: whether to use 4 bytes - instead of 8 bytes integer to - store Doc ID during sort */ - fts_psort_t** psort, /*!< out: parallel sort info to be - instantiated */ - fts_psort_t** merge) /*!< out: parallel merge info - to be instantiated */ + const page_size_t old_page_size, + fts_psort_t** psort, + fts_psort_t** merge) MY_ATTRIBUTE((nonnull)); + /********************************************************************//** Clean up and deallocate FTS parallel sort structures, and close temparary merge sort files */ diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h index b553f169c91..fd2651da39e 100644 --- a/storage/innobase/include/row0import.h +++ b/storage/innobase/include/row0import.h @@ -46,21 +46,13 @@ row_import_for_mysql( in MySQL */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/*****************************************************************//** -Update the DICT_TF2_DISCARDED flag in SYS_TABLES. -@return DB_SUCCESS or error code. */ -dberr_t -row_import_update_discarded_flag( -/*=============================*/ - trx_t* trx, /*!< in/out: transaction that - covers the update */ - table_id_t table_id, /*!< in: Table for which we want - to set the root table->flags2 */ - bool discarded, /*!< in: set MIX_LEN column bit - to discarded, if true */ - bool dict_locked) /*!< in: Set to true if the - caller already owns the - dict_sys_t:: mutex. */ +/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN. +@param[in,out] trx dictionary transaction +@param[in] table_id table identifier +@param[in] discarded whether to set or clear the flag +@return DB_SUCCESS or error code */ +dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id, + bool discarded) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Update the root page numbers and tablespace ID of a table. diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h index 27fe442f6ff..9a16394a052 100644 --- a/storage/innobase/include/row0ins.h +++ b/storage/innobase/include/row0ins.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, 2020 MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -53,15 +53,7 @@ row_ins_check_foreign_constraint( dtuple_t* entry, /*!< in: index entry for index */ que_thr_t* thr) /*!< in: query thread */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/*********************************************************************//** -Creates an insert node struct. -@return own: insert node struct */ -ins_node_t* -ins_node_create( -/*============*/ - ulint ins_type, /*!< in: INS_VALUES, ... */ - dict_table_t* table, /*!< in: table where to insert */ - mem_heap_t* heap); /*!< in: mem heap where created */ + /*********************************************************************//** Sets a new row to insert for an INS_DIRECT node. This function is only used if we have constructed the row separately, which is a rare case; this @@ -146,7 +138,9 @@ row_ins_sec_index_entry( /*====================*/ dict_index_t* index, /*!< in: secondary index */ dtuple_t* entry, /*!< in/out: index entry to insert */ - que_thr_t* thr) /*!< in: query thread */ + que_thr_t* thr, /*!< in: query thread */ + bool check_foreign = true) /*!< in: true if check + foreign table is needed, false otherwise */ MY_ATTRIBUTE((warn_unused_result)); /***********************************************************//** Inserts a row to a table. This is a high-level function used in @@ -157,10 +151,30 @@ row_ins_step( /*=========*/ que_thr_t* thr); /*!< in: query thread */ -/* Insert node structure */ +/* Insert node types */ +#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */ +#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */ +#define INS_DIRECT 2 /* this is for internal use in dict0crea: + insert the row directly */ -struct ins_node_t{ - ins_node_t() : common(QUE_NODE_INSERT, NULL), entry(entry_list.end()) +/* Node execution states */ +#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */ +#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */ +#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and + inserted */ + +struct row_prebuilt_t; + +/** Insert node structure */ +struct ins_node_t +{ + explicit ins_node_t(ulint ins_type, dict_table_t *table) : + common(QUE_NODE_INSERT, NULL), + ins_type(ins_type), + row(NULL), table(table), select(NULL), values_list(NULL), + state(INS_NODE_SET_IX_LOCK), index(NULL), + entry_list(), entry(entry_list.end()), + trx_id(0), entry_sys_heap(mem_heap_create(128)) { } que_common_t common; /*!< node type: QUE_NODE_INSERT */ @@ -184,26 +198,27 @@ struct ins_node_t{ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; trx_id_t trx_id; /*!< trx id or the last trx which executed the node */ + byte vers_start_buf[8]; /* Buffers for System Versioning */ + byte vers_end_buf[8]; /* system fields. */ mem_heap_t* entry_sys_heap; /* memory heap used as auxiliary storage; entry_list and sys fields are stored here; if this is NULL, entry list should be created and buffers for sys fields in row allocated */ - ulint magic_n; + void vers_update_end(row_prebuilt_t *prebuilt, bool history_row); + bool vers_history_row() const; /* true if 'row' is historical */ }; -#define INS_NODE_MAGIC_N 15849075 - -/* Insert node types */ -#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */ -#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */ -#define INS_DIRECT 2 /* this is for internal use in dict0crea: - insert the row directly */ - -/* Node execution states */ -#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */ -#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */ -#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and - inserted */ +/** Create an insert object. +@param ins_type INS_VALUES, ... +@param table table where to insert +@param heap memory heap +@return the created object */ +inline ins_node_t *ins_node_create(ulint ins_type, dict_table_t *table, + mem_heap_t *heap) +{ + return new (mem_heap_alloc(heap, sizeof(ins_node_t))) + ins_node_t(ins_type, table); +} #endif diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h index 1e46d65e427..1e1b31c2547 100644 --- a/storage/innobase/include/row0log.h +++ b/storage/innobase/include/row0log.h @@ -47,17 +47,21 @@ for online creation. bool row_log_allocate( /*=============*/ + const trx_t* trx, /*!< in: the ALTER TABLE transaction */ dict_index_t* index, /*!< in/out: index */ dict_table_t* table, /*!< in/out: new table being rebuilt, or NULL when creating a secondary index */ bool same_pk,/*!< in: whether the definition of the PRIMARY KEY has remained the same */ - const dtuple_t* add_cols, + const dtuple_t* defaults, /*!< in: default values of - added columns, or NULL */ + added, changed columns, or NULL */ const ulint* col_map,/*!< in: mapping of old column numbers to new ones, or NULL if !table */ - const char* path) /*!< in: where to create temporary file */ + const char* path, /*!< in: where to create temporary file */ + const TABLE* old_table, /*!< in:table definition before alter */ + bool allow_not_null) /*!< in: allow null to non-null + conversion */ MY_ATTRIBUTE((nonnull(1), warn_unused_result)); /******************************************************//** @@ -205,13 +209,15 @@ row_log_table_blob_alloc( @param[in,out] stage performance schema accounting object, used by ALTER TABLE. stage->begin_phase_log_table() will be called initially and then stage->inc() will be called for each block of log that is applied. +@param[in] new_table Altered table @return DB_SUCCESS, or error code on failure */ dberr_t row_log_table_apply( que_thr_t* thr, dict_table_t* old_table, struct TABLE* table, - ut_stage_alter_t* stage) + ut_stage_alter_t* stage, + dict_table_t* new_table) MY_ATTRIBUTE((warn_unused_result)); /******************************************************//** @@ -241,6 +247,11 @@ row_log_apply( ut_stage_alter_t* stage) MY_ATTRIBUTE((warn_unused_result)); +/** Get the n_core_fields of online log for the index +@param index index whose n_core_fields of log to be accessed +@return number of n_core_fields */ +unsigned row_log_get_n_core_fields(const dict_index_t *index); + #ifdef HAVE_PSI_STAGE_INTERFACE /** Estimate how much work is to be done by the log apply phase of an ALTER TABLE for this index. diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h index dfd1d9fb9fd..3252af0062b 100644 --- a/storage/innobase/include/row0merge.h +++ b/storage/innobase/include/row0merge.h @@ -58,11 +58,11 @@ struct ib_sequence_t; /** @brief Block size for I/O operations in merge sort. -The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty() +The minimum is srv_page_size, or page_get_free_space_of_empty() rounded to a power of 2. When not creating a PRIMARY KEY that contains column prefixes, this -can be set as small as UNIV_PAGE_SIZE / 2. */ +can be set as small as srv_page_size / 2. */ typedef byte row_merge_block_t; /** @brief Secondary buffer for I/O operations of merge records. @@ -98,7 +98,7 @@ struct row_merge_buf_t { /** Information about temporary files used in merge sort */ struct merge_file_t { - int fd; /*!< file descriptor */ + pfs_os_file_t fd; /*!< file descriptor */ ulint offset; /*!< file offset (end of file) */ ib_uint64_t n_rec; /*!< number of records in the file */ }; @@ -192,7 +192,7 @@ row_merge_drop_temp_indexes(void); UNIV_PFS_IO defined, register the file descriptor with Performance Schema. @param[in] path location for creating temporary merge files, or NULL @return File descriptor */ -int +pfs_os_file_t row_merge_file_create_low( const char* path) MY_ATTRIBUTE((warn_unused_result)); @@ -202,7 +202,7 @@ if UNIV_PFS_IO is defined. */ void row_merge_file_destroy_low( /*=======================*/ - int fd); /*!< in: merge file descriptor */ + const pfs_os_file_t& fd); /*!< in: merge file descriptor */ /*********************************************************************//** Rename the tables in the data dictionary. The data dictionary must @@ -247,7 +247,6 @@ row_merge_rename_index_to_drop( MY_ATTRIBUTE((nonnull(1), warn_unused_result)); /** Create the index and load in to the dictionary. -@param[in,out] trx trx (sets error_state) @param[in,out] table the index is on this table @param[in] index_def the index definition @param[in] add_v new virtual columns added along with add @@ -255,7 +254,6 @@ row_merge_rename_index_to_drop( @return index, or NULL on error */ dict_index_t* row_merge_create_index( - trx_t* trx, dict_table_t* table, const index_def_t* index_def, const dict_add_v_col_t* add_v) @@ -303,7 +301,7 @@ old_table unless creating a PRIMARY KEY @param[in] n_indexes size of indexes[] @param[in,out] table MySQL table, for reporting erroneous key value if applicable -@param[in] add_cols default values of added columns, or NULL +@param[in] defaults default values of added, changed columns, or NULL @param[in] col_map mapping of old column numbers to new ones, or NULL if old_table == new_table @param[in] add_autoinc number of added AUTO_INCREMENT columns, or @@ -317,6 +315,7 @@ this function and it will be passed to other functions for further accounting. @param[in] add_v new virtual columns added along with indexes @param[in] eval_table mysql table used to evaluate virtual column value, see innobase_get_computed_value(). +@param[in] allow_non_null allow the conversion from null to not-null @return DB_SUCCESS or error code */ dberr_t row_merge_build_indexes( @@ -328,14 +327,15 @@ row_merge_build_indexes( const ulint* key_numbers, ulint n_indexes, struct TABLE* table, - const dtuple_t* add_cols, + const dtuple_t* defaults, const ulint* col_map, ulint add_autoinc, ib_sequence_t& sequence, bool skip_pk_sort, ut_stage_alter_t* stage, const dict_add_v_col_t* add_v, - struct TABLE* eval_table) + struct TABLE* eval_table, + bool allow_non_null) MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** @@ -367,7 +367,7 @@ UNIV_INTERN bool row_merge_write( /*============*/ - int fd, /*!< in: file descriptor */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ ulint offset, /*!< in: offset where to write, in number of row_merge_block_t elements */ const void* buf, /*!< in: data */ @@ -388,7 +388,7 @@ row_merge_buf_empty( @param[out] merge_file merge file structure @param[in] path location for creating temporary file, or NULL @return file descriptor, or -1 on failure */ -int +pfs_os_file_t row_merge_file_create( merge_file_t* merge_file, const char* path) @@ -416,7 +416,7 @@ row_merge_sort( const row_merge_dup_t* dup, merge_file_t* file, row_merge_block_t* block, - int* tmpfd, + pfs_os_file_t* tmpfd, const bool update_progress, const double pct_progress, const double pct_cost, @@ -455,7 +455,7 @@ row_merge_file_destroy( bool row_merge_read( /*===========*/ - int fd, /*!< in: file descriptor */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ ulint offset, /*!< in: offset where to read in number of row_merge_block_t elements */ @@ -474,7 +474,7 @@ row_merge_read_rec( mrec_buf_t* buf, /*!< in/out: secondary buffer */ const byte* b, /*!< in: pointer to record */ const dict_index_t* index, /*!< in: index of the record */ - int fd, /*!< in: file descriptor */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ ulint* foffs, /*!< in/out: file offset */ const mrec_t** mrec, /*!< out: pointer to merge record, or NULL on end of list diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h index 8738f991368..aa72a1f822a 100644 --- a/storage/innobase/include/row0mysql.h +++ b/storage/innobase/include/row0mysql.h @@ -223,14 +223,26 @@ row_lock_table_autoinc_for_mysql( dberr_t row_lock_table(row_prebuilt_t* prebuilt); +/** System Versioning: row_insert_for_mysql() modes */ +enum ins_mode_t { + /* plain row (without versioning) */ + ROW_INS_NORMAL = 0, + /* row_start = TRX_ID, row_end = MAX */ + ROW_INS_VERSIONED, + /* row_end = TRX_ID */ + ROW_INS_HISTORICAL +}; + /** Does an insert for MySQL. @param[in] mysql_rec row in the MySQL format @param[in,out] prebuilt prebuilt struct in MySQL handle +@param[in] ins_mode what row type we're inserting @return error code or DB_SUCCESS*/ dberr_t row_insert_for_mysql( const byte* mysql_rec, - row_prebuilt_t* prebuilt) + row_prebuilt_t* prebuilt, + ins_mode_t ins_mode) MY_ATTRIBUTE((warn_unused_result)); /*********************************************************************//** @@ -254,7 +266,8 @@ row_get_prebuilt_update_vector( @param[in,out] prebuilt prebuilt struct in MySQL handle @return error code or DB_SUCCESS */ dberr_t -row_update_for_mysql(row_prebuilt_t* prebuilt) +row_update_for_mysql( + row_prebuilt_t* prebuilt) MY_ATTRIBUTE((warn_unused_result)); /** This can only be used when srv_locks_unsafe_for_binlog is TRUE or this @@ -616,6 +629,8 @@ struct row_prebuilt_t { not to be confused with InnoDB externally stored columns (VARCHAR can be off-page too) */ + unsigned versioned_write:1;/*!< whether this is + a versioned write */ mysql_row_templ_t* mysql_template;/*!< template used to transform rows fast between MySQL and Innobase formats; memory for this template @@ -731,7 +746,7 @@ struct row_prebuilt_t { allocated mem buf start, because there is a 4 byte magic number at the start and at the end */ - ibool keep_other_fields_on_keyread; /*!< when using fetch + bool keep_other_fields_on_keyread; /*!< when using fetch cache with HA_EXTRA_KEYREAD, don't overwrite other fields in mysql row row buffer.*/ @@ -786,6 +801,20 @@ struct row_prebuilt_t { uint srch_key_val_len; /*!< Size of search key */ /** The MySQL table object */ TABLE* m_mysql_table; + + /** Get template by dict_table_t::cols[] number */ + const mysql_row_templ_t* get_template_by_col(ulint col) const + { + ut_ad(col < n_template); + ut_ad(mysql_template); + for (ulint i = col; i < n_template; ++i) { + const mysql_row_templ_t* templ = &mysql_template[i]; + if (!templ->is_virtual && templ->col_no == col) { + return templ; + } + } + return NULL; + } }; /** Callback for row_mysql_sys_index_iterate() */ diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h index c4ddff4243c..1505fb9663a 100644 --- a/storage/innobase/include/row0purge.h +++ b/storage/innobase/include/row0purge.h @@ -105,7 +105,7 @@ public: upd_t* update; /*!< update vector for a clustered index record */ - dtuple_t* ref; /*!< NULL, or row reference to the next row to + const dtuple_t* ref; /*!< NULL, or row reference to the next row to handle */ dtuple_t* row; /*!< NULL, or a copy (also fields copied to heap) of the indexed fields of the row to @@ -172,7 +172,7 @@ public: @param[in] limit last transaction for which to skip */ void skip(table_id_t id, trx_id_t limit) { - DBUG_ASSERT(limit >= trx_id || !srv_safe_truncate); + DBUG_ASSERT(limit >= trx_id); unavailable_table_id = id; def_trx_id = limit; } diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h index 1c7e40ac690..5268d684529 100644 --- a/storage/innobase/include/row0row.h +++ b/storage/innobase/include/row0row.h @@ -87,8 +87,8 @@ row_build_index_entry_low( inserted or purged */ const row_ext_t* ext, /*!< in: externally stored column prefixes, or NULL */ - dict_index_t* index, /*!< in: index on the table */ - mem_heap_t* heap, /*!< in: memory heap from which + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap, /*!< in,out: memory heap from which the memory for the index entry is allocated */ ulint flag) /*!< in: ROW_BUILD_NORMAL, @@ -109,8 +109,8 @@ row_build_index_entry( inserted or purged */ const row_ext_t* ext, /*!< in: externally stored column prefixes, or NULL */ - dict_index_t* index, /*!< in: index on the table */ - mem_heap_t* heap) /*!< in: memory heap from which + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in,out: memory heap from which the memory for the index entry is allocated */ MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4))); @@ -150,9 +150,9 @@ row_build( consulted instead; the user columns in this table should be the same columns as in index->table */ - const dtuple_t* add_cols, + const dtuple_t* defaults, /*!< in: default values of - added columns, or NULL */ + added, changed columns, or NULL */ const ulint* col_map,/*!< in: mapping of old column numbers to new ones, or NULL */ row_ext_t** ext, /*!< out, own: cache of @@ -174,7 +174,7 @@ addition of new virtual columns. of an index, or NULL if index->table should be consulted instead -@param[in] add_cols default values of added columns, or NULL +@param[in] defaults default values of added, changed columns, or NULL @param[in] add_v new virtual columns added along with new indexes @param[in] col_map mapping of old column @@ -191,7 +191,7 @@ row_build_w_add_vcol( const rec_t* rec, const rec_offs* offsets, const dict_table_t* col_table, - const dtuple_t* add_cols, + const dtuple_t* defaults, const dict_add_v_col_t* add_v, const ulint* col_map, row_ext_t** ext, @@ -262,9 +262,8 @@ row_build_row_ref_in_tuple( held as long as the row reference is used! */ const dict_index_t* index, /*!< in: secondary index */ - rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) + rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) or NULL */ - trx_t* trx) /*!< in: transaction or NULL */ MY_ATTRIBUTE((nonnull(1,2,3))); /*******************************************************************//** Builds from a secondary index record a row reference with which we can @@ -278,8 +277,8 @@ row_build_row_ref_fast( const ulint* map, /*!< in: array of field numbers in rec telling how ref should be built from the fields of rec */ - const rec_t* rec, /*!< in: record in the index; must be - preserved while ref is used, as we do + const rec_t* rec, /*!< in: secondary index record; + must be preserved while ref is used, as we do not copy field values to heap */ const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */ /***************************************************************//** @@ -391,7 +390,7 @@ row_mtr_start(mtr_t* mtr, dict_index_t* index, bool pessimistic) { mtr->start(); - switch (index->space) { + switch (index->table->space_id) { case IBUF_SPACE_ID: if (pessimistic && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) { @@ -402,7 +401,7 @@ row_mtr_start(mtr_t* mtr, dict_index_t* index, bool pessimistic) mtr->set_log_mode(MTR_LOG_NO_REDO); break; default: - mtr->set_named_space(index->space); + index->set_modified(*mtr); break; } diff --git a/storage/innobase/include/row0row.ic b/storage/innobase/include/row0row.ic index a3dd51aaac2..18e6959e6f3 100644 --- a/storage/innobase/include/row0row.ic +++ b/storage/innobase/include/row0row.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -118,8 +119,8 @@ row_build_index_entry( inserted or purged */ const row_ext_t* ext, /*!< in: externally stored column prefixes, or NULL */ - dict_index_t* index, /*!< in: index on the table */ - mem_heap_t* heap) /*!< in: memory heap from which + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in,out: memory heap from which the memory for the index entry is allocated */ { @@ -144,8 +145,8 @@ row_build_row_ref_fast( const ulint* map, /*!< in: array of field numbers in rec telling how ref should be built from the fields of rec */ - const rec_t* rec, /*!< in: record in the index; must be - preserved while ref is used, as we do + const rec_t* rec, /*!< in: secondary index record; + must be preserved while ref is used, as we do not copy field values to heap */ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ { diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h index d9c08243a91..717e1a2be0b 100644 --- a/storage/innobase/include/row0sel.h +++ b/storage/innobase/include/row0sel.h @@ -133,8 +133,7 @@ row_sel_convert_mysql_key_to_innobase( ulint buf_len, /*!< in: buffer length */ dict_index_t* index, /*!< in: index of the key value */ const byte* key_ptr, /*!< in: MySQL key value */ - ulint key_len, /*!< in: MySQL key value length */ - trx_t* trx); /*!< in: transaction */ + ulint key_len); /*!< in: MySQL key value length */ /** Searches for rows in the database. This is used in the interface to diff --git a/storage/innobase/include/row0trunc.h b/storage/innobase/include/row0trunc.h index bd890fe7b73..c5f89f7cfdb 100644 --- a/storage/innobase/include/row0trunc.h +++ b/storage/innobase/include/row0trunc.h @@ -181,19 +181,16 @@ public: /** Create an index for a table. @param[in] table_name table name, for which to create the index - @param[in] space_id space id where we have to - create the index - @param[in] page_size page size of the .ibd file + @param[in,out] space tablespace @param[in] index_type type of index to truncate @param[in] index_id id of index to truncate @param[in] btr_redo_create_info control info for ::btr_create() @param[in,out] mtr mini-transaction covering the create index @return root page no or FIL_NULL on failure */ - ulint create_index( + inline ulint create_index( const char* table_name, - ulint space_id, - const page_size_t& page_size, + fil_space_t* space, ulint index_type, index_id_t index_id, const btr_create_t& btr_redo_create_info, @@ -202,31 +199,27 @@ public: /** Create the indexes for a table @param[in] table_name table name, for which to create the indexes - @param[in] space_id space id where we have to create the - indexes - @param[in] page_size page size of the .ibd file - @param[in] flags tablespace flags + @param[in,out] space tablespace @param[in] format_flags page format flags @return DB_SUCCESS or error code. */ - dberr_t create_indexes( + inline dberr_t create_indexes( const char* table_name, - ulint space_id, - const page_size_t& page_size, - ulint flags, + fil_space_t* space, ulint format_flags); /** Check if index has been modified since TRUNCATE log snapshot was recorded. - @param space_id space_id where table/indexes resides. + @param[in] space tablespace + @param[in] root_page_no index root page number @return true if modified else false */ - bool is_index_modified_since_logged( - ulint space_id, - ulint root_page_no) const; + inline bool is_index_modified_since_logged( + const fil_space_t* space, + ulint root_page_no) const; /** Drop indexes for a table. - @param space_id space_id where table/indexes resides. + @param[in,out] space tablespace @return DB_SUCCESS or error code. */ - void drop_indexes(ulint space_id) const; + void drop_indexes(fil_space_t* space) const; /** Parses log record during recovery @@ -420,9 +413,4 @@ private: const char* log_file_name); }; -/** MySQL 5.7 TRUNCATE TABLE. -@param table table being truncated -@param trx transaction covering the truncate -@return error code or DB_SUCCESS */ -dberr_t row_truncate_table_for_mysql(dict_table_t* table, trx_t* trx); #endif /* row0trunc_h */ diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h index a461b96b919..a18d154c132 100644 --- a/storage/innobase/include/row0undo.h +++ b/storage/innobase/include/row0undo.h @@ -107,7 +107,7 @@ struct undo_node_t{ ulint cmpl_info;/*!< compiler analysis of an update */ upd_t* update; /*!< update vector for a clustered index record */ - dtuple_t* ref; /*!< row reference to the next row to handle */ + const dtuple_t* ref; /*!< row reference to the next row to handle */ dtuple_t* row; /*!< a copy (also fields copied to heap) of the row to handle */ row_ext_t* ext; /*!< NULL, or prefixes of the externally diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h index b60770b01fb..e58e8dd01fb 100644 --- a/storage/innobase/include/row0upd.h +++ b/storage/innobase/include/row0upd.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, 2020, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -235,27 +235,19 @@ row_upd_build_difference_binary( TABLE* mysql_table, dberr_t* error) MY_ATTRIBUTE((nonnull(1,2,3,7,9), warn_unused_result)); -/***********************************************************//** -Replaces the new column values stored in the update vector to the index entry -given. */ +/** Apply an update vector to an index entry. +@param[in,out] entry index entry to be updated; the clustered index record + must be covered by a lock or a page latch to prevent + deletion (rollback or purge) +@param[in] index index of the entry +@param[in] update update vector built for the entry +@param[in,out] heap memory heap for copying off-page columns */ void row_upd_index_replace_new_col_vals_index_pos( -/*=========================================*/ - dtuple_t* entry, /*!< in/out: index entry where replaced; - the clustered index record must be - covered by a lock or a page latch to - prevent deletion (rollback or purge) */ - dict_index_t* index, /*!< in: index; NOTE that this may also be a - non-clustered index */ - const upd_t* update, /*!< in: an update vector built for the index so - that the field number in an upd_field is the - index position */ - ibool order_only, - /*!< in: if TRUE, limit the replacement to - ordering fields of index; note that this - does not work for non-clustered indexes. */ - mem_heap_t* heap) /*!< in: memory heap for allocating and - copying the new values */ + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + mem_heap_t* heap) MY_ATTRIBUTE((nonnull)); /** Replace the new column values stored in the update vector, during trx_undo_prev_version_build(). @@ -447,6 +439,7 @@ struct upd_t{ virtual column update now */ ulint n_fields; /*!< number of update fields */ upd_field_t* fields; /*!< array of update fields */ + byte vers_sys_value[8]; /*!< buffer for updating system fields */ /** Append an update field to the end of array @param[in] field an update field */ @@ -455,7 +448,32 @@ struct upd_t{ fields[n_fields++] = field; } - /** Determine if the given field_no is modified. + void remove_element(ulint i) + { + ut_ad(n_fields > 0); + ut_ad(i < n_fields); + while (i < n_fields - 1) + { + fields[i]= fields[i + 1]; + i++; + } + n_fields--; + } + + bool remove(const ulint field_no) + { + for (ulint i= 0; i < n_fields; ++i) + { + if (field_no == fields[i].field_no) + { + remove_element(i); + return true; + } + } + return false; + } + + /** Determine if the given field_no is modified. @return true if modified, false otherwise. */ bool is_modified(const ulint field_no) const { @@ -467,6 +485,22 @@ struct upd_t{ return(false); } + /** Determine if the update affects a system versioned column or row_end. */ + bool affects_versioned() const + { + for (ulint i = 0; i < n_fields; i++) { + dtype_t type = fields[i].new_val.type; + if (type.is_versioned()) { + return true; + } + // versioned DELETE is UPDATE SET row_end=NOW + if (type.vers_sys_end()) { + return true; + } + } + return false; + } + #ifdef UNIV_DEBUG bool validate() const { @@ -483,17 +517,24 @@ struct upd_t{ }; +/** Kinds of update operation */ +enum delete_mode_t { + NO_DELETE = 0, /*!< this operation does not delete */ + PLAIN_DELETE, /*!< ordinary delete */ + VERSIONED_DELETE /*!< update old and insert a new row */ +}; + /* Update node structure which also implements the delete operation of a row */ struct upd_node_t{ que_common_t common; /*!< node type: QUE_NODE_UPDATE */ - ibool is_delete;/* TRUE if delete, FALSE if update */ + delete_mode_t is_delete; /*!< kind of DELETE */ ibool searched_update; /* TRUE if searched update, FALSE if positioned */ - ibool in_mysql_interface; - /* TRUE if the update node was created + bool in_mysql_interface; + /* whether the update node was created for the MySQL interface */ dict_foreign_t* foreign;/* NULL or pointer to a foreign key constraint if this update node is used in @@ -538,6 +579,12 @@ struct upd_node_t{ dtuple_t* row; /*!< NULL, or a copy (also fields copied to heap) of the row to update; this must be reset to NULL after a successful update */ + dtuple_t* historical_row; /*!< historical row used in + CASCADE UPDATE/SET NULL; + allocated from historical_heap */ + mem_heap_t* historical_heap; /*!< heap for historical row insertion; + created when row to update is located; + freed right before row update */ row_ext_t* ext; /*!< NULL, or prefixes of the externally stored columns in the old row */ dtuple_t* upd_row;/* NULL, or a copy of the updated row */ @@ -552,6 +599,32 @@ struct upd_node_t{ /* column assignment list */ ulint magic_n; +private: + /** Appends row_start or row_end field to update vector and sets a + CURRENT_TIMESTAMP/trx->id value to it. + Supposed to be called only by make_versioned_update() and + make_versioned_delete(). + @param[in] trx transaction + @param[in] vers_sys_idx table->row_start or table->row_end */ + void vers_update_fields(const trx_t *trx, ulint idx); + +public: + /** Also set row_start = CURRENT_TIMESTAMP/trx->id + @param[in] trx transaction */ + void vers_make_update(const trx_t *trx) + { + vers_update_fields(trx, table->vers_start); + } + + /** Only set row_end = CURRENT_TIMESTAMP/trx->id. + Do not touch other fields at all. + @param[in] trx transaction */ + void vers_make_delete(const trx_t *trx) + { + update->n_fields = 0; + is_delete = VERSIONED_DELETE; + vers_update_fields(trx, table->vers_end); + } }; #define UPD_NODE_MAGIC_N 1579975 diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic index 07e1c13d771..e1368a14e63 100644 --- a/storage/innobase/include/row0upd.ic +++ b/storage/innobase/include/row0upd.ic @@ -181,9 +181,8 @@ row_upd_rec_sys_fields( offset = row_get_trx_id_offset(index, offsets); } -#if DATA_TRX_ID + 1 != DATA_ROLL_PTR -# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" -#endif + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + /* During IMPORT the trx id in the record can be in the future, if the .ibd file is being imported from another instance. During IMPORT roll_ptr will be 0. */ diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h index 3b1f16ef15f..032801335f8 100644 --- a/storage/innobase/include/row0vers.h +++ b/storage/innobase/include/row0vers.h @@ -40,6 +40,7 @@ class ReadView; /** Determine if an active transaction has inserted or modified a secondary index record. +@param[in,out] caller_trx trx of current thread @param[in] rec secondary index record @param[in] index secondary index @param[in] offsets rec_get_offsets(rec, index) @@ -48,26 +49,11 @@ trx_mutex_enter(), and trx->release_reference() must be invoked @retval NULL if the record was committed */ trx_t* row_vers_impl_x_locked( + trx_t* caller_trx, const rec_t* rec, dict_index_t* index, const rec_offs* offsets); -/*****************************************************************//** -Finds out if we must preserve a delete marked earlier version of a clustered -index record, because it is >= the purge view. -@param[in] trx_id transaction id in the version -@param[in] name table name -@param[in,out] mtr mini transaction holding the latch on the - clustered index record; it will also hold - the latch on purge_view -@return TRUE if earlier version should be preserved */ -ibool -row_vers_must_preserve_del_marked( -/*==============================*/ - trx_id_t trx_id, - const table_name_t& name, - mtr_t* mtr); - /** Finds out if a version of the record, where the version >= the current purge view, should have ientry as its secondary index entry. We check if there is any not delete marked version of the record where the trx @@ -133,6 +119,7 @@ which should be seen by a semi-consistent read. */ void row_vers_build_for_semi_consistent_read( /*====================================*/ + trx_t* caller_trx,/*!<in/out: trx of current thread */ const rec_t* rec, /*!< in: record in a clustered index; the caller must have a latch on the page; this latch locks the top of the stack of versions diff --git a/storage/innobase/include/srv0conc.h b/storage/innobase/include/srv0conc.h index d6682e19539..d24107735ed 100644 --- a/storage/innobase/include/srv0conc.h +++ b/storage/innobase/include/srv0conc.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -41,9 +42,7 @@ Created 2011/04/18 Sunny Bains #define srv_conc_h /** We are prepared for a situation that we have this many threads waiting for -a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the -value. */ - +a semaphore inside InnoDB. srv_start() sets the value. */ extern ulint srv_max_n_threads; /** The following controls how many threads we let inside InnoDB concurrently: diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index ccc70206ede..eaf47789486 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -503,18 +503,18 @@ extern ulint monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / /** Macros to turn on/off the control bit in monitor_set_tbl for a monitor counter option. */ -#define MONITOR_ON(monitor) \ - (monitor_set_tbl[monitor / NUM_BITS_ULINT] |= \ - ((ulint)1 << (monitor % NUM_BITS_ULINT))) +#define MONITOR_ON(monitor) \ + (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT] |= \ + (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))) -#define MONITOR_OFF(monitor) \ - (monitor_set_tbl[monitor / NUM_BITS_ULINT] &= \ - ~((ulint)1 << (monitor % NUM_BITS_ULINT))) +#define MONITOR_OFF(monitor) \ + (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT] &= \ + ~(ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))) /** Check whether the requested monitor is turned on/off */ -#define MONITOR_IS_ON(monitor) \ - (monitor_set_tbl[monitor / NUM_BITS_ULINT] & \ - ((ulint)1 << (monitor % NUM_BITS_ULINT))) +#define MONITOR_IS_ON(monitor) \ + (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT] & \ + (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))) /** The actual monitor counter array that records each monintor counter value */ @@ -608,8 +608,9 @@ Use MONITOR_INC if appropriate mutex protection exists. #define MONITOR_ATOMIC_INC_LOW(monitor, enabled) \ if (enabled) { \ ib_uint64_t value; \ - value = my_atomic_add64( \ - (int64*) &MONITOR_VALUE(monitor), 1) + 1; \ + value = my_atomic_add64_explicit( \ + (int64*) &MONITOR_VALUE(monitor), 1, \ + MY_MEMORY_ORDER_RELAXED) + 1; \ /* Note: This is not 100% accurate because of the \ inherent race, we ignore it due to performance. */ \ if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) { \ @@ -624,8 +625,9 @@ Use MONITOR_DEC if appropriate mutex protection exists. #define MONITOR_ATOMIC_DEC_LOW(monitor, enabled) \ if (enabled) { \ ib_uint64_t value; \ - value = my_atomic_add64( \ - (int64*) &MONITOR_VALUE(monitor), -1) - 1; \ + value = my_atomic_add64_explicit( \ + (int64*) &MONITOR_VALUE(monitor), -1, \ + MY_MEMORY_ORDER_RELAXED) - 1; \ /* Note: This is not 100% accurate because of the \ inherent race, we ignore it due to performance. */ \ if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) { \ @@ -652,14 +654,14 @@ Use MONITOR_DEC if appropriate mutex protection exists. } \ } -#ifdef HAVE_valgrind_or_MSAN +#ifdef HAVE_valgrind # define MONITOR_CHECK_DEFINED(value) do { \ mon_type_t m = value; \ MEM_CHECK_DEFINED(&m, sizeof m); \ } while (0) -#else /* HAVE_valgrind_or_MSAN */ +#else /* HAVE_valgrind */ # define MONITOR_CHECK_DEFINED(value) (void) 0 -#endif /* HAVE_valgrind_or_MSAN */ +#endif /* HAVE_valgrind */ #define MONITOR_INC_VALUE(monitor, value) \ MONITOR_CHECK_DEFINED(value); \ diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index d196a4d6db6..6aa079676a0 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -77,7 +77,7 @@ struct srv_stats_t lsn_ctr_1_t os_log_written; /** Number of writes being done to the log files. - Protected by log_sys->write_mutex. */ + Protected by log_sys.write_mutex. */ ulint_ctr_1_t os_log_pending_writes; /** We increase this counter, when we don't have enough @@ -144,7 +144,7 @@ struct srv_stats_t ulint_ctr_1_t n_lock_wait_count; /** Number of threads currently waiting on database locks */ - simple_counter<ulint, true> n_lock_wait_current_count; + simple_atomic_counter<> n_lock_wait_current_count; /** Number of rows read. */ ulint_ctr_64_t n_rows_read; @@ -258,20 +258,11 @@ extern my_bool high_level_read_only; /** store to its own file each table created by an user; data dictionary tables are in the system tablespace 0 */ extern my_bool srv_file_per_table; -/** whether to use backup-safe TRUNCATE and crash-safe RENAME -instead of the MySQL 5.7 WL#6501 TRUNCATE TABLE implementation */ -extern my_bool srv_safe_truncate; /** Sleep delay for threads waiting to enter InnoDB. In micro-seconds. */ extern ulong srv_thread_sleep_delay; /** Maximum sleep delay (in micro-seconds), value of 0 disables it.*/ extern ulong srv_adaptive_max_sleep_delay; -/** The file format to use on new *.ibd files. */ -extern ulint srv_file_format; -/** Whether to check file format during startup. A value of -UNIV_FORMAT_MAX + 1 means no checking ie. FALSE. The default is to -set it to the highest format we support. */ -extern ulint srv_max_file_format_at_startup; /** Place locks to records only i.e. do not use next-key locking except on duplicate key checking and foreign key checking */ extern ibool srv_locks_unsafe_for_binlog; @@ -288,25 +279,12 @@ Currently we support native aio on windows and linux */ extern my_bool srv_use_native_aio; extern my_bool srv_numa_interleave; -/* Use trim operation */ -extern my_bool srv_use_trim; - /* Use atomic writes i.e disable doublewrite buffer */ extern my_bool srv_use_atomic_writes; /* Compression algorithm*/ extern ulong innodb_compression_algorithm; -/* Number of flush threads */ -#define MTFLUSH_MAX_WORKER 64 -#define MTFLUSH_DEFAULT_WORKER 8 - -/* Number of threads used for multi-threaded flush */ -extern long srv_mtflush_threads; - -/* If this flag is TRUE, then we will use multi threaded flush. */ -extern my_bool srv_use_mtflush; - /** TRUE if the server was successfully started */ extern bool srv_was_started; @@ -364,17 +342,15 @@ extern const ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES; extern char* srv_log_group_home_dir; -/** Maximum number of srv_n_log_files, or innodb_log_files_in_group */ -#define SRV_N_LOG_FILES_MAX 100 extern ulong srv_n_log_files; /** The InnoDB redo log file size, or 0 when changing the redo log format at startup (while disallowing writes to the redo log). */ extern ulonglong srv_log_file_size; -extern ulint srv_log_buffer_size; +extern ulong srv_log_buffer_size; extern ulong srv_flush_log_at_trx_commit; extern uint srv_flush_log_at_timeout; extern ulong srv_log_write_ahead_size; -extern char srv_adaptive_flushing; +extern my_bool srv_adaptive_flushing; extern my_bool srv_flush_sync; #ifdef WITH_INNODB_DISALLOW_WRITES @@ -405,8 +381,6 @@ extern ulong srv_n_page_hash_locks; /** Scan depth for LRU flush batch i.e.: number of blocks scanned*/ extern ulong srv_LRU_scan_depth; /** Whether or not to flush neighbors of a block */ -extern ulong srv_buf_pool_dump_pct; /*!< dump that may % of each buffer - pool during BP dump */ extern ulong srv_flush_neighbors; /** Previously requested size */ extern ulint srv_buf_pool_old_size; @@ -416,14 +390,18 @@ extern ulint srv_buf_pool_base_size; extern ulint srv_buf_pool_curr_size; /** Dump this % of each buffer pool during BP dump */ extern ulong srv_buf_pool_dump_pct; +#ifdef UNIV_DEBUG +/** Abort load after this amount of pages */ +extern ulong srv_buf_pool_load_pages_abort; +#endif /** Lock table size in bytes */ extern ulint srv_lock_table_size; extern ulint srv_n_file_io_threads; extern my_bool srv_random_read_ahead; extern ulong srv_read_ahead_threshold; -extern ulint srv_n_read_io_threads; -extern ulint srv_n_write_io_threads; +extern ulong srv_n_read_io_threads; +extern ulong srv_n_write_io_threads; /* Defragmentation, Origianlly facebook default value is 100, but it's too high */ #define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40 @@ -455,8 +433,6 @@ to treat NULL value when collecting statistics. It is not defined as enum type because the configure option takes unsigned integer type. */ extern ulong srv_innodb_stats_method; -extern char* srv_file_flush_method_str; - extern ulint srv_max_n_open_files; extern ulong srv_n_page_cleaners; @@ -491,7 +467,7 @@ extern my_bool srv_stats_include_delete_marked; extern unsigned long long srv_stats_modified_counter; extern my_bool srv_stats_sample_traditional; -extern ibool srv_use_doublewrite_buf; +extern my_bool srv_use_doublewrite_buf; extern ulong srv_doublewrite_batch_size; extern ulong srv_checksum_algorithm; @@ -575,6 +551,7 @@ extern my_bool srv_ibuf_disable_background_merge; #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ #ifdef UNIV_DEBUG +extern my_bool innodb_evict_tables_on_commit_debug; extern my_bool srv_sync_debug; extern my_bool srv_purge_view_update_only_debug; @@ -652,16 +629,16 @@ extern mysql_pfs_key_t trx_rollback_clean_thread_key; schema */ # define pfs_register_thread(key) \ do { \ - struct PSI_thread* psi = PSI_THREAD_CALL(new_thread)(key, NULL, 0);\ + struct PSI_thread* psi = PSI_CALL_new_thread(key, NULL, 0);\ /* JAN: TODO: MYSQL 5.7 PSI \ - PSI_THREAD_CALL(set_thread_os_id)(psi); */ \ - PSI_THREAD_CALL(set_thread)(psi); \ + PSI_CALL_set_thread_os_id(psi); */ \ + PSI_CALL_set_thread(psi); \ } while (0) /* This macro delist the current thread from performance schema */ # define pfs_delete_thread() \ do { \ - PSI_THREAD_CALL(delete_current_thread)(); \ + PSI_CALL_delete_current_thread(); \ } while (0) # else # define pfs_register_thread(key) @@ -702,10 +679,9 @@ extern PSI_stage_info srv_stage_buffer_pool_load; #endif /* HAVE_PSI_STAGE_INTERFACE */ -/** Alternatives for the file flush option in Unix; see the InnoDB manual -about what these mean */ +/** Alternatives for innodb_flush_method */ enum srv_flush_t { - SRV_FSYNC = 1, /*!< fsync, the default */ + SRV_FSYNC = 0, /*!< fsync, the default */ SRV_O_DSYNC, /*!< open log files in O_SYNC mode */ SRV_LITTLESYNC, /*!< do not call os_file_flush() when writing data files, but do flush @@ -717,18 +693,21 @@ enum srv_flush_t { the reason for which is that some FS do not flush meta-data when unbuffered IO happens */ - SRV_O_DIRECT_NO_FSYNC, + SRV_O_DIRECT_NO_FSYNC /*!< do not use fsync() when using direct IO i.e.: it can be set to avoid the fsync() call that we make when using SRV_UNIX_O_DIRECT. However, in this case user/DBA should be sure about the integrity of the meta-data */ - SRV_ALL_O_DIRECT_FSYNC +#ifdef _WIN32 + ,SRV_ALL_O_DIRECT_FSYNC /*!< Traditional Windows appoach to open all files without caching, and do FileFlushBuffers()*/ +#endif }; -extern enum srv_flush_t srv_file_flush_method; +/** innodb_flush_method */ +extern ulong srv_file_flush_method; /** Alternatives for srv_force_recovery. Non-zero values are intended to help the user get a damaged database up so that he can dump intact @@ -965,16 +944,10 @@ srv_was_tablespace_truncated(const fil_space_t* space); #ifdef UNIV_DEBUG /** Disables master thread. It's used by: SET GLOBAL innodb_master_thread_disabled_debug = 1 (0). -@param[in] thd thread handle -@param[in] var pointer to system variable -@param[out] var_ptr where the formal string goes @param[in] save immediate result from check function */ void -srv_master_thread_disabled_debug_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save); +srv_master_thread_disabled_debug_update(THD*, st_mysql_sys_var*, void*, + const void* save); #endif /* UNIV_DEBUG */ /** Status variables to be passed to MySQL */ @@ -990,6 +963,7 @@ struct export_var_t{ char innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */ char innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */ char innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */ + my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */ ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */ ulint innodb_buffer_pool_pages_data; /*!< Data pages */ ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */ @@ -1018,7 +992,7 @@ struct export_var_t{ ulint innodb_os_log_fsyncs; /*!< fil_n_log_flushes */ ulint innodb_os_log_pending_writes; /*!< srv_os_log_pending_writes */ ulint innodb_os_log_pending_fsyncs; /*!< fil_n_pending_log_flushes */ - ulint innodb_page_size; /*!< UNIV_PAGE_SIZE */ + ulint innodb_page_size; /*!< srv_page_size */ ulint innodb_pages_created; /*!< buf_pool->stat.n_pages_created */ ulint innodb_pages_read; /*!< buf_pool->stat.n_pages_read*/ ulint innodb_page0_read; /*!< srv_stats.page0_read */ @@ -1055,6 +1029,9 @@ struct export_var_t{ ulint innodb_defragment_count; /*!< Number of defragment operations*/ + /** Number of instant ALTER TABLE operations that affect columns */ + ulong innodb_instant_alter_column; + ulint innodb_onlineddl_rowlog_rows; /*!< Online alter rows */ ulint innodb_onlineddl_rowlog_pct_used; /*!< Online alter percentage of used row log buffer */ diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h index e559814b33d..c1be0016bb3 100644 --- a/storage/innobase/include/srv0start.h +++ b/storage/innobase/include/srv0start.h @@ -43,20 +43,16 @@ only one buffer pool instance is used. */ dberr_t srv_undo_tablespaces_init(bool create_new_db); -/****************************************************************//** -Starts Innobase and creates a new database if database files -are not found and the user wants. +/** Start InnoDB. +@param[in] create_new_db whether to create a new database @return DB_SUCCESS or error code */ -dberr_t -innobase_start_or_create_for_mysql(); +dberr_t srv_start(bool create_new_db); /** Shut down InnoDB. */ -void -innodb_shutdown(); +void innodb_shutdown(); /** Shut down background threads that can generate undo log. */ -void -srv_shutdown_bg_undo_sources(); +void srv_shutdown_bg_undo_sources(); /*************************************************************//** Copy the file path component of the physical file to parameter. It will diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h index 7a8366b933b..e4186b74370 100644 --- a/storage/innobase/include/sync0arr.h +++ b/storage/innobase/include/sync0arr.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2017, MariaDB Corporation. +Copyright (c) 2015, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -98,16 +98,11 @@ void sync_array_print( FILE* file); /*!< in: file where to print */ -/**********************************************************************//** -Create the primary system wait array(s), they are protected by an OS mutex */ -void -sync_array_init( - ulint n_threads); /*!< in: Number of slots to create */ +/** Create the primary system wait arrays */ +void sync_array_init(); -/**********************************************************************//** -Close sync array wait sub-system. */ -void -sync_array_close(); +/** Destroy the sync array wait sub-system. */ +void sync_array_close(); /**********************************************************************//** Get an instance of the sync wait array. */ diff --git a/storage/innobase/include/sync0policy.h b/storage/innobase/include/sync0policy.h index c7c348bd489..4e48f1e2720 100644 --- a/storage/innobase/include/sync0policy.h +++ b/storage/innobase/include/sync0policy.h @@ -48,7 +48,7 @@ public: m_mutex(), m_filename(), m_line(), - m_thread_id(os_thread_id_t(ULINT_UNDEFINED)) + m_thread_id(ULINT_UNDEFINED) { /* No op */ } @@ -74,7 +74,8 @@ public: { m_mutex = mutex; - m_thread_id = os_thread_get_curr_id(); + my_atomic_storelint(&m_thread_id, + ulint(os_thread_get_curr_id())); m_filename = filename; @@ -87,7 +88,7 @@ public: { m_mutex = NULL; - m_thread_id = os_thread_id_t(ULINT_UNDEFINED); + my_atomic_storelint(&m_thread_id, ULINT_UNDEFINED); m_filename = NULL; @@ -103,7 +104,7 @@ public: msg << m_mutex->policy().to_string(); - if (os_thread_pf(m_thread_id) != ULINT_UNDEFINED) { + if (m_thread_id != ULINT_UNDEFINED) { msg << " addr: " << m_mutex << " acquired: " << locked_from().c_str(); @@ -136,7 +137,7 @@ public: unsigned m_line; /** Thread ID of the thread that own(ed) the mutex */ - os_thread_id_t m_thread_id; + ulint m_thread_id; }; /** Constructor. */ @@ -155,7 +156,7 @@ public: /** Mutex is being destroyed. */ void destroy() UNIV_NOTHROW { - ut_ad(m_context.m_thread_id == os_thread_id_t(ULINT_UNDEFINED)); + ut_ad((ulint)my_atomic_loadlint(&m_context.m_thread_id) == ULINT_UNDEFINED); m_magic_n = 0; @@ -165,8 +166,7 @@ public: /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. @param[in] id Mutex ID */ - void init(latch_id_t id) - UNIV_NOTHROW; + void init(latch_id_t id) UNIV_NOTHROW; /** Called when an attempt is made to lock the mutex @param[in] mutex Mutex instance to be locked @@ -197,7 +197,7 @@ public: bool is_owned() const UNIV_NOTHROW { return(os_thread_eq( - m_context.m_thread_id, + (os_thread_id_t)my_atomic_loadlint(&m_context.m_thread_id), os_thread_get_curr_id())); } @@ -219,7 +219,7 @@ public: os_thread_id_t get_thread_id() const UNIV_NOTHROW { - return(m_context.m_thread_id); + return((os_thread_id_t)my_atomic_loadlint(&m_context.m_thread_id)); } /** Magic number to check for memory corruption. */ @@ -239,7 +239,7 @@ struct NoPolicy { void init(const Mutex&, latch_id_t, const char*, uint32_t) UNIV_NOTHROW { } void destroy() UNIV_NOTHROW { } - void enter(const Mutex&, const char*, unsigned line) UNIV_NOTHROW { } + void enter(const Mutex&, const char*, unsigned) UNIV_NOTHROW { } void add(uint32_t, uint32_t) UNIV_NOTHROW { } void locked(const Mutex&, const char*, ulint) UNIV_NOTHROW { } void release(const Mutex&) UNIV_NOTHROW { } @@ -273,12 +273,11 @@ public: /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. - @param[in] mutex Mutex instance to track @param[in] id Mutex ID @param[in] filename File where mutex was created @param[in] line Line in filename */ void init( - const MutexType& mutex, + const Mutex&, latch_id_t id, const char* filename, uint32_t line) @@ -421,15 +420,8 @@ public: /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. - @param[in] mutex Mutex instance to track - @param[in] id Mutex ID - @param[in] filename File where mutex was created - @param[in] line Line in filename */ - void init( - const MutexType& mutex, - latch_id_t id, - const char* filename, - uint32_t line) + @param[in] id Mutex ID */ + void init(const Mutex&, latch_id_t id, const char*, uint32) UNIV_NOTHROW { /* It can be LATCH_ID_BUF_BLOCK_MUTEX or diff --git a/storage/innobase/include/sync0policy.ic b/storage/innobase/include/sync0policy.ic index b86dee0a3b8..e7aeb2e16bb 100644 --- a/storage/innobase/include/sync0policy.ic +++ b/storage/innobase/include/sync0policy.ic @@ -80,7 +80,7 @@ void MutexDebug<Mutex>::locked( UNIV_NOTHROW { ut_ad(!is_owned()); - ut_ad(m_context.m_thread_id == os_thread_id_t(ULINT_UNDEFINED)); + ut_ad(m_context.m_thread_id == ULINT_UNDEFINED); m_context.locked(mutex, name, line); @@ -88,7 +88,7 @@ void MutexDebug<Mutex>::locked( } template <typename Mutex> -void MutexDebug<Mutex>::release(const Mutex* mutex) +void MutexDebug<Mutex>::release(const Mutex*) UNIV_NOTHROW { ut_ad(is_owned()); diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h index 855d4439280..84e8800a447 100644 --- a/storage/innobase/include/sync0rw.h +++ b/storage/innobase/include/sync0rw.h @@ -499,13 +499,13 @@ bool rw_lock_lock_word_decr( /*===================*/ rw_lock_t* lock, /*!< in/out: rw-lock */ - ulint amount, /*!< in: amount to decrement */ - lint threshold); /*!< in: threshold of judgement */ + int32_t amount, /*!< in: amount to decrement */ + int32_t threshold); /*!< in: threshold of judgement */ #ifdef UNIV_DEBUG /******************************************************************//** Checks if the thread has locked the rw-lock in the specified mode, with the pass value == 0. */ -ibool +bool rw_lock_own( /*========*/ const rw_lock_t*lock, /*!< in: rw-lock */ @@ -569,10 +569,10 @@ struct rw_lock_t #endif /* UNIV_DEBUG */ { /** Holds the state of the lock. */ - volatile lint lock_word; + int32_t lock_word; /** 1: there are waiters */ - volatile uint32_t waiters; + int32_t waiters; /** number of granted SX locks. */ volatile ulint sx_recursive; @@ -595,9 +595,6 @@ struct rw_lock_t /** File name where lock created */ const char* cfile_name; - /** last s-lock file/line is not guaranteed to be correct */ - const char* last_s_file_name; - /** File name where last x-locked */ const char* last_x_file_name; @@ -607,9 +604,6 @@ struct rw_lock_t /** If 1 then the rw-lock is a block lock */ unsigned is_block_lock:1; - /** Line number where last time s-locked */ - unsigned last_s_line:14; - /** Line number where last time x-locked */ unsigned last_x_line:14; diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic index d0be5f0ece1..15f8ff3fe62 100644 --- a/storage/innobase/include/sync0rw.ic +++ b/storage/innobase/include/sync0rw.ic @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. -Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. +Copyright (c) 2017, 2018, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -77,7 +77,8 @@ rw_lock_get_writer( /*===============*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - lint lock_word = lock->lock_word; + int32_t lock_word = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), + MY_MEMORY_ORDER_RELAXED); ut_ad(lock_word <= X_LOCK_DECR); if (lock_word > X_LOCK_HALF_DECR) { @@ -109,15 +110,16 @@ rw_lock_get_reader_count( /*=====================*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - lint lock_word = lock->lock_word; + int32_t lock_word = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), + MY_MEMORY_ORDER_RELAXED); ut_ad(lock_word <= X_LOCK_DECR); if (lock_word > X_LOCK_HALF_DECR) { /* s-locked, no x-waiter */ - return(X_LOCK_DECR - lock_word); + return ulint(X_LOCK_DECR - lock_word); } else if (lock_word > 0) { /* s-locked, with sx-locks only */ - return(X_LOCK_HALF_DECR - lock_word); + return ulint(X_LOCK_HALF_DECR - lock_word); } else if (lock_word == 0) { /* x-locked */ return(0); @@ -145,7 +147,8 @@ rw_lock_get_x_lock_count( /*=====================*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - lint lock_copy = lock->lock_word; + int32_t lock_copy = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), + MY_MEMORY_ORDER_RELAXED); ut_ad(lock_copy <= X_LOCK_DECR); if (lock_copy == 0 || lock_copy == -X_LOCK_HALF_DECR) { @@ -158,12 +161,12 @@ rw_lock_get_x_lock_count( /* no s-lock, no sx-lock, 2 or more x-locks. First 2 x-locks are set with -X_LOCK_DECR, all other recursive x-locks are set with -1 */ - return(2 - (lock_copy + X_LOCK_DECR)); + return ulint(2 - X_LOCK_DECR - lock_copy); } else { /* no s-lock, 1 or more sx-lock, 2 or more x-locks. First 2 x-locks are set with -(X_LOCK_DECR + X_LOCK_HALF_DECR), all other recursive x-locks are set with -1 */ - return(2 - (lock_copy + X_LOCK_DECR + X_LOCK_HALF_DECR)); + return ulint(2 - X_LOCK_DECR - X_LOCK_HALF_DECR - lock_copy); } } @@ -178,7 +181,8 @@ rw_lock_get_sx_lock_count( const rw_lock_t* lock) /*!< in: rw-lock */ { #ifdef UNIV_DEBUG - lint lock_copy = lock->lock_word; + int32_t lock_copy = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), + MY_MEMORY_ORDER_RELAXED); ut_ad(lock_copy <= X_LOCK_DECR); @@ -197,9 +201,7 @@ rw_lock_get_sx_lock_count( } /******************************************************************//** -Two different implementations for decrementing the lock_word of a rw_lock: -one for systems supporting atomic operations, one for others. This does -does not support recusive x-locks: they should be handled by the caller and +Recursive x-locks are not supported: they should be handled by the caller and need not be atomic since they are performed by the current lock holder. Returns true if the decrement was made, false if not. @return true if decr occurs */ @@ -208,16 +210,17 @@ bool rw_lock_lock_word_decr( /*===================*/ rw_lock_t* lock, /*!< in/out: rw-lock */ - ulint amount, /*!< in: amount to decrement */ - lint threshold) /*!< in: threshold of judgement */ + int32_t amount, /*!< in: amount to decrement */ + int32_t threshold) /*!< in: threshold of judgement */ { - lint local_lock_word; - - local_lock_word = lock->lock_word; - while (local_lock_word > threshold) { - if (my_atomic_caslint(&lock->lock_word, - &local_lock_word, - local_lock_word - amount)) { + int32_t lock_copy = my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED); + while (lock_copy > threshold) { + if (my_atomic_cas32_strong_explicit(&lock->lock_word, + &lock_copy, + lock_copy - amount, + MY_MEMORY_ORDER_ACQUIRE, + MY_MEMORY_ORDER_RELAXED)) { return(true); } } @@ -246,11 +249,6 @@ rw_lock_s_lock_low( ut_d(rw_lock_add_debug_info(lock, pass, RW_LOCK_S, file_name, line)); - /* These debugging values are not set safely: they may be incorrect - or even refer to a line that is invalid for the file name. */ - lock->last_s_file_name = file_name; - lock->last_s_line = line; - return(TRUE); /* locking succeeded */ } @@ -304,29 +302,32 @@ rw_lock_x_lock_func_nowait( const char* file_name,/*!< in: file name where lock requested */ unsigned line) /*!< in: line where requested */ { - lint oldval = X_LOCK_DECR; + int32_t oldval = X_LOCK_DECR; - if (my_atomic_caslint(&lock->lock_word, &oldval, 0)) { + if (my_atomic_cas32_strong_explicit(&lock->lock_word, &oldval, 0, + MY_MEMORY_ORDER_ACQUIRE, + MY_MEMORY_ORDER_RELAXED)) { lock->writer_thread = os_thread_get_curr_id(); } else if (os_thread_eq(lock->writer_thread, os_thread_get_curr_id())) { - /* Relock: this lock_word modification is safe since no other - threads can modify (lock, unlock, or reserve) lock_word while - there is an exclusive writer and this is the writer thread. */ - if (lock->lock_word == 0 || lock->lock_word == -X_LOCK_HALF_DECR) { + /* Relock: even though no other thread can modify (lock, unlock + or reserve) lock_word while there is an exclusive writer and + this is the writer thread, we still want concurrent threads to + observe consistent values. */ + if (oldval == 0 || oldval == -X_LOCK_HALF_DECR) { /* There are 1 x-locks */ - lock->lock_word -= X_LOCK_DECR; - } else if (lock->lock_word <= -X_LOCK_DECR) { + my_atomic_add32_explicit(&lock->lock_word, -X_LOCK_DECR, + MY_MEMORY_ORDER_RELAXED); + } else if (oldval <= -X_LOCK_DECR) { /* There are 2 or more x-locks */ - lock->lock_word--; + my_atomic_add32_explicit(&lock->lock_word, -1, + MY_MEMORY_ORDER_RELAXED); + /* Watch for too many recursive locks */ + ut_ad(oldval < 1); } else { /* Failure */ return(FALSE); } - - /* Watch for too many recursive locks */ - ut_ad(lock->lock_word < 0); - } else { /* Failure */ return(FALSE); @@ -354,14 +355,19 @@ rw_lock_s_unlock_func( #endif /* UNIV_DEBUG */ rw_lock_t* lock) /*!< in/out: rw-lock */ { - ut_ad(lock->lock_word > -X_LOCK_DECR); - ut_ad(lock->lock_word != 0); - ut_ad(lock->lock_word < X_LOCK_DECR); +#ifdef UNIV_DEBUG + int32_t dbg_lock_word = my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED); + ut_ad(dbg_lock_word > -X_LOCK_DECR); + ut_ad(dbg_lock_word != 0); + ut_ad(dbg_lock_word < X_LOCK_DECR); +#endif ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_S)); /* Increment lock_word to indicate 1 less reader */ - lint lock_word = my_atomic_addlint(&lock->lock_word, 1) + 1; + int32_t lock_word = my_atomic_add32_explicit(&lock->lock_word, 1, + MY_MEMORY_ORDER_RELEASE) + 1; if (lock_word == 0 || lock_word == -X_LOCK_HALF_DECR) { /* wait_ex waiter exists. It may not be asleep, but we signal @@ -387,41 +393,49 @@ rw_lock_x_unlock_func( #endif /* UNIV_DEBUG */ rw_lock_t* lock) /*!< in/out: rw-lock */ { - ut_ad(lock->lock_word == 0 || lock->lock_word == -X_LOCK_HALF_DECR - || lock->lock_word <= -X_LOCK_DECR); + int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED); - if (lock->lock_word == 0) { + ut_ad(lock_word == 0 || lock_word == -X_LOCK_HALF_DECR + || lock_word <= -X_LOCK_DECR); + + if (lock_word == 0) { /* Last caller in a possible recursive chain. */ lock->writer_thread = 0; } ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_X)); - if (lock->lock_word == 0 || lock->lock_word == -X_LOCK_HALF_DECR) { - /* There is 1 x-lock */ - /* atomic increment is needed, because it is last */ - if (my_atomic_addlint(&lock->lock_word, X_LOCK_DECR) <= -X_LOCK_DECR) { - ut_error; - } + if (lock_word == 0 || lock_word == -X_LOCK_HALF_DECR) { + /* Last X-lock owned by this thread, it may still hold SX-locks. + ACQ_REL due to... + RELEASE: we release rw-lock + ACQUIRE: we want waiters to be loaded after lock_word is stored */ + my_atomic_add32_explicit(&lock->lock_word, X_LOCK_DECR, + MY_MEMORY_ORDER_ACQ_REL); /* This no longer has an X-lock but it may still have an SX-lock. So it is now free for S-locks by other threads. We need to signal read/write waiters. We do not need to signal wait_ex waiters, since they cannot exist when there is a writer. */ - if (lock->waiters) { - my_atomic_store32((int32*) &lock->waiters, 0); + if (my_atomic_load32_explicit(&lock->waiters, + MY_MEMORY_ORDER_RELAXED)) { + my_atomic_store32_explicit(&lock->waiters, 0, + MY_MEMORY_ORDER_RELAXED); os_event_set(lock->event); sync_array_object_signalled(); } - } else if (lock->lock_word == -X_LOCK_DECR - || lock->lock_word == -(X_LOCK_DECR + X_LOCK_HALF_DECR)) { + } else if (lock_word == -X_LOCK_DECR + || lock_word == -(X_LOCK_DECR + X_LOCK_HALF_DECR)) { /* There are 2 x-locks */ - lock->lock_word += X_LOCK_DECR; + my_atomic_add32_explicit(&lock->lock_word, X_LOCK_DECR, + MY_MEMORY_ORDER_RELAXED); } else { /* There are more than 2 x-locks. */ - ut_ad(lock->lock_word < -X_LOCK_DECR); - lock->lock_word += 1; + ut_ad(lock_word < -X_LOCK_DECR); + my_atomic_add32_explicit(&lock->lock_word, 1, + MY_MEMORY_ORDER_RELAXED); } ut_ad(rw_lock_validate(lock)); @@ -447,28 +461,37 @@ rw_lock_sx_unlock_func( ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_SX)); if (lock->sx_recursive == 0) { + int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED); /* Last caller in a possible recursive chain. */ - if (lock->lock_word > 0) { + if (lock_word > 0) { lock->writer_thread = 0; + ut_ad(lock_word <= INT_MAX32 - X_LOCK_HALF_DECR); + + /* Last SX-lock owned by this thread, doesn't own X-lock. + ACQ_REL due to... + RELEASE: we release rw-lock + ACQUIRE: we want waiters to be loaded after lock_word is stored */ + my_atomic_add32_explicit(&lock->lock_word, X_LOCK_HALF_DECR, + MY_MEMORY_ORDER_ACQ_REL); - if (my_atomic_addlint(&lock->lock_word, X_LOCK_HALF_DECR) <= 0) { - ut_error; - } /* Lock is now free. May have to signal read/write waiters. We do not need to signal wait_ex waiters, since they cannot exist when there is an sx-lock holder. */ - if (lock->waiters) { - my_atomic_store32((int32*) &lock->waiters, 0); + if (my_atomic_load32_explicit(&lock->waiters, + MY_MEMORY_ORDER_RELAXED)) { + my_atomic_store32_explicit(&lock->waiters, 0, + MY_MEMORY_ORDER_RELAXED); os_event_set(lock->event); sync_array_object_signalled(); } } else { /* still has x-lock */ - ut_ad(lock->lock_word == -X_LOCK_HALF_DECR - || lock->lock_word <= -(X_LOCK_DECR - + X_LOCK_HALF_DECR)); - lock->lock_word += X_LOCK_HALF_DECR; + ut_ad(lock_word == -X_LOCK_HALF_DECR || + lock_word <= -(X_LOCK_DECR + X_LOCK_HALF_DECR)); + my_atomic_add32_explicit(&lock->lock_word, X_LOCK_HALF_DECR, + MY_MEMORY_ORDER_RELAXED); } } diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 037bf8047cd..a54e0f6a40e 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -57,7 +57,6 @@ extern mysql_pfs_key_t buf_pool_zip_mutex_key; extern mysql_pfs_key_t cache_last_read_mutex_key; extern mysql_pfs_key_t dict_foreign_err_mutex_key; extern mysql_pfs_key_t dict_sys_mutex_key; -extern mysql_pfs_key_t file_format_max_mutex_key; extern mysql_pfs_key_t fil_system_mutex_key; extern mysql_pfs_key_t flush_list_mutex_key; extern mysql_pfs_key_t fts_delete_mutex_key; @@ -92,7 +91,6 @@ extern mysql_pfs_key_t srv_innodb_monitor_mutex_key; extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key; extern mysql_pfs_key_t srv_monitor_file_mutex_key; extern mysql_pfs_key_t buf_dblwr_mutex_key; -extern mysql_pfs_key_t trx_undo_mutex_key; extern mysql_pfs_key_t trx_mutex_key; extern mysql_pfs_key_t trx_pool_mutex_key; extern mysql_pfs_key_t trx_pool_manager_mutex_key; @@ -107,6 +105,7 @@ extern mysql_pfs_key_t sync_array_mutex_key; extern mysql_pfs_key_t thread_mutex_key; extern mysql_pfs_key_t zip_pad_mutex_key; extern mysql_pfs_key_t row_drop_list_mutex_key; +extern mysql_pfs_key_t rw_trx_hash_element_mutex_key; #endif /* UNIV_PFS_MUTEX */ #ifdef UNIV_PFS_RWLOCK diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h index 563176f4abe..9a4b34e0c83 100644 --- a/storage/innobase/include/sync0types.h +++ b/storage/innobase/include/sync0types.h @@ -106,16 +106,6 @@ V Transaction system header | V -Transaction undo mutex The undo log entry must be written -| before any index page is modified. -| Transaction undo mutex is for the undo -| logs the analogue of the tree latch -| for a B-tree. If a thread has the -| trx undo mutex reserved, it is allowed -| to latch the undo log pages in any -| order, and also after it has acquired -| the fsp latch. -V Rollback segment mutex The rollback segment mutex must be | reserved, if, e.g., a new page must | be added to an undo log. The rollback @@ -158,7 +148,7 @@ V lock_sys_mutex Mutex protecting lock_sys_t | V -trx_sys->mutex Mutex protecting trx_sys_t +trx_sys.mutex Mutex protecting trx_sys_t | V Threads mutex Background thread scheduling mutex @@ -229,6 +219,7 @@ enum latch_level_t { SYNC_TRX_SYS_HEADER, SYNC_THREADS, SYNC_TRX, + SYNC_RW_TRX_HASH_ELEMENT, SYNC_TRX_SYS, SYNC_LOCK_SYS, SYNC_LOCK_WAIT_SYS, @@ -251,7 +242,6 @@ enum latch_level_t { SYNC_RSEG_HEADER_NEW, SYNC_NOREDO_RSEG, SYNC_REDO_RSEG, - SYNC_TRX_UNDO, SYNC_PURGE_LATCH, SYNC_TREE_NODE, SYNC_TREE_NODE_FROM_HASH, @@ -266,8 +256,6 @@ enum latch_level_t { SYNC_DICT, SYNC_FTS_CACHE, - SYNC_FILE_FORMAT_TAG, - SYNC_DICT_OPERATION, SYNC_TRX_I_S_LAST_READ, @@ -332,7 +320,6 @@ enum latch_id_t { LATCH_ID_SRV_MISC_TMPFILE, LATCH_ID_SRV_MONITOR_FILE, LATCH_ID_BUF_DBLWR, - LATCH_ID_TRX_UNDO, LATCH_ID_TRX_POOL, LATCH_ID_TRX_POOL_MANAGER, LATCH_ID_TRX, @@ -373,11 +360,10 @@ enum latch_id_t { LATCH_ID_SCRUB_STAT_MUTEX, LATCH_ID_DEFRAGMENT_MUTEX, LATCH_ID_BTR_DEFRAGMENT_MUTEX, - LATCH_ID_MTFLUSH_THREAD_MUTEX, - LATCH_ID_MTFLUSH_MUTEX, LATCH_ID_FIL_CRYPT_STAT_MUTEX, LATCH_ID_FIL_CRYPT_DATA_MUTEX, LATCH_ID_FIL_CRYPT_THREADS_MUTEX, + LATCH_ID_RW_TRX_HASH_ELEMENT, LATCH_ID_TEST_MUTEX, LATCH_ID_MAX = LATCH_ID_TEST_MUTEX }; @@ -485,10 +471,10 @@ struct OSMutex { } private: -#ifdef UNIV_DEBUG +#ifdef DBUG_ASSERT_EXISTS /** true if the mutex has been freed/destroyed. */ bool m_freed; -#endif /* UNIV_DEBUG */ +#endif /* DBUG_ASSERT_EXISTS */ sys_mutex_t m_mutex; }; @@ -987,8 +973,7 @@ struct latch_t { UNIV_NOTHROW : m_id(id), - m_rw_lock(), - m_temp_fsp() { } + m_rw_lock() {} /** Destructor */ virtual ~latch_t() UNIV_NOTHROW { } @@ -1022,24 +1007,6 @@ struct latch_t { return(sync_latch_get_level(m_id)); } - /** @return true if the latch is for a temporary file space*/ - bool is_temp_fsp() const - UNIV_NOTHROW - { - return(m_temp_fsp); - } - - /** Set the temporary tablespace flag. (For internal temporary - tables, MySQL 5.7 does not always acquire the index->lock. We - need to figure out the context and add some special rules - during the checks.) */ - void set_temp_fsp() - UNIV_NOTHROW - { - ut_ad(get_id() == LATCH_ID_FIL_SPACE); - m_temp_fsp = true; - } - /** @return the latch name, m_id must be set */ const char* get_name() const UNIV_NOTHROW @@ -1055,9 +1022,6 @@ struct latch_t { /** true if it is a rw-lock. In debug mode, rw_lock_t derives from this class and sets this variable. */ bool m_rw_lock; - - /** true if it is an temporary space latch */ - bool m_temp_fsp; }; /** Subclass this to iterate over a thread's acquired latch levels. */ @@ -1149,92 +1113,88 @@ enum rw_lock_flag_t { #endif /* UNIV_INNOCHECKSUM */ -#ifdef _WIN64 static inline ulint my_atomic_addlint(ulint *A, ulint B) { +#ifdef _WIN64 return ulint(my_atomic_add64((volatile int64*)A, B)); +#else + return ulint(my_atomic_addlong(A, B)); +#endif } static inline ulint my_atomic_loadlint(const ulint *A) { +#ifdef _WIN64 return ulint(my_atomic_load64((volatile int64*)A)); +#else + return ulint(my_atomic_loadlong(A)); +#endif } static inline lint my_atomic_addlint(volatile lint *A, lint B) { +#ifdef _WIN64 return my_atomic_add64((volatile int64*)A, B); +#else + return my_atomic_addlong(A, B); +#endif } static inline lint my_atomic_loadlint(const lint *A) { +#ifdef _WIN64 return lint(my_atomic_load64((volatile int64*)A)); +#else + return my_atomic_loadlong(A); +#endif } static inline void my_atomic_storelint(ulint *A, ulint B) { +#ifdef _WIN64 my_atomic_store64((volatile int64*)A, B); +#else + my_atomic_storelong(A, B); +#endif } -static inline lint my_atomic_caslint(volatile lint *A, lint *B, lint C) +/** Simple non-atomic counter aligned to CACHE_LINE_SIZE +@tparam Type the integer type of the counter */ +template <typename Type> +struct MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter { - return my_atomic_cas64((volatile int64*)A, (int64 *)B, C); -} + /** Increment the counter */ + Type inc() { return add(1); } + /** Decrement the counter */ + Type dec() { return add(Type(~0)); } -static inline ulint my_atomic_caslint(ulint *A, ulint *B, ulint C) -{ - return my_atomic_cas64((volatile int64*)A, (int64 *)B, (int64)C); -} + /** Add to the counter + @param[in] i amount to be added + @return the value of the counter after adding */ + Type add(Type i) { return m_counter += i; } -#else -#define my_atomic_addlint my_atomic_addlong -#define my_atomic_loadlint my_atomic_loadlong -#define my_atomic_caslint my_atomic_caslong -#endif + /** @return the value of the counter */ + operator Type() const { return m_counter; } -/** Simple counter aligned to CACHE_LINE_SIZE -@tparam Type the integer type of the counter -@tparam atomic whether to use atomic memory access */ -template <typename Type = ulint, bool atomic = false> -struct MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter +private: + /** The counter */ + Type m_counter; +}; + +/** Simple atomic counter aligned to CACHE_LINE_SIZE +@tparam Type lint or ulint */ +template <typename Type = ulint> +struct MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) simple_atomic_counter { /** Increment the counter */ Type inc() { return add(1); } /** Decrement the counter */ - Type dec() { return sub(1); } + Type dec() { return add(Type(~0)); } /** Add to the counter @param[in] i amount to be added - @return the value of the counter after adding */ - Type add(Type i) - { - compile_time_assert(!atomic || sizeof(Type) == sizeof(lint)); - if (atomic) { -#ifdef _MSC_VER -// Suppress type conversion/ possible loss of data warning -#pragma warning (push) -#pragma warning (disable : 4244) -#endif - return Type(my_atomic_addlint(reinterpret_cast<ulint*> - (&m_counter), i)); -#ifdef _MSC_VER -#pragma warning (pop) -#endif - } else { - return m_counter += i; - } - } - /** Subtract from the counter - @param[in] i amount to be subtracted - @return the value of the counter after adding */ - Type sub(Type i) - { - compile_time_assert(!atomic || sizeof(Type) == sizeof(lint)); - if (atomic) { - return Type(my_atomic_addlint(&m_counter, -lint(i))); - } else { - return m_counter -= i; - } - } + @return the value of the counter before adding */ + Type add(Type i) { return my_atomic_addlint(&m_counter, i); } /** @return the value of the counter (non-atomic access)! */ operator Type() const { return m_counter; } diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h index 7e766072272..65c7d321597 100644 --- a/storage/innobase/include/trx0i_s.h +++ b/storage/innobase/include/trx0i_s.h @@ -263,10 +263,10 @@ trx_i_s_possibly_fetch_data_into_cache( trx_i_s_cache_t* cache); /*!< in/out: cache */ /*******************************************************************//** -Returns TRUE if the data in the cache is truncated due to the memory +Returns true, if the data in the cache is truncated due to the memory limit posed by TRX_I_S_MEM_LIMIT. @return TRUE if truncated */ -ibool +bool trx_i_s_cache_is_truncated( /*=======================*/ trx_i_s_cache_t* cache); /*!< in: cache */ diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index 73d497dd64a..4bc5aded341 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -27,9 +27,8 @@ Created 3/26/1996 Heikki Tuuri #ifndef trx0purge_h #define trx0purge_h -#include "trx0sys.h" +#include "trx0rseg.h" #include "que0types.h" -#include "page0page.h" #include <queue> @@ -47,16 +46,13 @@ trx_purge_get_log_from_hist( /*========================*/ fil_addr_t node_addr); /*!< in: file address of the history list node of the log */ -/************************************************************************ -Adds the update undo log as the first log in the history list. Removes the -update undo log segment from the rseg slot if it is too big for reuse. */ +/** Prepend the history list with an undo log. +Remove the undo log segment from the rseg slot if it is too big for reuse. +@param[in] trx transaction +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction */ void -trx_purge_add_update_undo_to_history( -/*=================================*/ - trx_t* trx, /*!< in: transaction */ - page_t* undo_page, /*!< in: update undo log header page, - x-latched */ - mtr_t* mtr); /*!< in: mtr */ +trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr); /*******************************************************************//** This function runs a purge batch. @return number of undo log pages handled in the batch */ @@ -65,40 +61,12 @@ trx_purge( /*======*/ ulint n_purge_threads, /*!< in: number of purge tasks to submit to task queue. */ - ulint limit, /*!< in: the maximum number of - records to purge in one batch */ bool truncate /*!< in: truncate history if true */ #ifdef UNIV_DEBUG , srv_slot_t *slot /*!< in/out: purge coordinator thread slot */ #endif ); -/*******************************************************************//** -Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */ -void -trx_purge_stop(void); -/*================*/ -/*******************************************************************//** -Resume purge, move to PURGE_STATE_RUN. */ -void -trx_purge_run(void); -/*================*/ - -/** Purge states */ -enum purge_state_t { - PURGE_STATE_INIT, /*!< Purge instance created */ - PURGE_STATE_RUN, /*!< Purge should be running */ - PURGE_STATE_STOP, /*!< Purge should be stopped */ - PURGE_STATE_EXIT, /*!< Purge has been shutdown */ - PURGE_STATE_DISABLED /*!< Purge was never started */ -}; - -/*******************************************************************//** -Get the purge state. -@return purge state. */ -purge_state_t -trx_purge_state(void); -/*=================*/ /** Rollback segements from a given transaction with trx-no scheduled for purge. */ @@ -108,69 +76,28 @@ private: trx_rsegs_t; public: typedef trx_rsegs_t::iterator iterator; + typedef trx_rsegs_t::const_iterator const_iterator; /** Default constructor */ - TrxUndoRsegs() : m_trx_no() { } - - explicit TrxUndoRsegs(trx_id_t trx_no) - : - m_trx_no(trx_no) - { - // Do nothing - } - - /** Get transaction number - @return trx_id_t - get transaction number. */ - trx_id_t get_trx_no() const - { - return(m_trx_no); - } - - /** Add rollback segment. - @param rseg rollback segment to add. */ - void push_back(trx_rseg_t* rseg) - { - m_rsegs.push_back(rseg); - } - - /** Erase the element pointed by given iterator. - @param[in] iterator iterator */ - void erase(iterator& it) - { - m_rsegs.erase(it); - } - - /** Number of registered rsegs. - @return size of rseg list. */ - ulint size() const - { - return(m_rsegs.size()); - } - - /** - @return an iterator to the first element */ - iterator begin() - { - return(m_rsegs.begin()); - } - - /** - @return an iterator to the end */ - iterator end() - { - return(m_rsegs.end()); - } + TrxUndoRsegs() {} + /** Constructor */ + TrxUndoRsegs(trx_rseg_t& rseg) + : m_commit(rseg.last_commit), m_rsegs(1, &rseg) {} + /** Constructor */ + TrxUndoRsegs(trx_id_t trx_no, trx_rseg_t& rseg) + : m_commit(trx_no << 1), m_rsegs(1, &rseg) {} - /** Append rollback segments from referred instance to current - instance. */ - void append(const TrxUndoRsegs& append_from) - { - ut_ad(get_trx_no() == append_from.get_trx_no()); + /** @return the transaction commit identifier */ + trx_id_t trx_no() const { return m_commit >> 1; } - m_rsegs.insert(m_rsegs.end(), - append_from.m_rsegs.begin(), - append_from.m_rsegs.end()); - } + bool operator!=(const TrxUndoRsegs& other) const + { return m_commit != other.m_commit; } + bool empty() const { return m_rsegs.empty(); } + void erase(iterator& it) { m_rsegs.erase(it); } + iterator begin() { return(m_rsegs.begin()); } + iterator end() { return(m_rsegs.end()); } + const_iterator begin() const { return m_rsegs.begin(); } + const_iterator end() const { return m_rsegs.end(); } /** Compare two TrxUndoRsegs based on trx_no. @param elem1 first element to compare @@ -178,17 +105,12 @@ public: @return true if elem1 > elem2 else false.*/ bool operator()(const TrxUndoRsegs& lhs, const TrxUndoRsegs& rhs) { - return(lhs.m_trx_no > rhs.m_trx_no); + return(lhs.m_commit > rhs.m_commit); } - /** Compiler defined copy-constructor/assignment operator - should be fine given that there is no reference to a memory - object outside scope of class object.*/ - private: - /** The rollback segments transaction number. */ - trx_id_t m_trx_no; - + /** Copy trx_rseg_t::last_commit */ + trx_id_t m_commit; /** Rollback segments of a transaction, scheduled for purge. */ trx_rsegs_t m_rsegs; }; @@ -198,16 +120,14 @@ typedef std::priority_queue< std::vector<TrxUndoRsegs, ut_allocator<TrxUndoRsegs> >, TrxUndoRsegs> purge_pq_t; -/** -Chooses the rollback segment with the smallest trx_no. */ +/** Chooses the rollback segment with the oldest committed transaction */ struct TrxUndoRsegsIterator { - /** Constructor */ TrxUndoRsegsIterator(); - /** Sets the next rseg to purge in purge_sys. + Executed in the purge coordinator thread. @return whether anything is to be purged */ - bool set_next(); + inline bool set_next(); private: // Disable copying @@ -215,38 +135,11 @@ private: TrxUndoRsegsIterator& operator=(const TrxUndoRsegsIterator&); /** The current element to process */ - TrxUndoRsegs m_trx_undo_rsegs; - - /** Track the current element in m_trx_undo_rseg */ - TrxUndoRsegs::iterator m_iter; - - /** Sentinel value */ - static const TrxUndoRsegs NullElement; + TrxUndoRsegs m_rsegs; + /** Track the current element in m_rsegs */ + TrxUndoRsegs::const_iterator m_iter; }; -/** This is the purge pointer/iterator. We need both the undo no and the -transaction no up to which purge has parsed and applied the records. */ -struct purge_iter_t { - purge_iter_t() - : - trx_no(), - undo_no(), - undo_rseg_space(ULINT_UNDEFINED) - { - // Do nothing - } - - trx_id_t trx_no; /*!< Purge has advanced past all - transactions whose number is less - than this */ - undo_no_t undo_no; /*!< Purge has advanced past all records - whose undo number is less than this */ - ulint undo_rseg_space; - /*!< Last undo record resided in this - space id. */ -}; - - /* Namespace to hold all the related functions and variables need for truncate of undo tablespace. */ namespace undo { @@ -271,17 +164,12 @@ namespace undo { /** Track UNDO tablespace mark for truncate. */ class Truncate { public: - - Truncate() - : - m_undo_for_trunc(ULINT_UNDEFINED), - m_rseg_for_trunc(), - m_scan_start(1), - m_purge_rseg_truncate_frequency( - static_cast<ulint>( - srv_purge_rseg_truncate_frequency)) + void create() { - /* Do Nothing. */ + m_undo_for_trunc = ULINT_UNDEFINED; + m_scan_start = 1; + m_purge_rseg_truncate_frequency = + ulint(srv_purge_rseg_truncate_frequency); } /** Clear the cached rollback segment. Normally done @@ -452,53 +340,58 @@ namespace undo { class purge_sys_t { public: - /** Construct the purge system. */ - purge_sys_t(); - /** Destruct the purge system. */ - ~purge_sys_t(); - - rw_lock_t latch; /*!< The latch protecting the purge - view. A purge operation must acquire an - x-latch here for the instant at which - it changes the purge view: an undo - log operation can prevent this by - obtaining an s-latch here. It also - protects state and running */ - os_event_t event; /*!< State signal event; - os_event_set() and os_event_reset() - are protected by purge_sys_t::latch - X-lock */ - ulint n_stop; /*!< Counter to track number stops */ - volatile bool running; /*!< true, if purge is active, - we check this without the latch too */ - volatile purge_state_t state; /*!< Purge coordinator thread states, - we check this in several places - without holding the latch. */ + /** signal state changes; os_event_reset() and os_event_set() + are protected by rw_lock_x_lock(latch) */ + MY_ALIGNED(CACHE_LINE_SIZE) + os_event_t event; + /** latch protecting view, m_enabled */ + MY_ALIGNED(CACHE_LINE_SIZE) + rw_lock_t latch; +private: + /** whether purge is enabled; protected by latch and my_atomic */ + int32_t m_enabled; + /** number of pending stop() calls without resume() */ + int32_t m_paused; +public: que_t* query; /*!< The query graph which will do the parallelized purge operation */ + MY_ALIGNED(CACHE_LINE_SIZE) ReadView view; /*!< The purge will not remove undo logs which are >= this view (purge view) */ - ulint n_submitted; /*!< Count of total tasks submitted - to the task queue */ - ulint n_completed; /*!< Count of total tasks completed */ - - /*------------------------------*/ - /* The following two fields form the 'purge pointer' which advances - during a purge, and which is used in history list truncation */ - - purge_iter_t iter; /* Limit up to which we have read and - parsed the UNDO log records. Not - necessarily purged from the indexes. - Note that this can never be less than - the limit below, we check for this - invariant in trx0purge.cc */ - purge_iter_t limit; /* The 'purge pointer' which advances - during a purge, and which is used in - history list truncation */ -#ifdef UNIV_DEBUG - purge_iter_t done; /* Indicate 'purge pointer' which have - purged already accurately. */ -#endif /* UNIV_DEBUG */ + /** Total number of tasks submitted by srv_purge_coordinator_thread. + Not accessed by other threads. */ + ulint n_submitted; + /** Number of completed tasks. Accessed by srv_purge_coordinator + and srv_worker_thread by my_atomic. */ + ulint n_completed; + + /** Iterator to the undo log records of committed transactions */ + struct iterator + { + bool operator<=(const iterator& other) const + { + if (commit < other.commit) return true; + if (commit > other.commit) return false; + return undo_no <= other.undo_no; + } + + /** @return the commit number of the transaction */ + trx_id_t trx_no() const { return commit >> 1; } + void reset_trx_no(trx_id_t trx_no) { commit = trx_no << 1; } + + /** 2 * trx_t::no + old_insert of the committed transaction */ + trx_id_t commit; + /** The record number within the committed transaction's undo + log, increasing, purged from from 0 onwards */ + undo_no_t undo_no; + }; + + /** The tail of the purge queue; the last parsed undo log of a + committed transaction. */ + iterator tail; + /** The head of the purge queue; any older undo logs of committed + transactions may be discarded (history list truncation). */ + iterator head; /*-----------------------------*/ bool next_stored; /*!< whether rseg holds the next record to purge */ @@ -526,10 +419,70 @@ public: undo::Truncate undo_trunc; /*!< Track UNDO tablespace marked for truncate. */ + + + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + + purge_sys_t() : event(NULL), m_enabled(false) {} + + + /** Create the instance */ + void create(); + + /** Close the purge system on shutdown */ + void close(); + + /** @return whether purge is enabled */ + bool enabled() + { + return my_atomic_load32_explicit(&m_enabled, MY_MEMORY_ORDER_RELAXED); + } + /** @return whether purge is enabled */ + bool enabled_latched() + { + ut_ad(rw_lock_own_flagged(&latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + return bool(m_enabled); + } + /** @return whether the purge coordinator is paused */ + bool paused() + { return my_atomic_load32_explicit(&m_paused, MY_MEMORY_ORDER_RELAXED); } + /** @return whether the purge coordinator is paused */ + bool paused_latched() + { + ut_ad(rw_lock_own_flagged(&latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + return m_paused != 0; + } + + /** Enable purge at startup. Not protected by latch; the main thread + will wait for purge_sys.enabled() in srv_start() */ + void coordinator_startup() + { + ut_ad(!enabled()); + my_atomic_store32_explicit(&m_enabled, true, MY_MEMORY_ORDER_RELAXED); + } + + /** Disable purge at shutdown */ + void coordinator_shutdown() + { + ut_ad(enabled()); + my_atomic_store32_explicit(&m_enabled, false, MY_MEMORY_ORDER_RELAXED); + } + + /** @return whether the purge coordinator thread is active */ + bool running(); + /** Stop purge during FLUSH TABLES FOR EXPORT */ + void stop(); + /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */ + void resume(); }; /** The global data structure coordinating a purge */ -extern purge_sys_t* purge_sys; +extern purge_sys_t purge_sys; /** Info required to purge a record */ struct trx_purge_rec_t { diff --git a/storage/innobase/include/trx0purge.ic b/storage/innobase/include/trx0purge.ic index 0ccff6f7798..e460676d58e 100644 --- a/storage/innobase/include/trx0purge.ic +++ b/storage/innobase/include/trx0purge.ic @@ -40,24 +40,3 @@ trx_purge_get_log_from_hist( return(node_addr); } - -/********************************************************************//** -address of its history list node. -@return true if purge_sys_t::limit <= purge_sys_t::iter */ -UNIV_INLINE -bool -trx_purge_check_limit(void) -/*=======================*/ -{ - /* limit is used to track till what point purge element has been - processed and so limit <= iter. - undo_no ordering is enforced only within the same rollback segment. - If a transaction uses multiple rollback segments then we need to - consider the rollback segment space id too. */ - return(purge_sys->iter.trx_no > purge_sys->limit.trx_no - || (purge_sys->iter.trx_no == purge_sys->limit.trx_no - && ((purge_sys->iter.undo_no >= purge_sys->limit.undo_no) - || (purge_sys->iter.undo_rseg_space - != purge_sys->limit.undo_rseg_space)))); -} - diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h index be06db4c954..f27d8af0368 100644 --- a/storage/innobase/include/trx0rec.h +++ b/storage/innobase/include/trx0rec.h @@ -53,22 +53,6 @@ trx_undo_rec_get_type( /*==================*/ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ /**********************************************************************//** -Reads from an undo log record the record compiler info. -@return compiler info */ -UNIV_INLINE -ulint -trx_undo_rec_get_cmpl_info( -/*=======================*/ - const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ -/**********************************************************************//** -Returns TRUE if an undo log record contains an extern storage field. -@return TRUE if extern */ -UNIV_INLINE -ibool -trx_undo_rec_get_extern_storage( -/*============================*/ - const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ -/**********************************************************************//** Reads the undo log record number. @return undo no */ UNIV_INLINE @@ -111,7 +95,7 @@ trx_undo_rec_get_row_ref( used, as we do NOT copy the data in the record! */ dict_index_t* index, /*!< in: clustered index */ - dtuple_t** ref, /*!< out, own: row reference */ + const dtuple_t**ref, /*!< out, own: row reference */ mem_heap_t* heap); /*!< in: memory heap from which the memory needed is allocated */ /**********************************************************************//** @@ -256,25 +240,22 @@ trx_undo_prev_version_build( into this function by purge thread or not. And if we read "after image" of undo log */ -/***********************************************************//** -Parses a redo log record of adding an undo log record. -@return end of log record or NULL */ +/** Parse MLOG_UNDO_INSERT. +@param[in] ptr log record +@param[in] end_ptr end of log record buffer +@param[in,out] page page or NULL +@return end of log record +@retval NULL if the log record is incomplete */ byte* trx_undo_parse_add_undo_rec( -/*========================*/ - byte* ptr, /*!< in: buffer */ - byte* end_ptr,/*!< in: buffer end */ - page_t* page); /*!< in: page or NULL */ -/***********************************************************//** -Parses a redo log record of erasing of an undo page end. -@return end of log record or NULL */ -byte* -trx_undo_parse_erase_page_end( -/*==========================*/ - byte* ptr, /*!< in: buffer */ - byte* end_ptr,/*!< in: buffer end */ - page_t* page, /*!< in: page or NULL */ - mtr_t* mtr); /*!< in: mtr or NULL */ + const byte* ptr, + const byte* end_ptr, + page_t* page); +/** Erase the unused undo log page end. +@param[in,out] undo_page undo log page +@return whether the page contained something */ +bool +trx_undo_erase_page_end(page_t* undo_page); /** Read from an undo log record a non-virtual column value. @param[in,out] ptr pointer to remaining part of the undo record @@ -326,6 +307,8 @@ compilation info multiplied by 16 is ORed to this value in an undo log record */ #define TRX_UNDO_RENAME_TABLE 9 /*!< RENAME TABLE */ +#define TRX_UNDO_INSERT_METADATA 10 /*!< insert a metadata + pseudo-record for instant ALTER */ #define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */ #define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked record */ @@ -341,6 +324,9 @@ record */ storage fields: used by purge to free the external storage */ +/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */ +extern const dtuple_t trx_undo_metadata; + #include "trx0rec.ic" #endif /* trx0rec_h */ diff --git a/storage/innobase/include/trx0rec.ic b/storage/innobase/include/trx0rec.ic index ecae142d8f5..02244d68b6f 100644 --- a/storage/innobase/include/trx0rec.ic +++ b/storage/innobase/include/trx0rec.ic @@ -36,35 +36,6 @@ trx_undo_rec_get_type( } /**********************************************************************//** -Reads from an undo log record the record compiler info. -@return compiler info */ -UNIV_INLINE -ulint -trx_undo_rec_get_cmpl_info( -/*=======================*/ - const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ -{ - return(mach_read_from_1(undo_rec + 2) / TRX_UNDO_CMPL_INFO_MULT); -} - -/**********************************************************************//** -Returns TRUE if an undo log record contains an extern storage field. -@return TRUE if extern */ -UNIV_INLINE -ibool -trx_undo_rec_get_extern_storage( -/*============================*/ - const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ -{ - if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) { - - return(TRUE); - } - - return(FALSE); -} - -/**********************************************************************//** Reads the undo log record number. @return undo no */ UNIV_INLINE @@ -93,8 +64,8 @@ trx_undo_rec_copy( ulint len; len = mach_read_from_2(undo_rec) - - ut_align_offset(undo_rec, UNIV_PAGE_SIZE); - ut_ad(len < UNIV_PAGE_SIZE); + - ut_align_offset(undo_rec, srv_page_size); + ut_ad(len < srv_page_size); trx_undo_rec_t* rec = static_cast<trx_undo_rec_t*>( mem_heap_dup(heap, undo_rec, len)); mach_write_to_2(rec, len); diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h index 0fd6973e551..a23b57ccc3e 100644 --- a/storage/innobase/include/trx0roll.h +++ b/storage/innobase/include/trx0roll.h @@ -31,7 +31,7 @@ Created 3/26/1996 Heikki Tuuri #include "mtr0mtr.h" #include "trx0sys.h" -extern bool trx_rollback_or_clean_is_active; +extern bool trx_rollback_is_active; extern const trx_t* trx_roll_crash_recv_trx; /*******************************************************************//** @@ -52,20 +52,17 @@ trx_undo_rec_t* trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap) MY_ATTRIBUTE((nonnull, warn_unused_result)); -/** Report progress when rolling back a row of a recovered transaction. -@return whether the rollback should be aborted due to pending shutdown */ -bool -trx_roll_must_shutdown(); +/** Report progress when rolling back a row of a recovered transaction. */ +void trx_roll_report_progress(); /*******************************************************************//** Rollback or clean up any incomplete transactions which were encountered in crash recovery. If the transaction already was committed, then we clean up a possible insert undo log. If the -transaction was not yet committed, then we roll it back. */ +transaction was not yet committed, then we roll it back. +@param all true=roll back all recovered active transactions; +false=roll back any incomplete dictionary transaction */ void -trx_rollback_or_clean_recovered( -/*============================*/ - ibool all); /*!< in: FALSE=roll back dictionary transactions; - TRUE=roll back all non-PREPARED transactions */ +trx_rollback_recovered(bool all); /*******************************************************************//** Rollback or clean up any incomplete transactions which were encountered in crash recovery. If the transaction already was @@ -75,11 +72,7 @@ Note: this is done in a background thread. @return a dummy parameter */ extern "C" os_thread_ret_t -DECLARE_THREAD(trx_rollback_or_clean_all_recovered)( -/*================================================*/ - void* arg MY_ATTRIBUTE((unused))); - /*!< in: a dummy parameter required by - os_thread_create */ +DECLARE_THREAD(trx_rollback_all_recovered)(void*); /*********************************************************************//** Creates a rollback command node struct. @return own: rollback node struct */ @@ -212,6 +205,4 @@ struct trx_named_savept_t{ transaction */ }; -#include "trx0roll.ic" - #endif diff --git a/storage/innobase/include/trx0roll.ic b/storage/innobase/include/trx0roll.ic deleted file mode 100644 index 79b8e9083dd..00000000000 --- a/storage/innobase/include/trx0roll.ic +++ /dev/null @@ -1,62 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/trx0roll.ic -Transaction rollback - -Created 3/26/1996 Heikki Tuuri -*******************************************************/ - -#ifdef UNIV_DEBUG -/*******************************************************************//** -Check if undo numbering is maintained while processing undo records -for rollback. -@return true if undo numbering is maintained. */ -UNIV_INLINE -bool -trx_roll_check_undo_rec_ordering( -/*=============================*/ - undo_no_t curr_undo_rec_no, /*!< in: record number of - undo record to process. */ - ulint curr_undo_space_id, /*!< in: space-id of rollback - segment that contains the - undo record to process. */ - const trx_t* trx) /*!< in: transaction */ -{ - /* Each transaction now can have multiple rollback segments. - If a transaction involves temp and non-temp tables, both the rollback - segments will be active. In this case undo records will be distrubuted - across the two rollback segments. - CASE-1: UNDO action will apply all undo records from one rollback - segment before moving to next. This means undo record numbers can't be - sequential but ordering is still enforced as next undo record number - should be < processed undo record number. - CASE-2: For normal rollback (not initiated by crash) all rollback - segments will be active (including non-redo). - Based on transaction operation pattern undo record number of first - undo record from this new rollback segment can be > last undo number - from previous rollback segment and so we ignore this check if - rollback segments are switching. Once switched new rollback segment - should re-follow undo record number pattern (as mentioned in CASE-1). */ - - return(curr_undo_space_id != trx->undo_rseg_space - || curr_undo_rec_no + 1 <= trx->undo_no); -} -#endif /* UNIV_DEBUG */ - diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h index 8ca17998df4..d4fdb19a988 100644 --- a/storage/innobase/include/trx0rseg.h +++ b/storage/innobase/include/trx0rseg.h @@ -27,10 +27,8 @@ Created 3/26/1996 Heikki Tuuri #ifndef trx0rseg_h #define trx0rseg_h -#include "trx0types.h" #include "trx0sys.h" #include "fut0lst.h" -#include <vector> /** Gets a rollback segment header. @param[in] space space where placed @@ -39,10 +37,7 @@ Created 3/26/1996 Heikki Tuuri @return rollback segment header, page x-latched */ UNIV_INLINE trx_rsegf_t* -trx_rsegf_get( - ulint space, - ulint page_no, - mtr_t* mtr); +trx_rsegf_get(fil_space_t* space, ulint page_no, mtr_t* mtr); /** Gets a newly created rollback segment header. @param[in] space space where placed @@ -57,16 +52,6 @@ trx_rsegf_get_new( mtr_t* mtr); /***************************************************************//** -Gets the file page number of the nth undo log slot. -@return page number of the undo log segment */ -UNIV_INLINE -ulint -trx_rsegf_get_nth_undo( -/*===================*/ - trx_rsegf_t* rsegf, /*!< in: rollback segment header */ - ulint n, /*!< in: index of slot */ - mtr_t* mtr); /*!< in: mtr */ -/***************************************************************//** Sets the file page number of the nth undo log slot. */ UNIV_INLINE void @@ -81,26 +66,21 @@ Looks for a free slot for an undo log segment. @return slot index or ULINT_UNDEFINED if not found */ UNIV_INLINE ulint -trx_rsegf_undo_find_free( -/*=====================*/ - trx_rsegf_t* rsegf, /*!< in: rollback segment header */ - mtr_t* mtr); /*!< in: mtr */ +trx_rsegf_undo_find_free(const trx_rsegf_t* rsegf); -/** Creates a rollback segment header. -This function is called only when a new rollback segment is created in -the database. -@param[in] space space id -@param[in] max_size max size in pages -@param[in] rseg_slot_no rseg id == slot number in trx sys +/** Create a rollback segment header. +@param[in,out] space system, undo, or temporary tablespace +@param[in] rseg_id rollback segment identifier +@param[in,out] sys_header the TRX_SYS page (NULL for temporary rseg) @param[in,out] mtr mini-transaction @return the created rollback segment @retval NULL on failure */ buf_block_t* trx_rseg_header_create( - ulint space, - ulint max_size, - ulint rseg_slot_no, - mtr_t* mtr); + fil_space_t* space, + ulint rseg_id, + buf_block_t* sys_header, + mtr_t* mtr); /** Initialize the rollback segments in memory at database startup. */ void @@ -134,7 +114,7 @@ trx_rseg_get_n_undo_tablespaces( ulint* space_ids); /*!< out: array of space ids of UNDO tablespaces */ /* Number of undo log slots in a rollback segment file copy */ -#define TRX_RSEG_N_SLOTS (UNIV_PAGE_SIZE / 16) +#define TRX_RSEG_N_SLOTS (srv_page_size / 16) /* Maximum number of transactions supported by a single rollback segment */ #define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2) @@ -151,32 +131,25 @@ struct trx_rseg_t { RsegMutex mutex; /** space where the rollback segment header is placed */ - ulint space; + fil_space_t* space; /** page number of the rollback segment header */ ulint page_no; - /** maximum allowed size in pages */ - ulint max_size; - /** current size in pages */ ulint curr_size; /*--------------------------------------------------------*/ - /* Fields for update undo logs */ - /** List of update undo logs */ - UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_list; - - /** List of update undo log segments cached for fast reuse */ - UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_cached; + /* Fields for undo logs */ + /** List of undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) undo_list; - /*--------------------------------------------------------*/ - /* Fields for insert undo logs */ - /** List of insert undo logs */ - UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_list; + /** List of undo log segments cached for fast reuse */ + UT_LIST_BASE_NODE_T(trx_undo_t) undo_cached; - /** List of insert undo log segments cached for fast reuse */ - UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_cached; + /** List of recovered old insert_undo logs of incomplete + transactions (to roll back or XA COMMIT & purge) */ + UT_LIST_BASE_NODE_T(trx_undo_t) old_insert_list; /*--------------------------------------------------------*/ @@ -187,11 +160,11 @@ struct trx_rseg_t { /** Byte offset of the last not yet purged log header */ ulint last_offset; - /** Transaction number of the last not yet purged log */ - trx_id_t last_trx_no; + /** trx_t::no * 2 + old_insert of the last not yet purged log */ + trx_id_t last_commit; - /** TRUE if the last not yet purged log needs purging */ - ibool last_del_marks; + /** Whether the log segment needs purge */ + bool needs_purge; /** Reference counter to track rseg allocated transactions. */ ulint trx_ref_count; @@ -200,23 +173,31 @@ struct trx_rseg_t { UNDO-tablespace marked for truncate. */ bool skip_allocation; + /** @return the commit ID of the last committed transaction */ + trx_id_t last_trx_no() const { return last_commit >> 1; } + + void set_last_trx_no(trx_id_t trx_no, bool is_update) + { + last_commit = trx_no << 1 | trx_id_t(is_update); + } + /** @return whether the rollback segment is persistent */ bool is_persistent() const { - ut_ad(space == SRV_TMP_SPACE_ID - || space == TRX_SYS_SPACE + ut_ad(space == fil_system.temp_space + || space == fil_system.sys_space || (srv_undo_space_id_start > 0 - && space >= srv_undo_space_id_start - && space <= srv_undo_space_id_start + && space->id >= srv_undo_space_id_start + && space->id <= srv_undo_space_id_start + TRX_SYS_MAX_UNDO_SPACES)); - ut_ad(space == SRV_TMP_SPACE_ID - || space == TRX_SYS_SPACE + ut_ad(space == fil_system.temp_space + || space == fil_system.sys_space || (srv_undo_space_id_start > 0 - && space >= srv_undo_space_id_start - && space <= srv_undo_space_id_start + && space->id >= srv_undo_space_id_start + && space->id <= srv_undo_space_id_start + srv_undo_tablespaces_open) || !srv_was_started); - return(space != SRV_TMP_SPACE_ID); + return(space->id != SRV_TMP_SPACE_ID); } }; @@ -233,19 +214,99 @@ struct trx_rseg_t { /* Transaction rollback segment header */ /*-------------------------------------------------------------*/ -#define TRX_RSEG_MAX_SIZE 0 /* Maximum allowed size for rollback - segment in pages */ -#define TRX_RSEG_HISTORY_SIZE 4 /* Number of file pages occupied - by the logs in the history list */ -#define TRX_RSEG_HISTORY 8 /* The update undo logs for committed - transactions */ +/** 0xfffffffe = pre-MariaDB 10.3.5 format; 0=MariaDB 10.3.5 or later */ +#define TRX_RSEG_FORMAT 0 +/** Number of pages in the TRX_RSEG_HISTORY list */ +#define TRX_RSEG_HISTORY_SIZE 4 +/** Committed transaction logs that have not been purged yet */ +#define TRX_RSEG_HISTORY 8 #define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE) /* Header for the file segment where this page is placed */ #define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE) /* Undo log segment slots */ +/** Maximum transaction ID (valid only if TRX_RSEG_FORMAT is 0) */ +#define TRX_RSEG_MAX_TRX_ID (TRX_RSEG_UNDO_SLOTS + TRX_RSEG_N_SLOTS \ + * TRX_RSEG_SLOT_SIZE) + +/** 8 bytes offset within the binlog file */ +#define TRX_RSEG_BINLOG_OFFSET TRX_RSEG_MAX_TRX_ID + 8 +/** MySQL log file name, 512 bytes, including terminating NUL +(valid only if TRX_RSEG_FORMAT is 0). +If no binlog information is present, the first byte is NUL. */ +#define TRX_RSEG_BINLOG_NAME TRX_RSEG_MAX_TRX_ID + 16 +/** Maximum length of binlog file name, including terminating NUL, in bytes */ +#define TRX_RSEG_BINLOG_NAME_LEN 512 + +#ifdef WITH_WSREP +/** The offset to WSREP XID headers */ +#define TRX_RSEG_WSREP_XID_INFO TRX_RSEG_MAX_TRX_ID + 16 + 512 + +/** WSREP XID format (1 if present and valid, 0 if not present) */ +#define TRX_RSEG_WSREP_XID_FORMAT TRX_RSEG_WSREP_XID_INFO +/** WSREP XID GTRID length */ +#define TRX_RSEG_WSREP_XID_GTRID_LEN TRX_RSEG_WSREP_XID_INFO + 4 +/** WSREP XID bqual length */ +#define TRX_RSEG_WSREP_XID_BQUAL_LEN TRX_RSEG_WSREP_XID_INFO + 8 +/** WSREP XID data (XIDDATASIZE bytes) */ +#define TRX_RSEG_WSREP_XID_DATA TRX_RSEG_WSREP_XID_INFO + 12 +#endif /* WITH_WSREP*/ + /*-------------------------------------------------------------*/ +/** Read the page number of an undo log slot. +@param[in] rsegf rollback segment header +@param[in] n slot number */ +inline +uint32_t +trx_rsegf_get_nth_undo(const trx_rsegf_t* rsegf, ulint n) +{ + ut_ad(n < TRX_RSEG_N_SLOTS); + return mach_read_from_4(rsegf + TRX_RSEG_UNDO_SLOTS + + n * TRX_RSEG_SLOT_SIZE); +} + +#ifdef WITH_WSREP +/** Update the WSREP XID information in rollback segment header. +@param[in,out] rseg_header rollback segment header +@param[in] xid WSREP XID +@param[in,out] mtr mini-transaction */ +void +trx_rseg_update_wsrep_checkpoint( + trx_rsegf_t* rseg_header, + const XID* xid, + mtr_t* mtr); + +/** Update WSREP checkpoint XID in first rollback segment header +as part of wsrep_set_SE_checkpoint() when it is guaranteed that there +are no wsrep transactions committing. +If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already +stored into rollback segments, the WSREP XID in all the remaining rollback +segments will be reset. +@param[in] xid WSREP XID */ +void trx_rseg_update_wsrep_checkpoint(const XID* xid); + +/** Recover the latest WSREP checkpoint XID. +@param[out] xid WSREP XID +@return whether the WSREP XID was found */ +bool trx_rseg_read_wsrep_checkpoint(XID& xid); +#endif /* WITH_WSREP */ + +/** Upgrade a rollback segment header page to MariaDB 10.3 format. +@param[in,out] rseg_header rollback segment header page +@param[in,out] mtr mini-transaction */ +void trx_rseg_format_upgrade(trx_rsegf_t* rseg_header, mtr_t* mtr); + +/** Update the offset information about the end of the binlog entry +which corresponds to the transaction just being committed. +In a replication slave, this updates the master binlog position +up to which replication has proceeded. +@param[in,out] rseg_header rollback segment header +@param[in] trx committing transaction +@param[in,out] mtr mini-transaction */ +void +trx_rseg_update_binlog_offset(byte* rseg_header, const trx_t* trx, mtr_t* mtr); + #include "trx0rseg.ic" #endif diff --git a/storage/innobase/include/trx0rseg.ic b/storage/innobase/include/trx0rseg.ic index eed487176e8..687a1d5b8d8 100644 --- a/storage/innobase/include/trx0rseg.ic +++ b/storage/innobase/include/trx0rseg.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -34,28 +34,18 @@ Created 3/26/1996 Heikki Tuuri @return rollback segment header, page x-latched */ UNIV_INLINE trx_rsegf_t* -trx_rsegf_get( - ulint space, - ulint page_no, - mtr_t* mtr) +trx_rsegf_get(fil_space_t* space, ulint page_no, mtr_t* mtr) { - buf_block_t* block; - trx_rsegf_t* header; - - ut_ad(space <= srv_undo_space_id_start + srv_undo_tablespaces_active - || space == SRV_TMP_SPACE_ID + ut_ad(space == fil_system.sys_space || space == fil_system.temp_space + || srv_is_undo_tablespace(space->id) || !srv_was_started); - ut_ad(space <= srv_undo_space_id_start + TRX_SYS_MAX_UNDO_SPACES - || space == SRV_TMP_SPACE_ID); - block = buf_page_get( - page_id_t(space, page_no), univ_page_size, RW_X_LATCH, mtr); + buf_block_t* block = buf_page_get(page_id_t(space->id, page_no), + univ_page_size, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_RSEG_HEADER); - header = TRX_RSEG + buf_block_get_frame(block); - - return(header); + return TRX_RSEG + block->frame; } /** Gets a newly created rollback segment header. @@ -88,23 +78,6 @@ trx_rsegf_get_new( } /***************************************************************//** -Gets the file page number of the nth undo log slot. -@return page number of the undo log segment */ -UNIV_INLINE -ulint -trx_rsegf_get_nth_undo( -/*===================*/ - trx_rsegf_t* rsegf, /*!< in: rollback segment header */ - ulint n, /*!< in: index of slot */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_a(n < TRX_RSEG_N_SLOTS); - - return(mtr_read_ulint(rsegf + TRX_RSEG_UNDO_SLOTS - + n * TRX_RSEG_SLOT_SIZE, MLOG_4BYTES, mtr)); -} - -/***************************************************************//** Sets the file page number of the nth undo log slot. */ UNIV_INLINE void @@ -126,10 +99,7 @@ Looks for a free slot for an undo log segment. @return slot index or ULINT_UNDEFINED if not found */ UNIV_INLINE ulint -trx_rsegf_undo_find_free( -/*=====================*/ - trx_rsegf_t* rsegf, /*!< in: rollback segment header */ - mtr_t* mtr) /*!< in: mtr */ +trx_rsegf_undo_find_free(const trx_rsegf_t* rsegf) { ulint i; ulint page_no; @@ -143,7 +113,7 @@ trx_rsegf_undo_find_free( #endif for (i = 0; i < max_slots; i++) { - page_no = trx_rsegf_get_nth_undo(rsegf, i, mtr); + page_no = trx_rsegf_get_nth_undo(rsegf, i); if (page_no == FIL_NULL) { return(i); diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index c4b1636cfd2..35ac8e12001 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,7 +33,6 @@ Created 3/26/1996 Heikki Tuuri #include "mem0mem.h" #include "mtr0mtr.h" #include "ut0byte.h" -#include "mem0mem.h" #include "ut0lst.h" #include "read0types.h" #include "page0types.h" @@ -45,173 +44,81 @@ Created 3/26/1996 Heikki Tuuri typedef UT_LIST_BASE_NODE_T(trx_t) trx_ut_list_t; -// Forward declaration -class MVCC; -class ReadView; - -/** The transaction system */ -extern trx_sys_t* trx_sys; - /** Checks if a page address is the trx sys header page. @param[in] page_id page id @return true if trx sys header page */ -inline bool trx_sys_hdr_page(const page_id_t page_id); - -/** Initialize the transaction system main-memory data structures. */ -void trx_sys_init_at_db_start(); +inline bool trx_sys_hdr_page(const page_id_t& page_id) +{ + return(page_id.space() == TRX_SYS_SPACE + && page_id.page_no() == TRX_SYS_PAGE_NO); +} /*****************************************************************//** -Creates the trx_sys instance and initializes purge_queue and mutex. */ -void -trx_sys_create(void); -/*================*/ -/*****************************************************************//** Creates and initializes the transaction system at the database creation. */ void trx_sys_create_sys_pages(void); /*==========================*/ -/** @return an unallocated rollback segment slot in the TRX_SYS header +/** Find an available rollback segment. +@param[in] sys_header +@return an unallocated rollback segment slot in the TRX_SYS header @retval ULINT_UNDEFINED if not found */ ulint -trx_sysf_rseg_find_free(mtr_t* mtr); -/**********************************************************************//** -Gets a pointer to the transaction system file copy and x-locks its page. -@return pointer to system file copy, page x-locked */ -UNIV_INLINE -trx_sysf_t* -trx_sysf_get( -/*=========*/ - mtr_t* mtr); /*!< in: mtr */ -/*****************************************************************//** -Gets the space of the nth rollback segment slot in the trx system -file copy. -@return space id */ -UNIV_INLINE -ulint -trx_sysf_rseg_get_space( -/*====================*/ - trx_sysf_t* sys_header, /*!< in: trx sys file copy */ - ulint i, /*!< in: slot index == rseg id */ - mtr_t* mtr); /*!< in: mtr */ -/*****************************************************************//** -Gets the page number of the nth rollback segment slot in the trx system -file copy. -@return page number, FIL_NULL if slot unused */ -UNIV_INLINE -ulint -trx_sysf_rseg_get_page_no( -/*======================*/ - trx_sysf_t* sys_header, /*!< in: trx sys file copy */ - ulint i, /*!< in: slot index == rseg id */ - mtr_t* mtr); /*!< in: mtr */ -/*****************************************************************//** -Sets the space id of the nth rollback segment slot in the trx system -file copy. */ -UNIV_INLINE -void -trx_sysf_rseg_set_space( -/*====================*/ - trx_sysf_t* sys_header, /*!< in: trx sys file copy */ - ulint i, /*!< in: slot index == rseg id */ - ulint space, /*!< in: space id */ - mtr_t* mtr); /*!< in: mtr */ -/*****************************************************************//** -Sets the page number of the nth rollback segment slot in the trx system -file copy. */ -UNIV_INLINE -void -trx_sysf_rseg_set_page_no( -/*======================*/ - trx_sysf_t* sys_header, /*!< in: trx sys file copy */ - ulint i, /*!< in: slot index == rseg id */ - ulint page_no, /*!< in: page number, FIL_NULL if - the slot is reset to unused */ - mtr_t* mtr); /*!< in: mtr */ -/*****************************************************************//** -Allocates a new transaction id. -@return new, allocated trx id */ -UNIV_INLINE -trx_id_t -trx_sys_get_new_trx_id(); -/*===================*/ -/*****************************************************************//** -Determines the maximum transaction id. -@return maximum currently allocated trx id; will be stale after the -next call to trx_sys_get_new_trx_id() */ -UNIV_INLINE -trx_id_t -trx_sys_get_max_trx_id(void); -/*========================*/ +trx_sys_rseg_find_free(const buf_block_t* sys_header); +/** Request the TRX_SYS page. +@param[in] rw whether to lock the page for writing +@return the TRX_SYS page +@retval NULL if the page cannot be read */ +inline +buf_block_t* +trx_sysf_get(mtr_t* mtr, bool rw = true) +{ + buf_block_t* block = buf_page_get( + page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), + univ_page_size, rw ? RW_X_LATCH : RW_S_LATCH, mtr); + if (block) { + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + } + return block; +} #ifdef UNIV_DEBUG /* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ extern uint trx_rseg_n_slots_debug; #endif -/*****************************************************************//** -Writes a trx id to an index page. In case that the id size changes in -some future version, this function should be used instead of -mach_write_... */ +/** Write DB_TRX_ID. +@param[out] db_trx_id the DB_TRX_ID field to be written to +@param[in] id transaction ID */ UNIV_INLINE void -trx_write_trx_id( -/*=============*/ - byte* ptr, /*!< in: pointer to memory where written */ - trx_id_t id); /*!< in: id */ -/*****************************************************************//** -Reads a trx id from an index page. In case that the id size changes in -some future version, this function should be used instead of -mach_read_... +trx_write_trx_id(byte* db_trx_id, trx_id_t id) +{ + compile_time_assert(DATA_TRX_ID_LEN == 6); + mach_write_to_6(db_trx_id, id); +} + +/** Read a transaction identifier. @return id */ -UNIV_INLINE +inline trx_id_t -trx_read_trx_id( -/*============*/ - const byte* ptr); /*!< in: pointer to memory from where to read */ -/****************************************************************//** -Looks for the trx instance with the given id in the rw trx_list. -@return the trx handle or NULL if not found */ -UNIV_INLINE -trx_t* -trx_get_rw_trx_by_id( -/*=================*/ - trx_id_t trx_id);/*!< in: trx id to search for */ -/****************************************************************//** -Returns the minimum trx id in rw trx list. This is the smallest id for which -the trx can possibly be active. (But, you must look at the trx->state to -find out if the minimum trx id transaction itself is active, or already -committed.) -@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */ -UNIV_INLINE -trx_id_t -trx_rw_min_trx_id(void); -/*===================*/ -/** Look up a rw transaction with the given id. -@param[in] trx_id transaction identifier -@param[out] corrupt flag that will be set if trx_id is corrupted -@return transaction; its state should be rechecked after acquiring trx_t::mutex -@retval NULL if there is no transaction identified by trx_id. */ -inline trx_t* trx_rw_is_active_low(trx_id_t trx_id, bool* corrupt); - -/** Look up a rw transaction with the given id. -@param[in] trx_id transaction identifier -@param[out] corrupt flag that will be set if trx_id is corrupted -@param[in] ref_count whether to increment trx->n_ref -@return transaction; its state should be rechecked after acquiring trx_t::mutex -@retval NULL if there is no active transaction identified by trx_id. */ -inline trx_t* trx_rw_is_active(trx_id_t trx_id, bool* corrupt, bool ref_count); - -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG -/***********************************************************//** -Assert that a transaction has been recovered. -@return TRUE */ -UNIV_INLINE -ibool -trx_assert_recovered( -/*=================*/ - trx_id_t trx_id) /*!< in: transaction identifier */ - MY_ATTRIBUTE((warn_unused_result)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +trx_read_trx_id(const byte* ptr) +{ + compile_time_assert(DATA_TRX_ID_LEN == 6); + return(mach_read_from_6(ptr)); +} + +#ifdef UNIV_DEBUG +/** Check that the DB_TRX_ID in a record is valid. +@param[in] db_trx_id the DB_TRX_ID column to validate +@param[in] trx_id the id of the ALTER TABLE transaction */ +inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id) +{ + trx_id_t id = trx_read_trx_id(static_cast<const byte*>(db_trx_id)); + ut_ad(id == 0 || id > trx_id); + return true; +} +#endif + /*****************************************************************//** Updates the offset information about the end of the MySQL binlog entry which corresponds to the transaction just being committed. In a MySQL @@ -222,138 +129,17 @@ trx_sys_update_mysql_binlog_offset( /*===============================*/ const char* file_name,/*!< in: MySQL log file name */ int64_t offset, /*!< in: position in that log file */ - trx_sysf_t* sys_header, /*!< in: trx sys header */ - mtr_t* mtr); /*!< in: mtr */ + buf_block_t* sys_header, /*!< in,out: trx sys header */ + mtr_t* mtr); /*!< in,out: mini-transaction */ /** Display the MySQL binlog offset info if it is present in the trx system header. */ void trx_sys_print_mysql_binlog_offset(); -#ifdef WITH_WSREP - -/** Update WSREP XID info in sys_header of TRX_SYS_PAGE_NO = 5. -@param[in] xid Transaction XID -@param[in,out] sys_header sys_header -@param[in] mtr minitransaction */ -UNIV_INTERN -void -trx_sys_update_wsrep_checkpoint( - const XID* xid, - trx_sysf_t* sys_header, - mtr_t* mtr); - -/** Read WSREP checkpoint XID from sys header. -@param[out] xid WSREP XID -@return whether the checkpoint was present */ -UNIV_INTERN -bool -trx_sys_read_wsrep_checkpoint(XID* xid); -#endif /* WITH_WSREP */ - -/** Initializes the tablespace tag system. */ -void -trx_sys_file_format_init(void); -/*==========================*/ - -/*****************************************************************//** -Closes the tablespace tag system. */ -void -trx_sys_file_format_close(void); -/*===========================*/ - -/********************************************************************//** -Tags the system table space with minimum format id if it has not been -tagged yet. -WARNING: This function is only called during the startup and AFTER the -redo log application during recovery has finished. */ -void -trx_sys_file_format_tag_init(void); -/*==============================*/ -/*****************************************************************//** -Shutdown/Close the transaction system. */ -void -trx_sys_close(void); -/*===============*/ -/*****************************************************************//** -Get the name representation of the file format from its id. -@return pointer to the name */ -const char* -trx_sys_file_format_id_to_name( -/*===========================*/ - const ulint id); /*!< in: id of the file format */ -/*****************************************************************//** -Set the file format id unconditionally except if it's already the -same value. -@return TRUE if value updated */ -ibool -trx_sys_file_format_max_set( -/*========================*/ - ulint format_id, /*!< in: file format id */ - const char** name); /*!< out: max file format name or - NULL if not needed. */ /** Create the rollback segments. @return whether the creation succeeded */ bool trx_sys_create_rsegs(); -/*****************************************************************//** -Get the number of transaction in the system, independent of their state. -@return count of transactions in trx_sys_t::trx_list */ -UNIV_INLINE -ulint -trx_sys_get_n_rw_trx(void); -/*======================*/ - -/********************************************************************* -Check if there are any active (non-prepared) transactions. -@return total number of active transactions or 0 if none */ -ulint -trx_sys_any_active_transactions(void); -/*=================================*/ -/*****************************************************************//** -Get the name representation of the file format from its id. -@return pointer to the max format name */ -const char* -trx_sys_file_format_max_get(void); -/*=============================*/ -/*****************************************************************//** -Check for the max file format tag stored on disk. -@return DB_SUCCESS or error code */ -dberr_t -trx_sys_file_format_max_check( -/*==========================*/ - ulint max_format_id); /*!< in: the max format id to check */ -/********************************************************************//** -Update the file format tag in the system tablespace only if the given -format id is greater than the known max id. -@return TRUE if format_id was bigger than the known max id */ -ibool -trx_sys_file_format_max_upgrade( -/*============================*/ - const char** name, /*!< out: max file format name */ - ulint format_id); /*!< in: file format identifier */ -/*****************************************************************//** -Get the name representation of the file format from its id. -@return pointer to the name */ -const char* -trx_sys_file_format_id_to_name( -/*===========================*/ - const ulint id); /*!< in: id of the file format */ - -/** -Add the transaction to the RW transaction set -@param trx transaction instance to add */ -UNIV_INLINE -void -trx_sys_rw_trx_add(trx_t* trx); - -#ifdef UNIV_DEBUG -/*************************************************************//** -Validate the trx_sys_t::rw_trx_list. -@return true if the list is valid */ -bool -trx_sys_validate_trx_list(); -/*========================*/ -#endif /* UNIV_DEBUG */ /** The automatically created system rollback segment has this id */ #define TRX_SYS_SYSTEM_RSEG_ID 0 @@ -363,18 +149,13 @@ trx_sys_validate_trx_list(); /** Transaction system header */ /*------------------------------------------------------------- @{ */ -#define TRX_SYS_TRX_ID_STORE 0 /*!< the maximum trx id or trx - number modulo - TRX_SYS_TRX_ID_UPDATE_MARGIN - written to a file page by any - transaction; the assignment of - transaction ids continues from - this number rounded up by - TRX_SYS_TRX_ID_UPDATE_MARGIN - plus - TRX_SYS_TRX_ID_UPDATE_MARGIN - when the database is - started */ +/** In old versions of InnoDB, this persisted the value of +trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5, +the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages +and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages +are used instead. The field only exists for the purpose of upgrading +from older MySQL or MariaDB versions. */ +#define TRX_SYS_TRX_ID_STORE 0 #define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the tablespace segment the trx system is created into */ @@ -384,16 +165,52 @@ trx_sys_validate_trx_list(); slots */ /*------------------------------------------------------------- @} */ -/* Max number of rollback segments: the number of segment specification slots -in the transaction system array; rollback segment id must fit in one (signed) -byte, therefore 128; each slot is currently 8 bytes in size. If you want -to raise the level to 256 then you will need to fix some assertions that -impose the 7 bit restriction. e.g., mach_write_to_3() */ +/** The number of rollback segments; rollback segment id must fit in +the 7 bits reserved for it in DB_ROLL_PTR. */ #define TRX_SYS_N_RSEGS 128 /** Maximum number of undo tablespaces (not counting the system tablespace) */ #define TRX_SYS_MAX_UNDO_SPACES (TRX_SYS_N_RSEGS - 1) -/** Maximum length of MySQL binlog file name, in bytes. */ +/* Rollback segment specification slot offsets */ + +/** the tablespace ID of an undo log header; starting with +MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */ +#define TRX_SYS_RSEG_SPACE 0 +/** the page number of an undo log header, or FIL_NULL if unused */ +#define TRX_SYS_RSEG_PAGE_NO 4 +/** Size of a rollback segment specification slot */ +#define TRX_SYS_RSEG_SLOT_SIZE 8 + +/** Read the tablespace ID of a rollback segment slot. +@param[in] sys_header TRX_SYS page +@param[in] rseg_id rollback segment identifier +@return undo tablespace id */ +inline +uint32_t +trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id) +{ + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->frame); +} + +/** Read the page number of a rollback segment slot. +@param[in] sys_header TRX_SYS page +@param[in] rseg_id rollback segment identifier +@return undo page number */ +inline +uint32_t +trx_sysf_rseg_get_page_no(const buf_block_t* sys_header, ulint rseg_id) +{ + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->frame); +} + +/** Maximum length of MySQL binlog file name, in bytes. +(Used before MariaDB 10.3.5.) */ #define TRX_SYS_MYSQL_LOG_NAME_LEN 512 /** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */ #define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344 @@ -402,7 +219,7 @@ impose the 7 bit restriction. e.g., mach_write_to_3() */ # error "UNIV_PAGE_SIZE_MIN < 4096" #endif /** The offset of the MySQL binlog offset info in the trx system header */ -#define TRX_SYS_MYSQL_LOG_INFO (UNIV_PAGE_SIZE - 1000) +#define TRX_SYS_MYSQL_LOG_INFO (srv_page_size - 1000) #define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is TRX_SYS_MYSQL_LOG_MAGIC_N if we have valid data in the @@ -411,7 +228,7 @@ impose the 7 bit restriction. e.g., mach_write_to_3() */ within that file */ #define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */ -/** Memory map TRX_SYS_PAGE_NO = 5 when UNIV_PAGE_SIZE = 4096 +/** Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096 0...37 FIL_HEADER 38...45 TRX_SYS_TRX_ID_STORE @@ -427,7 +244,7 @@ impose the 7 bit restriction. e.g., mach_write_to_3() */ ... ...1063 TRX_SYS_RSEG_PAGE_NO for slot 126 -(UNIV_PAGE_SIZE-3500 WSREP ::: FAIL would overwrite undo tablespace +(srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace space_id, page_no pairs :::) 596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD 600 TRX_SYS_WSREP_XID_FORMAT @@ -437,7 +254,7 @@ space_id, page_no pairs :::) 739 TRX_SYS_WSREP_XID_DATA_END FIXED WSREP XID info offsets for 4k page size 10.0.32-galera -(UNIV_PAGE_SIZE-2500) +(srv_page_size-2500) 1596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD 1600 TRX_SYS_WSREP_XID_FORMAT 1604 TRX_SYS_WSREP_XID_GTRID_LEN @@ -445,19 +262,19 @@ FIXED WSREP XID info offsets for 4k page size 10.0.32-galera 1612 TRX_SYS_WSREP_XID_DATA (len = 128) 1739 TRX_SYS_WSREP_XID_DATA_END -(UNIV_PAGE_SIZE - 2000 MYSQL MASTER LOG) +(srv_page_size - 2000 MYSQL MASTER LOG) 2096 TRX_SYS_MYSQL_MASTER_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 2100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH 2104 TRX_SYS_MYSQL_LOG_OFFSET_LOW 2108 TRX_SYS_MYSQL_LOG_NAME -(UNIV_PAGE_SIZE - 1000 MYSQL LOG) +(srv_page_size - 1000 MYSQL LOG) 3096 TRX_SYS_MYSQL_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 3100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH 3104 TRX_SYS_MYSQL_LOG_OFFSET_LOW 3108 TRX_SYS_MYSQL_LOG_NAME -(UNIV_PAGE_SIZE - 200 DOUBLEWRITE) +(srv_page_size - 200 DOUBLEWRITE) 3896 TRX_SYS_DOUBLEWRITE TRX_SYS_DOUBLEWRITE_FSEG 3906 TRX_SYS_DOUBLEWRITE_MAGIC 3910 TRX_SYS_DOUBLEWRITE_BLOCK1 @@ -465,12 +282,12 @@ FIXED WSREP XID info offsets for 4k page size 10.0.32-galera 3918 TRX_SYS_DOUBLEWRITE_REPEAT 3930 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N -(UNIV_PAGE_SIZE - 8, TAILER) +(srv_page_size - 8, TAILER) 4088..4096 FIL_TAILER */ #ifdef WITH_WSREP -/** The offset to WSREP XID headers */ +/** The offset to WSREP XID headers (used before MariaDB 10.3.5) */ #define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL) #define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0 #define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265 @@ -486,7 +303,7 @@ FIXED WSREP XID info offsets for 4k page size 10.0.32-galera /** Doublewrite buffer */ /* @{ */ /** The offset of the doublewrite buffer header on the trx system header page */ -#define TRX_SYS_DOUBLEWRITE (UNIV_PAGE_SIZE - 200) +#define TRX_SYS_DOUBLEWRITE (srv_page_size - 200) /*-------------------------------------------------------------*/ #define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg containing the doublewrite @@ -534,83 +351,489 @@ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ #define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE /* @} */ -/** File format tag */ -/* @{ */ -/** The offset of the file format tag on the trx system header page -(TRX_SYS_PAGE_NO of TRX_SYS_SPACE) */ -#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16) - -/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid. The file format -identifier is added to this constant. */ -#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL -/** Contents of TRX_SYS_FILE_FORMAT_TAG+4 when valid */ -#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL -/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid. The file format -identifier is added to this 64-bit constant. */ -#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N \ - ((ib_uint64_t) TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH << 32 \ - | TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW) -/* @} */ +trx_t* current_trx(); + +struct rw_trx_hash_element_t +{ + rw_trx_hash_element_t(): trx(0) + { + mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex); + } + + + ~rw_trx_hash_element_t() + { + mutex_free(&mutex); + } + + + trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */ + trx_id_t no; + trx_t *trx; + ib_mutex_t mutex; +}; + + +/** + Wrapper around LF_HASH to store set of in memory read-write transactions. +*/ + +class rw_trx_hash_t +{ + LF_HASH hash; + + + /** + Constructor callback for lock-free allocator. + + Object is just allocated and is not yet accessible via rw_trx_hash by + concurrent threads. Object can be reused multiple times before it is freed. + Every time object is being reused initializer() callback is called. + */ + + static void rw_trx_hash_constructor(uchar *arg) + { + new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t(); + } + + + /** + Destructor callback for lock-free allocator. + + Object is about to be freed and is not accessible via rw_trx_hash by + concurrent threads. + */ + + static void rw_trx_hash_destructor(uchar *arg) + { + reinterpret_cast<rw_trx_hash_element_t*> + (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t(); + } + + + /** + Destructor callback for lock-free allocator. + + This destructor is used at shutdown. It frees remaining transaction + objects. + + XA PREPARED transactions may remain if they haven't been committed or + rolled back. ACTIVE transactions may remain if startup was interrupted or + server is running in read-only mode or for certain srv_force_recovery + levels. + */ + + static void rw_trx_hash_shutdown_destructor(uchar *arg) + { + rw_trx_hash_element_t *element= + reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD); + if (trx_t *trx= element->trx) + { + ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) || + trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) || + (trx_state_eq(trx, TRX_STATE_ACTIVE) && + (!srv_was_started || + srv_read_only_mode || + srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO))); + trx_free_at_shutdown(trx); + } + element->~rw_trx_hash_element_t(); + } + + + /** + Initializer callback for lock-free hash. + + Object is not yet accessible via rw_trx_hash by concurrent threads, but is + about to become such. Object id can be changed only by this callback and + remains the same until all pins to this object are released. + + Object trx can be changed to 0 by erase() under object mutex protection, + which indicates it is about to be removed from lock-free hash and become + not accessible by concurrent threads. + */ + + static void rw_trx_hash_initializer(LF_HASH *, + rw_trx_hash_element_t *element, + trx_t *trx) + { + ut_ad(element->trx == 0); + element->trx= trx; + element->id= trx->id; + element->no= TRX_ID_MAX; + trx->rw_trx_hash_element= element; + } + + + /** + Gets LF_HASH pins. + + Pins are used to protect object from being destroyed or reused. They are + normally stored in trx object for quick access. If caller doesn't have trx + available, we try to get it using currnet_trx(). If caller doesn't have trx + at all, temporary pins are allocated. + */ + + LF_PINS *get_pins(trx_t *trx) + { + if (!trx->rw_trx_hash_pins) + { + trx->rw_trx_hash_pins= lf_hash_get_pins(&hash); + ut_a(trx->rw_trx_hash_pins); + } + return trx->rw_trx_hash_pins; + } + + + struct eliminate_duplicates_arg + { + trx_ids_t ids; + my_hash_walk_action action; + void *argument; + eliminate_duplicates_arg(size_t size, my_hash_walk_action act, void* arg): + action(act), argument(arg) { ids.reserve(size); } + }; + + + static my_bool eliminate_duplicates(rw_trx_hash_element_t *element, + eliminate_duplicates_arg *arg) + { + for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++) + { + if (*it == element->id) + return 0; + } + arg->ids.push_back(element->id); + return arg->action(element, arg->argument); + } + -/** The transaction system central memory data structure. */ -struct trx_sys_t { - - TrxSysMutex mutex; /*!< mutex protecting most fields in - this structure except when noted - otherwise */ - - MVCC* mvcc; /*!< Multi version concurrency control - manager */ - volatile trx_id_t - max_trx_id; /*!< The smallest number not yet - assigned as a transaction id or - transaction number. This is declared - volatile because it can be accessed - without holding any mutex during - AC-NL-RO view creation. */ - trx_ut_list_t serialisation_list; - /*!< Ordered on trx_t::no of all the - currenrtly active RW transactions */ #ifdef UNIV_DEBUG - trx_id_t rw_max_trx_id; /*!< Max trx id of read-write - transactions which exist or existed */ -#endif /* UNIV_DEBUG */ - - /** Avoid false sharing */ - const char pad1[CACHE_LINE_SIZE]; - trx_ut_list_t rw_trx_list; /*!< List of active and committed in - memory read-write transactions, sorted - on trx id, biggest first. Recovered - transactions are always on this list. */ - - /** Avoid false sharing */ - const char pad2[CACHE_LINE_SIZE]; - trx_ut_list_t mysql_trx_list; /*!< List of transactions created - for MySQL. All user transactions are - on mysql_trx_list. The rw_trx_list - can contain system transactions and - recovered transactions that will not - be in the mysql_trx_list. - mysql_trx_list may additionally contain - transactions that have not yet been - started in InnoDB. */ - - trx_ids_t rw_trx_ids; /*!< Array of Read write transaction IDs - for MVCC snapshot. A ReadView would take - a snapshot of these transactions whose - changes are not visible to it. We should - remove transactions from the list before - committing in memory and releasing locks - to ensure right order of removal and - consistent snapshot. */ - - /** Avoid false sharing */ - const char pad3[CACHE_LINE_SIZE]; + static void validate_element(trx_t *trx) + { + ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg); + ut_ad(!trx_is_autocommit_non_locking(trx)); + /* trx->state can be anything except TRX_STATE_NOT_STARTED */ + mutex_enter(&trx->mutex); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) || + trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) || + trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) || + trx_state_eq(trx, TRX_STATE_PREPARED)); + mutex_exit(&trx->mutex); + } + + + struct debug_iterator_arg + { + my_hash_walk_action action; + void *argument; + }; + + + static my_bool debug_iterator(rw_trx_hash_element_t *element, + debug_iterator_arg *arg) + { + mutex_enter(&element->mutex); + if (element->trx) + validate_element(element->trx); + mutex_exit(&element->mutex); + return arg->action(element, arg->argument); + } +#endif + + +public: + void init() + { + lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0, + sizeof(trx_id_t), 0, &my_charset_bin); + hash.alloc.constructor= rw_trx_hash_constructor; + hash.alloc.destructor= rw_trx_hash_destructor; + hash.initializer= + reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer); + } + + + void destroy() + { + hash.alloc.destructor= rw_trx_hash_shutdown_destructor; + lf_hash_destroy(&hash); + } + + + /** + Releases LF_HASH pins. + + Must be called by thread that owns trx_t object when the latter is being + "detached" from thread (e.g. released to the pool by trx_t::free()). Can be + called earlier if thread is expected not to use rw_trx_hash. + + Since pins are not allowed to be transferred to another thread, + initialisation thread calls this for recovered transactions. + */ + + void put_pins(trx_t *trx) + { + if (trx->rw_trx_hash_pins) + { + lf_hash_put_pins(trx->rw_trx_hash_pins); + trx->rw_trx_hash_pins= 0; + } + } + + + /** + Finds trx object in lock-free hash with given id. + + Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless + the transaction may get committed before this method returns. + + With do_ref_count == false the caller may dereference returned trx pointer + only if lock_sys.mutex was acquired before calling find(). + + With do_ref_count == true caller may dereference trx even if it is not + holding lock_sys.mutex. Caller is responsible for calling + trx->release_reference() when it is done playing with trx. + + Ideally this method should get caller rw_trx_hash_pins along with trx + object as a parameter, similar to insert() and erase(). However most + callers lose trx early in their call chains and it is not that easy to pass + them through. + + So we take more expensive approach: get trx through current_thd()->ha_data. + Some threads don't have trx attached to THD, and at least server + initialisation thread, fts_optimize_thread, srv_master_thread, + dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even + have THD at all. For such cases we allocate pins only for duration of + search and free them immediately. + + This has negative performance impact and should be fixed eventually (by + passing caller_trx as a parameter). Still stream of DML is more or less Ok. + + @return + @retval 0 not found + @retval pointer to trx + */ + + trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count) + { + /* + In MariaDB 10.3, purge will reset DB_TRX_ID to 0 + when the history is lost. Read/write transactions will + always have a nonzero trx_t::id; there the value 0 is + reserved for transactions that did not write or lock + anything yet. + + The caller should already have handled trx_id==0 specially. + */ + ut_ad(trx_id); + ut_ad(!caller_trx || caller_trx->id != trx_id || !do_ref_count); + + trx_t *trx= 0; + LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash); + ut_a(pins); + + rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*> + (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id), + sizeof(trx_id_t))); + if (element) + { + mutex_enter(&element->mutex); + lf_hash_search_unpin(pins); + if ((trx= element->trx)) { + DBUG_ASSERT(trx_id == trx->id); + ut_d(validate_element(trx)); + if (do_ref_count) + { + /* + We have an early state check here to avoid committer + starvation in a wait loop for transaction references, + when there's a stream of trx_sys.find() calls from other + threads. The trx->state may change to COMMITTED after + trx->mutex is released, and it will have to be rechecked + by the caller after reacquiring the mutex. + */ + trx_mutex_enter(trx); + const trx_state_t state= trx->state; + trx_mutex_exit(trx); + if (state == TRX_STATE_COMMITTED_IN_MEMORY) + trx= NULL; + else + trx->reference(); + } + } + mutex_exit(&element->mutex); + } + if (!caller_trx) + lf_hash_put_pins(pins); + return trx; + } + + + /** + Inserts trx to lock-free hash. + + Object becomes accessible via rw_trx_hash. + */ + + void insert(trx_t *trx) + { + ut_d(validate_element(trx)); + int res= lf_hash_insert(&hash, get_pins(trx), + reinterpret_cast<void*>(trx)); + ut_a(res == 0); + } + + + /** + Removes trx from lock-free hash. + + Object becomes not accessible via rw_trx_hash. But it still can be pinned + by concurrent find(), which is supposed to release it immediately after + it sees object trx is 0. + */ + + void erase(trx_t *trx) + { + ut_d(validate_element(trx)); + mutex_enter(&trx->rw_trx_hash_element->mutex); + trx->rw_trx_hash_element->trx= 0; + mutex_exit(&trx->rw_trx_hash_element->mutex); + int res= lf_hash_delete(&hash, get_pins(trx), + reinterpret_cast<const void*>(&trx->id), + sizeof(trx_id_t)); + ut_a(res == 0); + } + + + /** + Returns the number of elements in the hash. + + The number is exact only if hash is protected against concurrent + modifications (e.g. single threaded startup or hash is protected + by some mutex). Otherwise the number may be used as a hint only, + because it may change even before this method returns. + */ + + uint32_t size() + { + return uint32_t(my_atomic_load32_explicit(&hash.count, + MY_MEMORY_ORDER_RELAXED)); + } + + + /** + Iterates the hash. + + @param caller_trx used to get/set pins + @param action called for every element in hash + @param argument opque argument passed to action + + May return the same element multiple times if hash is under contention. + If caller doesn't like to see the same transaction multiple times, it has + to call iterate_no_dups() instead. + + May return element with committed transaction. If caller doesn't like to + see committed transactions, it has to skip those under element mutex: + + mutex_enter(&element->mutex); + if (trx_t trx= element->trx) + { + // trx is protected against commit in this branch + } + mutex_exit(&element->mutex); + + May miss concurrently inserted transactions. + + @return + @retval 0 iteration completed successfully + @retval 1 iteration was interrupted (action returned 1) + */ + + int iterate(trx_t *caller_trx, my_hash_walk_action action, void *argument) + { + LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash); + ut_a(pins); +#ifdef UNIV_DEBUG + debug_iterator_arg debug_arg= { action, argument }; + action= reinterpret_cast<my_hash_walk_action>(debug_iterator); + argument= &debug_arg; +#endif + int res= lf_hash_iterate(&hash, pins, action, argument); + if (!caller_trx) + lf_hash_put_pins(pins); + return res; + } + + + int iterate(my_hash_walk_action action, void *argument) + { + return iterate(current_trx(), action, argument); + } + + + /** + Iterates the hash and eliminates duplicate elements. + + @sa iterate() + */ + + int iterate_no_dups(trx_t *caller_trx, my_hash_walk_action action, + void *argument) + { + eliminate_duplicates_arg arg(size() + 32, action, argument); + return iterate(caller_trx, reinterpret_cast<my_hash_walk_action> + (eliminate_duplicates), &arg); + } + + + int iterate_no_dups(my_hash_walk_action action, void *argument) + { + return iterate_no_dups(current_trx(), action, argument); + } +}; + + +/** The transaction system central memory data structure. */ +class trx_sys_t +{ + /** + The smallest number not yet assigned as a transaction id or transaction + number. Accessed and updated with atomic operations. + */ + MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_max_trx_id; + + + /** + Solves race conditions between register_rw() and snapshot_ids() as well as + race condition between assign_new_trx_no() and snapshot_ids(). + + @sa register_rw() + @sa assign_new_trx_no() + @sa snapshot_ids() + */ + MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_rw_trx_hash_version; + + + /** + TRX_RSEG_HISTORY list length (number of committed transactions to purge) + */ + MY_ALIGNED(CACHE_LINE_SIZE) int32 rseg_history_len; + + bool m_initialised; + +public: + /** Mutex protecting trx_list. */ + MY_ALIGNED(CACHE_LINE_SIZE) mutable TrxSysMutex mutex; + + /** List of all transactions. */ + MY_ALIGNED(CACHE_LINE_SIZE) trx_ut_list_t trx_list; + + MY_ALIGNED(CACHE_LINE_SIZE) /** Temporary rollback segments */ trx_rseg_t* temp_rsegs[TRX_SYS_N_RSEGS]; - /** Avoid false sharing */ - const char pad4[CACHE_LINE_SIZE]; + MY_ALIGNED(CACHE_LINE_SIZE) trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS]; /*!< Pointer array to rollback segments; NULL if slot not in use; @@ -618,34 +841,380 @@ struct trx_sys_t { single-threaded mode; not protected by any mutex, because it is read-only during multi-threaded operation */ - ulint rseg_history_len; - /*!< Length of the TRX_RSEG_HISTORY - list (update undo logs for committed - transactions), protected by - rseg->mutex */ - - TrxIdSet rw_trx_set; /*!< Mapping from transaction id - to transaction instance */ -}; -/** When a trx id which is zero modulo this number (which must be a power of -two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system -page is updated */ -#define TRX_SYS_TRX_ID_WRITE_MARGIN ((trx_id_t) 256) + /** + Lock-free hash of in memory read-write transactions. + Works faster when it is on it's own cache line (tested). + */ + + MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash; + + +#ifdef WITH_WSREP + /** Latest recovered XID during startup */ + XID recovered_wsrep_xid; +#endif + /** Latest recovered binlog offset */ + uint64_t recovered_binlog_offset; + /** Latest recovered binlog file name */ + char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN]; + /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */ + lsn_t recovered_binlog_lsn; + + + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ -/** Test if trx_sys->mutex is owned. */ -#define trx_sys_mutex_own() (trx_sys->mutex.is_owned()) + trx_sys_t(): m_initialised(false) {} -/** Acquire the trx_sys->mutex. */ -#define trx_sys_mutex_enter() do { \ - mutex_enter(&trx_sys->mutex); \ -} while (0) -/** Release the trx_sys->mutex. */ -#define trx_sys_mutex_exit() do { \ - trx_sys->mutex.exit(); \ -} while (0) + /** + Returns the minimum trx id in rw trx list. -#include "trx0sys.ic" + This is the smallest id for which the trx can possibly be active. (But, you + must look at the trx->state to find out if the minimum trx id transaction + itself is active, or already committed.) + + @return the minimum trx id, or m_max_trx_id if the trx list is empty + */ + + trx_id_t get_min_trx_id() + { + trx_id_t id= get_max_trx_id(); + rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action> + (get_min_trx_id_callback), &id); + return id; + } + + + /** + Determines the maximum transaction id. + + @return maximum currently allocated trx id; will be stale after the + next call to trx_sys.get_new_trx_id() + */ + + trx_id_t get_max_trx_id() + { + return static_cast<trx_id_t> + (my_atomic_load64_explicit(reinterpret_cast<int64*>(&m_max_trx_id), + MY_MEMORY_ORDER_RELAXED)); + } + + + /** + Allocates a new transaction id. + @return new, allocated trx id + */ + + trx_id_t get_new_trx_id() + { + trx_id_t id= get_new_trx_id_no_refresh(); + refresh_rw_trx_hash_version(); + return id; + } + + + /** + Allocates and assigns new transaction serialisation number. + + There's a gap between m_max_trx_id increment and transaction serialisation + number becoming visible through rw_trx_hash. While we're in this gap + concurrent thread may come and do MVCC snapshot without seeing allocated + but not yet assigned serialisation number. Then at some point purge thread + may clone this view. As a result it won't see newly allocated serialisation + number and may remove "unnecessary" history data of this transaction from + rollback segments. + + m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has + to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively + means that all transaction serialisation numbers up to m_max_trx_id are + available through rw_trx_hash. + + We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so + that m_rw_trx_hash_version increment happens after + trx->rw_trx_hash_element->no becomes visible through rw_trx_hash. + + @param trx transaction + */ + void assign_new_trx_no(trx_t *trx) + { + trx->no= get_new_trx_id_no_refresh(); + my_atomic_store64_explicit(reinterpret_cast<int64*> + (&trx->rw_trx_hash_element->no), + trx->no, MY_MEMORY_ORDER_RELAXED); + refresh_rw_trx_hash_version(); + } + + + /** + Takes MVCC snapshot. + + To reduce malloc probablility we reserve rw_trx_hash.size() + 32 elements + in ids. + + For details about get_rw_trx_hash_version() != get_max_trx_id() spin + @sa register_rw() and @sa assign_new_trx_no(). + + We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so + that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash. + + To optimise snapshot creation rw_trx_hash.iterate() is being used instead + of rw_trx_hash.iterate_no_dups(). It means that some transaction + identifiers may appear multiple times in ids. + + @param[in,out] caller_trx used to get access to rw_trx_hash_pins + @param[out] ids array to store registered transaction identifiers + @param[out] max_trx_id variable to store m_max_trx_id value + @param[out] mix_trx_no variable to store min(trx->no) value + */ + + void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id, + trx_id_t *min_trx_no) + { + ut_ad(!mutex_own(&mutex)); + snapshot_ids_arg arg(ids); + + while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id()) + ut_delay(1); + arg.m_no= arg.m_id; + + ids->clear(); + ids->reserve(rw_trx_hash.size() + 32); + rw_trx_hash.iterate(caller_trx, + reinterpret_cast<my_hash_walk_action>(copy_one_id), + &arg); + + *max_trx_id= arg.m_id; + *min_trx_no= arg.m_no; + } + + + /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */ + void init_max_trx_id(trx_id_t value) + { + m_max_trx_id= m_rw_trx_hash_version= value; + } + + + bool is_initialised() { return m_initialised; } + + + /** Initialise the transaction subsystem. */ + void create(); + + /** Close the transaction subsystem on shutdown. */ + void close(); + + /** @return total number of active (non-prepared) transactions */ + ulint any_active_transactions(); + + + /** + Registers read-write transaction. + + Transaction becomes visible to MVCC. + + There's a gap between m_max_trx_id increment and transaction becoming + visible through rw_trx_hash. While we're in this gap concurrent thread may + come and do MVCC snapshot. As a result concurrent read view will be able to + observe records owned by this transaction even before it was committed. + + m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has + to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively + means that all transactions up to m_max_trx_id are available through + rw_trx_hash. + + We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so + that m_rw_trx_hash_version increment happens after transaction becomes + visible through rw_trx_hash. + */ + + void register_rw(trx_t *trx) + { + trx->id= get_new_trx_id_no_refresh(); + rw_trx_hash.insert(trx); + refresh_rw_trx_hash_version(); + } + + + /** + Deregisters read-write transaction. + + Transaction is removed from rw_trx_hash, which releases all implicit locks. + MVCC snapshot won't see this transaction anymore. + */ + + void deregister_rw(trx_t *trx) + { + rw_trx_hash.erase(trx); + } + + + bool is_registered(trx_t *caller_trx, trx_id_t id) + { + return id && find(caller_trx, id, false); + } + + + trx_t *find(trx_t *caller_trx, trx_id_t id, bool do_ref_count= true) + { + return rw_trx_hash.find(caller_trx, id, do_ref_count); + } + + + /** + Registers transaction in trx_sys. + + @param trx transaction + */ + void register_trx(trx_t *trx) + { + mutex_enter(&mutex); + UT_LIST_ADD_FIRST(trx_list, trx); + mutex_exit(&mutex); + } + + + /** + Deregisters transaction in trx_sys. + + @param trx transaction + */ + void deregister_trx(trx_t *trx) + { + mutex_enter(&mutex); + UT_LIST_REMOVE(trx_list, trx); + mutex_exit(&mutex); + } + + + /** + Clones the oldest view and stores it in view. + + No need to call ReadView::close(). The caller owns the view that is passed + in. This function is called by purge thread to determine whether it should + purge the delete marked record or not. + */ + void clone_oldest_view(); + + + /** @return the number of active views */ + size_t view_count() const + { + size_t count= 0; + + mutex_enter(&mutex); + for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx; + trx= UT_LIST_GET_NEXT(trx_list, trx)) + { + if (trx->read_view.get_state() == READ_VIEW_STATE_OPEN) + ++count; + } + mutex_exit(&mutex); + return count; + } + + /** @return number of committed transactions waiting for purge */ + uint32 history_size() const + { + return uint32(my_atomic_load32(&const_cast<trx_sys_t*>(this) + ->rseg_history_len)); + } + /** Add to the TRX_RSEG_HISTORY length (on database startup). */ + void history_add(int32 len) + { + my_atomic_add32(&rseg_history_len, len); + } + /** Register a committed transaction. */ + void history_insert() { history_add(1); } + /** Note that a committed transaction was purged. */ + void history_remove() { history_add(-1); } + +private: + static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element, + trx_id_t *id) + { + if (element->id < *id) + { + mutex_enter(&element->mutex); + /* We don't care about read-only transactions here. */ + if (element->trx && element->trx->rsegs.m_redo.rseg) + *id= element->id; + mutex_exit(&element->mutex); + } + return 0; + } + + + struct snapshot_ids_arg + { + snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {} + trx_ids_t *m_ids; + trx_id_t m_id; + trx_id_t m_no; + }; + + + static my_bool copy_one_id(rw_trx_hash_element_t *element, + snapshot_ids_arg *arg) + { + if (element->id < arg->m_id) + { + trx_id_t no= static_cast<trx_id_t>(my_atomic_load64_explicit( + reinterpret_cast<int64*>(&element->no), MY_MEMORY_ORDER_RELAXED)); + arg->m_ids->push_back(element->id); + if (no < arg->m_no) + arg->m_no= no; + } + return 0; + } + + + /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */ + trx_id_t get_rw_trx_hash_version() + { + return static_cast<trx_id_t> + (my_atomic_load64_explicit(reinterpret_cast<int64*> + (&m_rw_trx_hash_version), + MY_MEMORY_ORDER_ACQUIRE)); + } + + + /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */ + void refresh_rw_trx_hash_version() + { + my_atomic_add64_explicit(reinterpret_cast<int64*>(&m_rw_trx_hash_version), + 1, MY_MEMORY_ORDER_RELEASE); + } + + + /** + Allocates new transaction id without refreshing rw_trx_hash version. + + This method is extracted for exclusive use by register_rw() and + assign_new_trx_no() where new id must be allocated atomically with + payload of these methods from MVCC snapshot point of view. + + @sa get_new_trx_id() + @sa assign_new_trx_no() + + @return new transaction id + */ + + trx_id_t get_new_trx_id_no_refresh() + { + return static_cast<trx_id_t>(my_atomic_add64_explicit( + reinterpret_cast<int64*>(&m_max_trx_id), 1, MY_MEMORY_ORDER_RELAXED)); + } +}; + + +/** The transaction system */ +extern trx_sys_t trx_sys; #endif diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic deleted file mode 100644 index c85695630f0..00000000000 --- a/storage/innobase/include/trx0sys.ic +++ /dev/null @@ -1,458 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, 2019, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/trx0sys.ic -Transaction system - -Created 3/26/1996 Heikki Tuuri -*******************************************************/ - -#include "trx0trx.h" -#include "data0type.h" -#include "srv0srv.h" -#include "mtr0log.h" - -/* The typedef for rseg slot in the file copy */ -typedef byte trx_sysf_rseg_t; - -/* Rollback segment specification slot offsets */ -/*-------------------------------------------------------------*/ -#define TRX_SYS_RSEG_SPACE 0 /* space where the segment - header is placed; starting with - MySQL/InnoDB 5.1.7, this is - UNIV_UNDEFINED if the slot is unused */ -#define TRX_SYS_RSEG_PAGE_NO 4 /* page number where the segment - header is placed; this is FIL_NULL - if the slot is unused */ -/*-------------------------------------------------------------*/ -/* Size of a rollback segment specification slot */ -#define TRX_SYS_RSEG_SLOT_SIZE 8 - -/*****************************************************************//** -Writes the value of max_trx_id to the file based trx system header. */ -void -trx_sys_flush_max_trx_id(void); -/*==========================*/ - -/** Checks if a page address is the trx sys header page. -@param[in] page_id page id -@return true if trx sys header page */ -inline bool trx_sys_hdr_page(const page_id_t page_id) -{ - return(page_id.space() == TRX_SYS_SPACE - && page_id.page_no() == TRX_SYS_PAGE_NO); -} - -/**********************************************************************//** -Gets a pointer to the transaction system header and x-latches its page. -@return pointer to system header, page x-latched. */ -UNIV_INLINE -trx_sysf_t* -trx_sysf_get( -/*=========*/ - mtr_t* mtr) /*!< in: mtr */ -{ - buf_block_t* block = NULL; - trx_sysf_t* header = NULL; - - ut_ad(mtr); - - block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), - univ_page_size, RW_X_LATCH, mtr); - - if (block) { - buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); - - header = TRX_SYS + buf_block_get_frame(block); - } - - return(header); -} - -/*****************************************************************//** -Gets the space of the nth rollback segment slot in the trx system -file copy. -@return space id */ -UNIV_INLINE -ulint -trx_sysf_rseg_get_space( -/*====================*/ - trx_sysf_t* sys_header, /*!< in: trx sys header */ - ulint i, /*!< in: slot index == rseg id */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_ad(sys_header); - ut_ad(i < TRX_SYS_N_RSEGS); - - return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS - + i * TRX_SYS_RSEG_SLOT_SIZE - + TRX_SYS_RSEG_SPACE, MLOG_4BYTES, mtr)); -} - -/*****************************************************************//** -Gets the page number of the nth rollback segment slot in the trx system -header. -@return page number, FIL_NULL if slot unused */ -UNIV_INLINE -ulint -trx_sysf_rseg_get_page_no( -/*======================*/ - trx_sysf_t* sys_header, /*!< in: trx system header */ - ulint i, /*!< in: slot index == rseg id */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_ad(sys_header); - ut_ad(i < TRX_SYS_N_RSEGS); - - return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS - + i * TRX_SYS_RSEG_SLOT_SIZE - + TRX_SYS_RSEG_PAGE_NO, MLOG_4BYTES, mtr)); -} - -/*****************************************************************//** -Sets the space id of the nth rollback segment slot in the trx system -file copy. */ -UNIV_INLINE -void -trx_sysf_rseg_set_space( -/*====================*/ - trx_sysf_t* sys_header, /*!< in: trx sys file copy */ - ulint i, /*!< in: slot index == rseg id */ - ulint space, /*!< in: space id */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_ad(sys_header); - ut_ad(i < TRX_SYS_N_RSEGS); - - mlog_write_ulint(sys_header + TRX_SYS_RSEGS - + i * TRX_SYS_RSEG_SLOT_SIZE - + TRX_SYS_RSEG_SPACE, - space, - MLOG_4BYTES, mtr); -} - -/*****************************************************************//** -Sets the page number of the nth rollback segment slot in the trx system -header. */ -UNIV_INLINE -void -trx_sysf_rseg_set_page_no( -/*======================*/ - trx_sysf_t* sys_header, /*!< in: trx sys header */ - ulint i, /*!< in: slot index == rseg id */ - ulint page_no, /*!< in: page number, FIL_NULL if the - slot is reset to unused */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_ad(sys_header); - ut_ad(i < TRX_SYS_N_RSEGS); - - mlog_write_ulint(sys_header + TRX_SYS_RSEGS - + i * TRX_SYS_RSEG_SLOT_SIZE - + TRX_SYS_RSEG_PAGE_NO, - page_no, - MLOG_4BYTES, mtr); -} - -/*****************************************************************//** -Writes a trx id to an index page. In case that the id size changes in -some future version, this function should be used instead of -mach_write_... */ -UNIV_INLINE -void -trx_write_trx_id( -/*=============*/ - byte* ptr, /*!< in: pointer to memory where written */ - trx_id_t id) /*!< in: id */ -{ -#if DATA_TRX_ID_LEN != 6 -# error "DATA_TRX_ID_LEN != 6" -#endif - mach_write_to_6(ptr, id); -} - -/*****************************************************************//** -Reads a trx id from an index page. In case that the id size changes in -some future version, this function should be used instead of -mach_read_... -@return id */ -UNIV_INLINE -trx_id_t -trx_read_trx_id( -/*============*/ - const byte* ptr) /*!< in: pointer to memory from where to read */ -{ -#if DATA_TRX_ID_LEN != 6 -# error "DATA_TRX_ID_LEN != 6" -#endif - return(mach_read_from_6(ptr)); -} - -/****************************************************************//** -Looks for the trx handle with the given id in rw_trx_list. -The caller must be holding trx_sys->mutex. -@return the trx handle or NULL if not found; -the pointer must not be dereferenced unless lock_sys->mutex was -acquired before calling this function and is still being held */ -UNIV_INLINE -trx_t* -trx_get_rw_trx_by_id( -/*=================*/ - trx_id_t trx_id) /*!< in: trx id to search for */ -{ - ut_ad(trx_id > 0); - ut_ad(trx_sys_mutex_own()); - - if (trx_sys->rw_trx_set.empty()) { - return(NULL); - } - - TrxIdSet::iterator it; - - it = trx_sys->rw_trx_set.find(TrxTrack(trx_id)); - - return(it == trx_sys->rw_trx_set.end() ? NULL : it->m_trx); -} - -/****************************************************************//** -Returns the minimum trx id in trx list. This is the smallest id for which -the trx can possibly be active. (But, you must look at the trx->state -to find out if the minimum trx id transaction itself is active, or already -committed.). The caller must be holding the trx_sys_t::mutex in shared mode. -@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */ -UNIV_INLINE -trx_id_t -trx_rw_min_trx_id_low(void) -/*=======================*/ -{ - trx_id_t id; - - ut_ad(trx_sys_mutex_own()); - - const trx_t* trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list); - - if (trx == NULL) { - id = trx_sys->max_trx_id; - } else { - assert_trx_in_rw_list(trx); - id = trx->id; - } - - return(id); -} - -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG -/***********************************************************//** -Assert that a transaction has been recovered. -@return TRUE */ -UNIV_INLINE -ibool -trx_assert_recovered( -/*=================*/ - trx_id_t trx_id) /*!< in: transaction identifier */ -{ - const trx_t* trx; - - trx_sys_mutex_enter(); - - trx = trx_get_rw_trx_by_id(trx_id); - ut_a(trx->is_recovered); - - trx_sys_mutex_exit(); - - return(TRUE); -} -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - -/****************************************************************//** -Returns the minimum trx id in rw trx list. This is the smallest id for which -the rw trx can possibly be active. (But, you must look at the trx->state -to find out if the minimum trx id transaction itself is active, or already -committed.) -@return the minimum trx id, or trx_sys->max_trx_id if rw trx list is empty */ -UNIV_INLINE -trx_id_t -trx_rw_min_trx_id(void) -/*===================*/ -{ - trx_sys_mutex_enter(); - - trx_id_t id = trx_rw_min_trx_id_low(); - - trx_sys_mutex_exit(); - - return(id); -} - -/** Look up a rw transaction with the given id. -@param[in] trx_id transaction identifier -@param[out] corrupt flag that will be set if trx_id is corrupted -@return transaction; its state should be rechecked after acquiring trx_t::mutex -@retval NULL if there is no transaction identified by trx_id. */ -inline trx_t* trx_rw_is_active_low(trx_id_t trx_id, bool* corrupt) -{ - ut_ad(trx_sys_mutex_own()); - - if (trx_id < trx_rw_min_trx_id_low()) { - } else if (trx_id >= trx_sys->max_trx_id) { - - /* There must be corruption: we let the caller handle the - diagnostic prints in this case. */ - - if (corrupt != NULL) { - *corrupt = true; - } - } else if (trx_t* trx = trx_get_rw_trx_by_id(trx_id)) { - return trx; - } - - return NULL; -} - -/** Look up a rw transaction with the given id. -@param[in] trx_id transaction identifier -@param[out] corrupt flag that will be set if trx_id is corrupted -@param[in] ref_count whether to increment trx->n_ref -@return transaction; its state should be rechecked after acquiring trx_t::mutex -@retval NULL if there is no active transaction identified by trx_id. */ -inline trx_t* trx_rw_is_active(trx_id_t trx_id, bool* corrupt, bool ref_count) -{ - ut_ad(trx_id); - - trx_sys_mutex_enter(); - - trx_t* trx = trx_rw_is_active_low(trx_id, corrupt); - - if (trx && ref_count) { - TrxMutex* trx_mutex = &trx->mutex; - mutex_enter(trx_mutex); - ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED)); - ut_ad(trx->id == trx_id); - if (trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) { - /* We have an early state check here to avoid - committer starvation in a wait loop for - transaction references, when there's a stream of - trx_rw_is_active() calls from other threads. - The trx->state may change to COMMITTED after - trx_mutex is released, and it will have to be - rechecked by the caller after reacquiring the mutex. */ - trx = NULL; - } else { - /* The reference could be safely incremented after - releasing one of trx_mutex or trx_sys->mutex. - Holding trx->mutex here may prevent a few false - references that could have a negative performance - impact on trx_commit_in_memory(). */ - trx->reference(); - } - mutex_exit(trx_mutex); - } - - trx_sys_mutex_exit(); - - return(trx); -} - -/*****************************************************************//** -Allocates a new transaction id. -@return new, allocated trx id */ -UNIV_INLINE -trx_id_t -trx_sys_get_new_trx_id() -/*====================*/ -{ - /* wsrep_fake_trx_id violates this assert */ - ut_ad(trx_sys_mutex_own()); - - /* VERY important: after the database is started, max_trx_id value is - divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if - will evaluate to TRUE when this function is first time called, - and the value for trx id will be written to disk-based header! - Thus trx id values will not overlap when the database is - repeatedly started! */ - - if (!(trx_sys->max_trx_id % TRX_SYS_TRX_ID_WRITE_MARGIN)) { - - trx_sys_flush_max_trx_id(); - } - - return(trx_sys->max_trx_id++); -} - -/*****************************************************************//** -Determines the maximum transaction id. -@return maximum currently allocated trx id; will be stale after the -next call to trx_sys_get_new_trx_id() */ -UNIV_INLINE -trx_id_t -trx_sys_get_max_trx_id(void) -/*========================*/ -{ - ut_ad(!trx_sys_mutex_own()); - -#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN - /* Avoid torn reads. */ - - trx_sys_mutex_enter(); - - trx_id_t max_trx_id = trx_sys->max_trx_id; - - trx_sys_mutex_exit(); - - return(max_trx_id); -#else - /* Perform a dirty read. Callers should be prepared for stale - values, and we know that the value fits in a machine word, so - that it will be read and written atomically. */ - return(trx_sys->max_trx_id); -#endif /* UNIV_WORD_SIZE < DATA_TRX_ID_LEN */ -} - -/*****************************************************************//** -Get the number of transaction in the system, independent of their state. -@return count of transactions in trx_sys_t::rw_trx_list */ -UNIV_INLINE -ulint -trx_sys_get_n_rw_trx(void) -/*======================*/ -{ - ulint n_trx; - - trx_sys_mutex_enter(); - - n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list); - - trx_sys_mutex_exit(); - - return(n_trx); -} - -/** -Add the transaction to the RW transaction set -@param trx transaction instance to add */ -UNIV_INLINE -void -trx_sys_rw_trx_add(trx_t* trx) -{ - ut_ad(trx->id != 0); - - trx_sys->rw_trx_set.insert(TrxTrack(trx->id, trx)); - ut_d(trx->in_rw_trx_list = true); -} diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 5354c77db25..c1572a0d07f 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -34,15 +34,15 @@ Created 3/26/1996 Heikki Tuuri #include "trx0xa.h" #include "ut0vec.h" #include "fts0fts.h" +#include "read0types.h" #include <vector> #include <set> // Forward declaration struct mtr_t; -class ReadView; class FlushObserver; -class ut_stage_alter_t; +struct rw_trx_hash_element_t; /******************************************************************//** Set detailed error message for the transaction. */ @@ -67,51 +67,19 @@ const dict_index_t* trx_get_error_info( /*===============*/ const trx_t* trx); /*!< in: trx object */ -/********************************************************************//** -Creates a transaction object for MySQL. -@return own: transaction object */ -trx_t* -trx_allocate_for_mysql(void); -/*========================*/ - -/** @return allocated transaction object for internal operations */ -trx_t *trx_allocate_for_background(); - -/** Frees and initialize a transaction object instantinated during recovery. -@param trx trx object to free and initialize during recovery */ -void -trx_free_resurrected(trx_t* trx); -/** Free a transaction that was allocated by background or user threads. -@param trx trx object to free */ -void -trx_free_for_background(trx_t* trx); +/** @return an allocated transaction */ +trx_t *trx_create(); -/********************************************************************//** -At shutdown, frees a transaction object that is in the PREPARED state. */ -void -trx_free_prepared( -/*==============*/ - trx_t* trx); /*!< in, own: trx object */ - -/** Free a transaction object for MySQL. -@param[in,out] trx transaction */ -void -trx_free_for_mysql(trx_t* trx); - -/** Disconnect a transaction from MySQL. -@param[in,out] trx transaction */ -void -trx_disconnect_plain(trx_t* trx); +/** At shutdown, frees a transaction object. */ +void trx_free_at_shutdown(trx_t *trx); /** Disconnect a prepared transaction from MySQL. @param[in,out] trx transaction */ -void -trx_disconnect_prepared(trx_t* trx); +void trx_disconnect_prepared(trx_t *trx); /** Initialize (resurrect) transactions at startup. */ -void -trx_lists_init_at_db_start(); +void trx_lists_init_at_db_start(); /*************************************************************//** Starts the transaction if it is not yet started. */ @@ -211,22 +179,10 @@ trx_commit( /*=======*/ trx_t* trx); /*!< in/out: transaction */ -/****************************************************************//** -Commits a transaction and a mini-transaction. */ -void -trx_commit_low( -/*===========*/ - trx_t* trx, /*!< in/out: transaction */ - mtr_t* mtr); /*!< in/out: mini-transaction (will be committed), - or NULL if trx made no modifications */ -/****************************************************************//** -Cleans up a transaction at database startup. The cleanup is needed if -the transaction already got to the middle of a commit when the database -crashed, and we cannot roll it back. */ -void -trx_cleanup_at_db_startup( -/*======================*/ - trx_t* trx); /*!< in: transaction */ +/** Commit a transaction and a mini-transaction. +@param[in,out] trx transaction +@param[in,out] mtr mini-transaction (NULL if no modifications) */ +void trx_commit_low(trx_t* trx, mtr_t* mtr); /**********************************************************************//** Does the transaction commit for MySQL. @return DB_SUCCESS or error number */ @@ -245,7 +201,7 @@ int trx_recover_for_mysql( /*==================*/ XID* xid_list, /*!< in/out: prepared transactions */ - ulint len); /*!< in: number of slots in xid_list */ + uint len); /*!< in: number of slots in xid_list */ /** Look up an X/Open distributed transaction in XA PREPARE state. @param[in] xid X/Open XA transaction identifier @return transaction on match (the trx_t::xid will be invalidated); @@ -266,31 +222,6 @@ void trx_mark_sql_stat_end( /*==================*/ trx_t* trx); /*!< in: trx handle */ -/********************************************************************//** -Assigns a read view for a consistent read query. All the consistent reads -within the same transaction will get the same read view, which is created -when this function is first called for a new started transaction. */ -ReadView* -trx_assign_read_view( -/*=================*/ - trx_t* trx); /*!< in: active transaction */ - -/****************************************************************//** -@return the transaction's read view or NULL if one not assigned. */ -UNIV_INLINE -ReadView* -trx_get_read_view( -/*==============*/ - trx_t* trx); - -/****************************************************************//** -@return the transaction's read view or NULL if one not assigned. */ -UNIV_INLINE -const ReadView* -trx_get_read_view( -/*==============*/ - const trx_t* trx); - /****************************************************************//** Prepares a transaction for commit/rollback. */ void @@ -314,7 +245,7 @@ trx_commit_step( /**********************************************************************//** Prints info about a transaction. -Caller must hold trx_sys->mutex. */ +Caller must hold trx_sys.mutex. */ void trx_print_low( /*==========*/ @@ -334,7 +265,7 @@ trx_print_low( /**********************************************************************//** Prints info about a transaction. -The caller must hold lock_sys->mutex and trx_sys->mutex. +The caller must hold lock_sys.mutex and trx_sys.mutex. When possible, use trx_print() instead. */ void trx_print_latched( @@ -344,25 +275,9 @@ trx_print_latched( ulint max_query_len); /*!< in: max query length to print, or 0 to use the default max length */ -#ifdef WITH_WSREP /**********************************************************************//** Prints info about a transaction. -Transaction information may be retrieved without having trx_sys->mutex acquired -so it may not be completely accurate. The caller must own lock_sys->mutex -and the trx must have some locks to make sure that it does not escape -without locking lock_sys->mutex. */ -UNIV_INTERN -void -wsrep_trx_print_locking( - FILE* f, /*!< in: output stream */ - const trx_t* trx, /*!< in: transaction */ - ulint max_query_len) /*!< in: max query length to print, - or 0 to use the default max length */ - MY_ATTRIBUTE((nonnull)); -#endif /* WITH_WSREP */ -/**********************************************************************//** -Prints info about a transaction. -Acquires and releases lock_sys->mutex and trx_sys->mutex. */ +Acquires and releases lock_sys.mutex. */ void trx_print( /*======*/ @@ -392,9 +307,9 @@ trx_set_dict_operation( /**********************************************************************//** Determines if a transaction is in the given state. -The caller must hold trx_sys->mutex, or it must be the thread +The caller must hold trx_sys.mutex, or it must be the thread that is serving a running transaction. -A running RW transaction must be in trx_sys->rw_trx_list. +A running RW transaction must be in trx_sys.rw_trx_hash. @return TRUE if trx->state == state */ UNIV_INLINE bool @@ -410,22 +325,11 @@ trx_state_eq( trx->state == TRX_STATE_NOT_STARTED after an error has been reported */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -# ifdef UNIV_DEBUG -/**********************************************************************//** -Asserts that a transaction has been started. -The caller must hold trx_sys->mutex. -@return TRUE if started */ -ibool -trx_assert_started( -/*===============*/ - const trx_t* trx) /*!< in: transaction */ - MY_ATTRIBUTE((warn_unused_result)); -# endif /* UNIV_DEBUG */ /**********************************************************************//** Determines if the currently running transaction has been interrupted. -@return TRUE if interrupted */ -ibool +@return true if interrupted */ +bool trx_is_interrupted( /*===============*/ const trx_t* trx); /*!< in: transaction */ @@ -519,15 +423,6 @@ with an explicit check for the read-only status. ((t)->read_only && trx_is_autocommit_non_locking((t))) /** -Assert that the transaction is in the trx_sys_t::rw_trx_list */ -#define assert_trx_in_rw_list(t) do { \ - ut_ad(!(t)->read_only); \ - ut_ad((t)->in_rw_trx_list \ - == !((t)->read_only || !(t)->rsegs.m_redo.rseg)); \ - check_trx_state(t); \ -} while (0) - -/** Check transaction state */ #define check_trx_state(t) do { \ ut_ad(!trx_is_autocommit_non_locking((t))); \ @@ -551,7 +446,7 @@ Check transaction state */ ut_ad(!(t)->has_logged()); \ ut_ad(!(t)->is_referenced()); \ ut_ad(!(t)->is_wsrep()); \ - ut_ad(!MVCC::is_view_active((t)->read_view)); \ + ut_ad(!(t)->read_view.is_open()); \ ut_ad((t)->lock.wait_thr == NULL); \ ut_ad(UT_LIST_GET_LEN((t)->lock.trx_locks) == 0); \ ut_ad((t)->lock.table_locks.empty()); \ @@ -560,27 +455,18 @@ Check transaction state */ ut_ad((t)->dict_operation == TRX_DICT_OP_NONE); \ } while(0) -/** Check if transaction is in-active so that it can be freed and put back to -transaction pool. -@param t transaction handle */ -#define assert_trx_is_inactive(t) do { \ - assert_trx_is_free((t)); \ - ut_ad((t)->dict_operation_lock_mode == 0); \ -} while(0) - #ifdef UNIV_DEBUG /*******************************************************************//** Assert that an autocommit non-locking select cannot be in the -rw_trx_list and that it is a read-only transaction. -The tranasction must be in the mysql_trx_list. */ +rw_trx_hash and that it is a read-only transaction. +The transaction must have mysql_thd assigned. */ # define assert_trx_nonlocking_or_in_list(t) \ do { \ if (trx_is_autocommit_non_locking(t)) { \ trx_state_t t_state = (t)->state; \ ut_ad((t)->read_only); \ ut_ad(!(t)->is_recovered); \ - ut_ad(!(t)->in_rw_trx_list); \ - ut_ad((t)->in_mysql_trx_list); \ + ut_ad((t)->mysql_thd); \ ut_ad(t_state == TRX_STATE_NOT_STARTED \ || t_state == TRX_STATE_ACTIVE); \ } else { \ @@ -590,8 +476,8 @@ The tranasction must be in the mysql_trx_list. */ #else /* UNIV_DEBUG */ /*******************************************************************//** Assert that an autocommit non-locking slect cannot be in the -rw_trx_list and that it is a read-only transaction. -The tranasction must be in the mysql_trx_list. */ +rw_trx_hash and that it is a read-only transaction. +The transaction must have mysql_thd assigned. */ # define assert_trx_nonlocking_or_in_list(trx) ((void)0) #endif /* UNIV_DEBUG */ @@ -618,7 +504,7 @@ To query the state either of the mutexes is sufficient within the locking code and no mutex is required when the query thread is no longer waiting. */ /** The locks and state of an active transaction. Protected by -lock_sys->mutex, trx->mutex or both. */ +lock_sys.mutex, trx->mutex or both. */ struct trx_lock_t { ulint n_active_thrs; /*!< number of active query threads */ @@ -630,10 +516,10 @@ struct trx_lock_t { TRX_QUE_LOCK_WAIT, this points to the lock request, otherwise this is NULL; set to non-NULL when holding - both trx->mutex and lock_sys->mutex; + both trx->mutex and lock_sys.mutex; set to NULL when holding - lock_sys->mutex; readers should - hold lock_sys->mutex, except when + lock_sys.mutex; readers should + hold lock_sys.mutex, except when they are holding trx->mutex and wait_lock==NULL */ ib_uint64_t deadlock_mark; /*!< A mark field that is initialized @@ -647,13 +533,13 @@ struct trx_lock_t { resolution, it sets this to true. Protected by trx->mutex. */ time_t wait_started; /*!< lock wait started at this time, - protected only by lock_sys->mutex */ + protected only by lock_sys.mutex */ que_thr_t* wait_thr; /*!< query thread belonging to this trx that is in QUE_THR_LOCK_WAIT state. For threads suspended in a lock wait, this is protected by - lock_sys->mutex. Otherwise, this may + lock_sys.mutex. Otherwise, this may only be modified by the thread that is serving the running transaction. */ @@ -672,12 +558,12 @@ struct trx_lock_t { unsigned table_cached; mem_heap_t* lock_heap; /*!< memory heap for trx_locks; - protected by lock_sys->mutex */ + protected by lock_sys.mutex */ trx_lock_list_t trx_locks; /*!< locks requested by the transaction; insertions are protected by trx->mutex - and lock_sys->mutex; removals are - protected by lock_sys->mutex */ + and lock_sys.mutex; removals are + protected by lock_sys.mutex */ lock_list table_locks; /*!< All table locks requested by this transaction, including AUTOINC locks */ @@ -696,14 +582,73 @@ struct trx_lock_t { ulint n_rec_locks; /*!< number of rec locks in this trx */ }; -/** Type used to store the list of tables that are modified by a given -transaction. We store pointers to the table objects in memory because +/** Logical first modification time of a table in a transaction */ +class trx_mod_table_time_t +{ + /** First modification of the table */ + undo_no_t first; + /** First modification of a system versioned column */ + undo_no_t first_versioned; + + /** Magic value signifying that a system versioned column of a + table was never modified in a transaction. */ + static const undo_no_t UNVERSIONED = IB_ID_MAX; + +public: + /** Constructor + @param[in] rows number of modified rows so far */ + trx_mod_table_time_t(undo_no_t rows) + : first(rows), first_versioned(UNVERSIONED) {} + +#ifdef UNIV_DEBUG + /** Validation + @param[in] rows number of modified rows so far + @return whether the object is valid */ + bool valid(undo_no_t rows = UNVERSIONED) const + { + return first <= first_versioned && first <= rows; + } +#endif /* UNIV_DEBUG */ + /** @return if versioned columns were modified */ + bool is_versioned() const { return first_versioned != UNVERSIONED; } + + /** After writing an undo log record, set is_versioned() if needed + @param[in] rows number of modified rows so far */ + void set_versioned(undo_no_t rows) + { + ut_ad(!is_versioned()); + first_versioned = rows; + ut_ad(valid()); + } + + /** Invoked after partial rollback + @param[in] limit number of surviving modified rows + @return whether this should be erased from trx_t::mod_tables */ + bool rollback(undo_no_t limit) + { + ut_ad(valid()); + if (first >= limit) { + return true; + } + + if (first_versioned < limit && is_versioned()) { + first_versioned = UNVERSIONED; + } + + return false; + } +}; + +/** Collection of persistent tables and their first modification +in a transaction. +We store pointers to the table objects in memory because we know that a table object will not be destroyed while a transaction that modified it is running. */ -typedef std::set< - dict_table_t*, +typedef std::map< + dict_table_t*, trx_mod_table_time_t, std::less<dict_table_t*>, - ut_allocator<dict_table_t*> > trx_mod_tables_t; + ut_allocator<std::pair<dict_table_t* const, trx_mod_table_time_t> > > + trx_mod_tables_t; /** The transaction handle @@ -731,32 +676,32 @@ Normally, only the thread that is currently associated with a running transaction may access (read and modify) the trx object, and it may do so without holding any mutex. The following are exceptions to this: -* trx_rollback_resurrected() may access resurrected (connectionless) -transactions while the system is already processing new user -transactions. The trx_sys->mutex and trx->is_recovered prevent -a race condition between it and trx_commit(). +* trx_rollback_recovered() may access resurrected (connectionless) +transactions (state == TRX_STATE_ACTIVE && is_recovered) +while the system is already processing new user transactions (!is_recovered). * trx_print_low() may access transactions not associated with the current -thread. The caller must be holding trx_sys->mutex and lock_sys->mutex. +thread. The caller must be holding lock_sys.mutex. -* When a transaction handle is in the trx_sys->mysql_trx_list or -trx_sys->trx_list, some of its fields must not be modified without -holding trx_sys->mutex exclusively. +* When a transaction handle is in the trx_sys.trx_list, some of its fields +must not be modified without holding trx->mutex. * The locking code (in particular, lock_deadlock_recursive() and lock_rec_convert_impl_to_expl()) will access transactions associated to other connections. The locks of transactions are protected by -lock_sys->mutex (insertions also by trx->mutex). */ +lock_sys.mutex (insertions also by trx->mutex). */ /** Represents an instance of rollback segment along with its state variables.*/ struct trx_undo_ptr_t { trx_rseg_t* rseg; /*!< rollback segment assigned to the transaction, or NULL if not assigned yet */ - trx_undo_t* insert_undo; /*!< pointer to the insert undo log, or - NULL if no inserts performed yet */ - trx_undo_t* update_undo; /*!< pointer to the update undo log, or - NULL if no update performed yet */ + trx_undo_t* undo; /*!< pointer to the undo log, or + NULL if nothing logged yet */ + trx_undo_t* old_insert; /*!< pointer to recovered + insert undo log, or NULL if no + INSERT transactions were + recovered from old-format undo logs */ }; /** An instance of temporary rollback segment. */ @@ -796,7 +741,7 @@ public: TrxMutex mutex; /*!< Mutex protecting the fields state and lock (except some fields of lock, which are protected by - lock_sys->mutex) */ + lock_sys.mutex) */ trx_id_t id; /*!< transaction id */ @@ -805,7 +750,7 @@ public: transaction is moved to COMMITTED_IN_MEMORY state. Protected by trx_sys_t::mutex - when trx->in_rw_trx_list. Initially + when trx is in rw_trx_hash. Initially set to TRX_ID_MAX. */ /** State of the trx from the point of view of concurrency control @@ -833,6 +778,9 @@ public: Recovered XA: * NOT_STARTED -> PREPARED -> COMMITTED -> (freed) + Recovered XA followed by XA ROLLBACK: + * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed) + XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT): * NOT_STARTED -> PREPARED -> (freed) @@ -843,11 +791,11 @@ public: XA (2PC) transactions are always treated as non-autocommit. - Transitions to ACTIVE or NOT_STARTED occur when - !in_rw_trx_list (no trx_sys->mutex needed). + Transitions to ACTIVE or NOT_STARTED occur when transaction + is not in rw_trx_hash (no trx_sys.mutex needed). Autocommit non-locking read-only transactions move between states - without holding any mutex. They are !in_rw_trx_list. + without holding any mutex. They are not in rw_trx_hash. All transactions, unless they are determined to be ac-nl-ro, explicitly tagged as read-only or read-write, will first be put @@ -856,21 +804,17 @@ public: do we remove it from the read-only list and put it on the read-write list. During this switch we assign it a rollback segment. - When a transaction is NOT_STARTED, it can be in_mysql_trx_list if - it is a user transaction. It cannot be in rw_trx_list. + When a transaction is NOT_STARTED, it can be in trx_list. It cannot be + in rw_trx_hash. - ACTIVE->PREPARED->COMMITTED is only possible when trx->in_rw_trx_list. - The transition ACTIVE->PREPARED is protected by trx_sys->mutex. + ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash. + The transition ACTIVE->PREPARED is protected by trx_sys.mutex. ACTIVE->COMMITTED is possible when the transaction is in - rw_trx_list. + rw_trx_hash. Transitions to COMMITTED are protected by trx_t::mutex. */ trx_state_t state; - /** whether this is a recovered transaction that should be - rolled back by trx_rollback_or_clean_recovered(). - Protected by trx_t::mutex for transactions that are in trx_sys. */ - bool is_recovered; #ifdef WITH_WSREP /** whether wsrep_on(mysql_thd) held at the start of transaction */ bool wsrep; @@ -882,23 +826,23 @@ public: bool is_wsrep() const { return false; } #endif /* WITH_WSREP */ - ReadView* read_view; /*!< consistent read view used in the + ReadView read_view; /*!< consistent read view used in the transaction, or NULL if not yet set */ - - UT_LIST_NODE_T(trx_t) - trx_list; /*!< list of transactions; - protected by trx_sys->mutex. */ - UT_LIST_NODE_T(trx_t) - no_list; /*!< Required during view creation - to check for the view limit for - transactions that are committing */ - trx_lock_t lock; /*!< Information about the transaction locks and state. Protected by - lock_sys->mutex (insertions also + lock_sys.mutex (insertions also by trx_t::mutex). */ /* These fields are not protected by any mutex. */ + + /** false=normal transaction, true=recovered (must be rolled back) + or disconnected transaction in XA PREPARE STATE. + + This field is accessed by the thread that owns the transaction, + without holding any mutex. + There is only one foreign-thread access in trx_print_low() + and a possible race condition with trx_disconnect_prepared(). */ + bool is_recovered; const char* op_info; /*!< English text describing the current operation, or an empty string */ @@ -976,7 +920,7 @@ public: contains a pointer to the latest file name; this is NULL if binlog is not used */ - int64_t mysql_log_offset; + ulonglong mysql_log_offset; /*!< if MySQL binlog is used, this field contains the end offset of the binlog entry */ @@ -989,21 +933,8 @@ public: statement uses, except those in consistent read */ /*------------------------------*/ -#ifdef UNIV_DEBUG - /** The following two fields are mutually exclusive. */ - /* @{ */ - - bool in_rw_trx_list; /*!< true if in trx_sys->rw_trx_list */ - /* @} */ -#endif /* UNIV_DEBUG */ - UT_LIST_NODE_T(trx_t) - mysql_trx_list; /*!< list of transactions created for - MySQL; protected by trx_sys->mutex */ -#ifdef UNIV_DEBUG - bool in_mysql_trx_list; - /*!< true if in - trx_sys->mysql_trx_list */ -#endif /* UNIV_DEBUG */ + UT_LIST_NODE_T(trx_t) trx_list; /*!< list of all transactions; + protected by trx_sys.mutex */ /*------------------------------*/ dberr_t error_state; /*!< 0 if no error, otherwise error number; NOTE That ONLY the thread @@ -1027,12 +958,6 @@ public: trx_savepoints; /*!< savepoints set with SAVEPOINT ..., oldest first */ /*------------------------------*/ - UndoMutex undo_mutex; /*!< mutex protecting the fields in this - section (down to undo_no_arr), EXCEPT - last_sql_stat_start, which can be - accessed only when we know that there - cannot be any activity in the undo - logs! */ undo_no_t undo_no; /*!< next undo log record number to assign; since the undo log is private for a transaction, this @@ -1040,21 +965,15 @@ public: with no gaps; thus it represents the number of modified/inserted rows in a transaction */ - ulint undo_rseg_space; - /*!< space id where last undo record - was written */ trx_savept_t last_sql_stat_start; /*!< undo_no when the last sql statement was started: in case of an error, trx - is rolled back down to this undo - number; see note at undo_mutex! */ + is rolled back down to this number */ trx_rsegs_t rsegs; /* rollback segments for undo logging */ undo_no_t roll_limit; /*!< least undo number to undo during a partial rollback; 0 otherwise */ -#ifdef UNIV_DEBUG bool in_rollback; /*!< true when the transaction is executing a partial or full rollback */ -#endif /* UNIV_DEBUG */ ulint pages_undone; /*!< number of undo log pages undone since the last undo log truncation */ /*------------------------------*/ @@ -1066,7 +985,7 @@ public: also in the lock list trx_locks. This vector needs to be freed explicitly when the trx instance is destroyed. - Protected by lock_sys->mutex. */ + Protected by lock_sys.mutex. */ /*------------------------------*/ bool read_only; /*!< true if transaction is flagged as a READ-ONLY transaction. @@ -1120,12 +1039,14 @@ public: os_event_t wsrep_event; /* event waited for in srv_conc_slot */ #endif /* WITH_WSREP */ + rw_trx_hash_element_t *rw_trx_hash_element; + LF_PINS *rw_trx_hash_pins; ulint magic_n; /** @return whether any persistent undo log has been generated */ bool has_logged_persistent() const { - return(rsegs.m_redo.insert_undo || rsegs.m_redo.update_undo); + return(rsegs.m_redo.undo); } /** @return whether any undo log has been generated */ @@ -1134,6 +1055,13 @@ public: return(has_logged_persistent() || rsegs.m_noredo.undo); } + /** @return whether any undo log has been generated or + recovered */ + bool has_logged_or_recovered() const + { + return(has_logged() || rsegs.m_redo.old_insert); + } + /** @return rollback segment for modifying temporary tables */ trx_rseg_t* get_temp_rseg() { @@ -1146,9 +1074,9 @@ public: } /** Set the innodb_log_optimize_ddl page flush observer - @param[in] space_id tablespace id - @param[in,out] stage performance_schema accounting */ - void set_flush_observer(ulint space_id, ut_stage_alter_t* stage); + @param[in,out] space tablespace + @param[in,out] stage performance_schema accounting */ + void set_flush_observer(fil_space_t* space, ut_stage_alter_t* stage); /** Remove the flush observer */ void remove_flush_observer(); @@ -1192,7 +1120,7 @@ public: } /** Free the memory to trx_pools */ - inline void free(); + void free(); private: diff --git a/storage/innobase/include/trx0trx.ic b/storage/innobase/include/trx0trx.ic index 4a5b1ba717f..f51d5368022 100644 --- a/storage/innobase/include/trx0trx.ic +++ b/storage/innobase/include/trx0trx.ic @@ -24,13 +24,11 @@ The transaction Created 3/26/1996 Heikki Tuuri *******************************************************/ -#include "read0read.h" - /**********************************************************************//** Determines if a transaction is in the given state. -The caller must hold trx_sys->mutex, or it must be the thread +The caller must hold trx_sys.mutex, or it must be the thread that is serving a running transaction. -A running RW transaction must be in trx_sys->rw_trx_list. +A running RW transaction must be in trx_sys.rw_trx_hash. @return TRUE if trx->state == state */ UNIV_INLINE bool @@ -50,27 +48,20 @@ trx_state_eq( switch (trx->state) { case TRX_STATE_PREPARED: case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: ut_ad(!trx_is_autocommit_non_locking(trx)); return(trx->state == state); case TRX_STATE_ACTIVE: - assert_trx_nonlocking_or_in_list(trx); return(state == trx->state); - case TRX_STATE_COMMITTED_IN_MEMORY: - - check_trx_state(trx); - return(state == trx->state); - case TRX_STATE_NOT_STARTED: /* These states are not allowed for running transactions. */ ut_a(state == TRX_STATE_NOT_STARTED || (relaxed && thd_get_error_number(trx->mysql_thd))); - ut_ad(!trx->in_rw_trx_list); - return(true); } ut_error; @@ -209,25 +200,3 @@ ok: trx->ddl = true; trx->dict_operation = op; } - -/** -@param trx Get the active view for this transaction, if one exists -@return the transaction's read view or NULL if one not assigned. */ -UNIV_INLINE -ReadView* -trx_get_read_view( - trx_t* trx) -{ - return(!MVCC::is_view_active(trx->read_view) ? NULL : trx->read_view); -} - -/** -@param trx Get the active view for this transaction, if one exists -@return the transaction's read view or NULL if one not assigned. */ -UNIV_INLINE -const ReadView* -trx_get_read_view( - const trx_t* trx) -{ - return(!MVCC::is_view_active(trx->read_view) ? NULL : trx->read_view); -} diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h index 097aea519a9..2aaec580d65 100644 --- a/storage/innobase/include/trx0types.h +++ b/storage/innobase/include/trx0types.h @@ -30,11 +30,8 @@ Created 3/26/1996 Heikki Tuuri #include "ut0byte.h" #include "ut0mutex.h" -#include <set> #include <vector> -//#include <unordered_set> - /** printf(3) format used for printing DB_TRX_ID and other system fields */ #define TRX_ID_FMT IB_ID_FMT @@ -94,8 +91,6 @@ enum trx_dict_op_t { struct trx_t; /** The locks and state of an active transaction */ struct trx_lock_t; -/** Transaction system */ -struct trx_sys_t; /** Signal */ struct trx_sig_t; /** Rollback segment */ @@ -119,9 +114,6 @@ typedef ib_id_t roll_ptr_t; /** Undo number */ typedef ib_id_t undo_no_t; -/** Maximum transaction identifier */ -#define TRX_ID_MAX IB_ID_MAX - /** Transaction savepoint */ struct trx_savept_t{ undo_no_t least_undo_no; /*!< least undo number to undo */ @@ -129,8 +121,6 @@ struct trx_savept_t{ /** File objects */ /* @{ */ -/** Transaction system header */ -typedef byte trx_sysf_t; /** Rollback segment header */ typedef byte trx_rsegf_t; /** Undo segment header */ @@ -147,56 +137,8 @@ typedef byte trx_undo_rec_t; typedef ib_mutex_t RsegMutex; typedef ib_mutex_t TrxMutex; -typedef ib_mutex_t UndoMutex; typedef ib_mutex_t PQMutex; typedef ib_mutex_t TrxSysMutex; typedef std::vector<trx_id_t, ut_allocator<trx_id_t> > trx_ids_t; - -/** Mapping read-write transactions from id to transaction instance, for -creating read views and during trx id lookup for MVCC and locking. */ -struct TrxTrack { - explicit TrxTrack(trx_id_t id, trx_t* trx = NULL) - : - m_id(id), - m_trx(trx) - { - // Do nothing - } - - trx_id_t m_id; - trx_t* m_trx; -}; - -struct TrxTrackHash { - size_t operator()(const TrxTrack& key) const - { - return(size_t(key.m_id)); - } -}; - -/** -Comparator for TrxMap */ -struct TrxTrackHashCmp { - - bool operator() (const TrxTrack& lhs, const TrxTrack& rhs) const - { - return(lhs.m_id == rhs.m_id); - } -}; - -/** -Comparator for TrxMap */ -struct TrxTrackCmp { - - bool operator() (const TrxTrack& lhs, const TrxTrack& rhs) const - { - return(lhs.m_id < rhs.m_id); - } -}; - -//typedef std::unordered_set<TrxTrack, TrxTrackHash, TrxTrackHashCmp> TrxIdSet; -typedef std::set<TrxTrack, TrxTrackCmp, ut_allocator<TrxTrack> > - TrxIdSet; - #endif /* trx0types_h */ diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h index 99330453c33..22420f111b5 100644 --- a/storage/innobase/include/trx0undo.h +++ b/storage/innobase/include/trx0undo.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -118,17 +118,6 @@ page_t* trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr); /******************************************************************//** -Returns the previous undo record on the page in the specified log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_prev_rec( -/*=======================*/ - trx_undo_rec_t* rec, /*!< in: undo log record */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset);/*!< in: undo log header offset on page */ -/******************************************************************//** Returns the next undo log record on the page in the specified log, or NULL if none exists. @return pointer to record, NULL if none */ @@ -139,28 +128,6 @@ trx_undo_page_get_next_rec( trx_undo_rec_t* rec, /*!< in: undo log record */ ulint page_no,/*!< in: undo log header page number */ ulint offset);/*!< in: undo log header offset on page */ -/******************************************************************//** -Returns the last undo record on the page in the specified undo log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_last_rec( -/*=======================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset); /*!< in: undo log header offset on page */ -/******************************************************************//** -Returns the first undo record on the page in the specified undo log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_first_rec( -/*========================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset);/*!< in: undo log header offset on page */ /***********************************************************************//** Gets the previous record in an undo log. @return undo log record, the page s-latched, NULL if none */ @@ -192,20 +159,18 @@ trx_undo_get_next_rec( @return undo log record, the page latched, NULL if none */ trx_undo_rec_t* trx_undo_get_first_rec( - ulint space, + fil_space_t* space, ulint page_no, ulint offset, ulint mode, mtr_t* mtr); /** Allocate an undo log page. -@param[in,out] trx transaction @param[in,out] undo undo log @param[in,out] mtr mini-transaction that does not hold any page latch @return X-latched block if success @retval NULL on failure */ -buf_block_t* -trx_undo_add_page(trx_t* trx, trx_undo_t* undo, mtr_t* mtr) +buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Free the last undo log page. The caller must hold the rseg mutex. @@ -238,37 +203,33 @@ trx_undo_truncate_start( ulint hdr_page_no, ulint hdr_offset, undo_no_t limit); -/********************************************************************//** -Initializes the undo log lists for a rollback segment memory copy. -This function is only called when the database is started or a new -rollback segment created. -@return the combined size of undo log segments in pages */ -ulint -trx_undo_lists_init( -/*================*/ - trx_rseg_t* rseg); /*!< in: rollback segment memory object */ /** Mark that an undo log header belongs to a data dictionary transaction. @param[in] trx dictionary transaction @param[in,out] undo undo log @param[in,out] mtr mini-transaction */ void trx_undo_mark_as_dict(const trx_t* trx, trx_undo_t* undo, mtr_t* mtr); +/** Assign an undo log for a persistent transaction. +A new undo log is created or a cached undo log reused. +@param[in,out] trx transaction +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); /** Assign an undo log for a transaction. A new undo log is created or a cached undo log reused. @param[in,out] trx transaction @param[in] rseg rollback segment @param[out] undo the undo log -@param[in] type TRX_UNDO_INSERT or TRX_UNDO_UPDATE -@retval DB_SUCCESS on success -@retval DB_TOO_MANY_CONCURRENT_TRXS -@retval DB_OUT_OF_FILE_SPACE -@retval DB_READ_ONLY -@retval DB_OUT_OF_MEMORY */ -dberr_t -trx_undo_assign_undo( - trx_t* trx, - trx_rseg_t* rseg, - trx_undo_t** undo, - ulint type) +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, + dberr_t* err, mtr_t* mtr) MY_ATTRIBUTE((nonnull, warn_unused_result)); /******************************************************************//** Sets the state of the undo log segment at a transaction finish. @@ -281,7 +242,7 @@ trx_undo_set_state_at_finish( /** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK. @param[in,out] trx transaction -@param[in,out] undo insert_undo or update_undo log +@param[in,out] undo undo log @param[in] rollback false=XA PREPARE, true=XA ROLLBACK @param[in,out] mtr mini-transaction @return undo log segment header page, x-latched */ @@ -292,20 +253,7 @@ trx_undo_set_state_at_prepare( bool rollback, mtr_t* mtr); -/**********************************************************************//** -Adds the update undo log header as the first in the history list, and -frees the memory object, or puts it to the list of cached update undo log -segments. */ -void -trx_undo_update_cleanup( -/*====================*/ - trx_t* trx, /*!< in: trx owning the update - undo log */ - page_t* undo_page, /*!< in: update undo log header page, - x-latched */ - mtr_t* mtr); /*!< in: mtr */ - -/** Free an insert or temporary undo log after commit or rollback. +/** Free an old insert or temporary undo log after commit or rollback. The information is not needed after a commit or rollback, therefore the data can be discarded. @param[in,out] undo undo log @@ -313,26 +261,31 @@ the data can be discarded. void trx_undo_commit_cleanup(trx_undo_t* undo, bool is_temp); -/********************************************************************//** -At shutdown, frees the undo logs of a PREPARED transaction. */ +/** At shutdown, frees the undo logs of a transaction. */ void -trx_undo_free_prepared( -/*===================*/ - trx_t* trx) /*!< in/out: PREPARED transaction */ - ATTRIBUTE_COLD __attribute__((nonnull)); - -/***********************************************************//** -Parses the redo log entry of an undo log page initialization. +trx_undo_free_at_shutdown(trx_t *trx); + +/** Parse MLOG_UNDO_INIT. +@param[in] ptr log record +@param[in] end_ptr end of log record buffer +@param[in,out] page page or NULL +@param[in,out] mtr mini-transaction +@return end of log record +@retval NULL if the log record is incomplete */ +byte* +trx_undo_parse_page_init(const byte* ptr, const byte* end_ptr, page_t* page); +/** Parse MLOG_UNDO_HDR_REUSE for crash-upgrade from MariaDB 10.2. +@param[in] ptr redo log record +@param[in] end_ptr end of log buffer +@param[in,out] page undo page or NULL @return end of log record or NULL */ byte* -trx_undo_parse_page_init( -/*=====================*/ - const byte* ptr, /*!< in: buffer */ - const byte* end_ptr,/*!< in: buffer end */ - page_t* page, /*!< in: page or NULL */ - mtr_t* mtr); /*!< in: mtr or NULL */ -/** Parse the redo log entry of an undo log page header create or reuse. -@param[in] type MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE +trx_undo_parse_page_header_reuse( + const byte* ptr, + const byte* end_ptr, + page_t* page); + +/** Parse the redo log entry of an undo log page header create. @param[in] ptr redo log record @param[in] end_ptr end of log buffer @param[in,out] page page frame or NULL @@ -340,17 +293,19 @@ trx_undo_parse_page_init( @return end of log record or NULL */ byte* trx_undo_parse_page_header( - mlog_id_t type, const byte* ptr, const byte* end_ptr, page_t* page, mtr_t* mtr); -/************************************************************************ -Frees an undo log memory copy. */ -void -trx_undo_mem_free( -/*==============*/ - trx_undo_t* undo); /* in: the undo object to be freed */ +/** Read an undo log when starting up the database. +@param[in,out] rseg rollback segment +@param[in] id rollback segment slot +@param[in] page_no undo log segment page number +@param[in,out] max_trx_id the largest observed transaction ID +@return size of the undo log in pages */ +ulint +trx_undo_mem_create_at_db_start(trx_rseg_t* rseg, ulint id, ulint page_no, + trx_id_t& max_trx_id); #endif /* !UNIV_INNOCHECKSUM */ @@ -373,25 +328,15 @@ trx_undo_mem_free( #ifndef UNIV_INNOCHECKSUM -/** Transaction undo log memory object; this is protected by the undo_mutex -in the corresponding transaction object */ +/** Transaction undo log memory object; modified by the thread associated +with the transaction. */ struct trx_undo_t { /*-----------------------------*/ ulint id; /*!< undo log slot number within the rollback segment */ - ulint type; /*!< TRX_UNDO_INSERT or - TRX_UNDO_UPDATE */ ulint state; /*!< state of the corresponding undo log segment */ - ibool del_marks; /*!< relevant only in an update undo - log: this is TRUE if the transaction may - have delete marked records, because of - a delete of a row or an update of an - indexed field; purge is then - necessary; also TRUE if the transaction - has updated an externally stored - field */ trx_id_t trx_id; /*!< id of the trx assigned to the undo log */ XID xid; /*!< X/Open XA transaction @@ -401,8 +346,6 @@ struct trx_undo_t { id */ trx_rseg_t* rseg; /*!< rseg where the undo log belongs */ /*-----------------------------*/ - ulint space; /*!< space id where the undo log - placed */ ulint hdr_page_no; /*!< page number of the header page in the undo log */ ulint hdr_offset; /*!< header offset of the undo log on @@ -412,8 +355,6 @@ struct trx_undo_t { top_page_no during a rollback */ ulint size; /*!< current size in pages */ /*-----------------------------*/ - ulint empty; /*!< TRUE if the stack of undo log - records is currently empty */ ulint top_page_no; /*!< page number where the latest undo log record was catenated; during rollback the page from which the latest @@ -421,9 +362,14 @@ struct trx_undo_t { ulint top_offset; /*!< offset of the latest undo record, i.e., the topmost element in the undo log if we think of it as a stack */ - undo_no_t top_undo_no; /*!< undo number of the latest record */ + undo_no_t top_undo_no; /*!< undo number of the latest record + (IB_ID_MAX if the undo log is empty) */ buf_block_t* guess_block; /*!< guess for the buffer block where the top page might reside */ + + /** @return whether the undo log is empty */ + bool empty() const { return top_undo_no == IB_ID_MAX; } + /*-----------------------------*/ UT_LIST_NODE_T(trx_undo_t) undo_list; /*!< undo log objects in the rollback @@ -436,8 +382,8 @@ struct trx_undo_t { /*-------------------------------------------------------------*/ /** Transaction undo log page header offsets */ /* @{ */ -#define TRX_UNDO_PAGE_TYPE 0 /*!< TRX_UNDO_INSERT or - TRX_UNDO_UPDATE */ +#define TRX_UNDO_PAGE_TYPE 0 /*!< unused; 0 (before MariaDB 10.3.1: + TRX_UNDO_INSERT or TRX_UNDO_UPDATE) */ #define TRX_UNDO_PAGE_START 2 /*!< Byte offset where the undo log records for the LATEST transaction start on this page (remember that @@ -458,7 +404,7 @@ struct trx_undo_t { at most this many bytes used; we must leave space at least for one new undo log header on the page */ -#define TRX_UNDO_PAGE_REUSE_LIMIT (3 * UNIV_PAGE_SIZE / 4) +#define TRX_UNDO_PAGE_REUSE_LIMIT (3 << (srv_page_size_shift - 2)) /* An update undo log segment may contain several undo logs on its first page if the undo logs took so little space that the segment could be cached and @@ -498,14 +444,23 @@ log segment */ page of an update undo log segment. */ /* @{ */ /*-------------------------------------------------------------*/ -#define TRX_UNDO_TRX_ID 0 /*!< Transaction id */ -#define TRX_UNDO_TRX_NO 8 /*!< Transaction number of the - transaction; defined only if the log - is in a history list */ -#define TRX_UNDO_DEL_MARKS 16 /*!< Defined only in an update undo - log: TRUE if the transaction may have - done delete markings of records, and - thus purge is necessary */ +/** Transaction start identifier, or 0 if the undo log segment has been +completely purged and trx_purge_free_segment() has started freeing it */ +#define TRX_UNDO_TRX_ID 0 +/** Transaction end identifier (if the log is in a history list), +or 0 if the transaction has not been committed */ +#define TRX_UNDO_TRX_NO 8 +/** Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of +surviving user records, this used to be called TRX_UNDO_DEL_MARKS. + +The value 1 indicates that purge needs to process the undo log segment. +The value 0 indicates that all of it has been processed, and +trx_purge_free_segment() has been invoked, so the log is not safe to access. + +Before MariaDB 10.3.1, a log segment may carry the value 0 even before +trx_purge_free_segment() was called, for those undo log records for +which purge would not result in removing delete-marked records. */ +#define TRX_UNDO_NEEDS_PURGE 16 #define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record of this log on the header page; purge may remove undo log record from the @@ -535,7 +490,7 @@ page of an update undo log segment. */ #define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE) /* Note: the writing of the undo log old header is coded by a log record -MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE. The appending of an XID to the +MLOG_UNDO_HDR_CREATE. The appending of an XID to the header is logged separately. In this sense, the XID is not really a member of the undo log header. TODO: do not append the XID to the log header if XA is not needed by the user. The XID wastes about 150 bytes of space in every diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic index 2e26e6547c3..19697c6054c 100644 --- a/storage/innobase/include/trx0undo.ic +++ b/storage/innobase/include/trx0undo.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -40,9 +40,7 @@ trx_undo_build_roll_ptr( ulint offset) /*!< in: offset of the undo entry within page */ { roll_ptr_t roll_ptr; -#if DATA_ROLL_PTR_LEN != 7 -# error "DATA_ROLL_PTR_LEN != 7" -#endif + compile_time_assert(DATA_ROLL_PTR_LEN == 7); ut_ad(is_insert == 0 || is_insert == 1); ut_ad(rseg_id < TRX_SYS_N_RSEGS); ut_ad(offset < 65536); @@ -67,12 +65,7 @@ trx_undo_decode_roll_ptr( ulint* offset) /*!< out: offset of the undo entry within page */ { -#if DATA_ROLL_PTR_LEN != 7 -# error "DATA_ROLL_PTR_LEN != 7" -#endif -#if TRUE != 1 -# error "TRUE != 1" -#endif + compile_time_assert(DATA_ROLL_PTR_LEN == 7); ut_ad(roll_ptr < (1ULL << 56)); *offset = (ulint) roll_ptr & 0xFFFF; roll_ptr >>= 16; @@ -92,14 +85,9 @@ trx_undo_roll_ptr_is_insert( /*========================*/ roll_ptr_t roll_ptr) /*!< in: roll pointer */ { -#if DATA_ROLL_PTR_LEN != 7 -# error "DATA_ROLL_PTR_LEN != 7" -#endif -#if TRUE != 1 -# error "TRUE != 1" -#endif - ut_ad(roll_ptr < (1ULL << 56)); - return((ibool) (roll_ptr >> 55)); + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + ut_ad(roll_ptr < (1ULL << (ROLL_PTR_INSERT_FLAG_POS + 1))); + return((ibool) (roll_ptr >> ROLL_PTR_INSERT_FLAG_POS)); } /***********************************************************************//** @@ -111,10 +99,8 @@ trx_undo_trx_id_is_insert( /*======================*/ const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ { -#if DATA_TRX_ID + 1 != DATA_ROLL_PTR -# error -#endif - return(static_cast<bool>(trx_id[DATA_TRX_ID_LEN] >> 7)); + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + return bool(trx_id[DATA_TRX_ID_LEN] >> 7); } /*****************************************************************//** @@ -129,9 +115,7 @@ trx_write_roll_ptr( written */ roll_ptr_t roll_ptr) /*!< in: roll ptr */ { -#if DATA_ROLL_PTR_LEN != 7 -# error "DATA_ROLL_PTR_LEN != 7" -#endif + compile_time_assert(DATA_ROLL_PTR_LEN == 7); mach_write_to_7(ptr, roll_ptr); } @@ -146,9 +130,7 @@ trx_read_roll_ptr( /*==============*/ const byte* ptr) /*!< in: pointer to memory from where to read */ { -#if DATA_ROLL_PTR_LEN != 7 -# error "DATA_ROLL_PTR_LEN != 7" -#endif + compile_time_assert(DATA_ROLL_PTR_LEN == 7); return(mach_read_from_7(ptr)); } @@ -184,89 +166,24 @@ trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr) return(buf_block_get_frame(block)); } -/******************************************************************//** -Returns the start offset of the undo log records of the specified undo -log on the page. -@return start offset */ -UNIV_INLINE -ulint -trx_undo_page_get_start( -/*====================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset) /*!< in: undo log header offset on page */ -{ - ulint start; - - if (page_no == page_get_page_no(undo_page)) { - - start = mach_read_from_2(offset + undo_page - + TRX_UNDO_LOG_START); - } else { - start = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE; - } - - return(start); -} - -/******************************************************************//** -Returns the end offset of the undo log records of the specified undo -log on the page. +/** Determine the end offset of undo log records of an undo log page. +@param[in] undo_page undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset @return end offset */ -UNIV_INLINE -ulint -trx_undo_page_get_end( -/*==================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset) /*!< in: undo log header offset on page */ +inline +uint16_t +trx_undo_page_get_end(const page_t* undo_page, ulint page_no, ulint offset) { - trx_ulogf_t* log_hdr; - ulint end; - if (page_no == page_get_page_no(undo_page)) { - - log_hdr = undo_page + offset; - - end = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG); - - if (end == 0) { - end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_FREE); + if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + + offset + undo_page)) { + return end; } - } else { - end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_FREE); - } - - return(end); -} - -/******************************************************************//** -Returns the previous undo record on the page in the specified log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_prev_rec( -/*=======================*/ - trx_undo_rec_t* rec, /*!< in: undo log record */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset) /*!< in: undo log header offset on page */ -{ - page_t* undo_page; - ulint start; - - undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE); - - start = trx_undo_page_get_start(undo_page, page_no, offset); - - if (start + undo_page == rec) { - - return(NULL); } - return(undo_page + mach_read_from_2(rec - 2)); + return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_page); } /******************************************************************//** @@ -285,7 +202,7 @@ trx_undo_page_get_next_rec( ulint end; ulint next; - undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE); + undo_page = (page_t*) ut_align_down(rec, srv_page_size); end = trx_undo_page_get_end(undo_page, page_no, offset); @@ -298,55 +215,3 @@ trx_undo_page_get_next_rec( return(undo_page + next); } - -/******************************************************************//** -Returns the last undo record on the page in the specified undo log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_last_rec( -/*=======================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset) /*!< in: undo log header offset on page */ -{ - ulint start; - ulint end; - - start = trx_undo_page_get_start(undo_page, page_no, offset); - end = trx_undo_page_get_end(undo_page, page_no, offset); - - if (start == end) { - - return(NULL); - } - - return(undo_page + mach_read_from_2(undo_page + end - 2)); -} - -/******************************************************************//** -Returns the first undo record on the page in the specified undo log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_first_rec( -/*========================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset) /*!< in: undo log header offset on page */ -{ - ulint start; - ulint end; - - start = trx_undo_page_get_start(undo_page, page_no, offset); - end = trx_undo_page_get_end(undo_page, page_no, offset); - - if (start == end) { - - return(NULL); - } - - return(undo_page + start); -} diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 0a7f745d45e..9f57380694b 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -39,10 +39,6 @@ Created 1/20/1994 Heikki Tuuri #define _IB_TO_STR(s) #s #define IB_TO_STR(s) _IB_TO_STR(s) -#define INNODB_VERSION_MAJOR 5 -#define INNODB_VERSION_MINOR 7 -#define INNODB_VERSION_BUGFIX 34 - /* The following is the InnoDB version as shown in SELECT plugin_version FROM information_schema.plugins; calculated in make_version_string() in sql/sql_show.cc like this: @@ -50,12 +46,12 @@ calculated in make_version_string() in sql/sql_show.cc like this: because the version is shown with only one dot, we skip the last component, i.e. we show M.N.P as M.N */ #define INNODB_VERSION_SHORT \ - (INNODB_VERSION_MAJOR << 8 | INNODB_VERSION_MINOR) + (MYSQL_VERSION_MAJOR << 8 | MYSQL_VERSION_MINOR) #define INNODB_VERSION_STR \ - IB_TO_STR(INNODB_VERSION_MAJOR) "." \ - IB_TO_STR(INNODB_VERSION_MINOR) "." \ - IB_TO_STR(INNODB_VERSION_BUGFIX) + IB_TO_STR(MYSQL_VERSION_MAJOR) "." \ + IB_TO_STR(MYSQL_VERSION_MINOR) "." \ + IB_TO_STR(MYSQL_VERSION_PATCH) /** How far ahead should we tell the service manager the timeout (time in seconds) */ @@ -96,7 +92,6 @@ support cross-platform development and expose comonly used SQL names. */ #include <sys/stat.h> #ifndef _WIN32 -# include <sys/mman.h> /* mmap() for os0proc.cc */ # include <sched.h> # include "my_config.h" #endif @@ -167,9 +162,8 @@ for all cases. This is used by ut0lst.h related code. */ /* When this macro is defined then additional test functions will be compiled. These functions live at the end of each relevant source file and have "test_" prefix. These functions can be called from the end of -innobase_init() or they can be called from gdb after -innobase_start_or_create_for_mysql() has executed using the call -command. */ +innodb_init() or they can be called from gdb after srv_start() has executed +using the call command. */ /* #define UNIV_COMPILE_TEST_FUNCS #define UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR @@ -261,33 +255,6 @@ management to ensure correct alignment for doubles etc. */ ======================== */ -/** There are currently two InnoDB file formats which are used to group -features with similar restrictions and dependencies. Using an enum allows -switch statements to give a compiler warning when a new one is introduced. */ -enum innodb_file_formats_enum { - /** Antelope File Format: InnoDB/MySQL up to 5.1. - This format includes REDUNDANT and COMPACT row formats */ - UNIV_FORMAT_A = 0, - - /** Barracuda File Format: Introduced in InnoDB plugin for 5.1: - This format includes COMPRESSED and DYNAMIC row formats. It - includes the ability to create secondary indexes from data that - is not on the clustered index page and the ability to store more - data off the clustered index page. */ - UNIV_FORMAT_B = 1 -}; - -typedef enum innodb_file_formats_enum innodb_file_formats_t; - -/** Minimum supported file format */ -#define UNIV_FORMAT_MIN UNIV_FORMAT_A - -/** Maximum supported file format */ -#define UNIV_FORMAT_MAX UNIV_FORMAT_B - -/** The 2-logarithm of UNIV_PAGE_SIZE: */ -#define UNIV_PAGE_SIZE_SHIFT srv_page_size_shift - #ifdef HAVE_LZO #define IF_LZO(A,B) A #else @@ -324,32 +291,29 @@ typedef enum innodb_file_formats_enum innodb_file_formats_t; #define IF_PUNCH_HOLE(A,B) B #endif -/** The universal page size of the database */ -#define UNIV_PAGE_SIZE ((ulint) srv_page_size) - /** log2 of smallest compressed page size (1<<10 == 1024 bytes) Note: This must never change! */ -#define UNIV_ZIP_SIZE_SHIFT_MIN 10 +#define UNIV_ZIP_SIZE_SHIFT_MIN 10U /** log2 of largest compressed page size (1<<14 == 16384 bytes). A compressed page directory entry reserves 14 bits for the start offset and 2 bits for flags. This limits the uncompressed page size to 16k. */ -#define UNIV_ZIP_SIZE_SHIFT_MAX 14 +#define UNIV_ZIP_SIZE_SHIFT_MAX 14U /* Define the Min, Max, Default page sizes. */ /** Minimum Page Size Shift (power of 2) */ -#define UNIV_PAGE_SIZE_SHIFT_MIN 12 +#define UNIV_PAGE_SIZE_SHIFT_MIN 12U /** log2 of largest page size (1<<16 == 64436 bytes). */ /** Maximum Page Size Shift (power of 2) */ -#define UNIV_PAGE_SIZE_SHIFT_MAX 16 +#define UNIV_PAGE_SIZE_SHIFT_MAX 16U /** log2 of default page size (1<<14 == 16384 bytes). */ /** Default Page Size Shift (power of 2) */ -#define UNIV_PAGE_SIZE_SHIFT_DEF 14 +#define UNIV_PAGE_SIZE_SHIFT_DEF 14U /** Original 16k InnoDB Page Size Shift, in case the default changes */ -#define UNIV_PAGE_SIZE_SHIFT_ORIG 14 +#define UNIV_PAGE_SIZE_SHIFT_ORIG 14U /** Original 16k InnoDB Page Size as an ssize (log2 - 9) */ -#define UNIV_PAGE_SSIZE_ORIG (UNIV_PAGE_SIZE_SHIFT_ORIG - 9) +#define UNIV_PAGE_SSIZE_ORIG (UNIV_PAGE_SIZE_SHIFT_ORIG - 9U) /** Minimum page size InnoDB currently supports. */ #define UNIV_PAGE_SIZE_MIN (1U << UNIV_PAGE_SIZE_SHIFT_MIN) @@ -369,13 +333,13 @@ and 2 bits for flags. This limits the uncompressed page size to 16k. /** Largest possible ssize for an uncompressed page. (The convention 'ssize' is used for 'log2 minus 9' or the number of shifts starting with 512.) -This max number varies depending on UNIV_PAGE_SIZE. */ +This max number varies depending on srv_page_size. */ #define UNIV_PAGE_SSIZE_MAX \ - static_cast<ulint>(UNIV_PAGE_SIZE_SHIFT - UNIV_ZIP_SIZE_SHIFT_MIN + 1) + ulint(srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1U) /** Smallest possible ssize for an uncompressed page. */ #define UNIV_PAGE_SSIZE_MIN \ - static_cast<ulint>(UNIV_PAGE_SIZE_SHIFT_MIN - UNIV_ZIP_SIZE_SHIFT_MIN + 1) + ulint(UNIV_PAGE_SIZE_SHIFT_MIN - UNIV_ZIP_SIZE_SHIFT_MIN + 1U) /** Maximum number of parallel threads in a parallelized operation */ #define UNIV_MAX_PARALLELISM 32 @@ -480,7 +444,7 @@ typedef ib_uint64_t lsn_t; #define UINT64_UNDEFINED ((ib_uint64_t)(-1)) /** The bitmask of 32-bit unsigned integer */ -#define ULINT32_MASK 0xFFFFFFFF +#define ULINT32_MASK 0xFFFFFFFFU /** The undefined 32-bit unsigned integer */ #define ULINT32_UNDEFINED ULINT32_MASK diff --git a/storage/innobase/include/ut0byte.ic b/storage/innobase/include/ut0byte.ic index b5b3d73fea8..e6e60f07886 100644 --- a/storage/innobase/include/ut0byte.ic +++ b/storage/innobase/include/ut0byte.ic @@ -144,9 +144,6 @@ ut_bit_get_nth( ulint n) /*!< in: nth bit requested */ { ut_ad(n < 8 * sizeof(ulint)); -#if TRUE != 1 -# error "TRUE != 1" -#endif return(1 & (a >> n)); } @@ -162,9 +159,6 @@ ut_bit_set_nth( ibool val) /*!< in: value for the bit to set */ { ut_ad(n < 8 * sizeof(ulint)); -#if TRUE != 1 -# error "TRUE != 1" -#endif if (val) { return(((ulint) 1 << n) | a); } else { diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h index 178be98fee6..68af6882155 100644 --- a/storage/innobase/include/ut0crc32.h +++ b/storage/innobase/include/ut0crc32.h @@ -50,9 +50,10 @@ extern ut_crc32_func_t ut_crc32; #ifdef INNODB_BUG_ENDIAN_CRC32 /** Pointer to CRC32 calculation function, which uses big-endian byte order when converting byte strings to integers internally. */ -extern ut_crc32_func_t ut_crc32_legacy_big_endian; +extern uint32_t ut_crc32_legacy_big_endian(const byte* buf, ulint len); #endif /* INNODB_BUG_ENDIAN_CRC32 */ +/** Text description of CRC32 implementation */ extern const char* ut_crc32_implementation; #endif /* ut0crc32_h */ diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h index 5d3fa1cf987..85856660494 100644 --- a/storage/innobase/include/ut0dbg.h +++ b/storage/innobase/include/ut0dbg.h @@ -59,8 +59,8 @@ ut_dbg_assertion_failed( ut_dbg_assertion_failed(0, __FILE__, __LINE__) /** Debug assertion */ -#define ut_ad DBUG_ASSERT -#ifdef UNIV_DEBUG +#define ut_ad DBUG_SLOW_ASSERT +#if defined(UNIV_DEBUG) || !defined(DBUG_OFF) /** Debug statement. Does nothing unless UNIV_DEBUG is defined. */ #define ut_d(EXPR) EXPR #else diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h index 3bd9ce3045e..a190b872549 100644 --- a/storage/innobase/include/ut0new.h +++ b/storage/innobase/include/ut0new.h @@ -129,6 +129,7 @@ InnoDB: #include <string.h> /* strlen(), strrchr(), strncmp() */ #include "my_global.h" /* needed for headers from mysql/psi/ */ + /* JAN: TODO: missing 5.7 header */ #ifdef HAVE_MYSQL_MEMORY_H #include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */ @@ -170,7 +171,6 @@ extern PSI_memory_key mem_key_other; extern PSI_memory_key mem_key_row_log_buf; extern PSI_memory_key mem_key_row_merge_sort; extern PSI_memory_key mem_key_std; -extern PSI_memory_key mem_key_trx_sys_t_rw_trx_ids; /** Setup the internal objects needed for UT_NEW() to operate. This must be called before the first call to UT_NEW(). */ @@ -232,6 +232,51 @@ struct ut_new_pfx_t { #endif }; +static inline void ut_allocate_trace_dontdump(void *ptr, size_t bytes, + bool +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DONTDUMP) + dontdump +#endif + , ut_new_pfx_t* pfx, + const char* +#ifdef UNIV_PFS_MEMORY + file +#endif + + ) +{ + ut_a(ptr != NULL); + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DONTDUMP) + if (dontdump && madvise(ptr, bytes, MADV_DONTDUMP)) { + ib::warn() << "Failed to set memory to " DONTDUMP_STR ": " + << strerror(errno) + << " ptr " << ptr + << " size " << bytes; + } +#endif + if (pfx != NULL) { +#ifdef UNIV_PFS_MEMORY + allocate_trace(bytes, file, pfx); +#endif /* UNIV_PFS_MEMORY */ + pfx->m_size = bytes; + } +} + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) +static inline void ut_dodump(void* ptr, size_t m_size) +{ + if (ptr && madvise(ptr, m_size, MADV_DODUMP)) { + ib::warn() << "Failed to set memory to " DODUMP_STR ": " + << strerror(errno) + << " ptr " << ptr + << " size " << m_size; + } +} +#else +static inline void ut_dodump(void*, size_t) {} +#endif + /** Allocator class for allocating memory from inside std::* containers. @tparam T type of allocated object @tparam oom_fatal whether to commit suicide when running out of memory */ @@ -246,19 +291,25 @@ public: typedef size_t size_type; typedef ptrdiff_t difference_type; +#ifdef UNIV_PFS_MEMORY /** Default constructor. */ explicit ut_allocator(PSI_memory_key key = PSI_NOT_INSTRUMENTED) -#ifdef UNIV_PFS_MEMORY : m_key(key) -#endif /* UNIV_PFS_MEMORY */ { } +#else + ut_allocator() {} + ut_allocator(PSI_memory_key) {} +#endif /* UNIV_PFS_MEMORY */ /** Constructor from allocator of another type. */ template <class U> - ut_allocator( - const ut_allocator<U>& other) + ut_allocator(const ut_allocator<U>& +#ifdef UNIV_PFS_MEMORY + other +#endif + ) #ifdef UNIV_PFS_MEMORY : m_key(other.m_key) #endif /* UNIV_PFS_MEMORY */ @@ -279,6 +330,8 @@ public: #endif /* UNIV_PFS_MEMORY */ } + pointer allocate(size_type n) { return allocate(n, NULL, NULL); } + /** Allocate a chunk of memory that can hold 'n_elements' objects of type 'T' and trace the allocation. If the allocation fails this method may throw an exception. This @@ -287,17 +340,19 @@ public: After successfull allocation the returned pointer must be passed to ut_allocator::deallocate() when no longer needed. @param[in] n_elements number of elements - @param[in] hint pointer to a nearby memory location, - unused by this implementation - @param[in] file file name of the caller @param[in] set_to_zero if true, then the returned memory is initialized with 0x0 bytes. + @param[in] throw_on_error if true, raize exception if too big @return pointer to the allocated memory */ pointer allocate( size_type n_elements, - const_pointer hint = NULL, - const char* file = NULL, + const_pointer, + const char* +#ifdef UNIV_PFS_MEMORY + file /*!< file name of the caller */ +#endif + , bool set_to_zero = false, bool throw_on_error = true) { @@ -564,6 +619,8 @@ public: /** Allocate a large chunk of memory that can hold 'n_elements' objects of type 'T' and trace the allocation. @param[in] n_elements number of elements + @param[in] dontdump if true, advise the OS is not to core + dump this memory. @param[out] pfx storage for the description of the allocated memory. The caller must provide space for this one and keep it until the memory is no longer needed and then pass it to @@ -572,7 +629,8 @@ public: pointer allocate_large( size_type n_elements, - ut_new_pfx_t* pfx) + ut_new_pfx_t* pfx, + bool dontdump = false) { if (n_elements == 0 || n_elements > max_size()) { return(NULL); @@ -583,17 +641,22 @@ public: pointer ptr = reinterpret_cast<pointer>( os_mem_alloc_large(&n_bytes)); -#ifdef UNIV_PFS_MEMORY - if (ptr != NULL) { - allocate_trace(n_bytes, NULL, pfx); + if (ptr == NULL) { + return NULL; } -#else - pfx->m_size = n_bytes; -#endif /* UNIV_PFS_MEMORY */ + + ut_allocate_trace_dontdump(ptr, n_bytes, dontdump, pfx, NULL); return(ptr); } + pointer + allocate_large_dontdump( + size_type n_elements, + ut_new_pfx_t* pfx) + { + return allocate_large(n_elements, pfx, true); + } /** Free a memory allocated by allocate_large() and trace the deallocation. @param[in,out] ptr pointer to memory to free @@ -602,17 +665,43 @@ public: void deallocate_large( pointer ptr, - const ut_new_pfx_t* pfx) + const ut_new_pfx_t* +#ifdef UNIV_PFS_MEMORY + pfx +#endif + , + size_t size) { #ifdef UNIV_PFS_MEMORY - deallocate_trace(pfx); + if (pfx) { + deallocate_trace(pfx); + } #endif /* UNIV_PFS_MEMORY */ - os_mem_free_large(ptr, pfx->m_size); + os_mem_free_large(ptr, size); } + void + deallocate_large_dodump( + pointer ptr, + const ut_new_pfx_t* +#ifdef UNIV_PFS_MEMORY + pfx +#endif + , + size_t size) + { + ut_dodump(ptr, size); + deallocate_large(ptr, #ifdef UNIV_PFS_MEMORY + pfx, +#else + NULL, +#endif + size); + } +#ifdef UNIV_PFS_MEMORY /** Get the performance schema key to use for tracing allocations. @param[in] file file name of the caller or NULL if unknown @return performance schema key */ @@ -722,12 +811,7 @@ could be freed by A2 even if the pfs mem key is different. */ template <typename T> inline bool -operator==( - const ut_allocator<T>& lhs, - const ut_allocator<T>& rhs) -{ - return(true); -} +operator==(const ut_allocator<T>&, const ut_allocator<T>&) { return(true); } /** Compare two allocators of the same type. */ template <typename T> @@ -840,6 +924,10 @@ ut_delete_array( ut_allocator<byte>(key).allocate( \ n_bytes, NULL, __FILE__, false, false)) +#define ut_malloc_dontdump(n_bytes) static_cast<void*>( \ + ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate_large( \ + n_bytes, true)) + #define ut_zalloc(n_bytes, key) static_cast<void*>( \ ut_allocator<byte>(key).allocate( \ n_bytes, NULL, __FILE__, true, false)) @@ -863,6 +951,10 @@ ut_delete_array( #define ut_free(ptr) ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate( \ reinterpret_cast<byte*>(ptr)) +#define ut_free_dodump(ptr, size) static_cast<void*>( \ + ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate_large( \ + ptr, NULL, size, true)) + #else /* UNIV_PFS_MEMORY */ /* Fallbacks when memory tracing is disabled at compile time. */ @@ -885,6 +977,14 @@ ut_delete_array( #define ut_malloc_nokey(n_bytes) ::malloc(n_bytes) +static inline void *ut_malloc_dontdump(size_t n_bytes) +{ + void *ptr = os_mem_alloc_large(&n_bytes); + + ut_allocate_trace_dontdump(ptr, n_bytes, true, NULL, NULL); + return ptr; +} + #define ut_zalloc_nokey(n_bytes) ::calloc(1, n_bytes) #define ut_zalloc_nokey_nofatal(n_bytes) ::calloc(1, n_bytes) @@ -893,6 +993,12 @@ ut_delete_array( #define ut_free(ptr) ::free(ptr) +static inline void ut_free_dodump(void *ptr, size_t size) +{ + ut_dodump(ptr, size); + os_mem_free_large(ptr, size); +} + #endif /* UNIV_PFS_MEMORY */ #endif /* ut0new_h */ diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h index 749c4188edf..e0a1f7c04ca 100644 --- a/storage/innobase/include/ut0pool.h +++ b/storage/innobase/include/ut0pool.h @@ -111,7 +111,7 @@ struct Pool { } else if (m_last < m_end) { /* Initialise the remaining elements. */ - init(m_end - m_last); + init(size_t(m_end - m_last)); ut_ad(!m_pqueue.empty()); diff --git a/storage/innobase/include/ut0stage.h b/storage/innobase/include/ut0stage.h index a369daa8bb3..1eb7810a1bb 100644 --- a/storage/innobase/include/ut0stage.h +++ b/storage/innobase/include/ut0stage.h @@ -527,65 +527,28 @@ ut_stage_alter_t::change_phase( class ut_stage_alter_t { public: - explicit - ut_stage_alter_t( - const dict_index_t* pk) - { - } + explicit ut_stage_alter_t(const dict_index_t*) {} - void - begin_phase_read_pk( - ulint n_sort_indexes) - { - } + void begin_phase_read_pk(ulint) {} - void - n_pk_recs_inc() - { - } + void n_pk_recs_inc() {} - void - inc( - ulint inc_val = 1) - { - } + void inc() {} + void inc(ulint) {} - void - end_phase_read_pk() - { - } + void end_phase_read_pk() {} - void - begin_phase_sort( - double sort_multi_factor) - { - } + void begin_phase_sort(double) {} - void - begin_phase_insert() - { - } + void begin_phase_insert() {} - void - begin_phase_flush( - ulint n_flush_pages) - { - } + void begin_phase_flush(ulint) {} - void - begin_phase_log_index() - { - } + void begin_phase_log_index() {} - void - begin_phase_log_table() - { - } + void begin_phase_log_table() {} - void - begin_phase_end() - { - } + void begin_phase_end() {} }; #endif /* HAVE_PSI_STAGE_INTERFACE */ diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h index a19f3db188d..a6a70c99ecf 100644 --- a/storage/innobase/include/ut0ut.h +++ b/storage/innobase/include/ut0ut.h @@ -46,56 +46,11 @@ Created 1/20/1994 Heikki Tuuri #include <stdarg.h> #include <string> +#include <my_atomic.h> /** Index name prefix in fast index creation, as a string constant */ #define TEMP_INDEX_PREFIX_STR "\377" -#ifdef HAVE_PAUSE_INSTRUCTION - /* According to the gcc info page, asm volatile means that the - instruction has important side-effects and must not be removed. - Also asm volatile may trigger a memory barrier (spilling all registers - to memory). */ -# ifdef __SUNPRO_CC -# define UT_RELAX_CPU() asm ("pause" ) -# else -# define UT_RELAX_CPU() __asm__ __volatile__ ("pause") -# endif /* __SUNPRO_CC */ - -#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION) -# define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop") -#elif defined _WIN32 - /* In the Win32 API, the x86 PAUSE instruction is executed by calling - the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- - independent way by using YieldProcessor. */ -# define UT_RELAX_CPU() YieldProcessor() -#elif defined(__powerpc__) && defined __GLIBC__ -# include <sys/platform/ppc.h> -# define UT_RELAX_CPU() __ppc_get_timebase() -#else -# define UT_RELAX_CPU() do { \ - volatile int32 volatile_var; \ - int32 oldval= 0; \ - my_atomic_cas32(&volatile_var, &oldval, 1); \ - } while (0) -#endif - -#if defined (__GNUC__) -# define UT_COMPILER_BARRIER() __asm__ __volatile__ ("":::"memory") -#elif defined (_MSC_VER) -# define UT_COMPILER_BARRIER() _ReadWriteBarrier() -#else -# define UT_COMPILER_BARRIER() -#endif - -#if defined(HAVE_HMT_PRIORITY_INSTRUCTION) -# include <sys/platform/ppc.h> -# define UT_LOW_PRIORITY_CPU() __ppc_set_ppr_low() -# define UT_RESUME_PRIORITY_CPU() __ppc_set_ppr_med() -#else -# define UT_LOW_PRIORITY_CPU() ((void)0) -# define UT_RESUME_PRIORITY_CPU() ((void)0) -#endif - #define ut_max std::max #define ut_min std::min @@ -240,14 +195,7 @@ void ut_sprintf_timestamp( /*=================*/ char* buf); /*!< in: buffer where to sprintf */ -/*************************************************************//** -Runs an idle loop on CPU. The argument gives the desired delay -in microseconds on 100 MHz Pentium + Visual C++. -@return dummy value */ -void -ut_delay( -/*=====*/ - ulint delay); /*!< in: delay in microseconds on 100 MHz Pentium */ + /*************************************************************//** Prints the contents of a memory buffer in hex and ascii. */ void diff --git a/storage/innobase/innodb.cmake b/storage/innobase/innodb.cmake index 523176b4530..8bfca3a614b 100644 --- a/storage/innobase/innodb.cmake +++ b/storage/innobase/innodb.cmake @@ -35,6 +35,8 @@ MYSQL_CHECK_SNAPPY() MYSQL_CHECK_NUMA() TEST_BIG_ENDIAN(IS_BIG_ENDIAN) +INCLUDE(${MYSQL_CMAKE_SCRIPT_DIR}/compile_flags.cmake) + IF(CMAKE_CROSSCOMPILING) # Use CHECK_C_SOURCE_COMPILES instead of CHECK_C_SOURCE_RUNS when # cross-compiling. Not as precise, but usually good enough. @@ -49,12 +51,6 @@ ELSE() ENDMACRO() ENDIF() -## MySQL 5.7 LZ4 (not needed) -##IF(LZ4_INCLUDE_DIR AND LZ4_LIBRARY) -## ADD_DEFINITIONS(-DHAVE_LZ4=1) -## INCLUDE_DIRECTORIES(${LZ4_INCLUDE_DIR}) -##ENDIF() - # OS tests IF(UNIX) IF(CMAKE_SYSTEM_NAME STREQUAL "Linux") @@ -133,20 +129,7 @@ ENDIF() OPTION(WITH_INNODB_EXTRA_DEBUG "Enable extra InnoDB debug checks" OFF) IF(WITH_INNODB_EXTRA_DEBUG) - IF(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") - MESSAGE(FATAL_ERROR "WITH_INNODB_EXTRA_DEBUG can be enabled only in debug builds") - ENDIF() - - SET(EXTRA_DEBUG_FLAGS "") - IF(WITH_INNODB_AHI) - SET(EXTRA_DEBUG_FLAGS "${EXTRA_DEBUG_FLAGS} -DUNIV_AHI_DEBUG") - ENDIF() - SET(EXTRA_DEBUG_FLAGS "${EXTRA_DEBUG_FLAGS} -DUNIV_DDL_DEBUG") - SET(EXTRA_DEBUG_FLAGS "${EXTRA_DEBUG_FLAGS} -DUNIV_DEBUG_FILE_ACCESSES") - SET(EXTRA_DEBUG_FLAGS "${EXTRA_DEBUG_FLAGS} -DUNIV_ZIP_DEBUG") - - SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${EXTRA_DEBUG_FLAGS}") - SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${EXTRA_DEBUG_FLAGS}") + ADD_DEFINITIONS(-DUNIV_ZIP_DEBUG) ENDIF() CHECK_FUNCTION_EXISTS(sched_getcpu HAVE_SCHED_GETCPU) @@ -170,13 +153,6 @@ IF(NOT MSVC) SET_SOURCE_FILES_PROPERTIES(trx/trx0rec.cc PROPERTIES COMPILE_FLAGS -O1) ENDIF() - # workaround for old gcc on x86, gcc atomic ops only work under -march=i686 - IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "i686" AND CMAKE_COMPILER_IS_GNUCC AND - CMAKE_C_COMPILER_VERSION VERSION_LESS "4.4.0") - SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=i686") - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=i686") - ENDIF() - # Only use futexes on Linux if GCC atomics are available IF(NOT MSVC AND NOT CMAKE_CROSSCOMPILING) CHECK_C_SOURCE_RUNS( @@ -255,13 +231,6 @@ IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro" PROPERTIES COMPILE_FLAGS -xO3) ENDIF() -# Removing compiler optimizations for innodb/mem/* files on 64-bit Windows -# due to 64-bit compiler error, See MySQL Bug #19424, #36366, #34297 -IF (MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 8) - SET_SOURCE_FILES_PROPERTIES(mem/mem0mem.cc mem/mem0pool.cc - PROPERTIES COMPILE_FLAGS -Od) -ENDIF() - # Avoid generating Hardware Capabilities due to crc32 instructions IF(CMAKE_SYSTEM_NAME MATCHES "SunOS" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i386") MY_CHECK_CXX_COMPILER_FLAG("-Wa,-nH") diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 506106a2269..b3086842624 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -72,8 +72,6 @@ extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd); extern "C" int thd_need_wait_reports(const MYSQL_THD thd); extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd); -extern "C" int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2); - /** Pretty-print a table lock. @param[in,out] file output stream @param[in] lock table lock */ @@ -253,7 +251,7 @@ private: ulint m_heap_no; /*!< heap number if rec lock */ }; - /** Used in deadlock tracking. Protected by lock_sys->mutex. */ + /** Used in deadlock tracking. Protected by lock_sys.mutex. */ static ib_uint64_t s_lock_mark_counter; /** Calculation steps thus far. It is the count of the nodes visited. */ @@ -309,7 +307,7 @@ lock_rec_validate_page( #endif /* UNIV_DEBUG */ /* The lock system */ -lock_sys_t* lock_sys = NULL; +lock_sys_t lock_sys; /** We store info on the latest deadlock error to this buffer. InnoDB Monitor will then fetch it and print */ @@ -328,8 +326,11 @@ lock_report_trx_id_insanity( const rec_t* rec, /*!< in: user record */ dict_index_t* index, /*!< in: index */ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */ - trx_id_t max_trx_id) /*!< in: trx_sys_get_max_trx_id() */ + trx_id_t max_trx_id) /*!< in: trx_sys.get_max_trx_id() */ { + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!rec_is_metadata(rec, index)); + ib::error() << "Transaction id " << ib::hex(trx_id) << " associated with record" << rec_offsets_print(rec, offsets) @@ -342,11 +343,6 @@ lock_report_trx_id_insanity( /*********************************************************************//** Checks that a transaction id is sensible, i.e., not in the future. @return true if ok */ -#ifdef UNIV_DEBUG - -#else -static MY_ATTRIBUTE((warn_unused_result)) -#endif bool lock_check_trx_id_sanity( /*=====================*/ @@ -355,17 +351,18 @@ lock_check_trx_id_sanity( dict_index_t* index, /*!< in: index */ const rec_offs* offsets) /*!< in: rec_get_offsets(rec, index) */ { - ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!rec_is_metadata(rec, index)); - trx_id_t max_trx_id = trx_sys_get_max_trx_id(); - bool is_ok = trx_id < max_trx_id; + trx_id_t max_trx_id= trx_sys.get_max_trx_id(); + ut_ad(max_trx_id || srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN); - if (UNIV_UNLIKELY(!is_ok)) { - lock_report_trx_id_insanity( - trx_id, rec, index, offsets, max_trx_id); - } - - return(is_ok); + if (UNIV_LIKELY(max_trx_id != 0) && UNIV_UNLIKELY(trx_id >= max_trx_id)) + { + lock_report_trx_id_insanity(trx_id, rec, index, offsets, max_trx_id); + return false; + } + return true; } /*********************************************************************//** @@ -384,13 +381,13 @@ lock_clust_rec_cons_read_sees( ut_ad(dict_index_is_clust(index)); ut_ad(page_rec_is_user_rec(rec)); ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!rec_is_metadata(rec, index)); /* Temp-tables are not shared across connections and multiple transactions from different connections cannot simultaneously operate on same temp-table and so read of temp-table is always consistent read. */ - if (srv_read_only_mode || dict_table_is_temporary(index->table)) { - ut_ad(view == 0 || dict_table_is_temporary(index->table)); + if (index->table->is_temporary()) { return(true); } @@ -422,15 +419,13 @@ lock_sec_rec_cons_read_sees( const ReadView* view) /*!< in: consistent read view */ { ut_ad(page_rec_is_user_rec(rec)); + ut_ad(!index->is_primary()); + ut_ad(!rec_is_metadata(rec, index)); /* NOTE that we might call this function while holding the search system latch. */ - if (recv_recovery_is_on()) { - - return(false); - - } else if (dict_table_is_temporary(index->table)) { + if (index->table->is_temporary()) { /* Temp-tables are not shared across connections and multiple transactions from different connections cannot simultaneously @@ -447,37 +442,34 @@ lock_sec_rec_cons_read_sees( return(view->sees(max_trx_id)); } -/*********************************************************************//** -Creates the lock system at database start. */ -void -lock_sys_create( -/*============*/ - ulint n_cells) /*!< in: number of slots in lock hash table */ -{ - ulint lock_sys_sz; - - lock_sys_sz = sizeof(*lock_sys) + OS_THREAD_MAX_N * sizeof(srv_slot_t); - lock_sys = static_cast<lock_sys_t*>(ut_zalloc_nokey(lock_sys_sz)); +/** + Creates the lock system at database start. - void* ptr = &lock_sys[1]; + @param[in] n_cells number of slots in lock hash table +*/ +void lock_sys_t::create(ulint n_cells) +{ + ut_ad(this == &lock_sys); - lock_sys->waiting_threads = static_cast<srv_slot_t*>(ptr); + m_initialised= true; - lock_sys->last_slot = lock_sys->waiting_threads; + waiting_threads = static_cast<srv_slot_t*> + (ut_zalloc_nokey(srv_max_n_threads * sizeof *waiting_threads)); + last_slot = waiting_threads; - mutex_create(LATCH_ID_LOCK_SYS, &lock_sys->mutex); + mutex_create(LATCH_ID_LOCK_SYS, &mutex); - mutex_create(LATCH_ID_LOCK_SYS_WAIT, &lock_sys->wait_mutex); + mutex_create(LATCH_ID_LOCK_SYS_WAIT, &wait_mutex); - lock_sys->timeout_event = os_event_create(0); + timeout_event = os_event_create(0); - lock_sys->rec_hash = hash_create(n_cells); - lock_sys->prdt_hash = hash_create(n_cells); - lock_sys->prdt_page_hash = hash_create(n_cells); + rec_hash = hash_create(n_cells); + prdt_hash = hash_create(n_cells); + prdt_page_hash = hash_create(n_cells); if (!srv_read_only_mode) { - lock_latest_err_file = os_file_create_tmpfile(NULL); + lock_latest_err_file = os_file_create_tmpfile(); ut_a(lock_latest_err_file); } } @@ -494,31 +486,33 @@ lock_rec_lock_fold( lock->un_member.rec_lock.page_no)); } -/** Resize the lock hash tables. -@param[in] n_cells number of slots in lock hash table */ -void -lock_sys_resize( - ulint n_cells) + +/** + Resize the lock hash table. + + @param[in] n_cells number of slots in lock hash table +*/ +void lock_sys_t::resize(ulint n_cells) { - hash_table_t* old_hash; + ut_ad(this == &lock_sys); - lock_mutex_enter(); + mutex_enter(&mutex); - old_hash = lock_sys->rec_hash; - lock_sys->rec_hash = hash_create(n_cells); - HASH_MIGRATE(old_hash, lock_sys->rec_hash, lock_t, hash, + hash_table_t* old_hash = rec_hash; + rec_hash = hash_create(n_cells); + HASH_MIGRATE(old_hash, rec_hash, lock_t, hash, lock_rec_lock_fold); hash_table_free(old_hash); - old_hash = lock_sys->prdt_hash; - lock_sys->prdt_hash = hash_create(n_cells); - HASH_MIGRATE(old_hash, lock_sys->prdt_hash, lock_t, hash, + old_hash = prdt_hash; + prdt_hash = hash_create(n_cells); + HASH_MIGRATE(old_hash, prdt_hash, lock_t, hash, lock_rec_lock_fold); hash_table_free(old_hash); - old_hash = lock_sys->prdt_page_hash; - lock_sys->prdt_page_hash = hash_create(n_cells); - HASH_MIGRATE(old_hash, lock_sys->prdt_page_hash, lock_t, hash, + old_hash = prdt_page_hash; + prdt_page_hash = hash_create(n_cells); + HASH_MIGRATE(old_hash, prdt_page_hash, lock_t, hash, lock_rec_lock_fold); hash_table_free(old_hash); @@ -547,40 +541,39 @@ lock_sys_resize( buf_pool_mutex_exit(buf_pool); } - lock_mutex_exit(); + mutex_exit(&mutex); } -/*********************************************************************//** -Closes the lock system at database shutdown. */ -void -lock_sys_close(void) -/*================*/ + +/** Closes the lock system at database shutdown. */ +void lock_sys_t::close() { + ut_ad(this == &lock_sys); + + if (!m_initialised) return; + if (lock_latest_err_file != NULL) { fclose(lock_latest_err_file); lock_latest_err_file = NULL; } - hash_table_free(lock_sys->rec_hash); - hash_table_free(lock_sys->prdt_hash); - hash_table_free(lock_sys->prdt_page_hash); + hash_table_free(rec_hash); + hash_table_free(prdt_hash); + hash_table_free(prdt_page_hash); - os_event_destroy(lock_sys->timeout_event); + os_event_destroy(timeout_event); - mutex_destroy(&lock_sys->mutex); - mutex_destroy(&lock_sys->wait_mutex); + mutex_destroy(&mutex); + mutex_destroy(&wait_mutex); - srv_slot_t* slot = lock_sys->waiting_threads; - - for (ulint i = 0; i < OS_THREAD_MAX_N; i++, ++slot) { - if (slot->event != NULL) { - os_event_destroy(slot->event); + for (ulint i = srv_max_n_threads; i--; ) { + if (os_event_t& event = waiting_threads[i].event) { + os_event_destroy(event); } } - ut_free(lock_sys); - - lock_sys = NULL; + ut_free(waiting_threads); + m_initialised= false; } /*********************************************************************//** @@ -724,7 +717,7 @@ static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx) Checks if a lock request for a new lock has to wait for request lock2. @return TRUE if new lock has to wait for lock2 to be removed */ UNIV_INLINE -ibool +bool lock_rec_has_to_wait( /*=================*/ bool for_locking, @@ -748,84 +741,83 @@ lock_rec_has_to_wait( ut_ad(lock_get_type_low(lock2) == LOCK_REC); ut_ad(lock_mutex_own()); - if (trx != lock2->trx - && !lock_mode_compatible(static_cast<lock_mode>( - LOCK_MODE_MASK & type_mode), - lock_get_mode(lock2))) { + if (trx == lock2->trx + || lock_mode_compatible( + static_cast<lock_mode>(LOCK_MODE_MASK & type_mode), + lock_get_mode(lock2))) { + return false; + } - /* We have somewhat complex rules when gap type record locks - cause waits */ + /* We have somewhat complex rules when gap type record locks + cause waits */ - if ((lock_is_on_supremum || (type_mode & LOCK_GAP)) - && !(type_mode & LOCK_INSERT_INTENTION)) { + if ((lock_is_on_supremum || (type_mode & LOCK_GAP)) + && !(type_mode & LOCK_INSERT_INTENTION)) { - /* Gap type locks without LOCK_INSERT_INTENTION flag - do not need to wait for anything. This is because - different users can have conflicting lock types - on gaps. */ + /* Gap type locks without LOCK_INSERT_INTENTION flag + do not need to wait for anything. This is because + different users can have conflicting lock types + on gaps. */ - return(FALSE); - } + return false; + } - if (!(type_mode & LOCK_INSERT_INTENTION) - && lock_rec_get_gap(lock2)) { + if (!(type_mode & LOCK_INSERT_INTENTION) && lock_rec_get_gap(lock2)) { - /* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP - does not need to wait for a gap type lock */ + /* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP + does not need to wait for a gap type lock */ - return(FALSE); - } + return false; + } - if ((type_mode & LOCK_GAP) - && lock_rec_get_rec_not_gap(lock2)) { + if ((type_mode & LOCK_GAP) && lock_rec_get_rec_not_gap(lock2)) { - /* Lock on gap does not need to wait for - a LOCK_REC_NOT_GAP type lock */ + /* Lock on gap does not need to wait for + a LOCK_REC_NOT_GAP type lock */ - return(FALSE); - } + return false; + } - if (lock_rec_get_insert_intention(lock2)) { + if (lock_rec_get_insert_intention(lock2)) { - /* No lock request needs to wait for an insert - intention lock to be removed. This is ok since our - rules allow conflicting locks on gaps. This eliminates - a spurious deadlock caused by a next-key lock waiting - for an insert intention lock; when the insert - intention lock was granted, the insert deadlocked on - the waiting next-key lock. + /* No lock request needs to wait for an insert + intention lock to be removed. This is ok since our + rules allow conflicting locks on gaps. This eliminates + a spurious deadlock caused by a next-key lock waiting + for an insert intention lock; when the insert + intention lock was granted, the insert deadlocked on + the waiting next-key lock. - Also, insert intention locks do not disturb each - other. */ + Also, insert intention locks do not disturb each + other. */ - return(FALSE); - } + return false; + } - if ((type_mode & LOCK_GAP || lock_rec_get_gap(lock2)) && - !thd_need_ordering_with(trx->mysql_thd, - lock2->trx->mysql_thd)) { - /* If the upper server layer has already decided on the - commit order between the transaction requesting the - lock and the transaction owning the lock, we do not - need to wait for gap locks. Such ordeering by the upper - server layer happens in parallel replication, where the - commit order is fixed to match the original order on the - master. - - Such gap locks are mainly needed to get serialisability - between transactions so that they will be binlogged in - the correct order so that statement-based replication - will give the correct results. Since the right order - was already determined on the master, we do not need - to enforce it again here. - - Skipping the locks is not essential for correctness, - since in case of deadlock we will just kill the later - transaction and retry it. But it can save some - unnecessary rollbacks and retries. */ - - return (FALSE); - } + if ((type_mode & LOCK_GAP || lock_rec_get_gap(lock2)) + && !thd_need_ordering_with(trx->mysql_thd, lock2->trx->mysql_thd)) { + /* If the upper server layer has already decided on the + commit order between the transaction requesting the + lock and the transaction owning the lock, we do not + need to wait for gap locks. Such ordeering by the upper + server layer happens in parallel replication, where the + commit order is fixed to match the original order on the + master. + + Such gap locks are mainly needed to get serialisability + between transactions so that they will be binlogged in + the correct order so that statement-based replication + will give the correct results. Since the right order + was already determined on the master, we do not need + to enforce it again here. + + Skipping the locks is not essential for correctness, + since in case of deadlock we will just kill the later + transaction and retry it. But it can save some + unnecessary rollbacks and retries. */ + + return false; + } #ifdef WITH_WSREP /* New lock request from a transaction is using unique key @@ -839,7 +831,7 @@ lock_rec_has_to_wait( lock_sys->mutex. */ if (trx->is_wsrep_UK_scan() && wsrep_thd_is_BF(lock2->trx->mysql_thd, false)) { - return (FALSE); + return false; } /* We very well can let bf to wait normally as other @@ -849,16 +841,13 @@ lock_rec_has_to_wait( ut_d(wsrep_assert_no_bf_bf_wait(lock2, trx)); #endif /* WITH_WSREP */ - return(TRUE); - } - - return(FALSE); + return true; } /*********************************************************************//** Checks if a lock request lock1 has to wait for request lock2. @return TRUE if lock1 has to wait for lock2 to be removed */ -ibool +bool lock_has_to_wait( /*=============*/ const lock_t* lock1, /*!< in: waiting lock */ @@ -869,32 +858,27 @@ lock_has_to_wait( { ut_ad(lock1 && lock2); - if (lock1->trx != lock2->trx - && !lock_mode_compatible(lock_get_mode(lock1), - lock_get_mode(lock2))) { - if (lock_get_type_low(lock1) == LOCK_REC) { - ut_ad(lock_get_type_low(lock2) == LOCK_REC); - - /* If this lock request is for a supremum record - then the second bit on the lock bitmap is set */ - - if (lock1->type_mode - & (LOCK_PREDICATE | LOCK_PRDT_PAGE)) { - return(lock_prdt_has_to_wait( - lock1->trx, lock1->type_mode, - lock_get_prdt_from_lock(lock1), - lock2)); - } else { - return(lock_rec_has_to_wait(false, - lock1->trx, lock1->type_mode, lock2, - lock_rec_get_nth_bit(lock1, true))); - } - } + if (lock1->trx == lock2->trx + || lock_mode_compatible(lock_get_mode(lock1), + lock_get_mode(lock2))) { + return false; + } + + if (lock_get_type_low(lock1) != LOCK_REC) { + return true; + } - return(TRUE); + ut_ad(lock_get_type_low(lock2) == LOCK_REC); + + if (lock1->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)) { + return lock_prdt_has_to_wait(lock1->trx, lock1->type_mode, + lock_get_prdt_from_lock(lock1), + lock2); } - return(FALSE); + return lock_rec_has_to_wait( + false, lock1->trx, lock1->type_mode, lock2, + lock_rec_get_nth_bit(lock1, PAGE_HEAP_NO_SUPREMUM)); } /*============== RECORD LOCK BASIC FUNCTIONS ============================*/ @@ -933,7 +917,7 @@ lock_rec_expl_exist_on_page( lock_mutex_enter(); /* Only used in ibuf pages, so rec_hash is good enough */ - lock = lock_rec_get_first_on_page_addr(lock_sys->rec_hash, + lock = lock_rec_get_first_on_page_addr(lock_sys.rec_hash, space, page_no); lock_mutex_exit(); @@ -1051,7 +1035,7 @@ lock_rec_has_expl( || (precise_mode & LOCK_MODE_MASK) == LOCK_X); ut_ad(!(precise_mode & LOCK_INSERT_INTENTION)); - for (lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); + for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { @@ -1104,7 +1088,7 @@ lock_rec_other_has_expl_req( return(NULL); } - for (lock_t* lock = lock_rec_get_first(lock_sys->rec_hash, + for (lock_t* lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { @@ -1180,7 +1164,7 @@ lock_rec_other_has_conflicting( bool is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM); - for (lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); + for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { @@ -1213,6 +1197,7 @@ static trx_t* lock_sec_rec_some_has_impl( /*=======================*/ + trx_t* caller_trx,/*!<in/out: trx of current thread */ const rec_t* rec, /*!< in: user record */ dict_index_t* index, /*!< in: secondary index */ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ @@ -1222,10 +1207,10 @@ lock_sec_rec_some_has_impl( const page_t* page = page_align(rec); ut_ad(!lock_mutex_own()); - ut_ad(!trx_sys_mutex_own()); ut_ad(!dict_index_is_clust(index)); ut_ad(page_rec_is_user_rec(rec)); ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!rec_is_metadata(rec, index)); max_trx_id = page_get_max_trx_id(page); @@ -1233,7 +1218,7 @@ lock_sec_rec_some_has_impl( if the max trx id for the page >= min trx id for the trx list, or database recovery is running. */ - if (max_trx_id < trx_rw_min_trx_id() && !recv_recovery_is_on()) { + if (max_trx_id < trx_sys.get_min_trx_id()) { trx = 0; @@ -1246,69 +1231,17 @@ lock_sec_rec_some_has_impl( x-lock. We have to look in the clustered index. */ } else { - trx = row_vers_impl_x_locked(rec, index, offsets); + trx = row_vers_impl_x_locked(caller_trx, rec, index, offsets); } return(trx); } -#ifdef UNIV_DEBUG -/*********************************************************************//** -Checks if some transaction, other than given trx_id, has an explicit -lock on the given rec, in the given precise_mode. -@return the transaction, whose id is not equal to trx_id, that has an -explicit lock on the given rec, in the given precise_mode or NULL.*/ -static -trx_t* -lock_rec_other_trx_holds_expl( -/*==========================*/ - ulint precise_mode, /*!< in: LOCK_S or LOCK_X - possibly ORed to LOCK_GAP or - LOCK_REC_NOT_GAP. */ - trx_t* trx, /*!< in: trx holding implicit - lock on rec */ - const rec_t* rec, /*!< in: user record */ - const buf_block_t* block) /*!< in: buffer block - containing the record */ -{ - trx_t* holds = NULL; - - lock_mutex_enter(); - mutex_enter(&trx_sys->mutex); - trx_mutex_enter(trx); - - ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED)); - - if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) { - const ulint heap_no = page_rec_get_heap_no(rec); - for (trx_t* t = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); - t != NULL; - t = UT_LIST_GET_NEXT(trx_list, t)) { - - lock_t* expl_lock = lock_rec_has_expl( - precise_mode, block, heap_no, t); - if (expl_lock && expl_lock->trx != trx) { - /* An explicit lock is held by trx other than - the trx holding the implicit lock. */ - holds = expl_lock->trx; - break; - } - } - } - - lock_mutex_exit(); - mutex_exit(&trx_sys->mutex); - trx_mutex_exit(trx); - - return(holds); -} -#endif /* UNIV_DEBUG */ - /*********************************************************************//** Return approximate number or record locks (bits set in the bitmap) for this transaction. Since delete-marked records may be removed, the record count will not be precise. -The caller must be holding lock_sys->mutex. */ +The caller must be holding lock_sys.mutex. */ ulint lock_number_of_rows_locked( /*=======================*/ @@ -1321,7 +1254,7 @@ lock_number_of_rows_locked( /*********************************************************************//** Return the number of table locks for a transaction. -The caller must be holding lock_sys->mutex. */ +The caller must be holding lock_sys.mutex. */ ulint lock_number_of_tables_locked( /*=========================*/ @@ -1543,7 +1476,7 @@ lock_rec_create_low( && innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS && !thd_is_replication_slave_thread(trx->mysql_thd)) { - HASH_PREPEND(lock_t, hash, lock_sys->rec_hash, + HASH_PREPEND(lock_t, hash, lock_sys.rec_hash, lock_rec_fold(space, page_no), lock); } else { HASH_INSERT(lock_t, hash, lock_hash_get(type_mode), @@ -1663,7 +1596,7 @@ lock_queue_validate( hash_table_t* hash; hash_cell_t* cell; lock_t* next; - bool wait_lock = false; + bool wait_lock __attribute__((unused))= false; if (in_lock == NULL) { return true; @@ -1766,6 +1699,11 @@ lock_rec_enqueue_waiting( ut_ad(0); } + if (trx->mysql_thd && thd_lock_wait_timeout(trx->mysql_thd) == 0) { + trx->error_state = DB_LOCK_WAIT_TIMEOUT; + return DB_LOCK_WAIT_TIMEOUT; + } + /* Enqueue the lock request that will wait to be granted, note that we already own the trx mutex. */ lock_t* lock = lock_rec_create( @@ -1816,7 +1754,7 @@ lock_rec_enqueue_waiting( == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS && !prdt && !thd_is_replication_slave_thread(lock->trx->mysql_thd)) { - HASH_DELETE(lock_t, hash, lock_sys->rec_hash, + HASH_DELETE(lock_t, hash, lock_sys.rec_hash, lock_rec_lock_fold(lock), lock); dberr_t res = lock_rec_insert_by_trx_age(lock); if (res != DB_SUCCESS) { @@ -1946,166 +1884,6 @@ lock_rec_add_to_queue( } /*********************************************************************//** -This is a fast routine for locking a record in the most common cases: -there are no explicit locks on the page, or there is just one lock, owned -by this transaction, and of the right type_mode. This is a low-level function -which does NOT look at implicit locks! Checks lock compatibility within -explicit locks. This function sets a normal next-key lock, or in the case of -a page supremum record, a gap type lock. -@return whether the locking succeeded */ -UNIV_INLINE -lock_rec_req_status -lock_rec_lock_fast( -/*===============*/ - bool impl, /*!< in: if TRUE, no lock is set - if no wait is necessary: we - assume that the caller will - set an implicit lock */ - ulint mode, /*!< in: lock mode: LOCK_X or - LOCK_S possibly ORed to either - LOCK_GAP or LOCK_REC_NOT_GAP */ - const buf_block_t* block, /*!< in: buffer block containing - the record */ - ulint heap_no,/*!< in: heap number of record */ - dict_index_t* index, /*!< in: index of record */ - que_thr_t* thr) /*!< in: query thread */ -{ - ut_ad(lock_mutex_own()); - ut_ad(!srv_read_only_mode); - ut_ad((LOCK_MODE_MASK & mode) != LOCK_S - || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); - ut_ad((LOCK_MODE_MASK & mode) != LOCK_X - || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX) - || srv_read_only_mode); - ut_ad((LOCK_MODE_MASK & mode) == LOCK_S - || (LOCK_MODE_MASK & mode) == LOCK_X); - ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP - || mode - (LOCK_MODE_MASK & mode) == 0 - || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); - ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); - - DBUG_EXECUTE_IF("innodb_report_deadlock", return(LOCK_REC_FAIL);); - - lock_t* lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); - - trx_t* trx = thr_get_trx(thr); - - lock_rec_req_status status = LOCK_REC_SUCCESS; - - if (lock == NULL) { - if (!impl) { - /* Note that we don't own the trx mutex. */ - lock = lock_rec_create( -#ifdef WITH_WSREP - NULL, NULL, -#endif - mode, block, heap_no, index, trx, false); - } - - status = LOCK_REC_SUCCESS_CREATED; - } else { - trx_mutex_enter(trx); - - if (lock_rec_get_next_on_page(lock) - || lock->trx != trx - || lock->type_mode != (mode | LOCK_REC) - || lock_rec_get_n_bits(lock) <= heap_no) { - - status = LOCK_REC_FAIL; - } else if (!impl) { - /* If the nth bit of the record lock is already set - then we do not set a new lock bit, otherwise we do - set */ - if (!lock_rec_get_nth_bit(lock, heap_no)) { - lock_rec_set_nth_bit(lock, heap_no); - status = LOCK_REC_SUCCESS_CREATED; - } - } - - trx_mutex_exit(trx); - } - - return(status); -} - -/*********************************************************************//** -This is the general, and slower, routine for locking a record. This is a -low-level function which does NOT look at implicit locks! Checks lock -compatibility within explicit locks. This function sets a normal next-key -lock, or in the case of a page supremum record, a gap type lock. -@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */ -static -dberr_t -lock_rec_lock_slow( -/*===============*/ - ibool impl, /*!< in: if TRUE, no lock is set - if no wait is necessary: we - assume that the caller will - set an implicit lock */ - ulint mode, /*!< in: lock mode: LOCK_X or - LOCK_S possibly ORed to either - LOCK_GAP or LOCK_REC_NOT_GAP */ - const buf_block_t* block, /*!< in: buffer block containing - the record */ - ulint heap_no,/*!< in: heap number of record */ - dict_index_t* index, /*!< in: index of record */ - que_thr_t* thr) /*!< in: query thread */ -{ - ut_ad(lock_mutex_own()); - ut_ad(!srv_read_only_mode); - ut_ad((LOCK_MODE_MASK & mode) != LOCK_S - || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); - ut_ad((LOCK_MODE_MASK & mode) != LOCK_X - || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - ut_ad((LOCK_MODE_MASK & mode) == LOCK_S - || (LOCK_MODE_MASK & mode) == LOCK_X); - ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP - || mode - (LOCK_MODE_MASK & mode) == 0 - || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); - ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); - - DBUG_EXECUTE_IF("innodb_report_deadlock", return(DB_DEADLOCK);); - - dberr_t err; - trx_t* trx = thr_get_trx(thr); - - trx_mutex_enter(trx); - - if (lock_rec_has_expl(mode, block, heap_no, trx)) { - /* The trx already has a strong enough lock: do nothing */ - err = DB_SUCCESS; - } else if ( -#ifdef WITH_WSREP - lock_t* c_lock = -#endif /* WITH_WSREP */ - lock_rec_other_has_conflicting( - static_cast<enum lock_mode>(mode), - block, heap_no, trx)) { - /* If another transaction has a non-gap conflicting - request in the queue, as this transaction does not - have a lock strong enough already granted on the - record, we have to wait. */ - err = lock_rec_enqueue_waiting( -#ifdef WITH_WSREP - c_lock, -#endif /* WITH_WSREP */ - mode, block, heap_no, index, thr, NULL); - } else if (!impl) { - /* Set the requested lock on the record, note that - we already own the transaction mutex. */ - lock_rec_add_to_queue( - LOCK_REC | mode, block, heap_no, index, trx, TRUE); - err = DB_SUCCESS_LOCKED_REC; - } else { - err = DB_SUCCESS; - } - - trx_mutex_exit(trx); - - return(err); -} - -/*********************************************************************//** Tries to lock the specified record in the mode requested. If not immediately possible, enqueues a waiting lock request. This is a low-level function which does NOT look at implicit locks! Checks lock compatibility within @@ -2129,33 +1907,93 @@ lock_rec_lock( dict_index_t* index, /*!< in: index of record */ que_thr_t* thr) /*!< in: query thread */ { - ut_ad(lock_mutex_own()); - ut_ad(!srv_read_only_mode); - ut_ad((LOCK_MODE_MASK & mode) != LOCK_S - || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); - ut_ad((LOCK_MODE_MASK & mode) != LOCK_X - || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - ut_ad((LOCK_MODE_MASK & mode) == LOCK_S - || (LOCK_MODE_MASK & mode) == LOCK_X); - ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP - || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP - || mode - (LOCK_MODE_MASK & mode) == 0); - ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); - - /* We try a simplified and faster subroutine for the most - common cases */ - switch (lock_rec_lock_fast(impl, mode, block, heap_no, index, thr)) { - case LOCK_REC_SUCCESS: - return(DB_SUCCESS); - case LOCK_REC_SUCCESS_CREATED: - return(DB_SUCCESS_LOCKED_REC); - case LOCK_REC_FAIL: - return(lock_rec_lock_slow(impl, mode, block, - heap_no, index, thr)); - } + trx_t *trx= thr_get_trx(thr); + dberr_t err= DB_SUCCESS; + + ut_ad(!srv_read_only_mode); + ut_ad((LOCK_MODE_MASK & mode) == LOCK_S || + (LOCK_MODE_MASK & mode) == LOCK_X); + ut_ad((mode & LOCK_TYPE_MASK) == LOCK_GAP || + (mode & LOCK_TYPE_MASK) == LOCK_REC_NOT_GAP || + (mode & LOCK_TYPE_MASK) == 0); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); + DBUG_EXECUTE_IF("innodb_report_deadlock", return DB_DEADLOCK;); + + lock_mutex_enter(); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S || + lock_table_has(trx, index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X || + lock_table_has(trx, index->table, LOCK_IX)); + + if (lock_t *lock= lock_rec_get_first_on_page(lock_sys.rec_hash, block)) + { + trx_mutex_enter(trx); + if (lock_rec_get_next_on_page(lock) || + lock->trx != trx || + lock->type_mode != (ulint(mode) | LOCK_REC) || + lock_rec_get_n_bits(lock) <= heap_no) + { + /* Do nothing if the trx already has a strong enough lock on rec */ + if (!lock_rec_has_expl(mode, block, heap_no, trx)) + { + if ( +#ifdef WITH_WSREP + lock_t *c_lock= +#endif + lock_rec_other_has_conflicting(mode, block, heap_no, trx)) + { + /* + If another transaction has a non-gap conflicting + request in the queue, as this transaction does not + have a lock strong enough already granted on the + record, we have to wait. */ + err = lock_rec_enqueue_waiting( +#ifdef WITH_WSREP + c_lock, +#endif /* WITH_WSREP */ + mode, block, heap_no, index, thr, NULL); + } + else if (!impl) + { + /* Set the requested lock on the record. */ + lock_rec_add_to_queue(LOCK_REC | mode, block, heap_no, index, trx, + true); + err= DB_SUCCESS_LOCKED_REC; + } + } + } + else if (!impl) + { + /* + If the nth bit of the record lock is already set then we do not set + a new lock bit, otherwise we do set + */ + if (!lock_rec_get_nth_bit(lock, heap_no)) + { + lock_rec_set_nth_bit(lock, heap_no); + err= DB_SUCCESS_LOCKED_REC; + } + } + trx_mutex_exit(trx); + } + else + { + /* + Simplified and faster path for the most common cases + Note that we don't own the trx mutex. + */ + if (!impl) + lock_rec_create( +#ifdef WITH_WSREP + NULL, NULL, +#endif + mode, block, heap_no, index, trx, false); - ut_error; - return(DB_ERROR); + err= DB_SUCCESS_LOCKED_REC; + } + lock_mutex_exit(); + MONITOR_ATOMIC_INC(MONITOR_NUM_RECLOCK_REQ); + return err; } /*********************************************************************//** @@ -2293,8 +2131,8 @@ lock_grant_and_move_on_page(ulint rec_fold, ulint space, ulint page_no) { lock_t* lock; lock_t* previous = static_cast<lock_t*>( - hash_get_nth_cell(lock_sys->rec_hash, - hash_calc_hash(rec_fold, lock_sys->rec_hash)) + hash_get_nth_cell(lock_sys.rec_hash, + hash_calc_hash(rec_fold, lock_sys.rec_hash)) ->node); if (previous == NULL) { return; @@ -2372,7 +2210,7 @@ static void lock_rec_dequeue_from_page(lock_t* in_lock) if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS - || lock_hash != lock_sys->rec_hash + || lock_hash != lock_sys.rec_hash || thd_is_replication_slave_thread(in_lock->trx->mysql_thd)) { /* Check if waiting locks in the queue can now be granted: grant locks if there are no conflicting locks ahead. Stop at @@ -2477,11 +2315,11 @@ lock_rec_free_all_from_discard_page( page_no = block->page.id.page_no(); lock_rec_free_all_from_discard_page_low( - space, page_no, lock_sys->rec_hash); + space, page_no, lock_sys.rec_hash); lock_rec_free_all_from_discard_page_low( - space, page_no, lock_sys->prdt_hash); + space, page_no, lock_sys.prdt_hash); lock_rec_free_all_from_discard_page_low( - space, page_no, lock_sys->prdt_page_hash); + space, page_no, lock_sys.prdt_page_hash); } /*============= RECORD LOCK MOVING AND INHERITING ===================*/ @@ -2526,12 +2364,12 @@ lock_rec_reset_and_release_wait( ulint heap_no)/*!< in: heap number of record */ { lock_rec_reset_and_release_wait_low( - lock_sys->rec_hash, block, heap_no); + lock_sys.rec_hash, block, heap_no); lock_rec_reset_and_release_wait_low( - lock_sys->prdt_hash, block, PAGE_HEAP_NO_INFIMUM); + lock_sys.prdt_hash, block, PAGE_HEAP_NO_INFIMUM); lock_rec_reset_and_release_wait_low( - lock_sys->prdt_page_hash, block, PAGE_HEAP_NO_INFIMUM); + lock_sys.prdt_page_hash, block, PAGE_HEAP_NO_INFIMUM); } /*************************************************************//** @@ -2564,7 +2402,7 @@ lock_rec_inherit_to_gap( DO want S-locks/X-locks(taken for replace) set by a consistency constraint to be inherited also then. */ - for (lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); + for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { @@ -2575,7 +2413,8 @@ lock_rec_inherit_to_gap( && lock_get_mode(lock) == (lock->trx->duplicates ? LOCK_S : LOCK_X))) { lock_rec_add_to_queue( - LOCK_REC | LOCK_GAP | lock_get_mode(lock), + LOCK_REC | LOCK_GAP + | ulint(lock_get_mode(lock)), heir_block, heir_heap_no, lock->index, lock->trx, FALSE); } @@ -2602,7 +2441,7 @@ lock_rec_inherit_to_gap_if_gap_lock( lock_mutex_enter(); - for (lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); + for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { @@ -2611,7 +2450,8 @@ lock_rec_inherit_to_gap_if_gap_lock( || !lock_rec_get_rec_not_gap(lock))) { lock_rec_add_to_queue( - LOCK_REC | LOCK_GAP | lock_get_mode(lock), + LOCK_REC | LOCK_GAP + | ulint(lock_get_mode(lock)), block, heir_heap_no, lock->index, lock->trx, FALSE); } @@ -2646,8 +2486,8 @@ lock_rec_move_low( /* If the lock is predicate lock, it resides on INFIMUM record */ ut_ad(lock_rec_get_first( lock_hash, receiver, receiver_heap_no) == NULL - || lock_hash == lock_sys->prdt_hash - || lock_hash == lock_sys->prdt_page_hash); + || lock_hash == lock_sys.prdt_hash + || lock_hash == lock_sys.prdt_page_hash); for (lock = lock_rec_get_first(lock_hash, donator, donator_heap_no); @@ -2670,7 +2510,7 @@ lock_rec_move_low( lock->index, lock->trx, FALSE); } - ut_ad(lock_rec_get_first(lock_sys->rec_hash, + ut_ad(lock_rec_get_first(lock_sys.rec_hash, donator, donator_heap_no) == NULL); } @@ -2725,7 +2565,7 @@ lock_rec_move( ulint donator_heap_no)/*!< in: heap_no of the record which gives the locks */ { - lock_rec_move_low(lock_sys->rec_hash, receiver, donator, + lock_rec_move_low(lock_sys.rec_hash, receiver, donator, receiver_heap_no, donator_heap_no); } @@ -2750,7 +2590,7 @@ lock_move_reorganize_page( lock_mutex_enter(); /* FIXME: This needs to deal with predicate lock too */ - lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); + lock = lock_rec_get_first_on_page(lock_sys.rec_hash, block); if (lock == NULL) { lock_mutex_exit(); @@ -2807,6 +2647,9 @@ lock_move_reorganize_page( for (;;) { ulint old_heap_no; ulint new_heap_no; + ut_d(const rec_t* const orec = rec1); + ut_ad(page_rec_is_metadata(rec1) + == page_rec_is_metadata(rec2)); if (comp) { old_heap_no = rec_get_heap_no_new(rec2); @@ -2827,6 +2670,8 @@ lock_move_reorganize_page( /* Clear the bit in old_lock. */ if (old_heap_no < lock->un_member.rec_lock.n_bits && lock_rec_reset_nth_bit(lock, old_heap_no)) { + ut_ad(!page_rec_is_metadata(orec)); + /* NOTE that the old lock bitmap could be too small for the new heap number! */ @@ -2878,7 +2723,7 @@ lock_move_rec_list_end( table to the end of the hash chain, and lock_rec_add_to_queue does not reuse locks if there are waiters in the queue. */ - for (lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); lock; + for (lock = lock_rec_get_first_on_page(lock_sys.rec_hash, block); lock; lock = lock_rec_get_next_on_page(lock)) { const rec_t* rec1 = rec; const rec_t* rec2; @@ -2906,6 +2751,10 @@ lock_move_rec_list_end( reset the lock bits on the old */ for (;;) { + ut_ad(page_rec_is_metadata(rec1) + == page_rec_is_metadata(rec2)); + ut_d(const rec_t* const orec = rec1); + ulint rec1_heap_no; ulint rec2_heap_no; @@ -2928,8 +2777,11 @@ lock_move_rec_list_end( rec2_heap_no = rec_get_heap_no_old(rec2); + ut_ad(rec_get_data_size_old(rec1) + == rec_get_data_size_old(rec2)); + ut_ad(!memcmp(rec1, rec2, - rec_get_data_size_old(rec2))); + rec_get_data_size_old(rec1))); rec1 = page_rec_get_next_low(rec1, FALSE); rec2 = page_rec_get_next_low(rec2, FALSE); @@ -2937,6 +2789,8 @@ lock_move_rec_list_end( if (rec1_heap_no < lock->un_member.rec_lock.n_bits && lock_rec_reset_nth_bit(lock, rec1_heap_no)) { + ut_ad(!page_rec_is_metadata(orec)); + if (type_mode & LOCK_WAIT) { lock_reset_lock_and_trx_wait(lock); } @@ -2980,10 +2834,11 @@ lock_move_rec_list_start( ut_ad(block->frame == page_align(rec)); ut_ad(new_block->frame == page_align(old_end)); ut_ad(comp == page_rec_is_comp(old_end)); + ut_ad(!page_rec_is_metadata(rec)); lock_mutex_enter(); - for (lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); lock; + for (lock = lock_rec_get_first_on_page(lock_sys.rec_hash, block); lock; lock = lock_rec_get_next_on_page(lock)) { const rec_t* rec1; const rec_t* rec2; @@ -3005,6 +2860,10 @@ lock_move_rec_list_start( reset the lock bits on the old */ while (rec1 != rec) { + ut_ad(page_rec_is_metadata(rec1) + == page_rec_is_metadata(rec2)); + ut_d(const rec_t* const prev = rec1); + ulint rec1_heap_no; ulint rec2_heap_no; @@ -3027,6 +2886,8 @@ lock_move_rec_list_start( if (rec1_heap_no < lock->un_member.rec_lock.n_bits && lock_rec_reset_nth_bit(lock, rec1_heap_no)) { + ut_ad(!page_rec_is_metadata(prev)); + if (type_mode & LOCK_WAIT) { lock_reset_lock_and_trx_wait(lock); } @@ -3089,7 +2950,7 @@ lock_rtr_move_rec_list( lock_mutex_enter(); - for (lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); lock; + for (lock = lock_rec_get_first_on_page(lock_sys.rec_hash, block); lock; lock = lock_rec_get_next_on_page(lock)) { ulint moved = 0; const rec_t* rec1; @@ -3105,6 +2966,8 @@ lock_rtr_move_rec_list( rec1 = rec_move[moved].old_rec; rec2 = rec_move[moved].new_rec; + ut_ad(!page_rec_is_metadata(rec1)); + ut_ad(!page_rec_is_metadata(rec2)); if (comp) { rec1_heap_no = rec_get_heap_no_new(rec1); @@ -3183,6 +3046,8 @@ lock_update_merge_right( page which will be discarded */ { + ut_ad(!page_rec_is_metadata(orig_succ)); + lock_mutex_enter(); /* Inherit the locks from the supremum of the left page to the @@ -3197,21 +3062,17 @@ lock_update_merge_right( waiting transactions */ lock_rec_reset_and_release_wait_low( - lock_sys->rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM); + lock_sys.rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM); -#ifdef UNIV_DEBUG /* there should exist no page lock on the left page, otherwise, it will be blocked from merge */ - ulint space = left_block->page.id.space(); - ulint page_no = left_block->page.id.page_no(); - ut_ad(lock_rec_get_first_on_page_addr( - lock_sys->prdt_page_hash, space, page_no) == NULL); -#endif /* UNIV_DEBUG */ + ut_ad(!lock_rec_get_first_on_page_addr(lock_sys.prdt_page_hash, + left_block->page.id.space(), + left_block->page.id.page_no())); lock_rec_free_all_from_discard_page(left_block); lock_mutex_exit(); - } /*************************************************************//** @@ -3315,7 +3176,7 @@ lock_update_merge_left( releasing waiting transactions */ lock_rec_reset_and_release_wait_low( - lock_sys->rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM); + lock_sys.rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM); } /* Move the locks from the supremum of right page to the supremum @@ -3324,15 +3185,12 @@ lock_update_merge_left( lock_rec_move(left_block, right_block, PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); -#ifdef UNIV_DEBUG /* there should exist no page lock on the right page, otherwise, it will be blocked from merge */ - ulint space = right_block->page.id.space(); - ulint page_no = right_block->page.id.page_no(); - lock_t* lock_test = lock_rec_get_first_on_page_addr( - lock_sys->prdt_page_hash, space, page_no); - ut_ad(!lock_test); -#endif /* UNIV_DEBUG */ + ut_ad(!lock_rec_get_first_on_page_addr( + lock_sys.prdt_page_hash, + right_block->page.id.space(), + right_block->page.id.page_no())); lock_rec_free_all_from_discard_page(right_block); @@ -3383,9 +3241,9 @@ lock_update_discard( lock_mutex_enter(); - if (lock_rec_get_first_on_page(lock_sys->rec_hash, block)) { - ut_ad(!lock_rec_get_first_on_page(lock_sys->prdt_hash, block)); - ut_ad(!lock_rec_get_first_on_page(lock_sys->prdt_page_hash, + if (lock_rec_get_first_on_page(lock_sys.rec_hash, block)) { + ut_ad(!lock_rec_get_first_on_page(lock_sys.prdt_hash, block)); + ut_ad(!lock_rec_get_first_on_page(lock_sys.prdt_page_hash, block)); /* Inherit all the locks on the page to the record and reset all the locks on the page */ @@ -3422,14 +3280,14 @@ lock_update_discard( lock_rec_free_all_from_discard_page_low( block->page.id.space(), block->page.id.page_no(), - lock_sys->rec_hash); + lock_sys.rec_hash); } else { lock_rec_free_all_from_discard_page_low( block->page.id.space(), block->page.id.page_no(), - lock_sys->prdt_hash); + lock_sys.prdt_hash); lock_rec_free_all_from_discard_page_low( block->page.id.space(), block->page.id.page_no(), - lock_sys->prdt_page_hash); + lock_sys.prdt_page_hash); } lock_mutex_exit(); @@ -3447,6 +3305,7 @@ lock_update_insert( ulint donator_heap_no; ut_ad(block->frame == page_align(rec)); + ut_ad(!page_rec_is_metadata(rec)); /* Inherit the gap-locking locks for rec, in gap mode, from the next record */ @@ -3478,6 +3337,7 @@ lock_update_delete( ulint next_heap_no; ut_ad(page == page_align(rec)); + ut_ad(!page_rec_is_metadata(rec)); if (page_is_comp(page)) { heap_no = rec_get_heap_no_new(rec); @@ -3854,7 +3714,7 @@ lock_table_enqueue_waiting( #endif /* WITH_WSREP */ /* Enqueue the lock request that will wait to be granted */ - lock = lock_table_create(table, mode | LOCK_WAIT, trx + lock = lock_table_create(table, ulint(mode) | LOCK_WAIT, trx #ifdef WITH_WSREP , c_lock #endif @@ -3965,7 +3825,7 @@ lock_table( locking overhead */ if ((flags & BTR_NO_LOCKING_FLAG) || srv_read_only_mode - || dict_table_is_temporary(table)) { + || table->is_temporary()) { return(DB_SUCCESS); } @@ -4013,13 +3873,14 @@ lock_table( mode: this trx may have to wait */ if (wait_for != NULL) { - err = lock_table_enqueue_waiting(mode | flags, table, thr + err = lock_table_enqueue_waiting(ulint(mode) | flags, table, + thr #ifdef WITH_WSREP , wait_for #endif ); } else { - lock_table_create(table, mode | flags, trx); + lock_table_create(table, ulint(mode) | flags, trx); ut_a(!flags || mode == LOCK_S || mode == LOCK_X); @@ -4265,13 +4126,14 @@ lock_rec_unlock( ut_ad(block->frame == page_align(rec)); ut_ad(!trx->lock.wait_lock); ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(!page_rec_is_metadata(rec)); heap_no = page_rec_get_heap_no(rec); lock_mutex_enter(); trx_mutex_enter(trx); - first_lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); + first_lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no); /* Find the last lock with the same lock_mode and transaction on the record. */ @@ -4322,7 +4184,7 @@ released: } } } else { - lock_grant_and_move_on_rec(lock_sys->rec_hash, first_lock, heap_no); + lock_grant_and_move_on_rec(lock_sys.rec_hash, first_lock, heap_no); } lock_mutex_exit(); @@ -4370,23 +4232,30 @@ lock_check_dict_lock( } #endif /* UNIV_DEBUG */ -/*********************************************************************//** -Releases transaction locks, and releases possible other transactions waiting -because of these locks. */ -static -void -lock_release( -/*=========*/ - trx_t* trx) /*!< in/out: transaction */ +/** Release the explicit locks of a committing transaction, +and release possible other transactions waiting because of these locks. */ +void lock_release(trx_t* trx) { - lock_t* lock; +#ifdef UNIV_DEBUG + std::set<table_id_t> to_evict; + if (innodb_evict_tables_on_commit_debug && !trx->is_recovered) +# if 1 /* if dict_stats_exec_sql() were not playing dirty tricks */ + if (!mutex_own(&dict_sys->mutex)) +# else /* this would be more proper way to do it */ + if (!trx->dict_operation_lock_mode && !trx->dict_operation) +# endif + for (trx_mod_tables_t::const_iterator it= trx->mod_tables.begin(); + it != trx->mod_tables.end(); ++it) + if (!it->first->is_temporary()) + to_evict.insert(it->first->id); +#endif ulint count = 0; - trx_id_t max_trx_id = trx_sys_get_max_trx_id(); + trx_id_t max_trx_id = trx_sys.get_max_trx_id(); - ut_ad(lock_mutex_own()); + lock_mutex_enter(); ut_ad(!trx_mutex_own(trx)); - for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks); + for (lock_t* lock = UT_LIST_GET_LAST(trx->lock.trx_locks); lock != NULL; lock = UT_LIST_GET_LAST(trx->lock.trx_locks)) { @@ -4426,6 +4295,28 @@ lock_release( ++count; } + + lock_mutex_exit(); + +#ifdef UNIV_DEBUG + if (to_evict.empty()) { + return; + } + mutex_enter(&dict_sys->mutex); + lock_mutex_enter(); + for (std::set<table_id_t>::const_iterator i = to_evict.begin(); + i != to_evict.end(); ++i) { + if (dict_table_t *table = dict_table_open_on_id( + *i, TRUE, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)) { + if (!table->get_ref_count() + && !UT_LIST_GET_LEN(table->locks)) { + dict_table_remove_from_cache_low(table, true); + } + } + } + lock_mutex_exit(); + mutex_exit(&dict_sys->mutex); +#endif } /* True if a lock mode is S or X */ @@ -4479,201 +4370,14 @@ lock_trx_table_locks_remove( ut_error; } -/*********************************************************************//** -Removes locks of a transaction on a table to be dropped. -If remove_also_table_sx_locks is TRUE then table-level S and X locks are -also removed in addition to other table-level and record-level locks. -No lock that is going to be removed is allowed to be a wait lock. */ -static -void -lock_remove_all_on_table_for_trx( -/*=============================*/ - dict_table_t* table, /*!< in: table to be dropped */ - trx_t* trx, /*!< in: a transaction */ - ibool remove_also_table_sx_locks)/*!< in: also removes - table S and X locks */ -{ - lock_t* lock; - lock_t* prev_lock; - - ut_ad(lock_mutex_own()); - - for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks); - lock != NULL; - lock = prev_lock) { - - prev_lock = UT_LIST_GET_PREV(trx_locks, lock); - - if (lock_get_type_low(lock) == LOCK_REC - && lock->index->table == table) { - ut_a(!lock_get_wait(lock)); - - lock_rec_discard(lock); - } else if (lock_get_type_low(lock) & LOCK_TABLE - && lock->un_member.tab_lock.table == table - && (remove_also_table_sx_locks - || !IS_LOCK_S_OR_X(lock))) { - - ut_a(!lock_get_wait(lock)); - - lock_trx_table_locks_remove(lock); - lock_table_remove_low(lock); - } - } -} - -/*******************************************************************//** -Remove any explicit record locks held by recovering transactions on -the table. -@return number of recovered transactions examined */ -static -ulint -lock_remove_recovered_trx_record_locks( -/*===================================*/ - dict_table_t* table) /*!< in: check if there are any locks - held on records in this table or on the - table itself */ -{ - ut_a(table != NULL); - ut_ad(lock_mutex_own()); - - ulint n_recovered_trx = 0; - - mutex_enter(&trx_sys->mutex); - - for (trx_t* trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); - trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { - - assert_trx_in_rw_list(trx); - - if (!trx->is_recovered) { - continue; - } - - /* Because we are holding the lock_sys->mutex, - implicit locks cannot be converted to explicit ones - while we are scanning the explicit locks. */ - - lock_t* next_lock; - - for (lock_t* lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); - lock != NULL; - lock = next_lock) { - - ut_a(lock->trx == trx); - - /* Recovered transactions can't wait on a lock. */ - - ut_a(!lock_get_wait(lock)); - - next_lock = UT_LIST_GET_NEXT(trx_locks, lock); - - switch (lock_get_type_low(lock)) { - default: - ut_error; - case LOCK_TABLE: - if (lock->un_member.tab_lock.table == table) { - lock_trx_table_locks_remove(lock); - lock_table_remove_low(lock); - } - break; - case LOCK_REC: - if (lock->index->table == table) { - lock_rec_discard(lock); - } - } - } - - ++n_recovered_trx; - } - - mutex_exit(&trx_sys->mutex); - - return(n_recovered_trx); -} - -/*********************************************************************//** -Removes locks on a table to be dropped or discarded. -If remove_also_table_sx_locks is TRUE then table-level S and X locks are -also removed in addition to other table-level and record-level locks. -No lock, that is going to be removed, is allowed to be a wait lock. */ -void -lock_remove_all_on_table( -/*=====================*/ - dict_table_t* table, /*!< in: table to be dropped - or discarded */ - ibool remove_also_table_sx_locks)/*!< in: also removes - table S and X locks */ -{ - lock_t* lock; - - lock_mutex_enter(); - - for (lock = UT_LIST_GET_FIRST(table->locks); - lock != NULL; - /* No op */) { - - lock_t* prev_lock; - - prev_lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock); - - /* If we should remove all locks (remove_also_table_sx_locks - is TRUE), or if the lock is not table-level S or X lock, - then check we are not going to remove a wait lock. */ - if (remove_also_table_sx_locks - || !(lock_get_type(lock) == LOCK_TABLE - && IS_LOCK_S_OR_X(lock))) { - - ut_a(!lock_get_wait(lock)); - } - - lock_remove_all_on_table_for_trx( - table, lock->trx, remove_also_table_sx_locks); - - if (prev_lock == NULL) { - if (lock == UT_LIST_GET_FIRST(table->locks)) { - /* lock was not removed, pick its successor */ - lock = UT_LIST_GET_NEXT( - un_member.tab_lock.locks, lock); - } else { - /* lock was removed, pick the first one */ - lock = UT_LIST_GET_FIRST(table->locks); - } - } else if (UT_LIST_GET_NEXT(un_member.tab_lock.locks, - prev_lock) != lock) { - /* If lock was removed by - lock_remove_all_on_table_for_trx() then pick the - successor of prev_lock ... */ - lock = UT_LIST_GET_NEXT( - un_member.tab_lock.locks, prev_lock); - } else { - /* ... otherwise pick the successor of lock. */ - lock = UT_LIST_GET_NEXT( - un_member.tab_lock.locks, lock); - } - } - - /* Note: Recovered transactions don't have table level IX or IS locks - but can have implicit record locks that have been converted to explicit - record locks. Such record locks cannot be freed by traversing the - transaction lock list in dict_table_t (as above). */ - - if (!lock_sys->rollback_complete - && lock_remove_recovered_trx_record_locks(table) == 0) { - - lock_sys->rollback_complete = TRUE; - } - - lock_mutex_exit(); -} - /*===================== VALIDATION AND DEBUGGING ====================*/ -/** Pretty-print a table lock. +/** Print info of a table lock. @param[in,out] file output stream @param[in] lock table lock */ -static void lock_table_print(FILE* file, const lock_t* lock) +static +void +lock_table_print(FILE* file, const lock_t* lock) { ut_ad(lock_mutex_own()); ut_a(lock_get_type_low(lock) == LOCK_TABLE); @@ -4727,7 +4431,7 @@ static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr) (ulong) space, (ulong) page_no, (ulong) lock_rec_get_n_bits(lock), lock->index->name()); - ut_print_name(file, lock->trx, lock->index->table_name); + ut_print_name(file, lock->trx, lock->index->table->name.m_name); fprintf(file, " trx id " TRX_ID_FMT, trx_get_id_for_print(lock->trx)); if (lock_get_mode(lock) == LOCK_S) { @@ -4779,9 +4483,11 @@ static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr) rec = page_find_rec_with_heap_no( buf_block_get_frame(block), i); + ut_ad(!page_rec_is_metadata(rec)); offsets = rec_get_offsets( - rec, lock->index, offsets, true, + rec, lock->index, offsets, + lock->index->n_core_fields, ULINT_UNDEFINED, &heap); putc(' ', file); @@ -4819,11 +4525,11 @@ lock_get_n_rec_locks(void) ut_ad(lock_mutex_own()); - for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) { + for (i = 0; i < hash_get_n_cells(lock_sys.rec_hash); i++) { const lock_t* lock; for (lock = static_cast<const lock_t*>( - HASH_GET_FIRST(lock_sys->rec_hash, i)); + HASH_GET_FIRST(lock_sys.rec_hash, i)); lock != 0; lock = static_cast<const lock_t*>( HASH_GET_NEXT(hash, lock))) { @@ -4872,49 +4578,19 @@ lock_print_info_summary( "------------\n", file); fprintf(file, "Trx id counter " TRX_ID_FMT "\n", - trx_sys_get_max_trx_id()); + trx_sys.get_max_trx_id()); fprintf(file, "Purge done for trx's n:o < " TRX_ID_FMT - " undo n:o < " TRX_ID_FMT " state: ", - purge_sys->iter.trx_no, - purge_sys->iter.undo_no); - - /* Note: We are reading the state without the latch. One because it - will violate the latching order and two because we are merely querying - the state of the variable for display. */ - - switch (purge_sys->state){ - case PURGE_STATE_INIT: - /* Should never be in this state while the system is running. */ - ut_error; - - case PURGE_STATE_EXIT: - fprintf(file, "exited"); - break; - - case PURGE_STATE_DISABLED: - fprintf(file, "disabled"); - break; - - case PURGE_STATE_RUN: - fprintf(file, "running"); - /* Check if it is waiting for more data to arrive. */ - if (!purge_sys->running) { - fprintf(file, " but idle"); - } - break; - - case PURGE_STATE_STOP: - fprintf(file, "stopped"); - break; - } - - fprintf(file, "\n"); - - fprintf(file, - "History list length %lu\n", - (ulong) trx_sys->rseg_history_len); + " undo n:o < " TRX_ID_FMT " state: %s\n" + "History list length %u\n", + purge_sys.tail.trx_no(), + purge_sys.tail.undo_no, + purge_sys.enabled() + ? (purge_sys.running() ? "running" + : purge_sys.paused() ? "stopped" : "running but idle") + : "disabled", + trx_sys.history_size()); #ifdef PRINT_NUM_OF_LOCK_STRUCTS fprintf(file, @@ -4935,10 +4611,13 @@ lock_trx_print_wait_and_mvcc_state(FILE* file, const trx_t* trx, time_t now) trx_print_latched(file, trx, 600); - const ReadView* read_view = trx_get_read_view(trx); + /* Note: read_view->get_state() check is race condition. But it + should "kind of work" because read_view is freed only at shutdown. + Worst thing that may happen is that it'll get transferred to + another thread and print wrong values. */ - if (read_view != NULL) { - read_view->print_limits(file); + if (trx->read_view.get_state() == READ_VIEW_STATE_OPEN) { + trx->read_view.print_limits(file); } if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) { @@ -4994,38 +4673,19 @@ lock_trx_print_locks( } } -/** Functor to display all transactions (except recovered ones) */ +/** Functor to display all transactions */ struct lock_print_info { - lock_print_info(FILE* file, time_t now) : file(file), now(now) {} + lock_print_info(FILE* file, time_t now) : + file(file), now(now), + purge_trx(purge_sys.query ? purge_sys.query->trx : NULL) + {} void operator()(const trx_t* trx) const { - ut_ad(mutex_own(&trx_sys->mutex)); - ut_ad(trx->in_mysql_trx_list); - lock_trx_print_wait_and_mvcc_state(file, trx, now); - - if (trx->will_lock && srv_print_innodb_lock_monitor) - lock_trx_print_locks(file, trx); - } - - FILE* const file; - const time_t now; -}; - -/** Functor to display recovered read-write transactions */ -struct lock_print_info_rw_recovered -{ - lock_print_info_rw_recovered(FILE* file, time_t now) : file(file),now(now) {} - - void operator()(const trx_t* trx) const - { - ut_ad(mutex_own(&trx_sys->mutex)); - ut_ad(trx->in_rw_trx_list); - if (trx->mysql_thd) + ut_ad(mutex_own(&trx_sys.mutex)); + if (UNIV_UNLIKELY(trx == purge_trx)) return; - ut_ad(!trx->in_mysql_trx_list); - lock_trx_print_wait_and_mvcc_state(file, trx, now); if (trx->will_lock && srv_print_innodb_lock_monitor) @@ -5034,6 +4694,7 @@ struct lock_print_info_rw_recovered FILE* const file; const time_t now; + const trx_t* const purge_trx; }; /*********************************************************************//** @@ -5050,12 +4711,11 @@ lock_print_info_all_transactions( fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n"); const time_t now = time(NULL); - mutex_enter(&trx_sys->mutex); - ut_list_map(trx_sys->mysql_trx_list, lock_print_info(file, now)); - ut_list_map(trx_sys->rw_trx_list, - lock_print_info_rw_recovered(file, now)); - mutex_exit(&trx_sys->mutex); + mutex_enter(&trx_sys.mutex); + ut_list_map(trx_sys.trx_list, lock_print_info(file, now)); + mutex_exit(&trx_sys.mutex); lock_mutex_exit(); + ut_ad(lock_validate()); } @@ -5110,18 +4770,18 @@ lock_table_queue_validate( const lock_t* lock; ut_ad(lock_mutex_own()); - ut_ad(trx_sys_mutex_own()); for (lock = UT_LIST_GET_FIRST(table->locks); lock != NULL; lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) { /* lock->trx->state cannot change from or to NOT_STARTED - while we are holding the trx_sys->mutex. It may change + while we are holding the lock_sys.mutex. It may change from ACTIVE or PREPARED to PREPARED or COMMITTED. */ trx_mutex_enter(lock->trx); - ut_ad(trx_assert_started(lock->trx)); - if (trx_state_eq(lock->trx, TRX_STATE_COMMITTED_IN_MEMORY)) { + check_trx_state(lock->trx); + + if (lock->trx->state == TRX_STATE_COMMITTED_IN_MEMORY) { } else if (!lock_get_wait(lock)) { ut_a(!lock_table_other_has_incompatible( lock->trx, 0, table, @@ -5141,10 +4801,10 @@ lock_table_queue_validate( Validates the lock queue on a single record. @return TRUE if ok */ static -ibool +bool lock_rec_queue_validate( /*====================*/ - ibool locked_lock_trx_sys, + bool locked_lock_trx_sys, /*!< in: if the caller holds both the lock mutex and trx_sys_t->lock. */ @@ -5160,6 +4820,7 @@ lock_rec_queue_validate( ut_a(block->frame == page_align(rec)); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + ut_ad(page_rec_is_leaf(rec)); ut_ad(lock_mutex_own() == locked_lock_trx_sys); ut_ad(!index || dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); @@ -5168,12 +4829,11 @@ lock_rec_queue_validate( if (!locked_lock_trx_sys) { lock_mutex_enter(); - mutex_enter(&trx_sys->mutex); } if (!page_rec_is_user_rec(rec)) { - for (lock = lock_rec_get_first(lock_sys->rec_hash, + for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no); lock != NULL; lock = lock_rec_get_next_const(heap_no, lock)) { @@ -5189,17 +4849,24 @@ lock_rec_queue_validate( trx_mutex_exit(lock->trx); } - goto func_exit; +func_exit: + if (!locked_lock_trx_sys) { + lock_mutex_exit(); + } + + return true; } ut_ad(page_rec_is_leaf(rec)); ut_ad(lock_mutex_own()); - if (!index || !index->is_primary()) { - /* Nothing we can do */ - } else if (trx_t* impl_trx = trx_rw_is_active_low( - lock_clust_rec_some_has_impl(rec, index, offsets), - NULL)) { + const trx_id_t impl_trx_id = index && index->is_primary() + ? lock_clust_rec_some_has_impl(rec, index, offsets) + : 0; + + if (trx_t *impl_trx = impl_trx_id + ? trx_sys.find(current_trx(), impl_trx_id, false) + : 0) { /* impl_trx could have been committed before we acquire its mutex, but not thereafter. */ @@ -5257,11 +4924,12 @@ lock_rec_queue_validate( mutex_exit(&impl_trx->mutex); } - for (lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); + for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no); lock != NULL; lock = lock_rec_get_next_const(heap_no, lock)) { ut_ad(!trx_is_ac_nl_ro(lock->trx)); + ut_ad(!page_rec_is_metadata(rec)); if (index) { ut_a(lock->index == index); @@ -5305,13 +4973,7 @@ lock_rec_queue_validate( ut_ad(innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS || lock_queue_validate(lock)); -func_exit: - if (!locked_lock_trx_sys) { - lock_mutex_exit(); - mutex_exit(&trx_sys->mutex); - } - - return(TRUE); + goto func_exit; } /*********************************************************************//** @@ -5336,10 +4998,9 @@ lock_rec_validate_page( ut_ad(!lock_mutex_own()); lock_mutex_enter(); - mutex_enter(&trx_sys->mutex); loop: lock = lock_rec_get_first_on_page_addr( - lock_sys->rec_hash, + lock_sys.rec_hash, block->page.id.space(), block->page.id.page_no()); if (!lock) { @@ -5372,8 +5033,8 @@ loop: ut_ad(!lock_rec_get_nth_bit(lock, i) || page_rec_is_leaf(rec)); offsets = rec_get_offsets(rec, lock->index, offsets, - true, ULINT_UNDEFINED, - &heap); + lock->index->n_core_fields, + ULINT_UNDEFINED, &heap); /* If this thread is holding the file space latch (fil_space_t::latch), the following @@ -5396,7 +5057,6 @@ loop: function_exit: lock_mutex_exit(); - mutex_exit(&trx_sys->mutex); if (heap != NULL) { mem_heap_free(heap); @@ -5405,61 +5065,21 @@ function_exit: } /*********************************************************************//** -Validates the table locks. -@return TRUE if ok */ -static -ibool -lock_validate_table_locks( -/*======================*/ - const trx_ut_list_t* trx_list) /*!< in: trx list */ -{ - const trx_t* trx; - - ut_ad(lock_mutex_own()); - ut_ad(trx_sys_mutex_own()); - - ut_ad(trx_list == &trx_sys->rw_trx_list); - - for (trx = UT_LIST_GET_FIRST(*trx_list); - trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { - - const lock_t* lock; - - check_trx_state(trx); - - for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); - lock != NULL; - lock = UT_LIST_GET_NEXT(trx_locks, lock)) { - - if (lock_get_type_low(lock) & LOCK_TABLE) { - - lock_table_queue_validate( - lock->un_member.tab_lock.table); - } - } - } - - return(TRUE); -} - -/*********************************************************************//** Validate record locks up to a limit. @return lock at limit or NULL if no more locks in the hash bucket */ static MY_ATTRIBUTE((warn_unused_result)) const lock_t* lock_rec_validate( /*==============*/ - ulint start, /*!< in: lock_sys->rec_hash + ulint start, /*!< in: lock_sys.rec_hash bucket */ ib_uint64_t* limit) /*!< in/out: upper limit of (space, page_no) */ { ut_ad(lock_mutex_own()); - ut_ad(trx_sys_mutex_own()); for (const lock_t* lock = static_cast<const lock_t*>( - HASH_GET_FIRST(lock_sys->rec_hash, start)); + HASH_GET_FIRST(lock_sys.rec_hash, start)); lock != NULL; lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))) { @@ -5498,9 +5118,12 @@ lock_rec_block_validate( buf_block_t* block; mtr_t mtr; - /* Make sure that the tablespace is not deleted while we are - trying to access the page. */ - if (fil_space_t* space = fil_space_acquire_silent(space_id)) { + /* Transactional locks should never refer to dropped + tablespaces, because all DDL operations that would drop or + discard or rebuild a tablespace do hold an exclusive table + lock, which would conflict with any locks referring to the + tablespace from other transactions. */ + if (fil_space_t* space = fil_space_acquire(space_id)) { dberr_t err = DB_SUCCESS; mtr_start(&mtr); @@ -5526,10 +5149,31 @@ lock_rec_block_validate( mtr_commit(&mtr); - fil_space_release(space); + space->release(); } } + +static my_bool lock_validate_table_locks(rw_trx_hash_element_t *element, void*) +{ + ut_ad(lock_mutex_own()); + mutex_enter(&element->mutex); + if (element->trx) + { + check_trx_state(element->trx); + for (const lock_t *lock= UT_LIST_GET_FIRST(element->trx->lock.trx_locks); + lock != NULL; + lock= UT_LIST_GET_NEXT(trx_locks, lock)) + { + if (lock_get_type_low(lock) & LOCK_TABLE) + lock_table_queue_validate(lock->un_member.tab_lock.table); + } + } + mutex_exit(&element->mutex); + return 0; +} + + /*********************************************************************//** Validates the lock system. @return TRUE if ok */ @@ -5547,15 +5191,16 @@ lock_validate() page_addr_set pages; lock_mutex_enter(); - mutex_enter(&trx_sys->mutex); - ut_a(lock_validate_table_locks(&trx_sys->rw_trx_list)); + /* Validate table locks */ + trx_sys.rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action> + (lock_validate_table_locks), 0); /* Iterate over all the record locks and validate the locks. We don't want to hog the lock_sys_t::mutex and the trx_sys_t::mutex. Release both mutexes during the validation check. */ - for (ulint i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) { + for (ulint i = 0; i < hash_get_n_cells(lock_sys.rec_hash); i++) { ib_uint64_t limit = 0; while (const lock_t* lock = lock_rec_validate(i, &limit)) { @@ -5568,7 +5213,6 @@ lock_validate() } } - mutex_exit(&trx_sys->mutex); lock_mutex_exit(); for (page_addr_set::const_iterator it = pages.begin(); @@ -5599,7 +5243,7 @@ lock_rec_insert_check_and_lock( dict_index_t* index, /*!< in: index */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr, /*!< in/out: mini-transaction */ - ibool* inherit)/*!< out: set to TRUE if the new + bool* inherit)/*!< out: set to true if the new inserted record maybe should inherit LOCK_GAP type locks from the successor record */ @@ -5608,7 +5252,8 @@ lock_rec_insert_check_and_lock( ut_ad(!dict_index_is_online_ddl(index) || index->is_primary() || (flags & BTR_CREATE_FLAG)); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); + ut_ad(page_rec_is_leaf(rec)); if (flags & BTR_NO_LOCKING_FLAG) { @@ -5620,10 +5265,11 @@ lock_rec_insert_check_and_lock( dberr_t err; lock_t* lock; - ibool inherit_in = *inherit; + bool inherit_in = *inherit; trx_t* trx = thr_get_trx(thr); const rec_t* next_rec = page_rec_get_next_const(rec); ulint heap_no = page_rec_get_heap_no(next_rec); + ut_ad(!rec_is_metadata(next_rec, index)); lock_mutex_enter(); /* Because this code is invoked for a running transaction by @@ -5635,7 +5281,7 @@ lock_rec_insert_check_and_lock( BTR_NO_LOCKING_FLAG and skip the locking altogether. */ ut_ad(lock_table_has(trx, index->table, LOCK_IX)); - lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); + lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no); if (lock == NULL) { /* We optimize CPU time usage in the simplest case */ @@ -5649,7 +5295,7 @@ lock_rec_insert_check_and_lock( trx->id, mtr); } - *inherit = FALSE; + *inherit = false; return(DB_SUCCESS); } @@ -5660,7 +5306,7 @@ lock_rec_insert_check_and_lock( return(DB_SUCCESS); } - *inherit = TRUE; + *inherit = true; /* If another transaction has an explicit lock request which locks the gap, waiting or granted, on the successor, the insert has to wait. @@ -5719,7 +5365,8 @@ lock_rec_insert_check_and_lock( const rec_offs* offsets; rec_offs_init(offsets_); - offsets = rec_get_offsets(next_rec, index, offsets_, true, + offsets = rec_get_offsets(next_rec, index, offsets_, + index->n_core_fields, ULINT_UNDEFINED, &heap); ut_ad(lock_rec_queue_validate( @@ -5749,12 +5396,13 @@ lock_rec_convert_impl_to_expl_for_trx( trx_t* trx, /*!< in/out: active transaction */ ulint heap_no)/*!< in: rec heap number to lock */ { + ut_ad(trx->is_referenced()); ut_ad(page_rec_is_leaf(rec)); + ut_ad(!rec_is_metadata(rec, index)); DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx"); lock_mutex_enter(); trx_mutex_enter(trx); - ut_ad(trx->is_referenced()); ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED)); if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) @@ -5771,17 +5419,110 @@ lock_rec_convert_impl_to_expl_for_trx( DEBUG_SYNC_C("after_lock_rec_convert_impl_to_expl_for_trx"); } -/*********************************************************************//** -If a transaction has an implicit x-lock on a record, but no explicit x-lock -set on the record, sets one for it. */ + +#ifdef UNIV_DEBUG +struct lock_rec_other_trx_holds_expl_arg +{ + const ulint heap_no; + const buf_block_t * const block; + const trx_t *impl_trx; +}; + + +static my_bool lock_rec_other_trx_holds_expl_callback( + rw_trx_hash_element_t *element, + lock_rec_other_trx_holds_expl_arg *arg) +{ + mutex_enter(&element->mutex); + if (element->trx) + { + trx_mutex_enter(element->trx); + ut_ad(element->trx->state != TRX_STATE_NOT_STARTED); + lock_t *expl_lock= element->trx->state == TRX_STATE_COMMITTED_IN_MEMORY + ? NULL : lock_rec_has_expl(LOCK_S | LOCK_REC_NOT_GAP, arg->block, + arg->heap_no, element->trx); + /* + An explicit lock is held by trx other than the trx holding the implicit + lock. + */ + ut_ad(!expl_lock || expl_lock->trx == arg->impl_trx); + trx_mutex_exit(element->trx); + } + mutex_exit(&element->mutex); + return 0; +} + + +/** + Checks if some transaction, other than given trx_id, has an explicit + lock on the given rec. + + FIXME: if the current transaction holds implicit lock from INSERT, a + subsequent locking read should not convert it to explicit. See also + MDEV-11215. + + @param caller_trx trx of current thread + @param[in] trx trx holding implicit lock on rec + @param[in] rec user record + @param[in] block buffer block containing the record +*/ + +static void lock_rec_other_trx_holds_expl(trx_t *caller_trx, trx_t *trx, + const rec_t *rec, + const buf_block_t *block) +{ + if (trx) + { + ut_ad(!page_rec_is_metadata(rec)); + lock_mutex_enter(); + ut_ad(trx->is_referenced()); + trx_mutex_enter(trx); + const trx_state_t state = trx->state; + trx_mutex_exit(trx); + ut_ad(state != TRX_STATE_NOT_STARTED); + if (state == TRX_STATE_COMMITTED_IN_MEMORY) + { + /* The transaction was committed before our lock_mutex_enter(). */ + lock_mutex_exit(); + return; + } + lock_rec_other_trx_holds_expl_arg arg= { page_rec_get_heap_no(rec), block, + trx }; + trx_sys.rw_trx_hash.iterate(caller_trx, + reinterpret_cast<my_hash_walk_action> + (lock_rec_other_trx_holds_expl_callback), + &arg); + lock_mutex_exit(); + } +} +#endif /* UNIV_DEBUG */ + + +/** If an implicit x-lock exists on a record, convert it to an explicit one. + +Often, this is called by a transaction that is about to enter a lock wait +due to the lock conflict. Two explicit locks would be created: first the +exclusive lock on behalf of the lock-holder transaction in this function, +and then a wait request on behalf of caller_trx, in the calling function. + +This may also be called by the same transaction that is already holding +an implicit exclusive lock on the record. In this case, no explicit lock +should be created. + +@param[in,out] caller_trx current transaction +@param[in] block index tree leaf page +@param[in] rec record on the leaf page +@param[in] index the index of the record +@param[in] offsets rec_get_offsets(rec,index) +@return whether caller_trx already holds an exclusive lock on rec */ static -void +bool lock_rec_convert_impl_to_expl( -/*==========================*/ - const buf_block_t* block, /*!< in: buffer block of rec */ - const rec_t* rec, /*!< in: user record on page */ - dict_index_t* index, /*!< in: index of record */ - const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ + trx_t* caller_trx, + const buf_block_t* block, + const rec_t* rec, + dict_index_t* index, + const rec_offs* offsets) { trx_t* trx; @@ -5789,20 +5530,34 @@ lock_rec_convert_impl_to_expl( ut_ad(page_rec_is_user_rec(rec)); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!rec_is_metadata(rec, index)); if (dict_index_is_clust(index)) { trx_id_t trx_id; trx_id = lock_clust_rec_some_has_impl(rec, index, offsets); - trx = trx_rw_is_active(trx_id, NULL, true); + if (trx_id == 0) { + return false; + } + if (UNIV_UNLIKELY(trx_id == caller_trx->id)) { + return true; + } + + trx = trx_sys.find(caller_trx, trx_id); } else { ut_ad(!dict_index_is_online_ddl(index)); - trx = lock_sec_rec_some_has_impl(rec, index, offsets); + trx = lock_sec_rec_some_has_impl(caller_trx, rec, index, + offsets); + if (trx == caller_trx) { + trx->release_reference(); + return true; + } - ut_ad(!trx || !lock_rec_other_trx_holds_expl( - LOCK_S | LOCK_REC_NOT_GAP, trx, rec, block)); + ut_d(lock_rec_other_trx_holds_expl(caller_trx, trx, rec, + block)); } if (trx != 0) { @@ -5817,6 +5572,8 @@ lock_rec_convert_impl_to_expl( lock_rec_convert_impl_to_expl_for_trx( block, rec, index, trx, heap_no); } + + return false; } /*********************************************************************//** @@ -5843,6 +5600,7 @@ lock_clust_rec_modify_check_and_lock( ulint heap_no; ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(page_rec_is_leaf(rec)); ut_ad(dict_index_is_clust(index)); ut_ad(block->frame == page_align(rec)); @@ -5850,7 +5608,8 @@ lock_clust_rec_modify_check_and_lock( return(DB_SUCCESS); } - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!index->table->is_temporary()); heap_no = rec_offs_comp(offsets) ? rec_get_heap_no_new(rec) @@ -5859,19 +5618,15 @@ lock_clust_rec_modify_check_and_lock( /* If a transaction has no explicit x-lock set on the record, set one for it */ - lock_rec_convert_impl_to_expl(block, rec, index, offsets); - - lock_mutex_enter(); - - ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + if (lock_rec_convert_impl_to_expl(thr_get_trx(thr), block, rec, index, + offsets)) { + /* We already hold an implicit exclusive lock. */ + return DB_SUCCESS; + } err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, block, heap_no, index, thr); - MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); - - lock_mutex_exit(); - ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets)); if (err == DB_SUCCESS_LOCKED_REC) { @@ -5907,13 +5662,15 @@ lock_sec_rec_modify_check_and_lock( ut_ad(!dict_index_is_clust(index)); ut_ad(!dict_index_is_online_ddl(index) || (flags & BTR_CREATE_FLAG)); ut_ad(block->frame == page_align(rec)); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!rec_is_metadata(rec, index)); if (flags & BTR_NO_LOCKING_FLAG) { return(DB_SUCCESS); } - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); heap_no = page_rec_get_heap_no(rec); @@ -5935,16 +5692,9 @@ lock_sec_rec_modify_check_and_lock( index record, and this would not have been possible if another active transaction had modified this secondary index record. */ - lock_mutex_enter(); - - ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, block, heap_no, index, thr); - MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); - - lock_mutex_exit(); #ifdef WITH_WSREP trx->wsrep_UK_scan= false; #endif /* WITH_WSREP */ @@ -5956,7 +5706,8 @@ lock_sec_rec_modify_check_and_lock( const rec_offs* offsets; rec_offs_init(offsets_); - offsets = rec_get_offsets(rec, index, offsets_, true, + offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, ULINT_UNDEFINED, &heap); ut_ad(lock_rec_queue_validate( @@ -6015,26 +5766,29 @@ lock_sec_rec_read_check_and_lock( ut_ad(block->frame == page_align(rec)); ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(page_rec_is_leaf(rec)); ut_ad(mode == LOCK_X || mode == LOCK_S); if ((flags & BTR_NO_LOCKING_FLAG) || srv_read_only_mode - || dict_table_is_temporary(index->table)) { + || index->table->is_temporary()) { return(DB_SUCCESS); } + ut_ad(!rec_is_metadata(rec, index)); heap_no = page_rec_get_heap_no(rec); /* Some transaction may have an implicit x-lock on the record only if the max trx id for the page >= min trx id for the trx list or a database recovery is running. */ - if ((page_get_max_trx_id(block->frame) >= trx_rw_min_trx_id() - || recv_recovery_is_on()) - && !page_rec_is_supremum(rec)) { - - lock_rec_convert_impl_to_expl(block, rec, index, offsets); + if (!page_rec_is_supremum(rec) + && page_get_max_trx_id(block->frame) >= trx_sys.get_min_trx_id() + && lock_rec_convert_impl_to_expl(thr_get_trx(thr), block, rec, + index, offsets)) { + /* We already hold an implicit exclusive lock. */ + return DB_SUCCESS; } #ifdef WITH_WSREP @@ -6049,19 +5803,10 @@ lock_sec_rec_read_check_and_lock( if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false)) trx->wsrep_UK_scan= true; #endif /* WITH_WSREP */ - lock_mutex_enter(); - - ut_ad(mode != LOCK_X - || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - ut_ad(mode != LOCK_S - || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); - err = lock_rec_lock(FALSE, mode | gap_mode, + err = lock_rec_lock(FALSE, ulint(mode) | gap_mode, block, heap_no, index, thr); - MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); - - lock_mutex_exit(); #ifdef WITH_WSREP trx->wsrep_UK_scan= false; #endif /* WITH_WSREP */ @@ -6109,33 +5854,27 @@ lock_clust_rec_read_check_and_lock( ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP || gap_mode == LOCK_REC_NOT_GAP); ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!rec_is_metadata(rec, index)); if ((flags & BTR_NO_LOCKING_FLAG) || srv_read_only_mode - || dict_table_is_temporary(index->table)) { + || index->table->is_temporary()) { return(DB_SUCCESS); } heap_no = page_rec_get_heap_no(rec); - if (heap_no != PAGE_HEAP_NO_SUPREMUM) { - - lock_rec_convert_impl_to_expl(block, rec, index, offsets); + if (heap_no != PAGE_HEAP_NO_SUPREMUM + && lock_rec_convert_impl_to_expl(thr_get_trx(thr), block, rec, + index, offsets)) { + /* We already hold an implicit exclusive lock. */ + return DB_SUCCESS; } - lock_mutex_enter(); - - ut_ad(mode != LOCK_X - || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - ut_ad(mode != LOCK_S - || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); - - err = lock_rec_lock(FALSE, mode | gap_mode, block, heap_no, index, thr); - - MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); - - lock_mutex_exit(); + err = lock_rec_lock(FALSE, ulint(mode) | gap_mode, + block, heap_no, index, thr); ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets)); @@ -6180,7 +5919,7 @@ lock_clust_rec_read_check_and_lock_alt( rec_offs_init(offsets_); ut_ad(page_rec_is_leaf(rec)); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, ULINT_UNDEFINED, &tmp_heap); err = lock_clust_rec_read_check_and_lock(flags, block, rec, index, offsets, mode, gap_mode, thr); @@ -6540,27 +6279,6 @@ lock_unlock_table_autoinc( } } -/** Release the explicit locks of a committing transaction, -and release possible other transactions waiting because of these locks. */ -void lock_trx_release_locks(trx_t* trx) -{ - ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks)); - - lock_mutex_enter(); - lock_release(trx); - trx->lock.n_rec_locks = 0; - /* We don't remove the locks one by one from the vector for - efficiency reasons. We simply reset it because we would have - released all the locks anyway. */ - - trx->lock.table_locks.clear(); - - ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); - ut_ad(ib_vector_is_empty(trx->autoinc_locks)); - lock_mutex_exit(); - mem_heap_empty(trx->lock.lock_heap); -} - static inline dberr_t lock_trx_handle_wait_low(trx_t* trx) { ut_ad(lock_mutex_own()); @@ -6616,57 +6334,43 @@ lock_table_get_n_locks( } #ifdef UNIV_DEBUG -/*******************************************************************//** -Do an exhaustive check for any locks (table or rec) against the table. -@return lock if found */ -static -const lock_t* -lock_table_locks_lookup( -/*====================*/ - const dict_table_t* table, /*!< in: check if there are - any locks held on records in - this table or on the table - itself */ - const trx_ut_list_t* trx_list) /*!< in: trx list to check */ -{ - ut_a(table != NULL); - ut_ad(lock_mutex_own()); - ut_ad(trx_sys_mutex_own()); - - for (trx_t* trx = UT_LIST_GET_FIRST(*trx_list); - trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { - const lock_t* lock; - - trx_mutex_enter(trx); - check_trx_state(trx); - - for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); - lock != NULL; - lock = UT_LIST_GET_NEXT(trx_locks, lock)) { - - ut_a(lock->trx == trx); - - if (lock_get_type_low(lock) == LOCK_REC) { - ut_ad(lock->index->online_status - != ONLINE_INDEX_CREATION - || dict_index_is_clust(lock->index)); - if (lock->index->table == table) { - break; - } - } else if (lock->un_member.tab_lock.table == table) { - break; - } - } - - trx_mutex_exit(trx); +/** + Do an exhaustive check for any locks (table or rec) against the table. - if (lock) { - return lock; - } - } + @param[in] table check if there are any locks held on records in this table + or on the table itself +*/ - return NULL; +static my_bool lock_table_locks_lookup(rw_trx_hash_element_t *element, + const dict_table_t *table) +{ + ut_ad(lock_mutex_own()); + mutex_enter(&element->mutex); + if (element->trx) + { + trx_mutex_enter(element->trx); + check_trx_state(element->trx); + if (element->trx->state != TRX_STATE_COMMITTED_IN_MEMORY) + { + for (const lock_t *lock= UT_LIST_GET_FIRST(element->trx->lock.trx_locks); + lock != NULL; + lock= UT_LIST_GET_NEXT(trx_locks, lock)) + { + ut_ad(lock->trx == element->trx); + if (lock_get_type_low(lock) == LOCK_REC) + { + ut_ad(lock->index->online_status != ONLINE_INDEX_CREATION || + lock->index->is_primary()); + ut_ad(lock->index->table != table); + } + else + ut_ad(lock->un_member.tab_lock.table != table); + } + } + trx_mutex_exit(element->trx); + } + mutex_exit(&element->mutex); + return 0; } #endif /* UNIV_DEBUG */ @@ -6682,17 +6386,17 @@ lock_table_has_locks( { ibool has_locks; + ut_ad(table != NULL); lock_mutex_enter(); has_locks = UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks > 0; #ifdef UNIV_DEBUG if (!has_locks) { - mutex_enter(&trx_sys->mutex); - - ut_ad(!lock_table_locks_lookup(table, &trx_sys->rw_trx_list)); - - mutex_exit(&trx_sys->mutex); + trx_sys.rw_trx_hash.iterate( + reinterpret_cast<my_hash_walk_action> + (lock_table_locks_lookup), + const_cast<dict_table_t*>(table)); } #endif /* UNIV_DEBUG */ @@ -6727,7 +6431,7 @@ void lock_set_timeout_event() /*====================*/ { - os_event_set(lock_sys->timeout_event); + os_event_set(lock_sys.timeout_event); } #ifdef UNIV_DEBUG @@ -6794,12 +6498,14 @@ lock_trx_has_sys_table_locks( return(strongest_lock); } -/*******************************************************************//** -Check if the transaction holds an exclusive lock on a record. -@return whether the locks are held */ +/** Check if the transaction holds an explicit exclusive lock on a record. +@param[in] trx transaction +@param[in] table table +@param[in] block leaf page +@param[in] heap_no heap number identifying the record +@return whether an explicit X-lock is held */ bool -lock_trx_has_rec_x_lock( -/*====================*/ +lock_trx_has_expl_x_lock( const trx_t* trx, /*!< in: transaction to check */ const dict_table_t* table, /*!< in: table to check */ const buf_block_t* block, /*!< in: buffer block of the record */ @@ -6808,11 +6514,9 @@ lock_trx_has_rec_x_lock( ut_ad(heap_no > PAGE_HEAP_NO_SUPREMUM); lock_mutex_enter(); - ut_a(lock_table_has(trx, table, LOCK_IX) - || dict_table_is_temporary(table)); - ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, - block, heap_no, trx) - || dict_table_is_temporary(table)); + ut_ad(lock_table_has(trx, table, LOCK_IX)); + ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, heap_no, + trx)); lock_mutex_exit(); return(true); } @@ -6859,8 +6563,6 @@ DeadlockChecker::print(const trx_t* trx, ulint max_query_len) ulint n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); ulint heap_size = mem_heap_get_size(trx->lock.lock_heap); - mutex_enter(&trx_sys->mutex); - trx_print_low(lock_latest_err_file, trx, max_query_len, n_rec_locks, n_trx_locks, heap_size); @@ -6868,8 +6570,6 @@ DeadlockChecker::print(const trx_t* trx, ulint max_query_len) trx_print_low(stderr, trx, max_query_len, n_rec_locks, n_trx_locks, heap_size); } - - mutex_exit(&trx_sys->mutex); } /** Print lock data to the deadlock file and possibly to stderr. @@ -6954,8 +6654,8 @@ DeadlockChecker::get_first_lock(ulint* heap_no) const hash_table_t* lock_hash; lock_hash = lock->type_mode & LOCK_PREDICATE - ? lock_sys->prdt_hash - : lock_sys->rec_hash; + ? lock_sys.prdt_hash + : lock_sys.rec_hash; /* We are only interested in records that match the heap_no. */ *heap_no = lock_rec_find_set_bit(lock); @@ -7317,12 +7017,14 @@ lock_update_split_and_merge( { const rec_t* left_next_rec; - ut_a(left_block && right_block); - ut_a(orig_pred); + ut_ad(page_is_leaf(left_block->frame)); + ut_ad(page_is_leaf(right_block->frame)); + ut_ad(page_align(orig_pred) == left_block->frame); lock_mutex_enter(); left_next_rec = page_rec_get_next_const(orig_pred); + ut_ad(!page_rec_is_metadata(left_next_rec)); /* Inherit the locks on the supremum of the left page to the first record which was moved from the right page */ diff --git a/storage/innobase/lock/lock0prdt.cc b/storage/innobase/lock/lock0prdt.cc index 5100388c5e0..9827243177d 100644 --- a/storage/innobase/lock/lock0prdt.cc +++ b/storage/innobase/lock/lock0prdt.cc @@ -525,7 +525,7 @@ lock_prdt_insert_check_and_lock( return(DB_SUCCESS); } - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); ut_ad(!dict_index_is_clust(index)); trx_t* trx = thr_get_trx(thr); @@ -541,7 +541,7 @@ lock_prdt_insert_check_and_lock( lock_t* lock; /* Only need to check locks on prdt_hash */ - lock = lock_rec_get_first(lock_sys->prdt_hash, block, PRDT_HEAPNO); + lock = lock_rec_get_first(lock_sys.prdt_hash, block, PRDT_HEAPNO); if (lock == NULL) { lock_mutex_exit(); @@ -619,7 +619,6 @@ lock_prdt_update_parent( buf_block_t* right_block, /*!< in/out: the new half page */ lock_prdt_t* left_prdt, /*!< in: MBR on the old page */ lock_prdt_t* right_prdt, /*!< in: MBR on the new page */ - lock_prdt_t* parent_prdt, /*!< in: original parent MBR */ ulint space, /*!< in: parent space id */ ulint page_no) /*!< in: parent page number */ { @@ -629,7 +628,7 @@ lock_prdt_update_parent( /* Get all locks in parent */ for (lock = lock_rec_get_first_on_page_addr( - lock_sys->prdt_hash, space, page_no); + lock_sys.prdt_hash, space, page_no); lock; lock = lock_rec_get_next_on_page(lock)) { lock_prdt_t* lock_prdt; @@ -673,7 +672,6 @@ static void lock_prdt_update_split_low( /*=======================*/ - buf_block_t* block, /*!< in/out: page to be split */ buf_block_t* new_block, /*!< in/out: the new half page */ lock_prdt_t* prdt, /*!< in: MBR on the old page */ lock_prdt_t* new_prdt, /*!< in: MBR on the new page */ @@ -750,17 +748,16 @@ Update predicate lock when page splits */ void lock_prdt_update_split( /*===================*/ - buf_block_t* block, /*!< in/out: page to be split */ buf_block_t* new_block, /*!< in/out: the new half page */ lock_prdt_t* prdt, /*!< in: MBR on the old page */ lock_prdt_t* new_prdt, /*!< in: MBR on the new page */ ulint space, /*!< in: space id */ ulint page_no) /*!< in: page number */ { - lock_prdt_update_split_low(block, new_block, prdt, new_prdt, + lock_prdt_update_split_low(new_block, prdt, new_prdt, space, page_no, LOCK_PREDICATE); - lock_prdt_update_split_low(block, new_block, NULL, NULL, + lock_prdt_update_split_low(new_block, NULL, NULL, space, page_no, LOCK_PRDT_PAGE); } @@ -802,15 +799,14 @@ lock_prdt_lock( SELECT FOR UPDATE */ ulint type_mode, /*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */ - que_thr_t* thr, /*!< in: query thread + que_thr_t* thr) /*!< in: query thread (can be NULL if BTR_NO_LOCKING_FLAG) */ - mtr_t* mtr) /*!< in/out: mini-transaction */ { trx_t* trx = thr_get_trx(thr); dberr_t err = DB_SUCCESS; lock_rec_req_status status = LOCK_REC_SUCCESS; - if (trx->read_only || dict_table_is_temporary(index->table)) { + if (trx->read_only || index->table->is_temporary()) { return(DB_SUCCESS); } @@ -819,8 +815,8 @@ lock_prdt_lock( ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)); hash_table_t* hash = type_mode == LOCK_PREDICATE - ? lock_sys->prdt_hash - : lock_sys->prdt_page_hash; + ? lock_sys.prdt_hash + : lock_sys.prdt_page_hash; /* Another transaction cannot have an implicit lock on the record, because when we come here, we already have modified the clustered @@ -829,7 +825,7 @@ lock_prdt_lock( lock_mutex_enter(); - const ulint prdt_mode = mode | type_mode; + const ulint prdt_mode = ulint(mode) | type_mode; lock_t* lock = lock_rec_get_first_on_page(hash, block); if (lock == NULL) { @@ -837,7 +833,7 @@ lock_prdt_lock( #ifdef WITH_WSREP NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */ #endif - mode | type_mode, block, PRDT_HEAPNO, + ulint(mode) | type_mode, block, PRDT_HEAPNO, index, trx, FALSE); status = LOCK_REC_SUCCESS_CREATED; @@ -869,7 +865,7 @@ lock_prdt_lock( NULL, /* FIXME: replicate SPATIAL INDEX locks */ #endif - mode | type_mode, + ulint(mode) | type_mode, block, PRDT_HEAPNO, index, thr, prdt); } else { @@ -929,7 +925,7 @@ lock_place_prdt_page_lock( lock_mutex_enter(); const lock_t* lock = lock_rec_get_first_on_page_addr( - lock_sys->prdt_page_hash, space, page_no); + lock_sys.prdt_page_hash, space, page_no); const ulint mode = LOCK_S | LOCK_PRDT_PAGE; trx_t* trx = thr_get_trx(thr); @@ -985,7 +981,7 @@ lock_test_prdt_page_lock( lock_mutex_enter(); lock = lock_rec_get_first_on_page_addr( - lock_sys->prdt_page_hash, space, page_no); + lock_sys.prdt_page_hash, space, page_no); lock_mutex_exit(); @@ -1005,13 +1001,13 @@ lock_prdt_rec_move( { lock_t* lock; - if (!lock_sys->prdt_hash) { + if (!lock_sys.prdt_hash) { return; } lock_mutex_enter(); - for (lock = lock_rec_get_first(lock_sys->prdt_hash, + for (lock = lock_rec_get_first(lock_sys.prdt_hash, donator, PRDT_HEAPNO); lock != NULL; lock = lock_rec_get_next(PRDT_HEAPNO, lock)) { diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc index 5d0d41ef494..90fe1ccd626 100644 --- a/storage/innobase/lock/lock0wait.cc +++ b/storage/innobase/lock/lock0wait.cc @@ -46,9 +46,9 @@ lock_wait_table_print(void) { ut_ad(lock_wait_mutex_own()); - const srv_slot_t* slot = lock_sys->waiting_threads; + const srv_slot_t* slot = lock_sys.waiting_threads; - for (ulint i = 0; i < OS_THREAD_MAX_N; i++, ++slot) { + for (ulint i = 0; i < srv_max_n_threads; i++, ++slot) { fprintf(stderr, "Slot %lu: thread type %lu," @@ -72,7 +72,7 @@ lock_wait_table_release_slot( srv_slot_t* slot) /*!< in: slot to release */ { #ifdef UNIV_DEBUG - srv_slot_t* upper = lock_sys->waiting_threads + OS_THREAD_MAX_N; + srv_slot_t* upper = lock_sys.waiting_threads + srv_max_n_threads; #endif /* UNIV_DEBUG */ lock_wait_mutex_enter(); @@ -83,7 +83,7 @@ lock_wait_table_release_slot( ut_ad(slot->thr->slot == slot); /* Must be within the array boundaries. */ - ut_ad(slot >= lock_sys->waiting_threads); + ut_ad(slot >= lock_sys.waiting_threads); ut_ad(slot < upper); /* Note: When we reserve the slot we use the trx_t::mutex to update @@ -102,23 +102,23 @@ lock_wait_table_release_slot( lock_mutex_exit(); /* Scan backwards and adjust the last free slot pointer. */ - for (slot = lock_sys->last_slot; - slot > lock_sys->waiting_threads && !slot->in_use; + for (slot = lock_sys.last_slot; + slot > lock_sys.waiting_threads && !slot->in_use; --slot) { /* No op */ } /* Either the array is empty or the last scanned slot is in use. */ - ut_ad(slot->in_use || slot == lock_sys->waiting_threads); + ut_ad(slot->in_use || slot == lock_sys.waiting_threads); - lock_sys->last_slot = slot + 1; + lock_sys.last_slot = slot + 1; /* The last slot is either outside of the array boundary or it's on an empty slot. */ - ut_ad(lock_sys->last_slot == upper || !lock_sys->last_slot->in_use); + ut_ad(lock_sys.last_slot == upper || !lock_sys.last_slot->in_use); - ut_ad(lock_sys->last_slot >= lock_sys->waiting_threads); - ut_ad(lock_sys->last_slot <= upper); + ut_ad(lock_sys.last_slot >= lock_sys.waiting_threads); + ut_ad(lock_sys.last_slot <= upper); lock_wait_mutex_exit(); } @@ -140,9 +140,9 @@ lock_wait_table_reserve_slot( ut_ad(lock_wait_mutex_own()); ut_ad(trx_mutex_own(thr_get_trx(thr))); - slot = lock_sys->waiting_threads; + slot = lock_sys.waiting_threads; - for (i = OS_THREAD_MAX_N; i--; ++slot) { + for (i = srv_max_n_threads; i--; ++slot) { if (!slot->in_use) { slot->in_use = TRUE; slot->thr = thr; @@ -158,18 +158,18 @@ lock_wait_table_reserve_slot( slot->suspend_time = time(NULL); slot->wait_timeout = wait_timeout; - if (slot == lock_sys->last_slot) { - ++lock_sys->last_slot; + if (slot == lock_sys.last_slot) { + ++lock_sys.last_slot; } - ut_ad(lock_sys->last_slot - <= lock_sys->waiting_threads + OS_THREAD_MAX_N); + ut_ad(lock_sys.last_slot + <= lock_sys.waiting_threads + srv_max_n_threads); return(slot); } } - ib::error() << "There appear to be " << OS_THREAD_MAX_N << " user" + ib::error() << "There appear to be " << srv_max_n_threads << " user" " threads currently waiting inside InnoDB, which is the upper" " limit. Cannot continue operation. Before aborting, we print" " a list of waiting threads."; @@ -184,7 +184,7 @@ lock_wait_table_reserve_slot( check if lock timeout was for priority thread, as a side effect trigger lock monitor @param[in] trx transaction owning the lock -@param[in] locked true if trx and lock_sys_mutex is ownd +@param[in] locked true if trx and lock_sys.mutex is ownd @return false for regular lock timeout */ static bool @@ -202,7 +202,7 @@ wsrep_is_BF_lock_timeout( ut_ad(lock_mutex_own()); - wsrep_trx_print_locking(stderr, trx, 3000); + trx_print_latched(stderr, trx, 3000); if (!locked) { lock_mutex_exit(); @@ -369,25 +369,22 @@ lock_wait_suspend_thread( lock_wait_table_release_slot(slot); if (thr->lock_state == QUE_THR_LOCK_ROW) { - srv_stats.n_lock_wait_current_count.dec(); - const ulonglong finish_time = my_interval_timer(); - ulint diff_time; - if (finish_time < start_time) { - diff_time = 0; - } else { - diff_time = ulint((finish_time - start_time) / 1000); + if (finish_time >= start_time) { + const ulint diff_time = static_cast<ulint> + ((finish_time - start_time) / 1000); srv_stats.n_lock_wait_time.add(diff_time); /* Only update the variable if we successfully retrieved the start and finish times. See Bug#36819. */ - if (diff_time > lock_sys->n_lock_max_wait_time) { - lock_sys->n_lock_max_wait_time = diff_time; + if (diff_time > lock_sys.n_lock_max_wait_time) { + lock_sys.n_lock_max_wait_time = diff_time; } + /* Record the lock wait time for this thread */ + thd_storage_lock_wait(trx->mysql_thd, diff_time); } - /* Record the lock wait time for this thread */ - thd_set_lock_wait_time(trx->mysql_thd, diff_time); + srv_stats.n_lock_wait_current_count.dec(); DBUG_EXECUTE_IF("lock_instrument_slow_query_log", os_thread_sleep(1000);); @@ -508,7 +505,7 @@ os_thread_ret_t DECLARE_THREAD(lock_wait_timeout_thread)(void*) { int64_t sig_count = 0; - os_event_t event = lock_sys->timeout_event; + os_event_t event = lock_sys.timeout_event; ut_ad(!srv_read_only_mode); @@ -534,8 +531,8 @@ DECLARE_THREAD(lock_wait_timeout_thread)(void*) /* Check all slots for user threads that are waiting on locks, and if they have exceeded the time limit. */ - for (slot = lock_sys->waiting_threads; - slot < lock_sys->last_slot; + for (slot = lock_sys.waiting_threads; + slot < lock_sys.last_slot; ++slot) { /* We are doing a read without the lock mutex @@ -554,7 +551,7 @@ DECLARE_THREAD(lock_wait_timeout_thread)(void*) } while (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP); - lock_sys->timeout_thread_active = false; + lock_sys.timeout_thread_active = false; /* We count the number of threads in os_thread_exit(). A created thread should always use that to exit and not use return() to exit. */ diff --git a/storage/innobase/log/log0crypt.cc b/storage/innobase/log/log0crypt.cc index f1297921839..a2e4ac1dd40 100644 --- a/storage/innobase/log/log0crypt.cc +++ b/storage/innobase/log/log0crypt.cc @@ -24,6 +24,7 @@ Created 11/25/2013 Minli Zhu Google Modified Jan Lindström jan.lindstrom@mariadb.com MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation. *******************************************************/ +#include <my_global.h> #include "m_string.h" #include "log0crypt.h" #include <mysql/service_my_crypt.h> @@ -390,9 +391,10 @@ log_tmp_block_encrypt( memcpy(iv + 1, tmp_iv, sizeof iv - sizeof *iv); int rc = encryption_crypt( - src, size, dst, &dst_len, - const_cast<byte*>(info.crypt_key.bytes), sizeof info.crypt_key, - reinterpret_cast<byte*>(iv), sizeof iv, + src, uint(size), dst, &dst_len, + const_cast<byte*>(info.crypt_key.bytes), + uint(sizeof info.crypt_key), + reinterpret_cast<byte*>(iv), uint(sizeof iv), encrypt ? ENCRYPTION_FLAG_ENCRYPT|ENCRYPTION_FLAG_NOPAD : ENCRYPTION_FLAG_DECRYPT|ENCRYPTION_FLAG_NOPAD, diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 4c68f3743e9..997430497bc 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -69,7 +69,7 @@ c-function and its parameters are written to the log to reduce the size of the log. 3a) You should not add parameters to these kind of functions - (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse()) + (e.g. trx_undo_header_create()) 3b) You should not add such functionality which either change working when compared with the old or are dependent on data @@ -81,7 +81,7 @@ reduce the size of the log. */ /** Redo log system */ -log_t* log_sys = NULL; +log_t log_sys; /** Whether to require checksums on the redo log pages */ my_bool innodb_log_checksums; @@ -103,7 +103,8 @@ static time_t log_last_margine_warning_time; /* Margins for free space in the log buffer after a log entry is catenated */ #define LOG_BUF_FLUSH_RATIO 2 -#define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE) +#define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN \ + + (4U << srv_page_size_shift)) /* This parameter controls asynchronous making of a new checkpoint; the value should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */ @@ -130,15 +131,8 @@ extern "C" UNIV_INTERN os_thread_ret_t DECLARE_THREAD(log_scrub_thread)(void*); -/******************************************************//** -Completes a checkpoint write i/o to a log file. */ -static -void -log_io_complete_checkpoint(void); -/*============================*/ - /****************************************************************//** -Returns the oldest modified block lsn in the pool, or log_sys->lsn if none +Returns the oldest modified block lsn in the pool, or log_sys.lsn if none exists. @return LSN of oldest modification */ static @@ -154,7 +148,7 @@ log_buf_pool_get_oldest_modification(void) if (!lsn) { - lsn = log_sys->lsn; + lsn = log_sys.lsn; } return(lsn); @@ -162,56 +156,48 @@ log_buf_pool_get_oldest_modification(void) /** Extends the log buffer. @param[in] len requested minimum size in bytes */ -void -log_buffer_extend( - ulint len) +void log_buffer_extend(ulong len) { - const ulint new_log_buffer_size = (len >> srv_page_size_shift) + 1; - const ulint new_buf_size = (new_log_buffer_size - << (srv_page_size_shift + 1)) - + OS_FILE_LOG_BLOCK_SIZE; - byte* new_buf_ptr = static_cast<byte*>(ut_malloc_nokey(new_buf_size)); + const ulong new_buf_size = ut_calc_align(len, srv_page_size); + byte* new_buf = static_cast<byte*>(ut_malloc_dontdump(new_buf_size)); + TRASH_ALLOC(new_buf, new_buf_size); + byte* new_flush_buf = + static_cast<byte*>(ut_malloc_dontdump(new_buf_size)); + TRASH_ALLOC(new_flush_buf, new_buf_size); log_mutex_enter(); - const ulint size = srv_log_buffer_size << srv_page_size_shift; - - if (len <= size) { + if (len <= srv_log_buffer_size) { /* Already extended enough by the others */ log_mutex_exit(); - ut_free(new_buf_ptr); + ut_free_dodump(new_buf, new_buf_size); + ut_free_dodump(new_flush_buf, new_buf_size); return; } - ib::warn() << "The transaction log size is too large" - " for innodb_log_buffer_size (" << len - << " >= " << size << " / 2). Trying to extend it."; - - byte* old_buf_ptr = log_sys->buf_ptr; - const byte* begin = log_sys->buf; - const byte* end = begin + log_sys->buf_free; + ib::warn() << "The redo log transaction size " << len << + " exceeds innodb_log_buffer_size=" + << srv_log_buffer_size << " / 2). Trying to extend it."; - log_sys->buf_ptr = new_buf_ptr; - srv_log_buffer_size = new_log_buffer_size; - log_sys->buf_size = size; - log_sys->buf - = static_cast<byte*>(ut_align(new_buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); + byte* old_buf = log_sys.buf; + byte* old_flush_buf = log_sys.flush_buf; + const ulong old_buf_size = srv_log_buffer_size; - if (!log_sys->first_in_use) { - log_sys->buf += size; - } - - memcpy(log_sys->buf, begin, end - begin); + srv_log_buffer_size = new_buf_size; + log_sys.buf = new_buf; + log_sys.flush_buf = new_flush_buf; + memcpy(new_buf, old_buf, log_sys.buf_free); - log_sys->max_buf_free = size / LOG_BUF_FLUSH_RATIO + log_sys.max_buf_free = new_buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; log_mutex_exit(); - ut_free(old_buf_ptr); + ut_free_dodump(old_buf, old_buf_size); + ut_free_dodump(old_flush_buf, old_buf_size); ib::info() << "innodb_log_buffer_size was extended to " - << size << "."; + << new_buf_size << "."; } /** Calculate actual length in redo buffer and file including @@ -230,7 +216,7 @@ log_calculate_actual_len( - (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE); /* actual data length in last block already written */ - ulint extra_len = (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE); + ulint extra_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE); ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE); extra_len -= LOG_BLOCK_HDR_SIZE; @@ -255,7 +241,7 @@ log_margin_checkpoint_age( ut_ad(log_mutex_own()); - if (margin > log_sys->log_group_capacity) { + if (margin > log_sys.log_group_capacity) { /* return with warning output to avoid deadlock */ if (!log_has_printed_chkp_margine_warning || difftime(time(NULL), @@ -267,7 +253,7 @@ log_margin_checkpoint_age( " small for the single transaction log (size=" << len << "). So, the last checkpoint age" " might exceed the log group capacity " - << log_sys->log_group_capacity << "."; + << log_sys.log_group_capacity << "."; } return; @@ -276,20 +262,20 @@ log_margin_checkpoint_age( /* Our margin check should ensure that we never reach this condition. Try to do checkpoint once. We cannot keep waiting here as it might result in hang in case the current mtr has latch on oldest lsn */ - if (log_sys->lsn - log_sys->last_checkpoint_lsn + margin - > log_sys->log_group_capacity) { + if (log_sys.lsn - log_sys.last_checkpoint_lsn + margin + > log_sys.log_group_capacity) { /* The log write of 'len' might overwrite the transaction log after the last checkpoint. Makes checkpoint. */ bool flushed_enough = false; - if (log_sys->lsn - log_buf_pool_get_oldest_modification() + if (log_sys.lsn - log_buf_pool_get_oldest_modification() + margin - <= log_sys->log_group_capacity) { + <= log_sys.log_group_capacity) { flushed_enough = true; } - log_sys->check_flush_or_checkpoint = true; + log_sys.check_flush_or_checkpoint = true; log_mutex_exit(); DEBUG_SYNC_C("margin_checkpoint_age_rescue"); @@ -326,7 +312,7 @@ loop: len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size + (5 * len) / 4; - if (log_sys->buf_free + len_upper_limit > log_sys->buf_size) { + if (log_sys.buf_free + len_upper_limit > srv_log_buffer_size) { log_mutex_exit(); DEBUG_SYNC_C("log_buf_size_exceeded"); @@ -342,7 +328,7 @@ loop: goto loop; } - return(log_sys->lsn); + return(log_sys.lsn); } /************************************************************//** @@ -354,7 +340,6 @@ log_write_low( const byte* str, /*!< in: string */ ulint str_len) /*!< in: string length */ { - log_t* log = log_sys; ulint len; ulint data_len; byte* log_block; @@ -363,7 +348,7 @@ log_write_low( part_loop: /* Calculate a part length */ - data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len; + data_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len; if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { @@ -374,18 +359,18 @@ part_loop: data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; len = OS_FILE_LOG_BLOCK_SIZE - - (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + - (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_TRL_SIZE; } - ut_memcpy(log->buf + log->buf_free, str, len); + memcpy(log_sys.buf + log_sys.buf_free, str, len); str_len -= len; str = str + len; log_block = static_cast<byte*>( - ut_align_down( - log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE)); + ut_align_down(log_sys.buf + log_sys.buf_free, + OS_FILE_LOG_BLOCK_SIZE)); log_block_set_data_len(log_block, data_len); @@ -393,20 +378,21 @@ part_loop: /* This block became full */ log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE); log_block_set_checkpoint_no(log_block, - log_sys->next_checkpoint_no); + log_sys.next_checkpoint_no); len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE; - log->lsn += len; + log_sys.lsn += len; /* Initialize the next block header */ - log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn); + log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, + log_sys.lsn); } else { - log->lsn += len; + log_sys.lsn += len; } - log->buf_free += len; + log_sys.buf_free += ulong(len); - ut_ad(log->buf_free <= log->buf_size); + ut_ad(log_sys.buf_free <= srv_log_buffer_size); if (str_len > 0) { goto part_loop; @@ -426,16 +412,15 @@ log_close(void) ulint first_rec_group; lsn_t oldest_lsn; lsn_t lsn; - log_t* log = log_sys; lsn_t checkpoint_age; ut_ad(log_mutex_own()); - lsn = log->lsn; + lsn = log_sys.lsn; log_block = static_cast<byte*>( - ut_align_down( - log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE)); + ut_align_down(log_sys.buf + log_sys.buf_free, + OS_FILE_LOG_BLOCK_SIZE)); first_rec_group = log_block_get_first_rec_group(log_block); @@ -448,14 +433,13 @@ log_close(void) log_block, log_block_get_data_len(log_block)); } - if (log->buf_free > log->max_buf_free) { - - log->check_flush_or_checkpoint = true; + if (log_sys.buf_free > log_sys.max_buf_free) { + log_sys.check_flush_or_checkpoint = true; } - checkpoint_age = lsn - log->last_checkpoint_lsn; + checkpoint_age = lsn - log_sys.last_checkpoint_lsn; - if (checkpoint_age >= log->log_group_capacity) { + if (checkpoint_age >= log_sys.log_group_capacity) { DBUG_EXECUTE_IF( "print_all_chkp_warnings", log_has_printed_chkp_warning = false;); @@ -468,131 +452,28 @@ log_close(void) ib::error() << "The age of the last checkpoint is " << checkpoint_age << ", which exceeds the log" - " group capacity " << log->log_group_capacity + " group capacity " + << log_sys.log_group_capacity << "."; } } - if (checkpoint_age <= log->max_modified_age_sync) { - + if (checkpoint_age <= log_sys.max_modified_age_sync) { goto function_exit; } oldest_lsn = buf_pool_get_oldest_modification(); if (!oldest_lsn - || lsn - oldest_lsn > log->max_modified_age_sync - || checkpoint_age > log->max_checkpoint_age_async) { - - log->check_flush_or_checkpoint = true; + || lsn - oldest_lsn > log_sys.max_modified_age_sync + || checkpoint_age > log_sys.max_checkpoint_age_async) { + log_sys.check_flush_or_checkpoint = true; } function_exit: return(lsn); } -/******************************************************//** -Calculates the offset within a log group, when the log file headers are not -included. -@return size offset (<= offset) */ -UNIV_INLINE -lsn_t -log_group_calc_size_offset( -/*=======================*/ - lsn_t offset, /*!< in: real offset within the - log group */ - const log_group_t* group) /*!< in: log group */ -{ - /* The lsn parameters are updated while holding both the mutexes - and it is ok to have either of them while reading */ - ut_ad(log_mutex_own() || log_write_mutex_own()); - - return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size)); -} - -/******************************************************//** -Calculates the offset within a log group, when the log file headers are -included. -@return real offset (>= offset) */ -UNIV_INLINE -lsn_t -log_group_calc_real_offset( -/*=======================*/ - lsn_t offset, /*!< in: size offset within the - log group */ - const log_group_t* group) /*!< in: log group */ -{ - /* The lsn parameters are updated while holding both the mutexes - and it is ok to have either of them while reading */ - ut_ad(log_mutex_own() || log_write_mutex_own()); - - return(offset + LOG_FILE_HDR_SIZE - * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE))); -} - -/** Calculate the offset of an lsn within a log group. -@param[in] lsn log sequence number -@param[in] group log group -@return offset within the log group */ -lsn_t -log_group_calc_lsn_offset( - lsn_t lsn, - const log_group_t* group) -{ - lsn_t gr_lsn; - lsn_t gr_lsn_size_offset; - lsn_t difference; - lsn_t group_size; - lsn_t offset; - - /* The lsn parameters are updated while holding both the mutexes - and it is ok to have either of them while reading */ - ut_ad(log_mutex_own() || log_write_mutex_own()); - - gr_lsn = group->lsn; - - gr_lsn_size_offset = log_group_calc_size_offset( - group->lsn_offset, group); - - group_size = group->capacity(); - - if (lsn >= gr_lsn) { - - difference = lsn - gr_lsn; - } else { - difference = gr_lsn - lsn; - - difference = difference % group_size; - - difference = group_size - difference; - } - - offset = (gr_lsn_size_offset + difference) % group_size; - - /* fprintf(stderr, - "Offset is " LSN_PF " gr_lsn_offset is " LSN_PF - " difference is " LSN_PF "\n", - offset, gr_lsn_size_offset, difference); - */ - - return(log_group_calc_real_offset(offset, group)); -} - -/********************************************************//** -Sets the field values in group to correspond to a given lsn. For this function -to work, the values must already be correctly initialized to correspond to -some lsn, for instance, a checkpoint lsn. */ -void -log_group_set_fields( -/*=================*/ - log_group_t* group, /*!< in/out: group */ - lsn_t lsn) /*!< in: lsn for which the values should be - set */ -{ - group->lsn_offset = log_group_calc_lsn_offset(lsn, group); - group->lsn = lsn; -} - /** Calculate the recommended highest values for lsn - last_checkpoint_lsn and lsn - buf_get_oldest_modification(). @param[in] file_size requested innodb_log_file_size @@ -632,168 +513,112 @@ log_set_capacity(ulonglong file_size) log_mutex_enter(); - log_sys->log_group_capacity = smallest_capacity; + log_sys.log_group_capacity = smallest_capacity; - log_sys->max_modified_age_async = margin + log_sys.max_modified_age_async = margin - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC; - log_sys->max_modified_age_sync = margin + log_sys.max_modified_age_sync = margin - margin / LOG_POOL_PREFLUSH_RATIO_SYNC; - log_sys->max_checkpoint_age_async = margin - margin + log_sys.max_checkpoint_age_async = margin - margin / LOG_POOL_CHECKPOINT_RATIO_ASYNC; - log_sys->max_checkpoint_age = margin; + log_sys.max_checkpoint_age = margin; log_mutex_exit(); return(true); } -/** Initializes the redo logging subsystem. */ -void -log_sys_init() +/** Initialize the redo log subsystem. */ +void log_t::create() { - log_sys = static_cast<log_t*>(ut_zalloc_nokey(sizeof(log_t))); - - mutex_create(LATCH_ID_LOG_SYS, &log_sys->mutex); - mutex_create(LATCH_ID_LOG_WRITE, &log_sys->write_mutex); - - mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_sys->log_flush_order_mutex); - - /* Start the lsn from one log block from zero: this way every - log record has a start lsn != zero, a fact which we will use */ - - log_sys->lsn = LOG_START_LSN; - - ut_ad(srv_log_buffer_size >= 4); - - log_sys->buf_size = srv_log_buffer_size << srv_page_size_shift; - - log_sys->buf_ptr = static_cast<byte*>( - ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE)); - TRASH_ALLOC(log_sys->buf_ptr, - log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE); - log_sys->buf = static_cast<byte*>( - ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); - - log_sys->first_in_use = true; - - log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO - - LOG_BUF_FLUSH_MARGIN; - log_sys->check_flush_or_checkpoint = true; - - log_sys->n_log_ios_old = log_sys->n_log_ios; - log_sys->last_printout_time = time(NULL); - /*----------------------------*/ - - log_sys->write_lsn = log_sys->lsn; - - log_sys->flush_event = os_event_create(0); - - os_event_set(log_sys->flush_event); - - /*----------------------------*/ - - log_sys->last_checkpoint_lsn = log_sys->lsn; - - rw_lock_create( - checkpoint_lock_key, &log_sys->checkpoint_lock, - SYNC_NO_ORDER_CHECK); - - log_sys->checkpoint_buf_ptr = static_cast<byte*>( - ut_zalloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE)); - - log_sys->checkpoint_buf = static_cast<byte*>( - ut_align(log_sys->checkpoint_buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); - - /*----------------------------*/ - - log_block_init(log_sys->buf, log_sys->lsn); - log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE); - - log_sys->buf_free = LOG_BLOCK_HDR_SIZE; - log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE; // TODO(minliz): ensure various LOG_START_LSN? - - MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, - log_sys->lsn - log_sys->last_checkpoint_lsn); - - log_scrub_thread_active = !srv_read_only_mode && srv_scrub_log; - if (log_scrub_thread_active) { - log_scrub_event = os_event_create("log_scrub_event"); - os_thread_create(log_scrub_thread, NULL, NULL); - } + ut_ad(this == &log_sys); + ut_ad(!is_initialised()); + m_initialised= true; + + mutex_create(LATCH_ID_LOG_SYS, &mutex); + mutex_create(LATCH_ID_LOG_WRITE, &write_mutex); + mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_flush_order_mutex); + + /* Start the lsn from one log block from zero: this way every + log record has a non-zero start lsn, a fact which we will use */ + + lsn= LOG_START_LSN; + + ut_ad(srv_log_buffer_size >= 16 * OS_FILE_LOG_BLOCK_SIZE); + ut_ad(srv_log_buffer_size >= 4U << srv_page_size_shift); + + buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size)); + TRASH_ALLOC(buf, srv_log_buffer_size); + flush_buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size)); + TRASH_ALLOC(flush_buf, srv_log_buffer_size); + + max_buf_free= srv_log_buffer_size / LOG_BUF_FLUSH_RATIO - + LOG_BUF_FLUSH_MARGIN; + check_flush_or_checkpoint= true; + + n_log_ios_old= n_log_ios; + last_printout_time= time(NULL); + + buf_next_to_write= 0; + write_lsn= lsn; + flushed_to_disk_lsn= 0; + n_pending_flushes= 0; + flush_event = os_event_create("log_flush_event"); + os_event_set(flush_event); + n_log_ios= 0; + n_log_ios_old= 0; + log_group_capacity= 0; + max_modified_age_async= 0; + max_modified_age_sync= 0; + max_checkpoint_age_async= 0; + max_checkpoint_age= 0; + next_checkpoint_no= 0; + next_checkpoint_lsn= 0; + append_on_checkpoint= NULL; + n_pending_checkpoint_writes= 0; + + last_checkpoint_lsn= lsn; + rw_lock_create(checkpoint_lock_key, &checkpoint_lock, SYNC_NO_ORDER_CHECK); + + log_block_init(buf, lsn); + log_block_set_first_rec_group(buf, LOG_BLOCK_HDR_SIZE); + + buf_free= LOG_BLOCK_HDR_SIZE; + lsn= LOG_START_LSN + LOG_BLOCK_HDR_SIZE; + + MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, lsn - last_checkpoint_lsn); + + log_scrub_thread_active= !srv_read_only_mode && srv_scrub_log; + if (log_scrub_thread_active) { + log_scrub_event= os_event_create("log_scrub_event"); + os_thread_create(log_scrub_thread, NULL, NULL); + } } /** Initialize the redo log. @param[in] n_files number of files */ -void -log_init(ulint n_files) -{ - log_group_t* group = &log_sys->log; - - group->n_files = n_files; - group->subformat = srv_safe_truncate; - if (srv_safe_truncate) { - group->format = srv_encrypt_log - ? LOG_HEADER_FORMAT_10_3 | LOG_HEADER_FORMAT_ENCRYPTED - : LOG_HEADER_FORMAT_10_3; - } else { - group->format = srv_encrypt_log - ? LOG_HEADER_FORMAT_10_2 | LOG_HEADER_FORMAT_ENCRYPTED - : LOG_HEADER_FORMAT_10_2; - } - group->file_size = srv_log_file_size; - group->state = LOG_GROUP_OK; - group->lsn = LOG_START_LSN; - group->lsn_offset = LOG_FILE_HDR_SIZE; - - group->checkpoint_buf_ptr = static_cast<byte*>( - ut_zalloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE)); - - group->checkpoint_buf = static_cast<byte*>( - ut_align(group->checkpoint_buf_ptr,OS_FILE_LOG_BLOCK_SIZE)); -} - -/******************************************************//** -Completes an i/o to a log file. */ -void -log_io_complete( -/*============*/ - log_group_t* group) /*!< in: log group or a dummy pointer */ +void log_t::files::create(ulint n_files) { - if ((ulint) group & 0x1UL) { - /* It was a checkpoint write */ - group = (log_group_t*)((ulint) group - 1); - - switch (srv_file_flush_method) { - case SRV_O_DSYNC: - case SRV_NOSYNC: - break; - case SRV_FSYNC: - case SRV_LITTLESYNC: - case SRV_O_DIRECT: - case SRV_O_DIRECT_NO_FSYNC: - case SRV_ALL_O_DIRECT_FSYNC: - fil_flush(SRV_LOG_SPACE_FIRST_ID); - } - - - DBUG_PRINT("ib_log", ("checkpoint info written")); - log_io_complete_checkpoint(); - - return; - } - - ut_error; /*!< We currently use synchronous writing of the - logs and cannot end up here! */ + ut_ad(n_files <= SRV_N_LOG_FILES_MAX); + ut_ad(this == &log_sys.log); + ut_ad(log_sys.is_initialised()); + + this->n_files= n_files; + format= srv_encrypt_log + ? LOG_HEADER_FORMAT_CURRENT | LOG_HEADER_FORMAT_ENCRYPTED + : LOG_HEADER_FORMAT_CURRENT; + subformat= 2; + file_size= srv_log_file_size; + lsn= LOG_START_LSN; + lsn_offset= LOG_FILE_HDR_SIZE; } /******************************************************//** Writes a log file header to a log file space. */ static void -log_group_file_header_flush( -/*========================*/ - log_group_t* group, /*!< in: log group */ +log_file_header_flush( ulint nth_file, /*!< in: header to the nth file in the log file space */ lsn_t start_lsn) /*!< in: log file data starts at this @@ -803,18 +628,16 @@ log_group_file_header_flush( ut_ad(log_write_mutex_own()); ut_ad(!recv_no_log_write); - ut_a(nth_file < group->n_files); - ut_ad((group->format & ~LOG_HEADER_FORMAT_ENCRYPTED) - == (srv_safe_truncate - ? LOG_HEADER_FORMAT_10_3 - : LOG_HEADER_FORMAT_10_2)); + ut_a(nth_file < log_sys.log.n_files); + ut_ad((log_sys.log.format & ~LOG_HEADER_FORMAT_ENCRYPTED) + == LOG_HEADER_FORMAT_CURRENT); // man 2 open suggests this buffer to be aligned by 512 for O_DIRECT MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE) byte buf[OS_FILE_LOG_BLOCK_SIZE] = {0}; - mach_write_to_4(buf + LOG_HEADER_FORMAT, group->format); - mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, srv_safe_truncate); + mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format); + mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, log_sys.log.subformat); mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn); strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR, LOG_HEADER_CREATOR_CURRENT); @@ -822,24 +645,23 @@ log_group_file_header_flush( >= sizeof LOG_HEADER_CREATOR_CURRENT); log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf)); - dest_offset = nth_file * group->file_size; + dest_offset = nth_file * log_sys.log.file_size; DBUG_PRINT("ib_log", ("write " LSN_PF " file " ULINTPF " header", start_lsn, nth_file)); - log_sys->n_log_ios++; + log_sys.n_log_ios++; srv_stats.os_log_pending_writes.inc(); - const ulint page_no - = (ulint) (dest_offset / univ_page_size.physical()); + const ulint page_no = ulint(dest_offset >> srv_page_size_shift); fil_io(IORequestLogWrite, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), univ_page_size, - (ulint) (dest_offset % univ_page_size.physical()), - OS_FILE_LOG_BLOCK_SIZE, buf, group); + ulint(dest_offset & (srv_page_size - 1)), + OS_FILE_LOG_BLOCK_SIZE, buf, NULL); srv_stats.os_log_pending_writes.dec(); } @@ -858,12 +680,10 @@ log_block_store_checksum( } /******************************************************//** -Writes a buffer to a log file group. */ +Writes a buffer to a log file. */ static void -log_group_write_buf( -/*================*/ - log_group_t* group, /*!< in: log group */ +log_write_buf( byte* buf, /*!< in: buffer */ ulint len, /*!< in: buffer len; must be divisible by OS_FILE_LOG_BLOCK_SIZE */ @@ -894,28 +714,27 @@ loop: return; } - next_offset = log_group_calc_lsn_offset(start_lsn, group); + next_offset = log_sys.log.calc_lsn_offset(start_lsn); if (write_header - && next_offset % group->file_size == LOG_FILE_HDR_SIZE) { + && next_offset % log_sys.log.file_size == LOG_FILE_HDR_SIZE) { /* We start to write a new log file instance in the group */ - ut_a(next_offset / group->file_size <= ULINT_MAX); + ut_a(next_offset / log_sys.log.file_size <= ULINT_MAX); - log_group_file_header_flush(group, (ulint) - (next_offset / group->file_size), - start_lsn); + log_file_header_flush( + ulint(next_offset / log_sys.log.file_size), start_lsn); srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE); srv_stats.log_writes.inc(); } - if ((next_offset % group->file_size) + len > group->file_size) { - + if ((next_offset % log_sys.log.file_size) + len + > log_sys.log.file_size) { /* if the above condition holds, then the below expression is < len which is ulint, so the typecast is ok */ - write_len = (ulint) - (group->file_size - (next_offset % group->file_size)); + write_len = ulint(log_sys.log.file_size + - (next_offset % log_sys.log.file_size)); } else { write_len = len; } @@ -950,20 +769,18 @@ loop: log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE); } - log_sys->n_log_ios++; + log_sys.n_log_ios++; srv_stats.os_log_pending_writes.inc(); - ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX); + ut_a((next_offset >> srv_page_size_shift) <= ULINT_MAX); - const ulint page_no - = (ulint) (next_offset / univ_page_size.physical()); + const ulint page_no = ulint(next_offset >> srv_page_size_shift); fil_io(IORequestLogWrite, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), univ_page_size, - (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf, - group); + ulint(next_offset & (srv_page_size - 1)), write_len, buf, NULL); srv_stats.os_log_pending_writes.dec(); @@ -987,9 +804,9 @@ static void log_write_flush_to_disk_low() { - /* FIXME: This is not holding log_sys->mutex while + /* FIXME: This is not holding log_sys.mutex while calling os_event_set()! */ - ut_a(log_sys->n_pending_flushes == 1); /* No other threads here */ + ut_a(log_sys.n_pending_flushes == 1); /* No other threads here */ bool do_flush = srv_file_flush_method != SRV_O_DSYNC; @@ -1000,16 +817,16 @@ log_write_flush_to_disk_low() log_mutex_enter(); if (do_flush) { - log_sys->flushed_to_disk_lsn = log_sys->current_flush_lsn; + log_sys.flushed_to_disk_lsn = log_sys.current_flush_lsn; } - log_sys->n_pending_flushes--; + log_sys.n_pending_flushes--; - os_event_set(log_sys->flush_event); + os_event_set(log_sys.flush_event); } -/** Switch the log buffer in use, and copy the content of last block -from old log buffer to the head of the to be used one. Thus, buf_free and +/** Swap log buffers, and copy the content of last block +from old buf to the head of the new buf. Thus, buf_free and buf_next_to_write would be changed accordingly */ static inline void @@ -1018,29 +835,18 @@ log_buffer_switch() ut_ad(log_mutex_own()); ut_ad(log_write_mutex_own()); - const byte* old_buf = log_sys->buf; - ulint area_end = ut_calc_align( - log_sys->buf_free, ulint(OS_FILE_LOG_BLOCK_SIZE)); - - if (log_sys->first_in_use) { - log_sys->first_in_use = false; - ut_ad(log_sys->buf == ut_align(log_sys->buf_ptr, - OS_FILE_LOG_BLOCK_SIZE)); - log_sys->buf += log_sys->buf_size; - } else { - log_sys->first_in_use = true; - log_sys->buf -= log_sys->buf_size; - ut_ad(log_sys->buf == ut_align(log_sys->buf_ptr, - OS_FILE_LOG_BLOCK_SIZE)); - } + ulong area_end = ut_calc_align( + log_sys.buf_free, ulong(OS_FILE_LOG_BLOCK_SIZE)); /* Copy the last block to new buf */ - ut_memcpy(log_sys->buf, - old_buf + area_end - OS_FILE_LOG_BLOCK_SIZE, + ut_memcpy(log_sys.flush_buf, + log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE, OS_FILE_LOG_BLOCK_SIZE); - log_sys->buf_free %= OS_FILE_LOG_BLOCK_SIZE; - log_sys->buf_next_to_write = log_sys->buf_free; + std::swap(log_sys.buf, log_sys.flush_buf); + + log_sys.buf_free %= OS_FILE_LOG_BLOCK_SIZE; + log_sys.buf_next_to_write = log_sys.buf_free; } /** Ensure that the log has been written to the log file up to a given @@ -1079,7 +885,7 @@ loop: (flush_to_disk == true) case, because the log_mutex contention also works as the arbitrator for write-IO (fsync) bandwidth between log files and data files. */ - if (!flush_to_disk && log_sys->write_lsn >= lsn) { + if (!flush_to_disk && log_sys.write_lsn >= lsn) { return; } #endif @@ -1088,8 +894,8 @@ loop: ut_ad(!recv_no_log_write); lsn_t limit_lsn = flush_to_disk - ? log_sys->flushed_to_disk_lsn - : log_sys->write_lsn; + ? log_sys.flushed_to_disk_lsn + : log_sys.write_lsn; if (limit_lsn >= lsn) { log_write_mutex_exit(); @@ -1102,15 +908,15 @@ loop: pending flush and based on that we wait for it to finish before proceeding further. */ if (flush_to_disk - && (log_sys->n_pending_flushes > 0 - || !os_event_is_set(log_sys->flush_event))) { + && (log_sys.n_pending_flushes > 0 + || !os_event_is_set(log_sys.flush_event))) { /* Figure out if the current flush will do the job for us. */ - bool work_done = log_sys->current_flush_lsn >= lsn; + bool work_done = log_sys.current_flush_lsn >= lsn; log_write_mutex_exit(); - os_event_wait(log_sys->flush_event); + os_event_wait(log_sys.flush_event); if (work_done) { return; @@ -1121,7 +927,7 @@ loop: log_mutex_enter(); if (!flush_to_disk - && log_sys->buf_free == log_sys->buf_next_to_write) { + && log_sys.buf_free == log_sys.buf_next_to_write) { /* Nothing to write and no flush to disk requested */ log_mutex_exit_all(); return; @@ -1135,14 +941,14 @@ loop: ulint pad_size; DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF, - log_sys->write_lsn, - log_sys->lsn)); + log_sys.write_lsn, + log_sys.lsn)); if (flush_to_disk) { - log_sys->n_pending_flushes++; - log_sys->current_flush_lsn = log_sys->lsn; - os_event_reset(log_sys->flush_event); + log_sys.n_pending_flushes++; + log_sys.current_flush_lsn = log_sys.lsn; + os_event_reset(log_sys.flush_event); - if (log_sys->buf_free == log_sys->buf_next_to_write) { + if (log_sys.buf_free == log_sys.buf_next_to_write) { /* Nothing to write, flush only */ log_mutex_exit_all(); log_write_flush_to_disk_low(); @@ -1151,8 +957,8 @@ loop: } } - start_offset = log_sys->buf_next_to_write; - end_offset = log_sys->buf_free; + start_offset = log_sys.buf_next_to_write; + end_offset = log_sys.buf_free; area_start = ut_2pow_round(start_offset, ulint(OS_FILE_LOG_BLOCK_SIZE)); @@ -1160,17 +966,17 @@ loop: ut_ad(area_end - area_start > 0); - log_block_set_flush_bit(log_sys->buf + area_start, TRUE); + log_block_set_flush_bit(log_sys.buf + area_start, TRUE); log_block_set_checkpoint_no( - log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE, - log_sys->next_checkpoint_no); + log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE, + log_sys.next_checkpoint_no); - write_lsn = log_sys->lsn; - write_buf = log_sys->buf; + write_lsn = log_sys.lsn; + write_buf = log_sys.buf; log_buffer_switch(); - log_group_set_fields(&log_sys->log, log_sys->write_lsn); + log_sys.log.set_fields(log_sys.write_lsn); log_mutex_exit(); /* Erase the end of the last log block. */ @@ -1180,12 +986,9 @@ loop: /* Calculate pad_size if needed. */ pad_size = 0; if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) { - lsn_t end_offset; ulint end_offset_in_unit; - end_offset = log_group_calc_lsn_offset( - ut_uint64_align_up(write_lsn, - OS_FILE_LOG_BLOCK_SIZE), - &log_sys->log); + lsn_t end_offset = log_sys.log.calc_lsn_offset( + ut_uint64_align_up(write_lsn, OS_FILE_LOG_BLOCK_SIZE)); end_offset_in_unit = (ulint) (end_offset % write_ahead_size); if (end_offset_in_unit > 0 @@ -1193,9 +996,9 @@ loop: /* The first block in the unit was initialized after the last writing. Needs to be written padded data once. */ - pad_size = std::min( + pad_size = std::min<ulint>( ulint(write_ahead_size) - end_offset_in_unit, - log_sys->buf_size - area_end); + srv_log_buffer_size - area_end); ::memset(write_buf + area_end, 0, pad_size); } } @@ -1204,43 +1007,41 @@ loop: service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, "InnoDB log write: " LSN_PF "," LSN_PF, - log_sys->write_lsn, lsn); + log_sys.write_lsn, lsn); } - if (log_sys->is_encrypted()) { - log_crypt(write_buf + area_start, log_sys->write_lsn, + if (log_sys.is_encrypted()) { + log_crypt(write_buf + area_start, log_sys.write_lsn, area_end - area_start); } /* Do the write to the log files */ - log_group_write_buf( - &log_sys->log, write_buf + area_start, - area_end - area_start + pad_size, + log_write_buf( + write_buf + area_start, area_end - area_start + pad_size, #ifdef UNIV_DEBUG pad_size, #endif /* UNIV_DEBUG */ - ut_uint64_align_down(log_sys->write_lsn, + ut_uint64_align_down(log_sys.write_lsn, OS_FILE_LOG_BLOCK_SIZE), start_offset - area_start); srv_stats.log_padded.add(pad_size); - log_sys->write_lsn = write_lsn; + log_sys.write_lsn = write_lsn; if (srv_file_flush_method == SRV_O_DSYNC) { /* O_SYNC means the OS did not buffer the log file at all: so we have also flushed to disk what we have written */ - log_sys->flushed_to_disk_lsn = log_sys->write_lsn; + log_sys.flushed_to_disk_lsn = log_sys.write_lsn; } log_write_mutex_exit(); if (flush_to_disk) { log_write_flush_to_disk_low(); - ib_uint64_t write_lsn = log_sys->write_lsn; - ib_uint64_t flush_lsn = log_sys->flushed_to_disk_lsn; + ib_uint64_t flush_lsn = log_sys.flushed_to_disk_lsn; log_mutex_exit(); - innobase_mysql_log_notify(write_lsn, flush_lsn); + innobase_mysql_log_notify(flush_lsn); } } @@ -1269,11 +1070,11 @@ log_buffer_sync_in_background( log_mutex_enter(); - lsn = log_sys->lsn; + lsn = log_sys.lsn; if (flush - && log_sys->n_pending_flushes > 0 - && log_sys->current_flush_lsn >= lsn) { + && log_sys.n_pending_flushes > 0 + && log_sys.current_flush_lsn >= lsn) { /* The write + flush will write enough */ log_mutex_exit(); return; @@ -1293,14 +1094,13 @@ void log_flush_margin(void) /*==================*/ { - log_t* log = log_sys; lsn_t lsn = 0; log_mutex_enter(); - if (log->buf_free > log->max_buf_free) { + if (log_sys.buf_free > log_sys.max_buf_free) { /* We can write during flush */ - lsn = log->lsn; + lsn = log_sys.lsn; } log_mutex_exit(); @@ -1377,36 +1177,33 @@ log_complete_checkpoint(void) /*=========================*/ { ut_ad(log_mutex_own()); - ut_ad(log_sys->n_pending_checkpoint_writes == 0); + ut_ad(log_sys.n_pending_checkpoint_writes == 0); - log_sys->next_checkpoint_no++; + log_sys.next_checkpoint_no++; - log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn; + log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn; MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, - log_sys->lsn - log_sys->last_checkpoint_lsn); + log_sys.lsn - log_sys.last_checkpoint_lsn); DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF ", flushed to " LSN_PF, - log_sys->last_checkpoint_lsn, - log_sys->flushed_to_disk_lsn)); + log_sys.last_checkpoint_lsn, + log_sys.flushed_to_disk_lsn)); - rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT); + rw_lock_x_unlock_gen(&(log_sys.checkpoint_lock), LOG_CHECKPOINT); } -/******************************************************//** -Completes an asynchronous checkpoint info write i/o to a log file. */ -static -void -log_io_complete_checkpoint(void) -/*============================*/ +/** Complete an asynchronous checkpoint write. */ +void log_t::complete_checkpoint() { + ut_ad(this == &log_sys); MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE); log_mutex_enter(); - ut_ad(log_sys->n_pending_checkpoint_writes > 0); + ut_ad(n_pending_checkpoint_writes > 0); - if (--log_sys->n_pending_checkpoint_writes == 0) { + if (!--n_pending_checkpoint_writes) { log_complete_checkpoint(); } @@ -1420,91 +1217,78 @@ void log_group_checkpoint(lsn_t end_lsn) { lsn_t lsn_offset; - byte* buf; ut_ad(!srv_read_only_mode); ut_ad(log_mutex_own()); - ut_ad(end_lsn == 0 || end_lsn >= log_sys->next_checkpoint_lsn); - ut_ad(end_lsn <= log_sys->lsn); - ut_ad(end_lsn + SIZE_OF_MLOG_CHECKPOINT <= log_sys->lsn + ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn); + ut_ad(end_lsn <= log_sys.lsn); + ut_ad(end_lsn + SIZE_OF_MLOG_CHECKPOINT <= log_sys.lsn || srv_shutdown_state > SRV_SHUTDOWN_INITIATED); DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF " written", - log_sys->next_checkpoint_no, - log_sys->next_checkpoint_lsn)); - - log_group_t* group = &log_sys->log; + log_sys.next_checkpoint_no, + log_sys.next_checkpoint_lsn)); - buf = group->checkpoint_buf; + byte* buf = log_sys.checkpoint_buf; memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE); - mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no); - mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn); + mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys.next_checkpoint_no); + mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys.next_checkpoint_lsn); - if (log_sys->is_encrypted()) { + if (log_sys.is_encrypted()) { log_crypt_write_checkpoint_buf(buf); } - lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn, - group); + lsn_offset = log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn); mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset); - mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size); + mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, + srv_log_buffer_size); mach_write_to_8(buf + LOG_CHECKPOINT_END_LSN, end_lsn); log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf)); MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE); - log_sys->n_log_ios++; + log_sys.n_log_ios++; MONITOR_INC(MONITOR_LOG_IO); - ut_ad(LOG_CHECKPOINT_1 < univ_page_size.physical()); - ut_ad(LOG_CHECKPOINT_2 < univ_page_size.physical()); + ut_ad(LOG_CHECKPOINT_1 < srv_page_size); + ut_ad(LOG_CHECKPOINT_2 < srv_page_size); - if (log_sys->n_pending_checkpoint_writes++ == 0) { - rw_lock_x_lock_gen(&log_sys->checkpoint_lock, + if (log_sys.n_pending_checkpoint_writes++ == 0) { + rw_lock_x_lock_gen(&log_sys.checkpoint_lock, LOG_CHECKPOINT); } /* Note: We alternate the physical place of the checkpoint info. See the (next_checkpoint_no & 1) below. */ - /* We send as the last parameter the group machine address - added with 1, as we want to distinguish between a normal log - file write and a checkpoint field write */ - fil_io(IORequestLogWrite, false, page_id_t(SRV_LOG_SPACE_FIRST_ID, 0), univ_page_size, - (log_sys->next_checkpoint_no & 1) + (log_sys.next_checkpoint_no & 1) ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1, OS_FILE_LOG_BLOCK_SIZE, - buf, (byte*) group + 1); - - ut_ad(((ulint) group & 0x1UL) == 0); + buf, reinterpret_cast<void*>(1) /* checkpoint write */); } -/** Read a log group header page to log_sys->checkpoint_buf. -@param[in] group log group -@param[in] header 0 or LOG_CHEKCPOINT_1 or LOG_CHECKPOINT2 */ -void -log_group_header_read( - const log_group_t* group, - ulint header) +/** Read a log group header page to log_sys.checkpoint_buf. +@param[in] header 0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */ +void log_header_read(ulint header) { ut_ad(log_mutex_own()); - log_sys->n_log_ios++; + log_sys.n_log_ios++; MONITOR_INC(MONITOR_LOG_IO); fil_io(IORequestLogRead, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, - header / univ_page_size.physical()), - univ_page_size, header % univ_page_size.physical(), - OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL); + header >> srv_page_size_shift), + univ_page_size, header & (srv_page_size - 1), + OS_FILE_LOG_BLOCK_SIZE, log_sys.checkpoint_buf, NULL); } /** Write checkpoint info to the log header and invoke log_mutex_exit(). @@ -1524,8 +1308,8 @@ log_write_checkpoint_info(bool sync, lsn_t end_lsn) if (sync) { /* Wait for the checkpoint write to complete */ - rw_lock_s_lock(&log_sys->checkpoint_lock); - rw_lock_s_unlock(&log_sys->checkpoint_lock); + rw_lock_s_lock(&log_sys.checkpoint_lock); + rw_lock_s_unlock(&log_sys.checkpoint_lock); DBUG_EXECUTE_IF( "crash_after_checkpoint", @@ -1541,8 +1325,8 @@ log_append_on_checkpoint( mtr_buf_t* buf) { log_mutex_enter(); - mtr_buf_t* old = log_sys->append_on_checkpoint; - log_sys->append_on_checkpoint = buf; + mtr_buf_t* old = log_sys.append_on_checkpoint; + log_sys.append_on_checkpoint = buf; log_mutex_exit(); return(old); } @@ -1576,7 +1360,9 @@ bool log_checkpoint(bool sync) case SRV_LITTLESYNC: case SRV_O_DIRECT: case SRV_O_DIRECT_NO_FSYNC: +#ifdef _WIN32 case SRV_ALL_O_DIRECT_FSYNC: +#endif fil_flush_file_spaces(FIL_TYPE_TABLESPACE); } @@ -1586,16 +1372,16 @@ bool log_checkpoint(bool sync) oldest_lsn = log_buf_pool_get_oldest_modification(); /* Because log also contains headers and dummy log records, - log_buf_pool_get_oldest_modification() will return log_sys->lsn + log_buf_pool_get_oldest_modification() will return log_sys.lsn if the buffer pool contains no dirty buffers. We must make sure that the log is flushed up to that lsn. If there are dirty buffers in the buffer pool, then our write-ahead-logging algorithm ensures that the log has been flushed up to oldest_lsn. */ - ut_ad(oldest_lsn >= log_sys->last_checkpoint_lsn); + ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); if (oldest_lsn - > log_sys->last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) { + > log_sys.last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) { /* Some log has been written since the previous checkpoint. */ } else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) { /* MariaDB 10.3 startup expects the redo log file to be @@ -1609,7 +1395,7 @@ bool log_checkpoint(bool sync) return(true); } /* Repeat the MLOG_FILE_NAME records after the checkpoint, in - case some log records between the checkpoint and log_sys->lsn + case some log records between the checkpoint and log_sys.lsn need them. Finally, write a MLOG_CHECKPOINT marker. Redo log apply expects to see a MLOG_CHECKPOINT after the checkpoint, except on clean shutdown, where the log will be empty after @@ -1620,14 +1406,14 @@ bool log_checkpoint(bool sync) threads will be blocked, and no pages can be added to the flush lists. */ lsn_t flush_lsn = oldest_lsn; - const lsn_t end_lsn = log_sys->lsn; + const lsn_t end_lsn = log_sys.lsn; const bool do_write = srv_shutdown_state <= SRV_SHUTDOWN_INITIATED || flush_lsn != end_lsn; if (fil_names_clear(flush_lsn, do_write)) { - ut_ad(log_sys->lsn >= end_lsn + SIZE_OF_MLOG_CHECKPOINT); - flush_lsn = log_sys->lsn; + ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_MLOG_CHECKPOINT); + flush_lsn = log_sys.lsn; } log_mutex_exit(); @@ -1636,28 +1422,28 @@ bool log_checkpoint(bool sync) log_mutex_enter(); - ut_ad(log_sys->flushed_to_disk_lsn >= flush_lsn); + ut_ad(log_sys.flushed_to_disk_lsn >= flush_lsn); ut_ad(flush_lsn >= oldest_lsn); - if (log_sys->last_checkpoint_lsn >= oldest_lsn) { + if (log_sys.last_checkpoint_lsn >= oldest_lsn) { log_mutex_exit(); return(true); } - if (log_sys->n_pending_checkpoint_writes > 0) { + if (log_sys.n_pending_checkpoint_writes > 0) { /* A checkpoint write is running */ log_mutex_exit(); if (sync) { /* Wait for the checkpoint write to complete */ - rw_lock_s_lock(&log_sys->checkpoint_lock); - rw_lock_s_unlock(&log_sys->checkpoint_lock); + rw_lock_s_lock(&log_sys.checkpoint_lock); + rw_lock_s_unlock(&log_sys.checkpoint_lock); } return(false); } - log_sys->next_checkpoint_lsn = oldest_lsn; + log_sys.next_checkpoint_lsn = oldest_lsn; log_write_checkpoint_info(sync, end_lsn); ut_ad(!log_mutex_own()); @@ -1688,7 +1474,6 @@ void log_checkpoint_margin(void) /*=======================*/ { - log_t* log = log_sys; lsn_t age; lsn_t checkpoint_age; ib_uint64_t advance; @@ -1700,39 +1485,39 @@ loop: log_mutex_enter(); ut_ad(!recv_no_log_write); - if (!log->check_flush_or_checkpoint) { + if (!log_sys.check_flush_or_checkpoint) { log_mutex_exit(); return; } oldest_lsn = log_buf_pool_get_oldest_modification(); - age = log->lsn - oldest_lsn; + age = log_sys.lsn - oldest_lsn; - if (age > log->max_modified_age_sync) { + if (age > log_sys.max_modified_age_sync) { /* A flush is urgent: we have to do a synchronous preflush */ - advance = age - log->max_modified_age_sync; + advance = age - log_sys.max_modified_age_sync; } - checkpoint_age = log->lsn - log->last_checkpoint_lsn; + checkpoint_age = log_sys.lsn - log_sys.last_checkpoint_lsn; bool checkpoint_sync; bool do_checkpoint; - if (checkpoint_age > log->max_checkpoint_age) { + if (checkpoint_age > log_sys.max_checkpoint_age) { /* A checkpoint is urgent: we do it synchronously */ checkpoint_sync = true; do_checkpoint = true; - } else if (checkpoint_age > log->max_checkpoint_age_async) { + } else if (checkpoint_age > log_sys.max_checkpoint_age_async) { /* A checkpoint is not urgent: do it asynchronously */ do_checkpoint = true; checkpoint_sync = false; - log->check_flush_or_checkpoint = false; + log_sys.check_flush_or_checkpoint = false; } else { do_checkpoint = false; checkpoint_sync = false; - log->check_flush_or_checkpoint = false; + log_sys.check_flush_or_checkpoint = false; } log_mutex_exit(); @@ -1747,9 +1532,7 @@ loop: thread doing a flush at the same time. */ if (!success) { log_mutex_enter(); - - log->check_flush_or_checkpoint = true; - + log_sys.check_flush_or_checkpoint = true; log_mutex_exit(); goto loop; } @@ -1780,7 +1563,7 @@ log_check_margins(void) log_checkpoint_margin(); log_mutex_enter(); ut_ad(!recv_no_log_write); - check = log_sys->check_flush_or_checkpoint; + check = log_sys.check_flush_or_checkpoint; log_mutex_exit(); } while (check); } @@ -1804,17 +1587,17 @@ logs_empty_and_mark_files_at_shutdown(void) srv_shutdown_state = SRV_SHUTDOWN_CLEANUP; loop: - ut_ad(lock_sys || !srv_was_started); - ut_ad(log_sys || !srv_was_started); - ut_ad(fil_system || !srv_was_started); + ut_ad(lock_sys.is_initialised() || !srv_was_started); + ut_ad(log_sys.is_initialised() || !srv_was_started); + ut_ad(fil_system.is_initialised() || !srv_was_started); os_event_set(srv_buf_resize_event); if (!srv_read_only_mode) { os_event_set(srv_error_event); os_event_set(srv_monitor_event); os_event_set(srv_buf_dump_event); - if (lock_sys) { - os_event_set(lock_sys->timeout_event); + if (lock_sys.timeout_thread_active) { + os_event_set(lock_sys.timeout_event); } if (dict_stats_event) { os_event_set(dict_stats_event); @@ -1841,7 +1624,7 @@ loop: if (ulint total_trx = srv_was_started && !srv_read_only_mode && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO - ? trx_sys_any_active_transactions() : 0) { + ? trx_sys.any_active_transactions() : 0) { if (srv_print_verbose_log && count > COUNT_INTERVAL) { service_manager_extend_timeout( @@ -1869,14 +1652,14 @@ loop: goto wait_suspend_loop; } else if (srv_dict_stats_thread_active) { thread_name = "dict_stats_thread"; - } else if (lock_sys && lock_sys->timeout_thread_active) { + } else if (lock_sys.timeout_thread_active) { thread_name = "lock_wait_timeout_thread"; } else if (srv_buf_dump_thread_active) { thread_name = "buf_dump_thread"; goto wait_suspend_loop; } else if (btr_defragment_thread_active) { thread_name = "btr_defragment_thread"; - } else if (srv_fast_shutdown != 2 && trx_rollback_or_clean_is_active) { + } else if (srv_fast_shutdown != 2 && trx_rollback_is_active) { thread_name = "rollback of recovered transactions"; } else { thread_name = NULL; @@ -1945,10 +1728,10 @@ wait_suspend_loop: os_event_set(log_scrub_event); } - if (log_sys) { + if (log_sys.is_initialised()) { log_mutex_enter(); - const ulint n_write = log_sys->n_pending_checkpoint_writes; - const ulint n_flush = log_sys->n_pending_flushes; + const ulint n_write = log_sys.n_pending_checkpoint_writes; + const ulint n_flush = log_sys.n_pending_flushes; log_mutex_exit(); if (log_scrub_thread_active || n_write || n_flush) { @@ -1999,7 +1782,7 @@ wait_suspend_loop: srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; - if (fil_system) { + if (fil_system.is_initialised()) { fil_close_all_files(); } return; @@ -2012,10 +1795,10 @@ wait_suspend_loop: log_mutex_enter(); - lsn = log_sys->lsn; + lsn = log_sys.lsn; - const bool lsn_changed = lsn != log_sys->last_checkpoint_lsn; - ut_ad(lsn >= log_sys->last_checkpoint_lsn); + const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn; + ut_ad(lsn >= log_sys.last_checkpoint_lsn); log_mutex_exit(); @@ -2039,7 +1822,7 @@ wait_suspend_loop: "Free innodb buffer pool"); buf_all_freed(); - ut_a(lsn == log_sys->lsn + ut_a(lsn == log_sys.lsn || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); if (lsn < srv_start_lsn) { @@ -2063,7 +1846,7 @@ wait_suspend_loop: /* Make some checks that the server really is quiet */ ut_a(srv_get_active_thread_type() == SRV_NONE); - ut_a(lsn == log_sys->lsn + ut_a(lsn == log_sys.lsn || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); } @@ -2075,8 +1858,8 @@ log_peek_lsn( /*=========*/ lsn_t* lsn) /*!< out: if returns TRUE, current lsn is here */ { - if (0 == mutex_enter_nowait(&(log_sys->mutex))) { - *lsn = log_sys->lsn; + if (0 == mutex_enter_nowait(&(log_sys.mutex))) { + *lsn = log_sys.lsn; log_mutex_exit(); @@ -2103,15 +1886,15 @@ log_print( "Log flushed up to " LSN_PF "\n" "Pages flushed up to " LSN_PF "\n" "Last checkpoint at " LSN_PF "\n", - log_sys->lsn, - log_sys->flushed_to_disk_lsn, + log_sys.lsn, + log_sys.flushed_to_disk_lsn, log_buf_pool_get_oldest_modification(), - log_sys->last_checkpoint_lsn); + log_sys.last_checkpoint_lsn); current_time = time(NULL); time_elapsed = difftime(current_time, - log_sys->last_printout_time); + log_sys.last_printout_time); if (time_elapsed <= 0) { time_elapsed = 1; @@ -2121,15 +1904,15 @@ log_print( ULINTPF " pending log flushes, " ULINTPF " pending chkp writes\n" ULINTPF " log i/o's done, %.2f log i/o's/second\n", - log_sys->n_pending_flushes, - log_sys->n_pending_checkpoint_writes, - log_sys->n_log_ios, + log_sys.n_pending_flushes, + log_sys.n_pending_checkpoint_writes, + log_sys.n_log_ios, static_cast<double>( - log_sys->n_log_ios - log_sys->n_log_ios_old) + log_sys.n_log_ios - log_sys.n_log_ios_old) / time_elapsed); - log_sys->n_log_ios_old = log_sys->n_log_ios; - log_sys->last_printout_time = current_time; + log_sys.n_log_ios_old = log_sys.n_log_ios; + log_sys.last_printout_time = current_time; log_mutex_exit(); } @@ -2140,58 +1923,33 @@ void log_refresh_stats(void) /*===================*/ { - log_sys->n_log_ios_old = log_sys->n_log_ios; - log_sys->last_printout_time = time(NULL); -} - -/** Close a log group. -@param[in,out] group log group to close */ -static -void -log_group_close(log_group_t* group) -{ - ut_free(group->checkpoint_buf_ptr); - group->n_files = 0; - group->checkpoint_buf_ptr = NULL; -} - -/********************************************************//** -Closes all log groups. */ -void -log_group_close_all(void) -/*=====================*/ -{ - log_group_close(&log_sys->log); + log_sys.n_log_ios_old = log_sys.n_log_ios; + log_sys.last_printout_time = time(NULL); } /** Shut down the redo log subsystem. */ -void -log_shutdown() +void log_t::close() { - log_group_close_all(); - - ut_free(log_sys->buf_ptr); - log_sys->buf_ptr = NULL; - log_sys->buf = NULL; - ut_free(log_sys->checkpoint_buf_ptr); - log_sys->checkpoint_buf_ptr = NULL; - log_sys->checkpoint_buf = NULL; - - os_event_destroy(log_sys->flush_event); - - rw_lock_free(&log_sys->checkpoint_lock); - - mutex_free(&log_sys->mutex); - mutex_free(&log_sys->write_mutex); - mutex_free(&log_sys->log_flush_order_mutex); - - if (!srv_read_only_mode && srv_scrub_log) { - os_event_destroy(log_scrub_event); - } - - recv_sys_close(); - ut_free(log_sys); - log_sys = NULL; + ut_ad(this == &log_sys); + if (!is_initialised()) return; + m_initialised = false; + log.close(); + + ut_free_dodump(buf, srv_log_buffer_size); + buf = NULL; + ut_free_dodump(flush_buf, srv_log_buffer_size); + flush_buf = NULL; + + os_event_destroy(flush_event); + rw_lock_free(&checkpoint_lock); + mutex_free(&mutex); + mutex_free(&write_mutex); + mutex_free(&log_flush_order_mutex); + + if (!srv_read_only_mode && srv_scrub_log) + os_event_destroy(log_scrub_event); + + recv_sys_close(); } /******************************************************//** @@ -2212,7 +1970,7 @@ log_pad_current_log_block(void) lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE); pad_length = OS_FILE_LOG_BLOCK_SIZE - - (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE) + - (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_TRL_SIZE; if (pad_length == (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE @@ -2229,7 +1987,7 @@ log_pad_current_log_block(void) log_write_low(&b, 1); } - lsn = log_sys->lsn; + lsn = log_sys.lsn; log_close(); @@ -2245,14 +2003,14 @@ log_scrub() /*=========*/ { log_mutex_enter(); - ulint cur_lbn = log_block_convert_lsn_to_no(log_sys->lsn); + ulint cur_lbn = log_block_convert_lsn_to_no(log_sys.lsn); if (next_lbn_to_pad == cur_lbn) { log_pad_current_log_block(); } - next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys->lsn); + next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys.lsn); log_mutex_exit(); } diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 42b8e4dee88..59087c76e37 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -60,7 +60,7 @@ Created 9/20/1997 Heikki Tuuri #include "fil0pagecompress.h" /** Log records are stored in the hash table in chunks at most of this size; -this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */ +this must be less than srv_page_size as it is stored in the buffer pool */ #define RECV_DATA_BLOCK_SIZE (MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t) - REDZONE_SIZE) /** Read-ahead area in applying log records to file pages */ @@ -77,7 +77,7 @@ volatile bool recv_recovery_on; bool recv_needed_recovery; #ifdef UNIV_DEBUG /** TRUE if writing to the redo log (mtr_commit) is forbidden. -Protected by log_sys->mutex. */ +Protected by log_sys.mutex. */ bool recv_no_log_write = false; #endif /* UNIV_DEBUG */ @@ -724,7 +724,9 @@ recv_sys_close() os_event_destroy(recv_sys->flush_end); } - ut_free(recv_sys->buf); + if (recv_sys->buf != NULL) { + ut_free_dodump(recv_sys->buf, recv_sys->buf_size); + } ut_ad(!recv_writer_thread_active); mutex_free(&recv_sys->writer_mutex); @@ -783,7 +785,7 @@ DECLARE_THREAD(recv_writer_thread)( /* Wait till we get a signal to clean the LRU list. Bounded by max wait time of 100ms. */ - ib_uint64_t sig_count = os_event_reset(buf_flush_event); + int64_t sig_count = os_event_reset(buf_flush_event); os_event_wait_time_low(buf_flush_event, 100000, sig_count); mutex_enter(&recv_sys->writer_mutex); @@ -832,7 +834,8 @@ recv_sys_init() } recv_sys->buf = static_cast<byte*>( - ut_malloc_nokey(RECV_PARSING_BUF_SIZE)); + ut_malloc_dontdump(RECV_PARSING_BUF_SIZE)); + recv_sys->buf_size = RECV_PARSING_BUF_SIZE; recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 512); recv_sys->progress_time = time(NULL); @@ -866,8 +869,9 @@ recv_sys_debug_free(void) hash_table_free(recv_sys->addr_hash); mem_heap_free(recv_sys->heap); - ut_free(recv_sys->buf); + ut_free_dodump(recv_sys->buf, recv_sys->buf_size); + recv_sys->buf_size = 0; recv_sys->buf = NULL; recv_sys->heap = NULL; recv_sys->addr_hash = NULL; @@ -883,57 +887,46 @@ recv_sys_debug_free(void) mutex_exit(&(recv_sys->mutex)); } -/** Read a log segment to a buffer. -@param[out] buf buffer -@param[in] group redo log files -@param[in, out] start_lsn in : read area start, out: the last read valid lsn +/** Read a log segment to log_sys.buf. +@param[in,out] start_lsn in: read area start, +out: the last read valid lsn @param[in] end_lsn read area end -@param[out] invalid_block - invalid, (maybe incompletely written) block encountered -@return false, if invalid block encountered (e.g checksum mismatch), true otherwise */ -bool -log_group_read_log_seg( - byte* buf, - const log_group_t* group, - lsn_t *start_lsn, - lsn_t end_lsn) +@return whether no invalid blocks (e.g checksum mismatch) were found */ +bool log_t::files::read_log_seg(lsn_t* start_lsn, lsn_t end_lsn) { ulint len; - lsn_t source_offset; bool success = true; - ut_ad(log_mutex_own()); + ut_ad(log_sys.mutex.is_owned()); ut_ad(!(*start_lsn % OS_FILE_LOG_BLOCK_SIZE)); ut_ad(!(end_lsn % OS_FILE_LOG_BLOCK_SIZE)); - + byte* buf = log_sys.buf; loop: - source_offset = log_group_calc_lsn_offset(*start_lsn, group); + lsn_t source_offset = calc_lsn_offset(*start_lsn); ut_a(end_lsn - *start_lsn <= ULINT_MAX); len = (ulint) (end_lsn - *start_lsn); ut_ad(len != 0); - const bool at_eof = (source_offset % group->file_size) + len - > group->file_size; + const bool at_eof = (source_offset % file_size) + len > file_size; if (at_eof) { /* If the above condition is true then len (which is ulint) is > the expression below, so the typecast is ok */ - len = (ulint) (group->file_size - - (source_offset % group->file_size)); + len = ulint(file_size - (source_offset % file_size)); } - log_sys->n_log_ios++; + log_sys.n_log_ios++; MONITOR_INC(MONITOR_LOG_IO); - ut_a(source_offset / UNIV_PAGE_SIZE <= ULINT_MAX); + ut_a((source_offset >> srv_page_size_shift) <= ULINT_MAX); - const ulint page_no - = (ulint) (source_offset / univ_page_size.physical()); + const ulint page_no = ulint(source_offset >> srv_page_size_shift); fil_io(IORequestLogRead, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), univ_page_size, - (ulint) (source_offset % univ_page_size.physical()), + ulint(source_offset & (srv_page_size - 1)), len, buf, NULL); for (ulint l = 0; l < len; l += OS_FILE_LOG_BLOCK_SIZE, @@ -953,7 +946,7 @@ fail: break; } - if (innodb_log_checksums || group->is_encrypted()) { + if (innodb_log_checksums || is_encrypted()) { ulint crc = log_block_calc_checksum_crc32(buf); ulint cksum = log_block_get_checksum(buf); @@ -976,7 +969,7 @@ fail: goto fail; } - if (group->is_encrypted()) { + if (is_encrypted()) { log_crypt(buf, *start_lsn, OS_FILE_LOG_BLOCK_SIZE, true); } @@ -1022,14 +1015,10 @@ recv_synchronize_groups() the block is always incomplete */ lsn_t start_lsn = ut_uint64_align_down(recovered_lsn, - OS_FILE_LOG_BLOCK_SIZE); - log_group_read_log_seg(log_sys->buf, &log_sys->log, - &start_lsn, start_lsn + OS_FILE_LOG_BLOCK_SIZE); - - /* Update the fields in the group struct to correspond to - recovered_lsn */ - - log_group_set_fields(&log_sys->log, recovered_lsn); + OS_FILE_LOG_BLOCK_SIZE); + log_sys.log.read_log_seg(&start_lsn, + start_lsn + OS_FILE_LOG_BLOCK_SIZE); + log_sys.log.set_fields(recovered_lsn); /* Copy the checkpoint info to the log; remember that we have incremented checkpoint_no by one, and the info will not be written @@ -1055,19 +1044,17 @@ recv_check_log_header_checksum( } /** Find the latest checkpoint in the format-0 log header. -@param[out] max_group log group, or NULL @param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 @return error code or DB_SUCCESS */ static MY_ATTRIBUTE((warn_unused_result)) dberr_t -recv_find_max_checkpoint_0(log_group_t** max_group, ulint* max_field) +recv_find_max_checkpoint_0(ulint* max_field) { - log_group_t* group = &log_sys->log; ib_uint64_t max_no = 0; ib_uint64_t checkpoint_no; - byte* buf = log_sys->checkpoint_buf; + byte* buf = log_sys.checkpoint_buf; - ut_ad(group->format == 0); + ut_ad(log_sys.log.format == 0); /** Offset of the first checkpoint checksum */ static const uint CHECKSUM_1 = 288; @@ -1078,11 +1065,11 @@ recv_find_max_checkpoint_0(log_group_t** max_group, ulint* max_field) /** Least significant bits of the checkpoint offset */ static const uint OFFSET_LOW32 = 16; - *max_group = NULL; + bool found = false; for (ulint field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2; field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) { - log_group_header_read(group, field); + log_header_read(field); if (static_cast<uint32_t>(ut_fold_binary(buf, CHECKSUM_1)) != mach_read_from_4(buf + CHECKSUM_1) @@ -1109,21 +1096,20 @@ recv_find_max_checkpoint_0(log_group_t** max_group, ulint* max_field) mach_read_from_8(buf + LOG_CHECKPOINT_LSN))); if (checkpoint_no >= max_no) { - *max_group = group; + found = true; *max_field = field; max_no = checkpoint_no; - group->state = LOG_GROUP_OK; - - group->lsn = mach_read_from_8( - buf + LOG_CHECKPOINT_LSN); - group->lsn_offset = static_cast<ib_uint64_t>( - mach_read_from_4(buf + OFFSET_HIGH32)) << 32 - | mach_read_from_4(buf + OFFSET_LOW32); + log_sys.log.set_lsn(mach_read_from_8( + buf + LOG_CHECKPOINT_LSN)); + log_sys.log.set_lsn_offset( + lsn_t(mach_read_from_4(buf + OFFSET_HIGH32)) + << 32 + | mach_read_from_4(buf + OFFSET_LOW32)); } } - if (*max_group != NULL) { + if (found) { return(DB_SUCCESS); } @@ -1144,34 +1130,27 @@ recv_find_max_checkpoint_0(log_group_t** max_group, ulint* max_field) static dberr_t recv_log_format_0_recover(lsn_t lsn, bool crypt) { log_mutex_enter(); - log_group_t* group = &log_sys->log; - const lsn_t source_offset - = log_group_calc_lsn_offset(lsn, group); + const lsn_t source_offset = log_sys.log.calc_lsn_offset(lsn); log_mutex_exit(); - const ulint page_no - = (ulint) (source_offset / univ_page_size.physical()); - byte* buf = log_sys->buf; + const ulint page_no = ulint(source_offset >> srv_page_size_shift); + byte* buf = log_sys.buf; static const char* NO_UPGRADE_RECOVERY_MSG = "Upgrade after a crash is not supported." " This redo log was created before MariaDB 10.2.2"; - static const char* NO_UPGRADE_RTFM_MSG = - ". Please follow the instructions at " - "https://mariadb.com/kb/en/library/upgrading/"; fil_io(IORequestLogRead, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), univ_page_size, - (ulint) ((source_offset & ~(OS_FILE_LOG_BLOCK_SIZE - 1)) - % univ_page_size.physical()), - OS_FILE_LOG_BLOCK_SIZE, buf, NULL); + ulint((source_offset & ~(OS_FILE_LOG_BLOCK_SIZE - 1)) + & (srv_page_size - 1)), + OS_FILE_LOG_BLOCK_SIZE, buf, NULL); if (log_block_calc_checksum_format_0(buf) != log_block_get_checksum(buf) && !log_crypt_101_read_block(buf)) { ib::error() << NO_UPGRADE_RECOVERY_MSG - << ", and it appears corrupted" - << NO_UPGRADE_RTFM_MSG; + << ", and it appears corrupted."; return(DB_CORRUPTION); } @@ -1179,12 +1158,11 @@ static dberr_t recv_log_format_0_recover(lsn_t lsn, bool crypt) == (source_offset & (OS_FILE_LOG_BLOCK_SIZE - 1))) { } else if (crypt) { ib::error() << "Cannot decrypt log for upgrading." - " The encrypted log was created before MariaDB 10.2.2" - << NO_UPGRADE_RTFM_MSG; + " The encrypted log was created" + " before MariaDB 10.2.2."; return DB_ERROR; } else { - ib::error() << NO_UPGRADE_RECOVERY_MSG - << NO_UPGRADE_RTFM_MSG; + ib::error() << NO_UPGRADE_RECOVERY_MSG << "."; return(DB_ERROR); } @@ -1193,29 +1171,27 @@ static dberr_t recv_log_format_0_recover(lsn_t lsn, bool crypt) recv_sys->parse_start_lsn = recv_sys->recovered_lsn = recv_sys->scanned_lsn = recv_sys->mlog_checkpoint_lsn = lsn; - log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn - = log_sys->lsn = log_sys->write_lsn - = log_sys->current_flush_lsn = log_sys->flushed_to_disk_lsn + log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn + = log_sys.lsn = log_sys.write_lsn + = log_sys.current_flush_lsn = log_sys.flushed_to_disk_lsn = lsn; - log_sys->next_checkpoint_no = 0; + log_sys.next_checkpoint_no = 0; return(DB_SUCCESS); } -/** Determine if a redo log from MariaDB 10.3 is clean. +/** Determine if a redo log from MariaDB 10.4 is clean. @return error code @retval DB_SUCCESS if the redo log is clean @retval DB_CORRUPTION if the redo log is corrupted @retval DB_ERROR if the redo log is not empty */ -static -dberr_t -recv_log_recover_10_3() +static dberr_t recv_log_recover_10_4() { - log_group_t* group = &log_sys->log; - const lsn_t lsn = group->lsn; - const lsn_t source_offset = log_group_calc_lsn_offset(lsn, group); + ut_ad(!log_sys.is_encrypted()); + const lsn_t lsn = log_sys.log.get_lsn(); + const lsn_t source_offset = log_sys.log.calc_lsn_offset(lsn); const ulint page_no = (ulint) (source_offset / univ_page_size.physical()); - byte* buf = log_sys->buf; + byte* buf = log_sys.buf; fil_io(IORequestLogRead, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), @@ -1228,11 +1204,7 @@ recv_log_recover_10_3() if (cksum != LOG_NO_CHECKSUM_MAGIC && cksum != log_block_calc_checksum_crc32(buf)) { - return(DB_CORRUPTION); - } - - if (group->is_encrypted()) { - log_crypt(buf, lsn, OS_FILE_LOG_BLOCK_SIZE, true); + return DB_CORRUPTION; } /* On a clean shutdown, the redo log will be logically empty @@ -1240,7 +1212,7 @@ recv_log_recover_10_3() if (log_block_get_data_len(buf) != (source_offset & (OS_FILE_LOG_BLOCK_SIZE - 1))) { - return(DB_ERROR); + return DB_ERROR; } /* Mark the redo log for downgrading. */ @@ -1248,12 +1220,12 @@ recv_log_recover_10_3() recv_sys->parse_start_lsn = recv_sys->recovered_lsn = recv_sys->scanned_lsn = recv_sys->mlog_checkpoint_lsn = lsn; - log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn - = log_sys->lsn = log_sys->write_lsn - = log_sys->current_flush_lsn = log_sys->flushed_to_disk_lsn + log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn + = log_sys.lsn = log_sys.write_lsn + = log_sys.current_flush_lsn = log_sys.flushed_to_disk_lsn = lsn; - log_sys->next_checkpoint_no = 0; - return(DB_SUCCESS); + log_sys.next_checkpoint_no = 0; + return DB_SUCCESS; } /** Find the latest checkpoint in the log header. @@ -1262,29 +1234,24 @@ recv_log_recover_10_3() dberr_t recv_find_max_checkpoint(ulint* max_field) { - log_group_t* group; ib_uint64_t max_no; ib_uint64_t checkpoint_no; ulint field; byte* buf; - group = &log_sys->log; - max_no = 0; *max_field = 0; - buf = log_sys->checkpoint_buf; + buf = log_sys.checkpoint_buf; - group->state = LOG_GROUP_CORRUPTED; - - log_group_header_read(group, 0); + log_header_read(0); /* Check the header page checksum. There was no checksum in the first redo log format (version 0). */ - group->format = mach_read_from_4(buf + LOG_HEADER_FORMAT); - group->subformat = group->format + log_sys.log.format = mach_read_from_4(buf + LOG_HEADER_FORMAT); + log_sys.log.subformat = log_sys.log.format != LOG_HEADER_FORMAT_3_23 ? mach_read_from_4(buf + LOG_HEADER_SUBFORMAT) : 0; - if (group->format != 0 + if (log_sys.log.format != LOG_HEADER_FORMAT_3_23 && !recv_check_log_header_checksum(buf)) { ib::error() << "Invalid redo log header checksum."; return(DB_CORRUPTION); @@ -1296,35 +1263,27 @@ recv_find_max_checkpoint(ulint* max_field) /* Ensure that the string is NUL-terminated. */ creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR] = 0; - switch (group->format) { - case 0: - return(recv_find_max_checkpoint_0(&group, max_field)); + switch (log_sys.log.format) { + case LOG_HEADER_FORMAT_3_23: + return(recv_find_max_checkpoint_0(max_field)); case LOG_HEADER_FORMAT_10_2: case LOG_HEADER_FORMAT_10_2 | LOG_HEADER_FORMAT_ENCRYPTED: - case LOG_HEADER_FORMAT_10_3: - case LOG_HEADER_FORMAT_10_3 | LOG_HEADER_FORMAT_ENCRYPTED: + case LOG_HEADER_FORMAT_CURRENT: + case LOG_HEADER_FORMAT_CURRENT | LOG_HEADER_FORMAT_ENCRYPTED: case LOG_HEADER_FORMAT_10_4: /* We can only parse the unencrypted LOG_HEADER_FORMAT_10_4. The encrypted format uses a larger redo log block trailer. */ break; default: ib::error() << "Unsupported redo log format." - " The redo log was created" - " with " << creator << - ". Please follow the instructions at " - "https://mariadb.com/kb/en/library/upgrading/"; - /* Do not issue a message about a possibility - to cleanly shut down the newer server version - and to remove the redo logs, because the - format of the system data structures may - radically change after MySQL 5.7. */ + " The redo log was created with " << creator << "."; return(DB_ERROR); } for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2; field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) { - log_group_header_read(group, field); + log_header_read(field); const ulint crc32 = log_block_calc_checksum_crc32(buf); const ulint cksum = log_block_get_checksum(buf); @@ -1339,7 +1298,7 @@ recv_find_max_checkpoint(ulint* max_field) continue; } - if (group->is_encrypted() + if (log_sys.is_encrypted() && !log_crypt_read_checkpoint_buf(buf)) { ib::error() << "Reading checkpoint" " encryption info failed."; @@ -1357,12 +1316,11 @@ recv_find_max_checkpoint(ulint* max_field) if (checkpoint_no >= max_no) { *max_field = field; max_no = checkpoint_no; - group->state = LOG_GROUP_OK; - group->lsn = mach_read_from_8( - buf + LOG_CHECKPOINT_LSN); - group->lsn_offset = mach_read_from_8( - buf + LOG_CHECKPOINT_OFFSET); - log_sys->next_checkpoint_no = checkpoint_no; + log_sys.log.set_lsn(mach_read_from_8( + buf + LOG_CHECKPOINT_LSN)); + log_sys.log.set_lsn_offset(mach_read_from_8( + buf + LOG_CHECKPOINT_OFFSET)); + log_sys.next_checkpoint_no = checkpoint_no; } } @@ -1379,22 +1337,8 @@ recv_find_max_checkpoint(ulint* max_field) return(DB_ERROR); } - switch (group->format) { - case LOG_HEADER_FORMAT_10_3: - case LOG_HEADER_FORMAT_10_3 | LOG_HEADER_FORMAT_ENCRYPTED: - if (group->subformat == 1) { - /* 10.2 with new crash-safe TRUNCATE */ - break; - } - /* fall through */ - case LOG_HEADER_FORMAT_10_4: - if (srv_operation == SRV_OPERATION_BACKUP) { - ib::error() - << "Incompatible redo log format." - " The redo log was created with " << creator; - return DB_ERROR; - } - dberr_t err = recv_log_recover_10_3(); + if (log_sys.log.format == LOG_HEADER_FORMAT_10_4) { + dberr_t err = recv_log_recover_10_4(); if (err != DB_SUCCESS) { ib::error() << "Downgrade after a crash is not supported." @@ -1402,10 +1346,10 @@ recv_find_max_checkpoint(ulint* max_field) << (err == DB_ERROR ? "." : ", and it appears corrupted."); } - return(err); + return err; } - return(DB_SUCCESS); + return DB_SUCCESS; } /** Try to parse a single log record body and also applies it if @@ -1737,18 +1681,22 @@ parse_log: ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page); break; case MLOG_UNDO_ERASE_END: - ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG); - ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, mtr); + if (page) { + ut_ad(page_type == FIL_PAGE_UNDO_LOG); + trx_undo_erase_page_end(page); + } break; case MLOG_UNDO_INIT: /* Allow anything in page_type when creating a page. */ - ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr); + ptr = trx_undo_parse_page_init(ptr, end_ptr, page); break; - case MLOG_UNDO_HDR_CREATE: case MLOG_UNDO_HDR_REUSE: ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG); - ptr = trx_undo_parse_page_header(type, ptr, end_ptr, - page, mtr); + ptr = trx_undo_parse_page_header_reuse(ptr, end_ptr, page); + break; + case MLOG_UNDO_HDR_CREATE: + ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG); + ptr = trx_undo_parse_page_header(ptr, end_ptr, page, mtr); break; case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK: ut_ad(!page || fil_page_type_is_index(page_type)); @@ -1815,9 +1763,15 @@ parse_log: ptr, end_ptr, page, page_zip, index); } break; + case MLOG_ZIP_WRITE_TRX_ID: + /* This must be a clustered index leaf page. */ + ut_ad(!page || page_type == FIL_PAGE_INDEX); + ptr = page_zip_parse_write_trx_id(ptr, end_ptr, + page, page_zip); + break; case MLOG_FILE_WRITE_CRYPT_DATA: dberr_t err; - ptr = const_cast<byte*>(fil_parse_write_crypt_data(ptr, end_ptr, block, &err)); + ptr = const_cast<byte*>(fil_parse_write_crypt_data(ptr, end_ptr, &err)); if (err != DB_SUCCESS) { recv_sys->found_corrupt_log = TRUE; @@ -1929,13 +1883,13 @@ recv_add_to_hash_table( ut_ad(type != MLOG_INDEX_LOAD); ut_ad(type != MLOG_TRUNCATE); - len = rec_end - body; + len = ulint(rec_end - body); recv = static_cast<recv_t*>( mem_heap_alloc(recv_sys->heap, sizeof(recv_t))); recv->type = type; - recv->len = rec_end - body; + recv->len = ulint(rec_end - body); recv->start_lsn = start_lsn; recv->end_lsn = end_lsn; @@ -1972,13 +1926,13 @@ recv_add_to_hash_table( prev_field = &(recv->data); - /* Store the log record body in chunks of less than UNIV_PAGE_SIZE: + /* Store the log record body in chunks of less than srv_page_size: recv_sys->heap grows into the buffer pool, and bigger chunks could not be allocated */ while (rec_end > body) { - len = rec_end - body; + len = ulint(rec_end - body); if (len > RECV_DATA_BLOCK_SIZE) { len = RECV_DATA_BLOCK_SIZE; @@ -2071,6 +2025,7 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, } lsn_t start_lsn = 0, end_lsn = 0; + fil_space_t* space; if (srv_is_tablespace_truncated(recv_addr->space)) { /* The table will be truncated after applying @@ -2078,11 +2033,16 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, goto skip_log; } + space = fil_space_acquire(recv_addr->space); + if (!space) { + goto skip_log; + } + for (recv_t* recv = UT_LIST_GET_FIRST(recv_addr->rec_list); recv; recv = UT_LIST_GET_NEXT(rec_list, recv)) { ut_ad(recv->start_lsn); end_lsn = recv->end_lsn; - ut_ad(end_lsn <= log_sys->log.scanned_lsn); + ut_ad(end_lsn <= log_sys.log.scanned_lsn); if (recv->start_lsn < page_lsn) { /* Ignore this record, because there are later changes @@ -2096,8 +2056,7 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, << get_mlog_string(recv->type) << " LSN " << recv->start_lsn << " < " << init_lsn); - } else if (srv_was_tablespace_truncated( - fil_space_get(recv_addr->space)) + } else if (srv_was_tablespace_truncated(space) && recv->start_lsn < truncate_t::get_truncated_tablespace_init_lsn( recv_addr->space)) { @@ -2159,6 +2118,8 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, } } + space->release(); + skip_log: #ifdef UNIV_ZIP_DEBUG ut_ad(!fil_page_index_page_check(page) @@ -2305,35 +2266,35 @@ to create a page which has buffered page intialization redo log records. static buf_block_t* recv_recovery_create_page_low(const page_id_t page_id, recv_addr_t* recv_addr) { - mtr_t mtr; - mlog_init_t::init& i = mlog_init.last(page_id); - const lsn_t end_lsn = UT_LIST_GET_LAST(recv_addr->rec_list)->end_lsn; + mtr_t mtr; + mlog_init_t::init &i= mlog_init.last(page_id); + const lsn_t end_lsn= UT_LIST_GET_LAST(recv_addr->rec_list)->end_lsn; if (end_lsn < i.lsn) { DBUG_LOG("ib_log", "skip log for page " - << page_id - << " LSN " << end_lsn - << " < " << i.lsn); - recv_addr->state = RECV_PROCESSED; + << page_id + << " LSN " << end_lsn + << " < " << i.lsn); + recv_addr->state= RECV_PROCESSED; ignore: ut_a(recv_sys->n_addrs); recv_sys->n_addrs--; return NULL; } - fil_space_t* space = fil_space_acquire(recv_addr->space); + fil_space_t *space= fil_space_acquire(recv_addr->space); if (!space) { - recv_addr->state = RECV_PROCESSED; + recv_addr->state= RECV_PROCESSED; goto ignore; } if (space->enable_lsn) { init_fail: - fil_space_release(space); - recv_addr->state = RECV_NOT_PROCESSED; + space->release(); + recv_addr->state= RECV_NOT_PROCESSED; return NULL; } @@ -2352,7 +2313,7 @@ init_fail: mtr.start(); mtr.set_log_mode(MTR_LOG_NONE); - buf_block_t* block = buf_page_create(page_id, page_size_t(space->flags), + buf_block_t *block= buf_page_create(page_id, page_size_t(space->flags), &mtr); if (recv_addr->state == RECV_PROCESSED) /* The page happened to exist in the buffer pool, or it was @@ -2361,13 +2322,13 @@ init_fail: mtr.commit(); else { - i.created = true; + i.created= true; buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); recv_recover_page(block, mtr, recv_addr, i.lsn); ut_ad(mtr.has_committed()); } - fil_space_release(space); + space->release(); return block; } @@ -2665,7 +2626,7 @@ recv_parse_log_rec( fil_space_set_recv_size(*space, size); } - return(new_ptr - ptr); + return ulint(new_ptr - ptr); } /*******************************************************//** @@ -2893,9 +2854,7 @@ loop: /* Do nothing */ break; case MLOG_CHECKPOINT: -#if SIZE_OF_MLOG_CHECKPOINT != 1 + 8 -# error SIZE_OF_MLOG_CHECKPOINT != 1 + 8 -#endif + compile_time_assert(SIZE_OF_MLOG_CHECKPOINT == 1 + 8); lsn = mach_read_from_8(ptr + 1); if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { @@ -3433,7 +3392,6 @@ func_exit: /** Scans log from a buffer and stores new log data to the parsing buffer. Parses and hashes the log records if new data found. -@param[in,out] group log group @param[in] checkpoint_lsn latest checkpoint log sequence number @param[in,out] contiguous_lsn log sequence number until which all redo log has been scanned @@ -3443,7 +3401,6 @@ can be applied to the tablespaces static bool recv_group_scan_log_recs( - log_group_t* group, lsn_t checkpoint_lsn, lsn_t* contiguous_lsn, bool last_phase) @@ -3473,10 +3430,10 @@ recv_group_scan_log_recs( store_t store_to_hash = recv_sys->mlog_checkpoint_lsn == 0 ? STORE_NO : (last_phase ? STORE_IF_EXISTS : STORE_YES); ulint available_mem = (buf_pool_get_n_pages() * 2 / 3) - << srv_page_size_shift; + << srv_page_size_shift; - group->scanned_lsn = end_lsn = *contiguous_lsn = ut_uint64_align_down( - *contiguous_lsn, OS_FILE_LOG_BLOCK_SIZE); + log_sys.log.scanned_lsn = end_lsn = *contiguous_lsn = + ut_uint64_align_down(*contiguous_lsn, OS_FILE_LOG_BLOCK_SIZE); do { if (last_phase && store_to_hash == STORE_NO) { @@ -3493,14 +3450,13 @@ recv_group_scan_log_recs( start_lsn = ut_uint64_align_down(end_lsn, OS_FILE_LOG_BLOCK_SIZE); end_lsn = start_lsn; - log_group_read_log_seg( - log_sys->buf, group, &end_lsn, - start_lsn + RECV_SCAN_SIZE); + log_sys.log.read_log_seg(&end_lsn, start_lsn + RECV_SCAN_SIZE); } while (end_lsn != start_lsn && !recv_scan_log_recs( - available_mem, &store_to_hash, log_sys->buf, - checkpoint_lsn, start_lsn, end_lsn, - contiguous_lsn, &group->scanned_lsn)); + available_mem, &store_to_hash, log_sys.buf, + checkpoint_lsn, + start_lsn, end_lsn, + contiguous_lsn, &log_sys.log.scanned_lsn)); if (recv_sys->found_corrupt_log || recv_sys->found_corrupt_fs) { DBUG_RETURN(false); @@ -3508,7 +3464,7 @@ recv_group_scan_log_recs( DBUG_PRINT("ib_log", ("%s " LSN_PF " completed", last_phase ? "rescan" : "scan", - group->scanned_lsn)); + log_sys.log.scanned_lsn)); DBUG_RETURN(store_to_hash == STORE_NO); } @@ -3715,55 +3671,35 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) log_mutex_enter(); - /* Look for the latest checkpoint from any of the log groups */ - err = recv_find_max_checkpoint(&max_cp_field); if (err != DB_SUCCESS) { -skip_apply: + + srv_start_lsn = recv_sys->recovered_lsn = log_sys.lsn; log_mutex_exit(); return(err); } - switch (log_sys->log.format) { - case 0: - break; - case LOG_HEADER_FORMAT_10_2: - case LOG_HEADER_FORMAT_10_2 | LOG_HEADER_FORMAT_ENCRYPTED: - break; - case LOG_HEADER_FORMAT_10_3: - case LOG_HEADER_FORMAT_10_3 | LOG_HEADER_FORMAT_ENCRYPTED: - if (log_sys->log.subformat == 1) { - /* 10.2 with new crash-safe TRUNCATE */ - break; - } - /* fall through */ - default: - /* This must be a clean log from a newer version. */ - goto skip_apply; - } - - log_group_header_read(&log_sys->log, max_cp_field); + log_header_read(max_cp_field); - buf = log_sys->checkpoint_buf; + buf = log_sys.checkpoint_buf; checkpoint_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_LSN); checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO); - /* Start reading the log groups from the checkpoint lsn up. The - variable contiguous_lsn contains an lsn up to which the log is - known to be contiguously written to all log groups. */ - + /* Start reading the log from the checkpoint lsn. The variable + contiguous_lsn contains an lsn up to which the log is known to + be contiguously written. */ recv_sys->mlog_checkpoint_lsn = 0; - ut_ad(RECV_SCAN_SIZE <= log_sys->buf_size); + ut_ad(RECV_SCAN_SIZE <= srv_log_buffer_size); const lsn_t end_lsn = mach_read_from_8( buf + LOG_CHECKPOINT_END_LSN); ut_ad(recv_sys->n_addrs == 0); contiguous_lsn = checkpoint_lsn; - switch (log_sys->log.format) { + switch (log_sys.log.format) { case 0: log_mutex_exit(); return recv_log_format_0_recover(checkpoint_lsn, @@ -3782,9 +3718,7 @@ skip_apply: } /* Look for MLOG_CHECKPOINT. */ - log_group_t* group = &log_sys->log; - recv_group_scan_log_recs(group, checkpoint_lsn, &contiguous_lsn, - false); + recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false); /* The first scan should not have stored or applied any records. */ ut_ad(recv_sys->n_addrs == 0); ut_ad(!recv_sys->found_corrupt_fs); @@ -3801,7 +3735,7 @@ skip_apply: } if (recv_sys->mlog_checkpoint_lsn == 0) { - lsn_t scan_lsn = group->scanned_lsn; + lsn_t scan_lsn = log_sys.log.scanned_lsn; if (!srv_read_only_mode && scan_lsn != checkpoint_lsn) { log_mutex_exit(); ib::error err; @@ -3814,12 +3748,12 @@ skip_apply: return(DB_ERROR); } - group->scanned_lsn = checkpoint_lsn; + log_sys.log.scanned_lsn = checkpoint_lsn; rescan = false; } else { contiguous_lsn = checkpoint_lsn; rescan = recv_group_scan_log_recs( - group, checkpoint_lsn, &contiguous_lsn, false); + checkpoint_lsn, &contiguous_lsn, false); if ((recv_sys->found_corrupt_log && !srv_force_recovery) || recv_sys->found_corrupt_fs) { @@ -3865,7 +3799,7 @@ skip_apply: } } - log_sys->lsn = recv_sys->recovered_lsn; + log_sys.lsn = recv_sys->recovered_lsn; if (recv_needed_recovery) { bool missing_tablespace = false; @@ -3892,8 +3826,7 @@ skip_apply: lsn_t recent_stored_lsn = recv_sys->last_stored_lsn; rescan = recv_group_scan_log_recs( - group, checkpoint_lsn, - &recent_stored_lsn, false); + checkpoint_lsn, &recent_stored_lsn, false); ut_ad(!recv_sys->found_corrupt_fs); @@ -3928,8 +3861,8 @@ skip_apply: if (rescan) { contiguous_lsn = checkpoint_lsn; - recv_group_scan_log_recs(group, checkpoint_lsn, - &contiguous_lsn, true); + recv_group_scan_log_recs( + checkpoint_lsn, &contiguous_lsn, true); if ((recv_sys->found_corrupt_log && !srv_force_recovery) @@ -3942,12 +3875,11 @@ skip_apply: ut_ad(!rescan || recv_sys->n_addrs == 0); } - /* We currently have only one log group */ - - if (group->scanned_lsn < checkpoint_lsn - || group->scanned_lsn < recv_max_page_lsn) { + if (log_sys.log.scanned_lsn < checkpoint_lsn + || log_sys.log.scanned_lsn < recv_max_page_lsn) { - ib::error() << "We scanned the log up to " << group->scanned_lsn + ib::error() << "We scanned the log up to " + << log_sys.log.scanned_lsn << ". A checkpoint was at " << checkpoint_lsn << " and" " the maximum LSN on a database page was " << recv_max_page_lsn << ". It is possible that the" @@ -3963,11 +3895,8 @@ skip_apply: return(DB_ERROR); } - /* Synchronize the uncorrupted log groups to the most up-to-date log - group; we also copy checkpoint info to groups */ - - log_sys->next_checkpoint_lsn = checkpoint_lsn; - log_sys->next_checkpoint_no = checkpoint_no + 1; + log_sys.next_checkpoint_lsn = checkpoint_lsn; + log_sys.next_checkpoint_no = checkpoint_no + 1; recv_synchronize_groups(); @@ -3977,24 +3906,24 @@ skip_apply: srv_start_lsn = recv_sys->recovered_lsn; } - log_sys->buf_free = (ulint) log_sys->lsn % OS_FILE_LOG_BLOCK_SIZE; - log_sys->buf_next_to_write = log_sys->buf_free; - log_sys->write_lsn = log_sys->lsn; + log_sys.buf_free = ulong(log_sys.lsn % OS_FILE_LOG_BLOCK_SIZE); + log_sys.buf_next_to_write = log_sys.buf_free; + log_sys.write_lsn = log_sys.lsn; - log_sys->last_checkpoint_lsn = checkpoint_lsn; + log_sys.last_checkpoint_lsn = checkpoint_lsn; if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL) { /* Write a MLOG_CHECKPOINT marker as the first thing, before generating any other redo log. This ensures that subsequent crash recovery will be possible even if the server were killed soon after this. */ - fil_names_clear(log_sys->last_checkpoint_lsn, true); + fil_names_clear(log_sys.last_checkpoint_lsn, true); } MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, - log_sys->lsn - log_sys->last_checkpoint_lsn); + log_sys.lsn - log_sys.last_checkpoint_lsn); - log_sys->next_checkpoint_no = ++checkpoint_no; + log_sys.next_checkpoint_no = ++checkpoint_no; mutex_enter(&recv_sys->mutex); @@ -4073,7 +4002,6 @@ recv_recovery_rollback_active(void) /* Drop partially created indexes. */ row_merge_drop_temp_indexes(); /* Drop garbage tables. */ - if (srv_safe_truncate) row_mysql_drop_garbage_tables(); /* Drop any auxiliary tables that were not dropped when the @@ -4085,8 +4013,8 @@ recv_recovery_rollback_active(void) /* Rollback the uncommitted transactions which have no user session */ - trx_rollback_or_clean_is_active = true; - os_thread_create(trx_rollback_or_clean_all_recovered, 0, 0); + trx_rollback_is_active = true; + os_thread_create(trx_rollback_all_recovered, 0, 0); } } @@ -4323,6 +4251,9 @@ static const char* get_mlog_string(mlog_id_t type) case MLOG_ZIP_PAGE_REORGANIZE: return("MLOG_ZIP_PAGE_REORGANIZE"); + case MLOG_ZIP_WRITE_TRX_ID: + return("MLOG_ZIP_WRITE_TRX_ID"); + case MLOG_FILE_RENAME2: return("MLOG_FILE_RENAME2"); diff --git a/storage/innobase/mem/mem0mem.cc b/storage/innobase/mem/mem0mem.cc index 783451abbf2..03ab4a89f77 100644 --- a/storage/innobase/mem/mem0mem.cc +++ b/storage/innobase/mem/mem0mem.cc @@ -29,18 +29,6 @@ Created 6/9/1994 Heikki Tuuri #include "srv0srv.h" #include <stdarg.h> -/** Duplicates a NUL-terminated string, allocated from a memory heap. -@param[in] heap, memory heap where string is allocated -@param[in] str) string to be copied -@return own: a copy of the string */ -char* -mem_heap_strdup( - mem_heap_t* heap, - const char* str) -{ - return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1))); -} - /**********************************************************************//** Concatenate two strings and return the result, using a memory heap. @return own: the result */ @@ -136,7 +124,7 @@ mem_heap_printf_low( val = va_arg(ap, unsigned long); - plen = sprintf(tmp, "%lu", val); + plen = size_t(sprintf(tmp, "%lu", val)); len += plen; if (buf) { @@ -227,7 +215,7 @@ mem_heap_validate( break; case MEM_HEAP_BUFFER: case MEM_HEAP_BUFFER | MEM_HEAP_BTR_SEARCH: - ut_ad(block->len <= UNIV_PAGE_SIZE); + ut_ad(block->len <= srv_page_size); break; default: ut_error; @@ -282,13 +270,13 @@ mem_heap_create_block_func( /* In dynamic allocation, calculate the size: block header + data. */ len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n); - if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) { + if (type == MEM_HEAP_DYNAMIC || len < srv_page_size / 2) { ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF); block = static_cast<mem_block_t*>(ut_malloc_nokey(len)); } else { - len = UNIV_PAGE_SIZE; + len = srv_page_size; if ((type & MEM_HEAP_BTR_SEARCH) && heap) { /* We cannot allocate the block from the @@ -423,7 +411,7 @@ mem_heap_block_free( type = heap->type; len = block->len; - if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) { + if (type == MEM_HEAP_DYNAMIC || len < srv_page_size / 2) { ut_ad(!buf_block); ut_free(block); } else { diff --git a/storage/innobase/mtr/mtr0log.cc b/storage/innobase/mtr/mtr0log.cc index 0e6a80cb363..714bd4435dc 100644 --- a/storage/innobase/mtr/mtr0log.cc +++ b/storage/innobase/mtr/mtr0log.cc @@ -148,7 +148,7 @@ mlog_parse_nbytes( offset = mach_read_from_2(ptr); ptr += 2; - if (offset >= UNIV_PAGE_SIZE) { + if (offset >= srv_page_size) { recv_sys->found_corrupt_log = TRUE; return(NULL); @@ -316,7 +316,7 @@ mlog_write_string( mtr_t* mtr) /*!< in: mini-transaction handle */ { ut_ad(ptr && mtr); - ut_a(len < UNIV_PAGE_SIZE); + ut_a(len < srv_page_size); memcpy(ptr, str, len); @@ -336,7 +336,7 @@ mlog_log_string( byte* log_ptr; ut_ad(ptr && mtr); - ut_ad(len <= UNIV_PAGE_SIZE); + ut_ad(len <= srv_page_size); log_ptr = mlog_open(mtr, 30); @@ -387,7 +387,7 @@ mlog_parse_string( len = mach_read_from_2(ptr); ptr += 2; - if (offset >= UNIV_PAGE_SIZE || len + offset > UNIV_PAGE_SIZE) { + if (offset >= srv_page_size || len + offset > srv_page_size) { recv_sys->found_corrupt_log = TRUE; return(NULL); @@ -430,23 +430,30 @@ mlog_open_and_write_index( ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + mtr->set_modified(); + switch (mtr->get_log_mode()) { + case MTR_LOG_NONE: + case MTR_LOG_NO_REDO: + return NULL; + case MTR_LOG_SHORT_INSERTS: + ut_ad(0); + /* fall through */ + case MTR_LOG_ALL: + break; + } + if (!page_rec_is_comp(rec)) { - log_start = log_ptr = mlog_open(mtr, 11 + size); - if (!log_ptr) { - return(NULL); /* logging is disabled */ - } + log_start = log_ptr = mtr->get_log()->open(11 + size); log_ptr = mlog_write_initial_log_record_fast(rec, type, log_ptr, mtr); log_end = log_ptr + 11 + size; } else { ulint i; + bool is_instant = index->is_instant(); ulint n = dict_index_get_n_fields(index); - ulint total = 11 + size + (n + 2) * 2; - ulint alloc = total; - - if (alloc > mtr_buf_t::MAX_DATA_SIZE) { - alloc = mtr_buf_t::MAX_DATA_SIZE; - } + ulint total = 11 + (is_instant ? 2 : 0) + size + (n + 2) * 2; + ulint alloc = std::min(total, + ulint(mtr_buf_t::MAX_DATA_SIZE)); const bool is_leaf = page_is_leaf(page_align(rec)); @@ -456,30 +463,30 @@ mlog_open_and_write_index( n = DICT_INDEX_SPATIAL_NODEPTR_SIZE; } - log_start = log_ptr = mlog_open(mtr, alloc); - - if (!log_ptr) { - return(NULL); /* logging is disabled */ - } - + log_start = log_ptr = mtr->get_log()->open(alloc); log_end = log_ptr + alloc; log_ptr = mlog_write_initial_log_record_fast( rec, type, log_ptr, mtr); - mach_write_to_2(log_ptr, n); - log_ptr += 2; + if (is_instant) { + // marked as instant index + mach_write_to_2(log_ptr, n | 0x8000); + + log_ptr += 2; - if (is_leaf) { - mach_write_to_2( - log_ptr, dict_index_get_n_unique_in_tree(index)); + // record the n_core_fields + mach_write_to_2(log_ptr, index->n_core_fields); } else { - mach_write_to_2( - log_ptr, - dict_index_get_n_unique_in_tree_nonleaf(index)); + mach_write_to_2(log_ptr, n); } log_ptr += 2; + mach_write_to_2( + log_ptr, is_leaf + ? dict_index_get_n_unique_in_tree(index) + : dict_index_get_n_unique_in_tree_nonleaf(index)); + log_ptr += 2; for (i = 0; i < n; i++) { dict_field_t* field; @@ -501,19 +508,14 @@ mlog_open_and_write_index( } if (log_ptr + 2 > log_end) { mlog_close(mtr, log_ptr); - ut_a(total > (ulint) (log_ptr - log_start)); - total -= log_ptr - log_start; - alloc = total; - - if (alloc > mtr_buf_t::MAX_DATA_SIZE) { - alloc = mtr_buf_t::MAX_DATA_SIZE; - } - - log_start = log_ptr = mlog_open(mtr, alloc); - - if (!log_ptr) { - return(NULL); /* logging is disabled */ - } + ut_a(total > ulint(log_ptr - log_start)); + total -= ulint(log_ptr - log_start); + alloc = std::min( + total, + ulint(mtr_buf_t::MAX_DATA_SIZE)); + + log_start = log_ptr = mtr->get_log()->open( + alloc); log_end = log_ptr + alloc; } mach_write_to_2(log_ptr, len); @@ -544,6 +546,7 @@ mlog_parse_index( ulint i, n, n_uniq; dict_table_t* table; dict_index_t* ind; + ulint n_core_fields = 0; ut_ad(comp == FALSE || comp == TRUE); @@ -553,6 +556,23 @@ mlog_parse_index( } n = mach_read_from_2(ptr); ptr += 2; + if (n & 0x8000) { /* record after instant ADD COLUMN */ + n &= 0x7FFF; + + n_core_fields = mach_read_from_2(ptr); + + if (!n_core_fields || n_core_fields > n) { + recv_sys->found_corrupt_log = TRUE; + return(NULL); + } + + ptr += 2; + + if (end_ptr < ptr + 2) { + return(NULL); + } + } + n_uniq = mach_read_from_2(ptr); ptr += 2; ut_ad(n_uniq <= n); @@ -562,11 +582,9 @@ mlog_parse_index( } else { n = n_uniq = 1; } - table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n, 0, + table = dict_mem_table_create("LOG_DUMMY", NULL, n, 0, comp ? DICT_TF_COMPACT : 0, 0); - ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY", - DICT_HDR_SPACE, 0, n); - ind->table = table; + ind = dict_mem_index_create(table, "LOG_DUMMY", 0, n); ind->n_uniq = (unsigned int) n_uniq; if (n_uniq != n) { ut_a(n_uniq + DATA_ROLL_PTR <= n); @@ -604,6 +622,22 @@ mlog_parse_index( ind->fields[DATA_ROLL_PTR - 1 + n_uniq].col = &table->cols[n + DATA_ROLL_PTR]; } + + ut_ad(table->n_cols == table->n_def); + + if (n_core_fields) { + for (i = n_core_fields; i < n; i++) { + ind->fields[i].col->def_val.len + = UNIV_SQL_NULL; + } + ind->n_core_fields = n_core_fields; + ind->n_core_null_bytes = UT_BITS_IN_BYTES( + ind->get_n_nullable(n_core_fields)); + } else { + ind->n_core_null_bytes = UT_BITS_IN_BYTES( + unsigned(ind->n_nullable)); + ind->n_core_fields = ind->n_fields; + } } /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ ind->cached = TRUE; diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index fefc0687ddb..2e364ba4945 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -382,7 +382,7 @@ mtr_write_log( ut_ad(!recv_no_log_write); DBUG_PRINT("ib_log", (ULINTPF " extra bytes written at " LSN_PF, - len, log_sys->lsn)); + len, log_sys.lsn)); log_reserve_and_open(len); log->for_each_block(write_log); @@ -404,8 +404,6 @@ void mtr_t::start() m_log_mode= MTR_LOG_ALL; ut_d(m_user_space_id= TRX_SYS_SPACE); m_user_space= NULL; - m_undo_space= NULL; - m_sys_space= NULL; m_state= MTR_STATE_ACTIVE; m_flush_observer= NULL; m_commit_lsn= 0; @@ -510,7 +508,7 @@ mtr_t::commit_checkpoint( if (write_mlog_checkpoint) { DBUG_PRINT("ib_log", ("MLOG_CHECKPOINT(" LSN_PF ") written at " LSN_PF, - checkpoint_lsn, log_sys->lsn)); + checkpoint_lsn, log_sys.lsn)); } } @@ -522,12 +520,7 @@ mtr_t::commit_checkpoint( bool mtr_t::is_named_space(ulint space) const { - ut_ad(!m_sys_space || m_sys_space->id == TRX_SYS_SPACE); - ut_ad(!m_undo_space || m_undo_space->id != TRX_SYS_SPACE); ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE); - ut_ad(!m_sys_space || m_sys_space != m_user_space); - ut_ad(!m_sys_space || m_sys_space != m_undo_space); - ut_ad(!m_user_space || m_user_space != m_undo_space); switch (get_log_mode()) { case MTR_LOG_NONE: @@ -542,6 +535,26 @@ mtr_t::is_named_space(ulint space) const ut_error; return(false); } +/** Check if a tablespace is associated with the mini-transaction +(needed for generating a MLOG_FILE_NAME record) +@param[in] space tablespace +@return whether the mini-transaction is associated with the space */ +bool mtr_t::is_named_space(const fil_space_t* space) const +{ + ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE); + + switch (get_log_mode()) { + case MTR_LOG_NONE: + case MTR_LOG_NO_REDO: + return true; + case MTR_LOG_ALL: + case MTR_LOG_SHORT_INSERTS: + return m_user_space == space || is_predefined_tablespace(space->id); + } + + ut_error; + return false; +} #endif /* UNIV_DEBUG */ /** Acquire a tablespace X-latch. @@ -558,80 +571,23 @@ mtr_t::x_lock_space(ulint space_id, const char* file, unsigned line) ut_ad(is_active()); if (space_id == TRX_SYS_SPACE) { - space = m_sys_space; - - if (!space) { - space = m_sys_space = fil_space_get(space_id); - } + space = fil_system.sys_space; } else if ((space = m_user_space) && space_id == space->id) { - } else if ((space = m_undo_space) && space_id == space->id) { - } else if (get_log_mode() == MTR_LOG_NO_REDO) { + } else { space = fil_space_get(space_id); - ut_ad(space->purpose == FIL_TYPE_TEMPORARY + ut_ad(get_log_mode() != MTR_LOG_NO_REDO + || space->purpose == FIL_TYPE_TEMPORARY || space->purpose == FIL_TYPE_IMPORT - || space->redo_skipped_count > 0 + || my_atomic_loadlint(&space->redo_skipped_count) > 0 || srv_is_tablespace_truncated(space->id)); - } else { - /* called from trx_rseg_create() */ - space = m_undo_space = fil_space_get(space_id); } ut_ad(space); ut_ad(space->id == space_id); x_lock_space(space, file, line); - ut_ad(space->purpose == FIL_TYPE_TEMPORARY - || space->purpose == FIL_TYPE_IMPORT - || space->purpose == FIL_TYPE_TABLESPACE); return(space); } -/** Exclusively aqcuire a tablespace latch. -@param space tablespace -@param file source code file name of the caller -@param line source code line number */ -void mtr_t::x_lock_space(fil_space_t *space, const char *file, unsigned line) -{ - rw_lock_x_lock_inline(&space->latch, 0, file, line); - memo_push(space, MTR_MEMO_SPACE_X_LOCK); -} - -/** Look up the system tablespace. */ -void -mtr_t::lookup_sys_space() -{ - ut_ad(!m_sys_space); - m_sys_space = fil_space_get(TRX_SYS_SPACE); - ut_ad(m_sys_space); -} - -/** Look up the user tablespace. -@param[in] space_id tablespace ID */ -void -mtr_t::lookup_user_space(ulint space_id) -{ - ut_ad(space_id != TRX_SYS_SPACE); - ut_ad(m_user_space_id == space_id); - ut_ad(!m_user_space); - m_user_space = fil_space_get(space_id); - ut_ad(m_user_space); -} - -/** Set the tablespace associated with the mini-transaction -(needed for generating a MLOG_FILE_NAME record) -@param[in] space user or system tablespace */ -void -mtr_t::set_named_space(fil_space_t* space) -{ - ut_ad(m_user_space_id == TRX_SYS_SPACE); - ut_d(m_user_space_id = space->id); - if (space->id == TRX_SYS_SPACE) { - ut_ad(!m_sys_space || m_sys_space == space); - m_sys_space = space; - } else { - m_user_space = space; - } -} - /** Release an object in the memo stack. @return true if released */ bool @@ -686,7 +642,7 @@ inline ulint mtr_t::prepare_write() ut_ad(m_log_mode == MTR_LOG_NO_REDO); ut_ad(m_log.size() == 0); log_mutex_enter(); - m_commit_lsn = log_sys->lsn; + m_commit_lsn = log_sys.lsn; return 0; } @@ -695,8 +651,8 @@ inline ulint mtr_t::prepare_write() ut_ad(len > 0); ut_ad(n_recs > 0); - if (len > log_sys->buf_size / 2) { - log_buffer_extend((len + 1) * 2); + if (len > srv_log_buffer_size / 2) { + log_buffer_extend(ulong((len + 1) * 2)); } ut_ad(m_n_log_recs == n_recs); @@ -780,30 +736,6 @@ inline lsn_t mtr_t::finish_write(ulint len) return start_lsn; } -/** Release the free extents that was reserved using -fsp_reserve_free_extents(). This is equivalent to calling -fil_space_release_free_extents(). This is intended for use -with index pages. -@param[in] n_reserved number of reserved extents */ -void -mtr_t::release_free_extents(ulint n_reserved) -{ - fil_space_t *space= m_user_space; - - ut_ad(!m_undo_space); - - if (space) - ut_ad(m_user_space->id == m_user_space_id); - else - { - ut_ad(m_sys_space->id == TRX_SYS_SPACE); - space= m_sys_space; - } - - ut_ad(memo_contains(get_memo(), space, MTR_MEMO_SPACE_X_LOCK)); - space->release_free_extents(n_reserved); -} - /** Find out whether a block was not X-latched by the mini-transaction */ struct FindBlockX { diff --git a/storage/innobase/mysql-test/storage_engine/repair_table.rdiff b/storage/innobase/mysql-test/storage_engine/repair_table.rdiff index 717d437b2d1..e9c46b3a6c1 100644 --- a/storage/innobase/mysql-test/storage_engine/repair_table.rdiff +++ b/storage/innobase/mysql-test/storage_engine/repair_table.rdiff @@ -78,7 +78,7 @@ DROP TABLE t1, t2; call mtr.add_suppression("Got an error from thread_id=.*"); call mtr.add_suppression("MySQL thread id .*, query id .* localhost.*root Checking table"); -@@ -62,45 +63,32 @@ +@@ -63,46 +64,33 @@ CREATE TABLE t1 (a <INT_COLUMN>, b <CHAR_COLUMN>, <CUSTOM_INDEX> (a)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>; REPAIR TABLE t1; Table Op Msg_type Msg_text @@ -94,9 +94,10 @@ Table Op Msg_type Msg_text -test.t1 repair warning Number of rows changed from 0 to 3 -test.t1 repair status OK ++test.t1 repair note The storage engine for the table doesn't support repair + db.opt -t1.MYD -t1.MYI -+test.t1 repair note The storage engine for the table doesn't support repair t1.frm +t1.ibd INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o'); diff --git a/storage/innobase/os/os0event.cc b/storage/innobase/os/os0event.cc index 9b5f8a45180..f1d7b2ed337 100644 --- a/storage/innobase/os/os0event.cc +++ b/storage/innobase/os/os0event.cc @@ -40,7 +40,7 @@ typedef pthread_cond_t os_cond_t; /** InnoDB condition variable. */ struct os_event { - os_event(const char* name) UNIV_NOTHROW; + os_event() UNIV_NOTHROW; ~os_event() UNIV_NOTHROW; @@ -395,7 +395,7 @@ os_event::wait_time_low( } /** Constructor */ -os_event::os_event(const char* name) UNIV_NOTHROW +os_event::os_event() UNIV_NOTHROW { init(); @@ -424,14 +424,9 @@ Creates an event semaphore, i.e., a semaphore which may just have two states: signaled and nonsignaled. The created event is manual reset: it must be reset explicitly by calling sync_os_reset_event. @return the event handle */ -os_event_t -os_event_create( -/*============*/ - const char* name) /*!< in: the name of the - event, if NULL the event - is created without a name */ +os_event_t os_event_create(const char*) { - return(UT_NEW_NOKEY(os_event(name))); + return(UT_NEW_NOKEY(os_event())); } /** diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index d2d5769d85e..bba682689a6 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -692,10 +692,6 @@ bool os_has_said_disk_full; /** Default Zip compression level */ extern uint page_zip_level; -#if DATA_TRX_ID_LEN > 6 -#error "COMPRESSION_ALGORITHM will not fit" -#endif /* DATA_TRX_ID_LEN */ - /** Validates the consistency of the aio system. @return true if ok */ static @@ -852,7 +848,8 @@ os_file_get_block_size( #ifdef _WIN32 fblock_size = 0; - + BOOL result = false; + size_t len = 0; // Open volume for this file, find out it "physical bytes per sector" HANDLE volume_handle = INVALID_HANDLE_VALUE; @@ -863,7 +860,7 @@ os_file_get_block_size( goto end; } - size_t len = strlen(volume); + len = strlen(volume); if (volume[len - 1] == '\\') { // Trim trailing backslash from volume name. volume[len - 1] = 0; @@ -889,7 +886,7 @@ os_file_get_block_size( storage_query.PropertyId = StorageAccessAlignmentProperty; storage_query.QueryType = PropertyStandardQuery; - BOOL result = os_win32_device_io_control(volume_handle, + result = os_win32_device_io_control(volume_handle, IOCTL_STORAGE_QUERY_PROPERTY, &storage_query, sizeof(storage_query), @@ -1039,7 +1036,7 @@ AIOHandler::post_io_processing(Slot* slot) ut_ad(slot->is_reserved); /* Total bytes read so far */ - ulint n_bytes = (slot->ptr - slot->buf) + slot->n_bytes; + ulint n_bytes = ulint(slot->ptr - slot->buf) + slot->n_bytes; return(n_bytes == slot->original_len ? DB_SUCCESS : DB_FAIL); } @@ -1087,21 +1084,14 @@ os_aio_validate_skip() /** Try os_aio_validate() every this many times */ # define OS_AIO_VALIDATE_SKIP 13 - /** The os_aio_validate() call skip counter. - Use a signed type because of the race condition below. */ - static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP; - - /* There is a race condition below, but it does not matter, - because this call is only for heuristic purposes. We want to - reduce the call frequency of the costly os_aio_validate() - check in debug builds. */ - --os_aio_validate_count; + static int os_aio_validate_count; - if (os_aio_validate_count > 0) { - return(true); + if (my_atomic_add32_explicit(&os_aio_validate_count, -1, + MY_MEMORY_ORDER_RELAXED) + % OS_AIO_VALIDATE_SKIP) { + return true; } - os_aio_validate_count = OS_AIO_VALIDATE_SKIP; return(os_aio_validate()); } #endif /* UNIV_DEBUG */ @@ -1245,22 +1235,32 @@ AIO::release_with_mutex(Slot* slot) release(); } -/** Creates a temporary file. This function is like tmpfile(3), but -the temporary file is created in the given parameter path. If the path -is NULL then it will create the file in the MySQL server configuration +/** Create a temporary file. This function is like tmpfile(3), but +the temporary file is created in the in the mysql server configuration parameter (--tmpdir). -@param[in] path location for creating temporary file -@@return temporary file handle, or NULL on error */ +@return temporary file handle, or NULL on error */ FILE* -os_file_create_tmpfile( - const char* path) +os_file_create_tmpfile() { FILE* file = NULL; WAIT_ALLOW_WRITES(); - int fd = innobase_mysql_tmpfile(path); + os_file_t fd = innobase_mysql_tmpfile(NULL); - if (fd >= 0) { + if (fd != OS_FILE_CLOSED) { +#ifdef _WIN32 + int crt_fd = _open_osfhandle((intptr_t)HANDLE(fd), 0); + if (crt_fd != -1) { + file = fdopen(crt_fd, "w+b"); + if (!file) { + close(crt_fd); + } + } +#else file = fdopen(fd, "w+b"); + if (!file) { + close(fd); + } +#endif } if (file == NULL) { @@ -1268,10 +1268,6 @@ os_file_create_tmpfile( ib::error() << "Unable to create temporary file; errno: " << errno; - - if (fd >= 0) { - close(fd); - } } return(file); @@ -1329,7 +1325,7 @@ os_file_make_new_pathname( /* Find the offset of the last slash. We will strip off the old basename.ibd which starts after that slash. */ last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR); - dir_len = last_slash ? last_slash - old_path : strlen(old_path); + dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path); /* allocate a new path and move the old directory path to it. */ new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd"; @@ -1482,7 +1478,7 @@ os_file_get_parent_dir( /* Non-trivial directory component */ - return(mem_strdupl(path, last_slash - path)); + return(mem_strdupl(path, ulint(last_slash - path))); } #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR @@ -2344,20 +2340,20 @@ AIO::is_linux_native_aio_supported() memset(&io_event, 0x0, sizeof(io_event)); - byte* buf = static_cast<byte*>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2)); - byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); + byte* buf = static_cast<byte*>(ut_malloc_nokey(srv_page_size * 2)); + byte* ptr = static_cast<byte*>(ut_align(buf, srv_page_size)); struct iocb iocb; /* Suppress valgrind warning. */ - memset(buf, 0x00, UNIV_PAGE_SIZE * 2); + memset(buf, 0x00, srv_page_size * 2); memset(&iocb, 0x0, sizeof(iocb)); struct iocb* p_iocb = &iocb; if (!srv_read_only_mode) { - io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0); + io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0); } else { ut_a(srv_page_size >= 4096); @@ -3323,7 +3319,7 @@ os_file_get_size( /* st_blocks is in 512 byte sized blocks */ file_size.m_alloc_size = s.st_blocks * 512; } else { - file_size.m_total_size = ~0; + file_size.m_total_size = ~0U; file_size.m_alloc_size = (os_offset_t) errno; } @@ -3465,16 +3461,6 @@ static void __stdcall win_free_syncio_event(void *data) { /* -Initialize tls index.for event handle used for synchronized IO on files that -might be opened with FILE_FLAG_OVERLAPPED. -*/ -static void win_init_syncio_event() { - fls_sync_io = FlsAlloc(win_free_syncio_event); - ut_a(fls_sync_io != FLS_OUT_OF_INDEXES); -} - - -/* Retrieve per-thread event for doing synchronous io on asyncronously opened files */ static HANDLE win_get_syncio_event() @@ -3575,46 +3561,6 @@ struct WinIoInit /* Ensures proper initialization and shutdown */ static WinIoInit win_io_init; -/** Check if the file system supports sparse files. -@param[in] name File name -@return true if the file system supports sparse files */ -static -bool -os_is_sparse_file_supported_win32(const char* filename) -{ - char volname[MAX_PATH]; - BOOL result = GetVolumePathName(filename, volname, MAX_PATH); - - if (!result) { - - ib::error() - << "os_is_sparse_file_supported: " - << "Failed to get the volume path name for: " - << filename - << "- OS error number " << GetLastError(); - - return(false); - } - - DWORD flags; - - result = GetVolumeInformation( - volname, NULL, MAX_PATH, NULL, NULL, - &flags, NULL, MAX_PATH); - - - if (!result) { - ib::error() - << "os_is_sparse_file_supported: " - << "Failed to get the volume info for: " - << volname - << "- OS error number " << GetLastError(); - - return(false); - } - - return(flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false; -} /** Free storage space associated with a section of the file. @param[in] fh Open file handle @@ -3911,7 +3857,7 @@ os_file_create_simple_func( ib::info() << "Read only mode set. Unable to" " open file '" << name << "' in RW mode, " - << "trying RO mode", name; + << "trying RO mode"; access = GENERIC_READ; @@ -4652,7 +4598,7 @@ bool os_file_close_func( os_file_t file) { - ut_a(file > 0); + ut_a(file); if (CloseHandle(file)) { return(true); @@ -4978,7 +4924,7 @@ os_file_io( os_offset_t offset, dberr_t* err) { - ulint original_n = n; + ssize_t original_n = ssize_t(n); IORequest type = in_type; ssize_t bytes_returned = 0; @@ -4993,7 +4939,7 @@ os_file_io( break; - } else if ((ulint) n_bytes + bytes_returned == n) { + } else if (n_bytes + bytes_returned == ssize_t(n)) { bytes_returned += n_bytes; @@ -5012,9 +4958,9 @@ os_file_io( /* Handle partial read/write. */ - ut_ad((ulint) n_bytes + bytes_returned < n); + ut_ad(ulint(n_bytes + bytes_returned) < n); - bytes_returned += (ulint) n_bytes; + bytes_returned += n_bytes; if (!type.is_partial_io_warning_disabled()) { @@ -5339,7 +5285,7 @@ os_file_set_nocache( ib::error() << "Failed to set DIRECTIO_ON on file " - << file_name << ": " << operation_name + << file_name << "; " << operation_name << ": " << strerror(errno_save) << "," " continuing anyway."; } @@ -5353,9 +5299,9 @@ os_file_set_nocache( # ifdef UNIV_LINUX ib::warn() << "Failed to set O_DIRECT on file" - << file_name << ";" << operation_name + << file_name << "; " << operation_name << ": " << strerror(errno_save) << ", " - << "ccontinuing anyway. O_DIRECT is " + "continuing anyway. O_DIRECT is " "known to result in 'Invalid argument' " "on Linux on tmpfs, " "see MySQL Bug#26662."; @@ -5371,7 +5317,7 @@ short_warning: << "Failed to set O_DIRECT on file " << file_name << "; " << operation_name << " : " << strerror(errno_save) - << " continuing anyway."; + << ", continuing anyway."; } } #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */ @@ -5460,18 +5406,16 @@ fallback: #endif /* _WIN32*/ /* Write up to 1 megabyte at a time. */ - ulint buf_size = ut_min( - static_cast<ulint>(64), - static_cast<ulint>(size / UNIV_PAGE_SIZE)); - - buf_size *= UNIV_PAGE_SIZE; + ulint buf_size = ut_min(ulint(64), + ulint(size >> srv_page_size_shift)) + << srv_page_size_shift; /* Align the buffer for possible raw i/o */ byte* buf2; - buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE)); + buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + srv_page_size)); - byte* buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); + byte* buf = static_cast<byte*>(ut_align(buf2, srv_page_size)); /* Write buffer full of zeros */ memset(buf, 0, buf_size); @@ -5646,7 +5590,7 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) /* Check does file system support punching holes for this tablespace. */ - if (!should_punch_hole() || !srv_use_trim) { + if (!should_punch_hole()) { return DB_IO_NO_PUNCH_HOLE; } @@ -5697,7 +5641,7 @@ os_is_sparse_file_supported(os_file_t fh) /* We don't know the FS block size, use the sector size. The FS will do the magic. */ - err = os_file_punch_hole_posix(fh, 0, UNIV_PAGE_SIZE); + err = os_file_punch_hole_posix(fh, 0, srv_page_size); return(err == DB_SUCCESS); #endif /* _WIN32 */ @@ -6309,7 +6253,7 @@ AIO::reserve_slot( doing simulated AIO */ ulint local_seg; - local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments; + local_seg = (offset >> (srv_page_size_shift + 6)) % m_n_segments; for (;;) { @@ -6990,10 +6934,10 @@ public: } m_ptr = static_cast<byte*>( - ut_malloc_nokey(len + UNIV_PAGE_SIZE)); + ut_malloc_nokey(len + srv_page_size)); m_buf = static_cast<byte*>( - ut_align(m_ptr, UNIV_PAGE_SIZE)); + ut_align(m_ptr, srv_page_size)); } else { len = first_slot()->len; diff --git a/storage/innobase/os/os0proc.cc b/storage/innobase/os/os0proc.cc index 71cb88ae372..5d0e53bcd82 100644 --- a/storage/innobase/os/os0proc.cc +++ b/storage/innobase/os/os0proc.cc @@ -163,7 +163,7 @@ os_mem_free_large( // And we must unpoison it by ourself as specified in documentation // for __asan_poison_memory_region() in sanitizer/asan_interface.h // munmap() doesn't do it for us automatically. - MEM_UNDEFINED(ptr, size); + MEM_MAKE_ADDRESSABLE(ptr, size); #endif /* __SANITIZE_ADDRESS__ */ #ifdef HAVE_LINUX_LARGE_PAGES diff --git a/storage/innobase/os/os0thread.cc b/storage/innobase/os/os0thread.cc index 2aac53f572d..0c97ebccc1b 100644 --- a/storage/innobase/os/os0thread.cc +++ b/storage/innobase/os/os0thread.cc @@ -143,7 +143,7 @@ os_thread_create_func( #endif /* not _WIN32 */ - ut_a(os_thread_count <= OS_THREAD_MAX_N); + ut_a(os_thread_count <= srv_max_n_threads); /* Return the thread_id if the caller requests it. */ if (thread_id != NULL) { @@ -188,7 +188,7 @@ os_thread_exit(bool detach) pfs_delete_thread(); #endif - my_atomic_addlint(&os_thread_count, -1); + my_atomic_addlint(&os_thread_count, ulint(-1)); #ifdef _WIN32 ExitThread(0); diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index 6810edf6c33..e4007fca83b 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -2,7 +2,7 @@ Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2018, 2020, MariaDB Corporation. +Copyright (c) 2018, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -75,7 +75,7 @@ page_cur_try_search_shortcut( ut_ad(page_is_leaf(page)); rec = page_header_get_ptr(page, PAGE_LAST_INSERT); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, dtuple_get_n_fields(tuple), &heap); ut_ad(rec); @@ -90,7 +90,8 @@ page_cur_try_search_shortcut( next_rec = page_rec_get_next_const(rec); if (!page_rec_is_supremum(next_rec)) { - offsets = rec_get_offsets(next_rec, index, offsets, true, + offsets = rec_get_offsets(next_rec, index, offsets, + index->n_core_fields, dtuple_get_n_fields(tuple), &heap); if (cmp_dtuple_rec_with_match(tuple, next_rec, offsets, @@ -159,7 +160,7 @@ page_cur_try_search_shortcut_bytes( ut_ad(page_is_leaf(page)); rec = page_header_get_ptr(page, PAGE_LAST_INSERT); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, dtuple_get_n_fields(tuple), &heap); ut_ad(rec); @@ -180,7 +181,8 @@ page_cur_try_search_shortcut_bytes( next_rec = page_rec_get_next_const(rec); if (!page_rec_is_supremum(next_rec)) { - offsets = rec_get_offsets(next_rec, index, offsets, true, + offsets = rec_get_offsets(next_rec, index, offsets, + index->n_core_fields, dtuple_get_n_fields(tuple), &heap); if (cmp_dtuple_rec_with_match_bytes( @@ -321,23 +323,19 @@ page_cur_search_with_match( #endif /* UNIV_ZIP_DEBUG */ ut_d(page_check_dir(page)); - const bool is_leaf = page_is_leaf(page); + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; #ifdef BTR_CUR_HASH_ADAPT - if (is_leaf - && (mode == PAGE_CUR_LE) - && !dict_index_is_spatial(index) - && (page_header_get_field(page, PAGE_N_DIRECTION) > 3) - && (page_header_get_ptr(page, PAGE_LAST_INSERT)) - && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) { - - if (page_cur_try_search_shortcut( - block, index, tuple, - iup_matched_fields, - ilow_matched_fields, - cursor)) { - return; - } + if (n_core + && page_get_direction(page) == PAGE_RIGHT + && page_header_get_offs(page, PAGE_LAST_INSERT) + && mode == PAGE_CUR_LE + && !index->is_spatial() + && page_header_get_field(page, PAGE_N_DIRECTION) > 3 + && page_cur_try_search_shortcut( + block, index, tuple, + iup_matched_fields, ilow_matched_fields, cursor)) { + return; } # ifdef PAGE_CUR_DBG if (mode == PAGE_CUR_DBG) { @@ -348,10 +346,10 @@ page_cur_search_with_match( /* If the mode is for R-tree indexes, use the special MBR related compare functions */ - if (dict_index_is_spatial(index) && mode > PAGE_CUR_LE) { + if (index->is_spatial() && mode > PAGE_CUR_LE) { /* For leaf level insert, we still use the traditional compare function for now */ - if (mode == PAGE_CUR_RTREE_INSERT && is_leaf) { + if (mode == PAGE_CUR_RTREE_INSERT && n_core) { mode = PAGE_CUR_LE; } else { rtr_cur_search_with_match( @@ -381,7 +379,7 @@ page_cur_search_with_match( owned by the upper limit directory slot. */ low = 0; - up = page_dir_get_n_slots(page) - 1; + up = ulint(page_dir_get_n_slots(page)) - 1; /* Perform binary search until the lower and upper limit directory slots come to the distance 1 of each other */ @@ -396,7 +394,7 @@ page_cur_search_with_match( offsets = offsets_; offsets = rec_get_offsets( - mid_rec, index, offsets, is_leaf, + mid_rec, index, offsets, n_core, dtuple_get_n_fields_cmp(tuple), &heap); cmp = cmp_dtuple_rec_with_match( @@ -450,7 +448,7 @@ up_slot_match: offsets = offsets_; offsets = rec_get_offsets( - mid_rec, index, offsets, is_leaf, + mid_rec, index, offsets, n_core, dtuple_get_n_fields_cmp(tuple), &heap); cmp = cmp_dtuple_rec_with_match( @@ -567,6 +565,7 @@ page_cur_search_with_match_bytes( rec_offs_init(offsets_); ut_ad(dtuple_validate(tuple)); + ut_ad(!(tuple->info_bits & REC_INFO_MIN_REC_FLAG)); #ifdef UNIV_DEBUG # ifdef PAGE_CUR_DBG if (mode != PAGE_CUR_DBG) @@ -586,18 +585,16 @@ page_cur_search_with_match_bytes( #ifdef BTR_CUR_HASH_ADAPT if (page_is_leaf(page) - && (mode == PAGE_CUR_LE) - && (page_header_get_field(page, PAGE_N_DIRECTION) > 3) - && (page_header_get_ptr(page, PAGE_LAST_INSERT)) - && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) { - - if (page_cur_try_search_shortcut_bytes( - block, index, tuple, - iup_matched_fields, iup_matched_bytes, - ilow_matched_fields, ilow_matched_bytes, - cursor)) { - return; - } + && page_get_direction(page) == PAGE_RIGHT + && page_header_get_offs(page, PAGE_LAST_INSERT) + && mode == PAGE_CUR_LE + && page_header_get_field(page, PAGE_N_DIRECTION) > 3 + && page_cur_try_search_shortcut_bytes( + block, index, tuple, + iup_matched_fields, iup_matched_bytes, + ilow_matched_fields, ilow_matched_bytes, + cursor)) { + return; } # ifdef PAGE_CUR_DBG if (mode == PAGE_CUR_DBG) { @@ -628,11 +625,11 @@ page_cur_search_with_match_bytes( owned by the upper limit directory slot. */ low = 0; - up = page_dir_get_n_slots(page) - 1; + up = ulint(page_dir_get_n_slots(page)) - 1; /* Perform binary search until the lower and upper limit directory slots come to the distance 1 of each other */ - ut_d(bool is_leaf = page_is_leaf(page)); + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; while (up - low > 1) { mid = (low + up) / 2; @@ -644,7 +641,7 @@ page_cur_search_with_match_bytes( up_matched_fields, up_matched_bytes); offsets = rec_get_offsets( - mid_rec, index, offsets_, is_leaf, + mid_rec, index, offsets_, n_core, dtuple_get_n_fields_cmp(tuple), &heap); cmp = cmp_dtuple_rec_with_match_bytes( @@ -700,8 +697,19 @@ up_slot_match: low_matched_fields, low_matched_bytes, up_matched_fields, up_matched_bytes); + if (UNIV_UNLIKELY(rec_get_info_bits( + mid_rec, + dict_table_is_comp(index->table)) + & REC_INFO_MIN_REC_FLAG)) { + ut_ad(!page_has_prev(page_align(mid_rec))); + ut_ad(!page_rec_is_leaf(mid_rec) + || rec_is_metadata(mid_rec, index)); + cmp = 1; + goto low_rec_match; + } + offsets = rec_get_offsets( - mid_rec, index, offsets_, is_leaf, + mid_rec, index, offsets_, n_core, dtuple_get_n_fields_cmp(tuple), &heap); cmp = cmp_dtuple_rec_with_match_bytes( @@ -733,23 +741,6 @@ up_rec_match: || mode == PAGE_CUR_LE_OR_EXTENDS #endif /* PAGE_CUR_LE_OR_EXTENDS */ ) { - if (!cmp && !cur_matched_fields) { -#ifdef UNIV_DEBUG - mtr_t mtr; - mtr_start(&mtr); - - /* We got a match, but cur_matched_fields is - 0, it must have REC_INFO_MIN_REC_FLAG */ - ulint rec_info = rec_get_info_bits(mid_rec, - rec_offs_comp(offsets)); - ut_ad(rec_info & REC_INFO_MIN_REC_FLAG); - ut_ad(!page_has_prev(page)); - mtr_commit(&mtr); -#endif - - cur_matched_fields = dtuple_get_n_fields_cmp(tuple); - } - goto low_rec_match; } else { @@ -816,19 +807,20 @@ page_cur_insert_rec_write_log( const byte* log_end; ulint i; - if (dict_table_is_temporary(index->table)) { + if (index->table->is_temporary()) { mtr->set_modified(); ut_ad(mtr->get_log_mode() == MTR_LOG_NO_REDO); return; } - ut_a(rec_size < UNIV_PAGE_SIZE); - ut_ad(mtr->is_named_space(index->space)); + ut_a(rec_size < srv_page_size); + ut_ad(mtr->is_named_space(index->table->space)); ut_ad(page_align(insert_rec) == page_align(cursor_rec)); ut_ad(!page_rec_is_comp(insert_rec) == !dict_table_is_comp(index->table)); - ut_d(const bool is_leaf = page_rec_is_leaf(cursor_rec)); + const ulint n_core = page_rec_is_leaf(cursor_rec) + ? index->n_core_fields : 0; { mem_heap_t* heap = NULL; @@ -842,9 +834,9 @@ page_cur_insert_rec_write_log( rec_offs_init(ins_offs_); cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_, - is_leaf, ULINT_UNDEFINED, &heap); + n_core, ULINT_UNDEFINED, &heap); ins_offs = rec_get_offsets(insert_rec, index, ins_offs_, - is_leaf, ULINT_UNDEFINED, &heap); + n_core, ULINT_UNDEFINED, &heap); extra_size = rec_offs_extra_size(ins_offs); cur_extra_size = rec_offs_extra_size(cur_offs); @@ -965,8 +957,8 @@ need_extra_info: /* Write the mismatch index */ log_ptr += mach_write_compressed(log_ptr, i); - ut_a(i < UNIV_PAGE_SIZE); - ut_a(extra_size < UNIV_PAGE_SIZE); + ut_a(i < srv_page_size); + ut_a(extra_size < srv_page_size); } else { /* Write the record end segment length and the extra info storage flag */ @@ -983,7 +975,7 @@ need_extra_info: mlog_close(mtr, log_ptr + rec_size); } else { mlog_close(mtr, log_ptr); - ut_a(rec_size < UNIV_PAGE_SIZE); + ut_a(rec_size < srv_page_size); mlog_catenate_string(mtr, ins_ptr, rec_size); } } @@ -1035,7 +1027,7 @@ page_cur_parse_insert_rec( cursor_rec = page + offset; - if (offset >= UNIV_PAGE_SIZE) { + if (offset >= srv_page_size) { recv_sys->found_corrupt_log = TRUE; @@ -1050,7 +1042,7 @@ page_cur_parse_insert_rec( return(NULL); } - if (end_seg_len >= UNIV_PAGE_SIZE << 1) { + if (end_seg_len >= srv_page_size << 1) { recv_sys->found_corrupt_log = TRUE; return(NULL); @@ -1074,7 +1066,7 @@ page_cur_parse_insert_rec( return(NULL); } - ut_a(origin_offset < UNIV_PAGE_SIZE); + ut_a(origin_offset < srv_page_size); mismatch_index = mach_parse_compressed(&ptr, end_ptr); @@ -1083,7 +1075,7 @@ page_cur_parse_insert_rec( return(NULL); } - ut_a(mismatch_index < UNIV_PAGE_SIZE); + ut_a(mismatch_index < srv_page_size); } if (end_ptr < ptr + (end_seg_len >> 1)) { @@ -1102,9 +1094,9 @@ page_cur_parse_insert_rec( /* Read from the log the inserted index record end segment which differs from the cursor record */ - ut_d(bool is_leaf = page_is_leaf(page)); + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; - offsets = rec_get_offsets(cursor_rec, index, offsets, is_leaf, + offsets = rec_get_offsets(cursor_rec, index, offsets, n_core, ULINT_UNDEFINED, &heap); if (!(end_seg_len & 0x1UL)) { @@ -1125,7 +1117,7 @@ page_cur_parse_insert_rec( /* Build the inserted record to buf */ - if (UNIV_UNLIKELY(mismatch_index >= UNIV_PAGE_SIZE)) { + if (UNIV_UNLIKELY(mismatch_index >= srv_page_size)) { ib::fatal() << "is_short " << is_short << ", " << "info_and_status_bits " << info_and_status_bits @@ -1139,15 +1131,13 @@ page_cur_parse_insert_rec( ut_memcpy(buf + mismatch_index, ptr, end_seg_len); if (page_is_comp(page)) { - /* Make rec_get_offsets() and rec_offs_make_valid() happy. */ - ut_d(rec_set_heap_no_new(buf + origin_offset, - PAGE_HEAP_NO_USER_LOW)); + rec_set_heap_no_new(buf + origin_offset, + PAGE_HEAP_NO_USER_LOW); rec_set_info_and_status_bits(buf + origin_offset, info_and_status_bits); } else { - /* Make rec_get_offsets() and rec_offs_make_valid() happy. */ - ut_d(rec_set_heap_no_old(buf + origin_offset, - PAGE_HEAP_NO_USER_LOW)); + rec_set_heap_no_old(buf + origin_offset, + PAGE_HEAP_NO_USER_LOW); rec_set_info_bits_old(buf + origin_offset, info_and_status_bits); } @@ -1155,7 +1145,7 @@ page_cur_parse_insert_rec( page_cur_position(cursor_rec, block, &cursor); offsets = rec_get_offsets(buf + origin_offset, index, offsets, - is_leaf, ULINT_UNDEFINED, &heap); + n_core, ULINT_UNDEFINED, &heap); if (UNIV_UNLIKELY(!page_cur_rec_insert(&cursor, buf + origin_offset, index, offsets, mtr))) { @@ -1176,6 +1166,50 @@ page_cur_parse_insert_rec( return(const_cast<byte*>(ptr + end_seg_len)); } +/** Reset PAGE_DIRECTION and PAGE_N_DIRECTION. +@param[in,out] ptr the PAGE_DIRECTION_B field +@param[in,out] page index tree page frame +@param[in] page_zip compressed page descriptor, or NULL */ +static inline +void +page_direction_reset(byte* ptr, page_t* page, page_zip_des_t* page_zip) +{ + ut_ad(ptr == PAGE_HEADER + PAGE_DIRECTION_B + page); + page_ptr_set_direction(ptr, PAGE_NO_DIRECTION); + if (page_zip) { + page_zip_write_header(page_zip, ptr, 1, NULL); + } + ptr = PAGE_HEADER + PAGE_N_DIRECTION + page; + *reinterpret_cast<uint16_t*>(ptr) = 0; + if (page_zip) { + page_zip_write_header(page_zip, ptr, 2, NULL); + } +} + +/** Increment PAGE_N_DIRECTION. +@param[in,out] ptr the PAGE_DIRECTION_B field +@param[in,out] page index tree page frame +@param[in] page_zip compressed page descriptor, or NULL +@param[in] dir PAGE_RIGHT or PAGE_LEFT */ +static inline +void +page_direction_increment( + byte* ptr, + page_t* page, + page_zip_des_t* page_zip, + uint dir) +{ + ut_ad(ptr == PAGE_HEADER + PAGE_DIRECTION_B + page); + ut_ad(dir == PAGE_RIGHT || dir == PAGE_LEFT); + page_ptr_set_direction(ptr, dir); + if (page_zip) { + page_zip_write_header(page_zip, ptr, 1, NULL); + } + page_header_set_field( + page, page_zip, PAGE_N_DIRECTION, + 1U + page_header_get_field(page, PAGE_N_DIRECTION)); +} + /************************************************************//** Allocates a block of memory from the heap of an index page. @return pointer to start of allocated buffer, or NULL if allocation fails */ @@ -1255,7 +1289,7 @@ page_cur_insert_rec_low( == (ibool) !!page_is_comp(page)); ut_ad(fil_page_index_page_check(page)); ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) == index->id - || recv_recovery_is_on() + || index->is_dummy || (mtr ? mtr->is_inside_ibuf() : dict_index_is_ibuf(index))); ut_ad(!page_rec_is_supremum(current_rec)); @@ -1263,7 +1297,7 @@ page_cur_insert_rec_low( /* 1. Get the size of the physical record in the page */ rec_size = rec_offs_size(offsets); -#ifdef HAVE_valgrind_or_MSAN +#ifdef HAVE_valgrind { const void* rec_start = rec - rec_offs_extra_size(offsets); @@ -1278,7 +1312,7 @@ page_cur_insert_rec_low( /* The variable-length header must be valid. */ MEM_CHECK_DEFINED(rec_start, extra_size); } -#endif /* HAVE_valgrind_or_MSAN */ +#endif /* HAVE_valgrind */ /* 2. Try to find suitable space from page memory management */ @@ -1292,7 +1326,8 @@ page_cur_insert_rec_low( rec_offs_init(foffsets_); foffsets = rec_get_offsets( - free_rec, index, foffsets, page_is_leaf(page), + free_rec, index, foffsets, + page_is_leaf(page) ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); if (rec_offs_size(foffsets) < rec_size) { if (UNIV_LIKELY_NULL(heap)) { @@ -1332,28 +1367,7 @@ use_heap: /* 3. Create the record */ insert_rec = rec_copy(insert_buf, rec, offsets); - rec_offs_make_valid(insert_rec, index, offsets); - - /* This is because assertion below is debug assertion */ -#ifdef UNIV_DEBUG - if (UNIV_UNLIKELY(current_rec == insert_rec)) { - ulint extra_len, data_len; - extra_len = rec_offs_extra_size(offsets); - data_len = rec_offs_data_size(offsets); - - fprintf(stderr, "InnoDB: Error: current_rec == insert_rec " - " extra_len " ULINTPF - " data_len " ULINTPF " insert_buf %p rec %p\n", - extra_len, data_len, insert_buf, rec); - fprintf(stderr, "InnoDB; Physical record: \n"); - rec_print(stderr, rec, index); - fprintf(stderr, "InnoDB: Inserted record: \n"); - rec_print(stderr, insert_rec, index); - fprintf(stderr, "InnoDB: Current record: \n"); - rec_print(stderr, current_rec, index); - ut_a(current_rec != insert_rec); - } -#endif /* UNIV_DEBUG */ + rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets); /* 4. Insert the record in the linked list of records */ ut_ad(current_rec != insert_rec); @@ -1363,9 +1377,24 @@ use_heap: rec_t* next_rec = page_rec_get_next(current_rec); #ifdef UNIV_DEBUG if (page_is_comp(page)) { - ut_ad(rec_get_status(current_rec) - <= REC_STATUS_INFIMUM); - ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + switch (rec_get_status(current_rec)) { + case REC_STATUS_ORDINARY: + case REC_STATUS_NODE_PTR: + case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INFIMUM: + break; + case REC_STATUS_SUPREMUM: + ut_ad(!"wrong status on current_rec"); + } + switch (rec_get_status(insert_rec)) { + case REC_STATUS_ORDINARY: + case REC_STATUS_NODE_PTR: + case REC_STATUS_COLUMNS_ADDED: + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad(!"wrong status on insert_rec"); + } ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); } #endif @@ -1374,7 +1403,7 @@ use_heap: } page_header_set_field(page, NULL, PAGE_N_RECS, - 1 + page_get_n_recs(page)); + 1U + page_get_n_recs(page)); /* 5. Set the n_owned field in the inserted record to zero, and set the heap_no field */ @@ -1396,34 +1425,18 @@ use_heap: == rec_get_node_ptr_flag(insert_rec)); if (!dict_index_is_spatial(index)) { + byte* ptr = PAGE_HEADER + PAGE_DIRECTION_B + page; if (UNIV_UNLIKELY(last_insert == NULL)) { - page_header_set_field(page, NULL, PAGE_DIRECTION, - PAGE_NO_DIRECTION); - page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); - - } else if ((last_insert == current_rec) - && (page_header_get_field(page, PAGE_DIRECTION) - != PAGE_LEFT)) { - - page_header_set_field(page, NULL, PAGE_DIRECTION, - PAGE_RIGHT); - page_header_set_field(page, NULL, PAGE_N_DIRECTION, - page_header_get_field( - page, PAGE_N_DIRECTION) + 1); - - } else if ((page_rec_get_next(insert_rec) == last_insert) - && (page_header_get_field(page, PAGE_DIRECTION) - != PAGE_RIGHT)) { - - page_header_set_field(page, NULL, PAGE_DIRECTION, - PAGE_LEFT); - page_header_set_field(page, NULL, PAGE_N_DIRECTION, - page_header_get_field( - page, PAGE_N_DIRECTION) + 1); +no_direction: + page_direction_reset(ptr, page, NULL); + } else if (last_insert == current_rec + && page_ptr_get_direction(ptr) != PAGE_LEFT) { + page_direction_increment(ptr, page, NULL, PAGE_RIGHT); + } else if (page_ptr_get_direction(ptr) != PAGE_RIGHT + && page_rec_get_next(insert_rec) == last_insert) { + page_direction_increment(ptr, page, NULL, PAGE_LEFT); } else { - page_header_set_field(page, NULL, PAGE_DIRECTION, - PAGE_NO_DIRECTION); - page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + goto no_direction; } } @@ -1504,9 +1517,9 @@ page_cur_insert_rec_zip( ut_ad(page_is_comp(page)); ut_ad(fil_page_index_page_check(page)); ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) == index->id - || (mtr ? mtr->is_inside_ibuf() : dict_index_is_ibuf(index)) - || recv_recovery_is_on()); - + || index->is_dummy + || (mtr ? mtr->is_inside_ibuf() : dict_index_is_ibuf(index))); + ut_ad(!page_get_instant(page)); ut_ad(!page_cur_is_after_last(cursor)); #ifdef UNIV_ZIP_DEBUG ut_a(page_zip_validate(page_zip, page, index)); @@ -1515,7 +1528,7 @@ page_cur_insert_rec_zip( /* 1. Get the size of the physical record in the page */ rec_size = rec_offs_size(offsets); -#ifdef HAVE_valgrind_or_MSAN +#ifdef HAVE_valgrind { const void* rec_start = rec - rec_offs_extra_size(offsets); @@ -1530,7 +1543,7 @@ page_cur_insert_rec_zip( /* The variable-length header must be valid. */ MEM_CHECK_DEFINED(rec_start, extra_size); } -#endif /* HAVE_valgrind_or_MSAN */ +#endif /* HAVE_valgrind */ const bool reorg_before_insert = page_has_garbage(page) && rec_size > page_get_max_insert_size(page, 1) @@ -1624,11 +1637,13 @@ page_cur_insert_rec_zip( because the MLOG_COMP_REC_INSERT should only be logged after a successful operation. */ ut_ad(!recv_recovery_is_on()); + ut_ad(!index->is_dummy); } else if (recv_recovery_is_on()) { /* This should be followed by MLOG_ZIP_PAGE_COMPRESS_NO_DATA, which should succeed. */ - rec_offs_make_valid(insert_rec, index, offsets); + rec_offs_make_valid(insert_rec, index, + page_is_leaf(page), offsets); } else { ulint pos = page_rec_get_n_recs_before(insert_rec); ut_ad(pos > 0); @@ -1644,7 +1659,8 @@ page_cur_insert_rec_zip( level, page, index, mtr); rec_offs_make_valid( - insert_rec, index, offsets); + insert_rec, index, + page_is_leaf(page), offsets); return(insert_rec); } @@ -1687,7 +1703,8 @@ page_cur_insert_rec_zip( insert_rec = page + rec_get_next_offs( cursor->rec, TRUE); rec_offs_make_valid( - insert_rec, index, offsets); + insert_rec, index, + page_is_leaf(page), offsets); return(insert_rec); } @@ -1723,7 +1740,8 @@ page_cur_insert_rec_zip( rec_offs_init(foffsets_); foffsets = rec_get_offsets(free_rec, index, foffsets, - page_rec_is_leaf(free_rec), + page_rec_is_leaf(free_rec) + ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); if (rec_offs_size(foffsets) < rec_size) { too_small: @@ -1738,14 +1756,13 @@ too_small: /* On compressed pages, do not relocate records from the free list. If extra_size would grow, use the heap. */ - extra_size_diff - = rec_offs_extra_size(offsets) - - rec_offs_extra_size(foffsets); + extra_size_diff = lint(rec_offs_extra_size(offsets) + - rec_offs_extra_size(foffsets)); if (UNIV_UNLIKELY(extra_size_diff < 0)) { /* Add an offset to the extra_size. */ if (rec_offs_size(foffsets) - < rec_size - extra_size_diff) { + < rec_size - ulint(extra_size_diff)) { goto too_small; } @@ -1829,7 +1846,7 @@ use_heap: /* 3. Create the record */ insert_rec = rec_copy(insert_buf, rec, offsets); - rec_offs_make_valid(insert_rec, index, offsets); + rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets); /* 4. Insert the record in the linked list of records */ ut_ad(cursor->rec != insert_rec); @@ -1848,7 +1865,7 @@ use_heap: } page_header_set_field(page, page_zip, PAGE_N_RECS, - 1 + page_get_n_recs(page)); + 1U + page_get_n_recs(page)); /* 5. Set the n_owned field in the inserted record to zero, and set the heap_no field */ @@ -1868,36 +1885,20 @@ use_heap: == rec_get_node_ptr_flag(insert_rec)); if (!dict_index_is_spatial(index)) { + byte* ptr = PAGE_HEADER + PAGE_DIRECTION_B + page; if (UNIV_UNLIKELY(last_insert == NULL)) { - page_header_set_field(page, page_zip, PAGE_DIRECTION, - PAGE_NO_DIRECTION); - page_header_set_field(page, page_zip, - PAGE_N_DIRECTION, 0); - - } else if ((last_insert == cursor->rec) - && (page_header_get_field(page, PAGE_DIRECTION) - != PAGE_LEFT)) { - - page_header_set_field(page, page_zip, PAGE_DIRECTION, - PAGE_RIGHT); - page_header_set_field(page, page_zip, PAGE_N_DIRECTION, - page_header_get_field( - page, PAGE_N_DIRECTION) + 1); - - } else if ((page_rec_get_next(insert_rec) == last_insert) - && (page_header_get_field(page, PAGE_DIRECTION) - != PAGE_RIGHT)) { - - page_header_set_field(page, page_zip, PAGE_DIRECTION, - PAGE_LEFT); - page_header_set_field(page, page_zip, PAGE_N_DIRECTION, - page_header_get_field( - page, PAGE_N_DIRECTION) + 1); +no_direction: + page_direction_reset(ptr, page, page_zip); + } else if (last_insert == cursor->rec + && page_ptr_get_direction(ptr) != PAGE_LEFT) { + page_direction_increment(ptr, page, page_zip, + PAGE_RIGHT); + } else if (page_ptr_get_direction(ptr) != PAGE_RIGHT + && page_rec_get_next(insert_rec) == last_insert) { + page_direction_increment(ptr, page, page_zip, + PAGE_LEFT); } else { - page_header_set_field(page, page_zip, PAGE_DIRECTION, - PAGE_NO_DIRECTION); - page_header_set_field(page, page_zip, - PAGE_N_DIRECTION, 0); + goto no_direction; } } @@ -1948,7 +1949,7 @@ page_copy_rec_list_to_created_page_write_log( byte* log_ptr; ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); log_ptr = mlog_open_and_write_index(mtr, page, index, page_is_comp(page) @@ -1978,6 +1979,8 @@ page_parse_copy_rec_list_to_created_page( page_t* page; page_zip_des_t* page_zip; + ut_ad(index->is_dummy); + if (ptr + 4 > end_ptr) { return(NULL); @@ -1999,6 +2002,13 @@ page_parse_copy_rec_list_to_created_page( } ut_ad(fil_page_index_page_check(block->frame)); + /* This function is never invoked on the clustered index root page, + except in the redo log apply of + page_copy_rec_list_end_to_created_page() which was logged by. + page_copy_rec_list_to_created_page_write_log(). + For other pages, this field must be zero-initialized. */ + ut_ad(!page_get_instant(block->frame) + || !page_has_siblings(block->frame)); while (ptr < rec_end) { ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, @@ -2013,9 +2023,8 @@ page_parse_copy_rec_list_to_created_page( page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); if (!dict_index_is_spatial(index)) { - page_header_set_field(page, page_zip, PAGE_DIRECTION, - PAGE_NO_DIRECTION); - page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + page_direction_reset(PAGE_HEADER + PAGE_DIRECTION_B + page, + page, page_zip); } return(rec_end); @@ -2056,6 +2065,9 @@ page_copy_rec_list_end_to_created_page( ut_ad(page_align(rec) != new_page); ut_ad(page_rec_is_comp(rec) == page_is_comp(new_page)); ut_ad(fil_page_index_page_check(new_page)); + /* This function is never invoked on the clustered index root page, + except in btr_lift_page_up(). */ + ut_ad(!page_get_instant(new_page) || !page_has_siblings(new_page)); if (page_rec_is_infimum(rec)) { @@ -2070,9 +2082,9 @@ page_copy_rec_list_end_to_created_page( #ifdef UNIV_DEBUG /* To pass the debug tests we have to set these dummy values in the debug version */ - page_dir_set_n_slots(new_page, NULL, UNIV_PAGE_SIZE / 2); + page_dir_set_n_slots(new_page, NULL, srv_page_size / 2); page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, - new_page + UNIV_PAGE_SIZE - 1); + new_page + srv_page_size - 1); #endif log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, index, mtr); @@ -2081,7 +2093,7 @@ page_copy_rec_list_end_to_created_page( /* Individual inserts are logged in a shorter form */ - const mtr_log_t log_mode = dict_table_is_temporary(index->table) + const mtr_log_t log_mode = index->table->is_temporary() || !index->is_readable() /* IMPORT TABLESPACE */ ? mtr_get_log_mode(mtr) : mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS); @@ -2096,10 +2108,11 @@ page_copy_rec_list_end_to_created_page( slot_index = 0; n_recs = 0; - ut_d(const bool is_leaf = page_is_leaf(new_page)); + const ulint n_core = page_is_leaf(new_page) + ? index->n_core_fields : 0; do { - offsets = rec_get_offsets(rec, index, offsets, is_leaf, + offsets = rec_get_offsets(rec, index, offsets, n_core, ULINT_UNDEFINED, &heap); insert_rec = rec_copy(heap_top, rec, offsets); @@ -2137,11 +2150,11 @@ page_copy_rec_list_end_to_created_page( rec_size = rec_offs_size(offsets); - ut_ad(heap_top < new_page + UNIV_PAGE_SIZE); + ut_ad(heap_top < new_page + srv_page_size); heap_top += rec_size; - rec_offs_make_valid(insert_rec, index, offsets); + rec_offs_make_valid(insert_rec, index, n_core != 0, offsets); page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, index, mtr); prev_rec = insert_rec; @@ -2171,9 +2184,13 @@ page_copy_rec_list_end_to_created_page( mem_heap_free(heap); } + /* Restore the log mode */ + + mtr_set_log_mode(mtr, log_mode); + log_data_len = mtr->get_log()->size() - log_data_len; - ut_a(log_data_len < 100 * UNIV_PAGE_SIZE); + ut_a(log_data_len < 100U << srv_page_size_shift); if (log_ptr != NULL) { mach_write_to_4(log_ptr, log_data_len); @@ -2195,15 +2212,10 @@ page_copy_rec_list_end_to_created_page( page_dir_set_n_heap(new_page, NULL, PAGE_HEAP_NO_USER_LOW + n_recs); page_header_set_field(new_page, NULL, PAGE_N_RECS, n_recs); - page_header_set_ptr(new_page, NULL, PAGE_LAST_INSERT, NULL); - - page_header_set_field(new_page, NULL, PAGE_DIRECTION, - PAGE_NO_DIRECTION); - page_header_set_field(new_page, NULL, PAGE_N_DIRECTION, 0); - - /* Restore the log mode */ - - mtr_set_log_mode(mtr, log_mode); + *reinterpret_cast<uint16_t*>(PAGE_HEADER + PAGE_LAST_INSERT + new_page) + = 0; + page_direction_reset(PAGE_HEADER + PAGE_DIRECTION_B + new_page, + new_page, NULL); } /***********************************************************//** @@ -2219,7 +2231,7 @@ page_cur_delete_rec_write_log( byte* log_ptr; ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); log_ptr = mlog_open_and_write_index(mtr, rec, index, page_rec_is_comp(rec) @@ -2279,7 +2291,8 @@ page_cur_parse_delete_rec( page_cur_delete_rec(&cursor, index, rec_get_offsets(rec, index, offsets_, - page_rec_is_leaf(rec), + page_rec_is_leaf(rec) + ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap), mtr); if (UNIV_LIKELY_NULL(heap)) { @@ -2330,9 +2343,9 @@ page_cur_delete_rec( ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); ut_ad(fil_page_index_page_check(page)); ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) == index->id - || (mtr ? mtr->is_inside_ibuf() : dict_index_is_ibuf(index)) - || recv_recovery_is_on()); - ut_ad(mtr == NULL || mtr->is_named_space(index->space)); + || index->is_dummy + || (mtr ? mtr->is_inside_ibuf() : dict_index_is_ibuf(index))); + ut_ad(!mtr || mtr->is_named_space(index->table->space)); /* The record must not be the supremum or infimum record. */ ut_ad(page_rec_is_user_rec(current_rec)); @@ -2408,9 +2421,7 @@ page_cur_delete_rec( prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED >= 2. */ -#if PAGE_DIR_SLOT_MIN_N_OWNED < 2 -# error "PAGE_DIR_SLOT_MIN_N_OWNED < 2" -#endif + compile_time_assert(PAGE_DIR_SLOT_MIN_N_OWNED >= 2); ut_ad(cur_n_owned > 1); if (current_rec == page_dir_slot_get_rec(cur_dir_slot)) { diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc index b078763c684..d69fdc7a202 100644 --- a/storage/innobase/page/page0page.cc +++ b/storage/innobase/page/page0page.cc @@ -2,7 +2,7 @@ Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -93,24 +93,24 @@ page_dir_find_owner_slot( const page_t* page = page_align(rec); const page_dir_slot_t* first_slot = page_dir_get_nth_slot(page, 0); const page_dir_slot_t* slot = page_dir_get_nth_slot( - page, page_dir_get_n_slots(page) - 1); + page, ulint(page_dir_get_n_slots(page)) - 1); const rec_t* r = rec; if (page_is_comp(page)) { while (rec_get_n_owned_new(r) == 0) { r = rec_get_next_ptr_const(r, TRUE); ut_ad(r >= page + PAGE_NEW_SUPREMUM); - ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + ut_ad(r < page + (srv_page_size - PAGE_DIR)); } } else { while (rec_get_n_owned_old(r) == 0) { r = rec_get_next_ptr_const(r, FALSE); ut_ad(r >= page + PAGE_OLD_SUPREMUM); - ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + ut_ad(r < page + (srv_page_size - PAGE_DIR)); } } - uint16 rec_offs_bytes = mach_encode_2(r - page); + uint16 rec_offs_bytes = mach_encode_2(ulint(r - page)); while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) { @@ -235,9 +235,9 @@ page_set_autoinc( { ut_ad(mtr_memo_contains_flagged( mtr, block, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); - ut_ad(dict_index_is_clust(index)); + ut_ad(index->is_primary()); ut_ad(index->page == block->page.id.page_no()); - ut_ad(index->space == block->page.id.space()); + ut_ad(index->table->space_id == block->page.id.space()); byte* field = PAGE_HEADER + PAGE_ROOT_AUTO_INC + buf_block_get_frame(block); @@ -321,12 +321,10 @@ page_create_low( { page_t* page; -#if PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA -# error "PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA" -#endif -#if PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA -# error "PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA" -#endif + compile_time_assert(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE + <= PAGE_DATA); + compile_time_assert(PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE + <= PAGE_DATA); buf_block_modify_clock_inc(block); @@ -340,7 +338,8 @@ page_create_low( memset(page + PAGE_HEADER, 0, PAGE_HEADER_PRIV_END); page[PAGE_HEADER + PAGE_N_DIR_SLOTS + 1] = 2; - page[PAGE_HEADER + PAGE_DIRECTION + 1] = PAGE_NO_DIRECTION; + page[PAGE_HEADER + PAGE_INSTANT] = 0; + page[PAGE_HEADER + PAGE_DIRECTION_B] = PAGE_NO_DIRECTION; if (comp) { page[PAGE_HEADER + PAGE_N_HEAP] = 0x80;/*page_is_comp()*/ @@ -350,10 +349,10 @@ page_create_low( sizeof infimum_supremum_compact); memset(page + PAGE_NEW_SUPREMUM_END, 0, - UNIV_PAGE_SIZE - PAGE_DIR - PAGE_NEW_SUPREMUM_END); - page[UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1] + srv_page_size - PAGE_DIR - PAGE_NEW_SUPREMUM_END); + page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1] = PAGE_NEW_SUPREMUM; - page[UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1] + page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1] = PAGE_NEW_INFIMUM; } else { page[PAGE_HEADER + PAGE_N_HEAP + 1] = PAGE_HEAP_NO_USER_LOW; @@ -362,10 +361,10 @@ page_create_low( sizeof infimum_supremum_redundant); memset(page + PAGE_OLD_SUPREMUM_END, 0, - UNIV_PAGE_SIZE - PAGE_DIR - PAGE_OLD_SUPREMUM_END); - page[UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1] + srv_page_size - PAGE_DIR - PAGE_OLD_SUPREMUM_END); + page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1] = PAGE_OLD_SUPREMUM; - page[UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1] + page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1] = PAGE_OLD_INFIMUM; } @@ -438,19 +437,19 @@ page_create_zip( /* PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC are always 0 for temporary tables. */ - ut_ad(max_trx_id == 0 || !dict_table_is_temporary(index->table)); + ut_ad(max_trx_id == 0 || !index->table->is_temporary()); /* In secondary indexes and the change buffer, PAGE_MAX_TRX_ID must be zero on non-leaf pages. max_trx_id can be 0 when the index consists of an empty root (leaf) page. */ ut_ad(max_trx_id == 0 || level == 0 || !dict_index_is_sec_or_ibuf(index) - || dict_table_is_temporary(index->table)); + || index->table->is_temporary()); /* In the clustered index, PAGE_ROOT_AUTOINC or PAGE_MAX_TRX_ID must be 0 on other pages than the root. */ ut_ad(level == 0 || max_trx_id == 0 || !dict_index_is_sec_or_ibuf(index) - || dict_table_is_temporary(index->table)); + || index->table->is_temporary()); page = page_create_low(block, TRUE, is_spatial); mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + page, level); @@ -491,14 +490,14 @@ page_create_empty( ut_ad(fil_page_index_page_check(page)); ut_ad(!index->is_dummy); - ut_ad(block->page.id.space() == index->space); + ut_ad(block->page.id.space() == index->table->space->id); /* Multiple transactions cannot simultaneously operate on the same temp-table in parallel. max_trx_id is ignored for temp tables because it not required for MVCC. */ if (dict_index_is_sec_or_ibuf(index) - && !dict_table_is_temporary(index->table) + && !index->table->is_temporary() && page_is_leaf(page)) { max_trx_id = page_get_max_trx_id(page); ut_ad(max_trx_id); @@ -510,7 +509,7 @@ page_create_empty( } if (page_zip) { - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); page_create_zip(block, index, page_header_get_field(page, PAGE_LEVEL), max_trx_id, NULL, mtr); @@ -559,9 +558,10 @@ page_copy_rec_list_end_no_locks( btr_assert_not_corrupted(new_block, index); ut_a(page_is_comp(new_page) == page_rec_is_comp(rec)); - ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) + ut_a(mach_read_from_2(new_page + srv_page_size - 10) == (ulint) (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); - ut_d(const bool is_leaf = page_is_leaf(block->frame)); + const ulint n_core = page_is_leaf(block->frame) + ? index->n_core_fields : 0; cur2 = page_get_infimum_rec(buf_block_get_frame(new_block)); @@ -569,7 +569,7 @@ page_copy_rec_list_end_no_locks( while (!page_cur_is_after_last(&cur1)) { rec_t* ins_rec; - offsets = rec_get_offsets(cur1.rec, index, offsets, is_leaf, + offsets = rec_get_offsets(cur1.rec, index, offsets, n_core, ULINT_UNDEFINED, &heap); ins_rec = page_cur_insert_rec_low(cur2, index, cur1.rec, offsets, mtr); @@ -680,7 +680,7 @@ page_copy_rec_list_end( for MVCC. */ if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page) - && !dict_table_is_temporary(index->table)) { + && !index->table->is_temporary()) { page_update_max_trx_id(new_block, NULL, page_get_max_trx_id(page), mtr); } @@ -731,9 +731,10 @@ page_copy_rec_list_end( /* Update the lock table and possible hash index */ - if (dict_index_is_spatial(index) && rec_move) { + if (dict_table_is_locking_disabled(index->table)) { + } else if (rec_move && dict_index_is_spatial(index)) { lock_rtr_move_rec_list(new_block, block, rec_move, num_moved); - } else if (!dict_table_is_locking_disabled(index->table)) { + } else { lock_move_rec_list_end(new_block, block, rec); } @@ -741,7 +742,7 @@ page_copy_rec_list_end( mem_heap_free(heap); } - btr_search_move_or_delete_hash_entries(new_block, block, index); + btr_search_move_or_delete_hash_entries(new_block, block); return(ret); } @@ -800,10 +801,11 @@ page_copy_rec_list_start( cur2 = ret; - const bool is_leaf = page_rec_is_leaf(rec); + const ulint n_core = page_rec_is_leaf(rec) ? index->n_core_fields : 0; /* Copy records from the original page to the new page */ - if (dict_index_is_spatial(index)) { + if (index->is_spatial()) { + ut_ad(!index->is_instant()); ulint max_to_move = page_get_n_recs( buf_block_get_frame(block)); heap = mem_heap_create(256); @@ -821,7 +823,7 @@ page_copy_rec_list_start( } else { while (page_cur_get_rec(&cur1) != rec) { offsets = rec_get_offsets(cur1.rec, index, offsets, - is_leaf, + n_core, ULINT_UNDEFINED, &heap); cur2 = page_cur_insert_rec_low(cur2, index, cur1.rec, offsets, mtr); @@ -841,8 +843,8 @@ page_copy_rec_list_start( same temp-table in parallel. max_trx_id is ignored for temp tables because it not required for MVCC. */ - if (is_leaf && dict_index_is_sec_or_ibuf(index) - && !dict_table_is_temporary(index->table)) { + if (n_core && dict_index_is_sec_or_ibuf(index) + && !index->table->is_temporary()) { page_update_max_trx_id(new_block, NULL, page_get_max_trx_id(page_align(rec)), mtr); @@ -893,9 +895,10 @@ zip_reorganize: /* Update the lock table and possible hash index */ - if (dict_index_is_spatial(index)) { + if (dict_table_is_locking_disabled(index->table)) { + } else if (dict_index_is_spatial(index)) { lock_rtr_move_rec_list(new_block, block, rec_move, num_moved); - } else if (!dict_table_is_locking_disabled(index->table)) { + } else { lock_move_rec_list_start(new_block, block, rec, ret); } @@ -903,7 +906,7 @@ zip_reorganize: mem_heap_free(heap); } - btr_search_move_or_delete_hash_entries(new_block, block, index); + btr_search_move_or_delete_hash_entries(new_block, block); return(ret); } @@ -1017,7 +1020,7 @@ page_delete_rec_list_end( rec_offs* offsets = offsets_; rec_offs_init(offsets_); - ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE); + ut_ad(size == ULINT_UNDEFINED || size < srv_page_size); ut_ad(!page_zip || page_rec_is_comp(rec)); #ifdef UNIV_ZIP_DEBUG ut_a(!page_zip || page_zip_validate(page_zip, page, index)); @@ -1071,7 +1074,7 @@ delete_all: ? MLOG_COMP_LIST_END_DELETE : MLOG_LIST_END_DELETE, mtr); - ut_d(const bool is_leaf = page_is_leaf(page)); + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; if (page_zip) { mtr_log_t log_mode; @@ -1085,7 +1088,7 @@ delete_all: page_cur_t cur; page_cur_position(rec, block, &cur); - offsets = rec_get_offsets(rec, index, offsets, is_leaf, + offsets = rec_get_offsets(rec, index, offsets, n_core, ULINT_UNDEFINED, &heap); rec = rec_get_next_ptr(rec, TRUE); #ifdef UNIV_ZIP_DEBUG @@ -1118,13 +1121,13 @@ delete_all: do { ulint s; - offsets = rec_get_offsets(rec2, index, offsets, - is_leaf, + offsets = rec_get_offsets(rec2, index, offsets, n_core, ULINT_UNDEFINED, &heap); s = rec_offs_size(offsets); - ut_ad(rec2 - page + s - rec_offs_extra_size(offsets) - < UNIV_PAGE_SIZE); - ut_ad(size + s < UNIV_PAGE_SIZE); + ut_ad(ulint(rec2 - page) + s + - rec_offs_extra_size(offsets) + < srv_page_size); + ut_ad(size + s < srv_page_size); size += s; n_recs++; @@ -1141,7 +1144,7 @@ delete_all: } } - ut_ad(size < UNIV_PAGE_SIZE); + ut_ad(size < srv_page_size); /* Update the page directory; there is no need to balance the number of the records owned by the supremum record, as it is allowed to be @@ -1264,11 +1267,12 @@ page_delete_rec_list_start( /* Individual deletes are not logged */ mtr_log_t log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); - ut_d(const bool is_leaf = page_rec_is_leaf(rec)); + const ulint n_core = page_rec_is_leaf(rec) + ? index->n_core_fields : 0; while (page_cur_get_rec(&cur1) != rec) { offsets = rec_get_offsets(page_cur_get_rec(&cur1), index, - offsets, is_leaf, + offsets, n_core, ULINT_UNDEFINED, &heap); page_cur_delete_rec(&cur1, index, offsets, mtr); } @@ -1598,7 +1602,7 @@ page_rec_get_nth_const( return(page_get_infimum_rec(page)); } - ut_ad(nth < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); + ut_ad(nth < srv_page_size / (REC_N_NEW_EXTRA_BYTES + 1)); for (i = 0;; i++) { @@ -1660,7 +1664,7 @@ page_rec_get_n_recs_before( slot = page_dir_get_nth_slot(page, i); slot_rec = page_dir_slot_get_rec(slot); - n += rec_get_n_owned_new(slot_rec); + n += lint(rec_get_n_owned_new(slot_rec)); if (rec == slot_rec) { @@ -1678,7 +1682,7 @@ page_rec_get_n_recs_before( slot = page_dir_get_nth_slot(page, i); slot_rec = page_dir_slot_get_rec(slot); - n += rec_get_n_owned_old(slot_rec); + n += lint(rec_get_n_owned_old(slot_rec)); if (rec == slot_rec) { @@ -1690,7 +1694,7 @@ page_rec_get_n_recs_before( n--; ut_ad(n >= 0); - ut_ad((ulong) n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); + ut_ad((ulong) n < srv_page_size / (REC_N_NEW_EXTRA_BYTES + 1)); return((ulint) n); } @@ -1842,20 +1846,20 @@ page_header_print( fprintf(stderr, "--------------------------------\n" "PAGE HEADER INFO\n" - "Page address %p, n records %lu (%s)\n" - "n dir slots %lu, heap top %lu\n" - "Page n heap %lu, free %lu, garbage %lu\n" - "Page last insert %lu, direction %lu, n direction %lu\n", - page, (ulong) page_header_get_field(page, PAGE_N_RECS), + "Page address %p, n records %u (%s)\n" + "n dir slots %u, heap top %u\n" + "Page n heap %u, free %u, garbage %u\n" + "Page last insert %u, direction %u, n direction %u\n", + page, page_header_get_field(page, PAGE_N_RECS), page_is_comp(page) ? "compact format" : "original format", - (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS), - (ulong) page_header_get_field(page, PAGE_HEAP_TOP), - (ulong) page_dir_get_n_heap(page), - (ulong) page_header_get_field(page, PAGE_FREE), - (ulong) page_header_get_field(page, PAGE_GARBAGE), - (ulong) page_header_get_field(page, PAGE_LAST_INSERT), - (ulong) page_header_get_field(page, PAGE_DIRECTION), - (ulong) page_header_get_field(page, PAGE_N_DIRECTION)); + page_header_get_field(page, PAGE_N_DIR_SLOTS), + page_header_get_field(page, PAGE_HEAP_TOP), + page_dir_get_n_heap(page), + page_header_get_field(page, PAGE_FREE), + page_header_get_field(page, PAGE_GARBAGE), + page_header_get_field(page, PAGE_LAST_INSERT), + page_get_direction(page), + page_header_get_field(page, PAGE_N_DIRECTION)); } /***************************************************************//** @@ -1983,7 +1987,7 @@ page_simple_validate_old( n_slots = page_dir_get_n_slots(page); - if (UNIV_UNLIKELY(n_slots < 2 || n_slots > UNIV_PAGE_SIZE / 4)) { + if (UNIV_UNLIKELY(n_slots < 2 || n_slots > srv_page_size / 4)) { ib::error() << "Nonsensical number of page dir slots: " << n_slots; goto func_exit; @@ -2022,7 +2026,7 @@ page_simple_validate_old( goto func_exit; } - if (UNIV_UNLIKELY(rec_get_n_owned_old(rec))) { + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) != 0)) { /* This is a record pointed to by a dir slot */ if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) != own_count)) { @@ -2058,7 +2062,7 @@ page_simple_validate_old( if (UNIV_UNLIKELY (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA - || rec_get_next_offs(rec, FALSE) >= UNIV_PAGE_SIZE)) { + || rec_get_next_offs(rec, FALSE) >= srv_page_size)) { ib::error() << "Next record offset nonsensical " << rec_get_next_offs(rec, FALSE) << " for rec " @@ -2069,7 +2073,7 @@ page_simple_validate_old( count++; - if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + if (UNIV_UNLIKELY(count > srv_page_size)) { ib::error() << "Page record list appears" " to be circular " << count; goto func_exit; @@ -2106,7 +2110,7 @@ page_simple_validate_old( while (rec != NULL) { if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA - || rec >= page + UNIV_PAGE_SIZE)) { + || rec >= page + srv_page_size)) { ib::error() << "Free list record has" " a nonsensical offset " << (rec - page); @@ -2123,7 +2127,7 @@ page_simple_validate_old( count++; - if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + if (UNIV_UNLIKELY(count > srv_page_size)) { ib::error() << "Page free list appears" " to be circular " << count; goto func_exit; @@ -2222,7 +2226,7 @@ page_simple_validate_new( goto func_exit; } - if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) { /* This is a record pointed to by a dir slot */ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != own_count)) { @@ -2258,7 +2262,7 @@ page_simple_validate_new( if (UNIV_UNLIKELY (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA - || rec_get_next_offs(rec, TRUE) >= UNIV_PAGE_SIZE)) { + || rec_get_next_offs(rec, TRUE) >= srv_page_size)) { ib::error() << "Next record offset nonsensical " << rec_get_next_offs(rec, TRUE) @@ -2269,7 +2273,7 @@ page_simple_validate_new( count++; - if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + if (UNIV_UNLIKELY(count > srv_page_size)) { ib::error() << "Page record list appears to be" " circular " << count; goto func_exit; @@ -2306,7 +2310,7 @@ page_simple_validate_new( while (rec != NULL) { if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA - || rec >= page + UNIV_PAGE_SIZE)) { + || rec >= page + srv_page_size)) { ib::error() << "Free list record has" " a nonsensical offset " << page_offset(rec); @@ -2324,7 +2328,7 @@ page_simple_validate_new( count++; - if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + if (UNIV_UNLIKELY(count > srv_page_size)) { ib::error() << "Page free list appears to be" " circular " << count; goto func_exit; @@ -2357,30 +2361,19 @@ func_exit: return(ret); } -/***************************************************************//** -This function checks the consistency of an index page. -@return TRUE if ok */ -ibool -page_validate( -/*==========*/ - const page_t* page, /*!< in: index page */ - dict_index_t* index) /*!< in: data dictionary index containing - the page record type definition */ +/** Check the consistency of an index page. +@param[in] page index page +@param[in] index B-tree or R-tree index +@return whether the page is valid */ +bool page_validate(const page_t* page, const dict_index_t* index) { const page_dir_slot_t* slot; - mem_heap_t* heap; - byte* buf; - ulint count; - ulint own_count; - ulint rec_own_count; - ulint slot_no; - ulint data_size; const rec_t* rec; const rec_t* old_rec = NULL; const rec_t* first_rec = NULL; - ulint offs; + ulint offs = 0; ulint n_slots; - ibool ret = FALSE; + ibool ret = TRUE; ulint i; rec_offs offsets_1[REC_OFFS_NORMAL_SIZE]; rec_offs offsets_2[REC_OFFS_NORMAL_SIZE]; @@ -2401,8 +2394,10 @@ page_validate( ib::error() << "'compact format' flag mismatch"; func_exit2: ib::error() << "Apparent corruption in space " - << page_get_space_id(page) << " page " - << page_get_page_no(page) << " index " << index->name; + << page_get_space_id(page) << " page " + << page_get_page_no(page) + << " of index " << index->name + << " of table " << index->table->name; return FALSE; } @@ -2420,84 +2415,174 @@ func_exit2: same temp-table in parallel. max_trx_id is ignored for temp tables because it not required for MVCC. */ - if (dict_index_is_sec_or_ibuf(index) - && !dict_table_is_temporary(index->table) - && page_is_leaf(page) - && !page_is_empty(page)) { + if (!page_is_leaf(page) || page_is_empty(page) + || !dict_index_is_sec_or_ibuf(index) + || index->table->is_temporary()) { + } else if (trx_id_t sys_max_trx_id = trx_sys.get_max_trx_id()) { trx_id_t max_trx_id = page_get_max_trx_id(page); - trx_id_t sys_max_trx_id = trx_sys_get_max_trx_id(); if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) { ib::error() << "PAGE_MAX_TRX_ID out of bounds: " << max_trx_id << ", " << sys_max_trx_id; - goto func_exit2; + ret = FALSE; } + } else { + ut_ad(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN); } - heap = mem_heap_create(UNIV_PAGE_SIZE + 200); - - /* The following buffer is used to check that the - records in the page record heap do not overlap */ - - buf = static_cast<byte*>(mem_heap_zalloc(heap, UNIV_PAGE_SIZE)); - /* Check first that the record heap and the directory do not overlap. */ n_slots = page_dir_get_n_slots(page); - const void* top = page_header_get_ptr(page, PAGE_HEAP_TOP); - const void* last_slot = page_dir_get_nth_slot(page, n_slots - 1); + if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP) + <= page_dir_get_nth_slot(page, n_slots - 1)))) { - if (UNIV_UNLIKELY(top > last_slot)) { - ib::warn() << "Record heap and dir overlap on space " - << page_get_space_id(page) << " page " - << page_get_page_no(page) << " index " << index->name - << ", " << top << ", " << last_slot; - goto func_exit; + ib::warn() << "Record heap and directory overlap"; + goto func_exit2; + } + + switch (uint16_t type = fil_page_get_type(page)) { + case FIL_PAGE_RTREE: + if (!index->is_spatial()) { +wrong_page_type: + ib::warn() << "Wrong page type " << type; + ret = FALSE; + } + break; + case FIL_PAGE_TYPE_INSTANT: + if (index->is_instant() + && page_get_page_no(page) == index->page) { + break; + } + goto wrong_page_type; + case FIL_PAGE_INDEX: + if (index->is_spatial()) { + goto wrong_page_type; + } + if (index->is_instant() + && page_get_page_no(page) == index->page) { + goto wrong_page_type; + } + break; + default: + goto wrong_page_type; } + /* The following buffer is used to check that the + records in the page record heap do not overlap */ + mem_heap_t* heap = mem_heap_create(srv_page_size + 200);; + byte* buf = static_cast<byte*>(mem_heap_zalloc(heap, srv_page_size)); + /* Validate the record list in a loop checking also that it is consistent with the directory. */ - count = 0; - data_size = 0; - own_count = 1; + ulint count = 0, data_size = 0, own_count = 1, slot_no = 0; + ulint info_bits; slot_no = 0; slot = page_dir_get_nth_slot(page, slot_no); rec = page_get_infimum_rec(page); + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; + for (;;) { - offsets = rec_get_offsets(rec, index, offsets, - page_is_leaf(page), + offsets = rec_get_offsets(rec, index, offsets, n_core, ULINT_UNDEFINED, &heap); if (page_is_comp(page) && page_rec_is_user_rec(rec) && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec) == page_is_leaf(page))) { ib::error() << "'node_ptr' flag mismatch"; - goto func_exit; + ret = FALSE; + goto next_rec; } if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { - goto func_exit; + ret = FALSE; + goto next_rec; + } + + info_bits = rec_get_info_bits(rec, page_is_comp(page)); + if (info_bits + & ~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) { + ib::error() << "info_bits has an incorrect value " + << info_bits; + ret = false; } if (rec == first_rec) { - if ((rec_get_info_bits(rec, page_is_comp(page)) - & REC_INFO_MIN_REC_FLAG) - && page_is_leaf(page)) { - ib::error() << "REC_INFO_MIN_REC_FLAG " - "is set in a leaf-page record"; + if (info_bits & REC_INFO_MIN_REC_FLAG) { + if (page_has_prev(page)) { + ib::error() << "REC_INFO_MIN_REC_FLAG " + "is set on non-left page"; + ret = false; + } else if (!page_is_leaf(page)) { + /* leftmost node pointer page */ + } else if (!index->is_instant()) { + ib::error() << "REC_INFO_MIN_REC_FLAG " + "is set in a leaf-page record"; + ret = false; + } else if (info_bits & REC_INFO_DELETED_FLAG) { + /* If this were a 10.4 metadata + record for index->table->instant + we should not get here in 10.3, because + the metadata record should not have + been recognized by + btr_cur_instant_init_low(). */ + ib::error() << "Metadata record " + "is delete-marked"; + ret = false; + } + } else if (!page_has_prev(page) + && index->is_instant()) { + ib::error() << "Metadata record is missing"; ret = false; } - } else if (rec_get_info_bits(rec, page_is_comp(page)) - & REC_INFO_MIN_REC_FLAG) { + } else if (info_bits & REC_INFO_MIN_REC_FLAG) { ib::error() << "REC_INFO_MIN_REC_FLAG record is not " "first in page"; ret = false; } + if (page_is_comp(page)) { + const rec_comp_status_t status = rec_get_status(rec); + if (status != REC_STATUS_ORDINARY + && status != REC_STATUS_NODE_PTR + && status != REC_STATUS_INFIMUM + && status != REC_STATUS_SUPREMUM + && status != REC_STATUS_COLUMNS_ADDED) { + ib::error() << "impossible record status " + << status; + ret = false; + } else if (page_rec_is_infimum(rec)) { + if (status != REC_STATUS_INFIMUM) { + ib::error() + << "infimum record has status " + << status; + ret = false; + } + } else if (page_rec_is_supremum(rec)) { + if (status != REC_STATUS_SUPREMUM) { + ib::error() << "supremum record has " + "status " + << status; + ret = false; + } + } else if (!page_is_leaf(page)) { + if (status != REC_STATUS_NODE_PTR) { + ib::error() << "node ptr record has " + "status " + << status; + ret = false; + } + } else if (!index->is_instant() + && status == REC_STATUS_COLUMNS_ADDED) { + ib::error() << "instantly added record in a " + "non-instant index"; + ret = false; + } + } + /* Check that the records are in the ascending order */ if (count >= PAGE_HEAP_NO_USER_LOW && !page_rec_is_supremum(rec)) { @@ -2507,16 +2592,10 @@ func_exit2: /* For spatial index, on nonleaf leavel, we allow recs to be equal. */ - bool rtr_equal_nodeptrs = - (ret == 0 && dict_index_is_spatial(index) - && !page_is_leaf(page)); - - if (ret <= 0 && !rtr_equal_nodeptrs) { + if (ret <= 0 && !(ret == 0 && index->is_spatial() + && !page_is_leaf(page))) { - ib::error() << "Records in wrong order on" - " space " << page_get_space_id(page) - << " page " << page_get_page_no(page) - << " index " << index->name; + ib::error() << "Records in wrong order"; fputs("\nInnoDB: previous record ", stderr); /* For spatial index, print the mbr info.*/ @@ -2537,7 +2616,7 @@ func_exit2: putc('\n', stderr); } - goto func_exit; + ret = FALSE; } } @@ -2545,7 +2624,7 @@ func_exit2: data_size += rec_offs_size(offsets); -#if UNIV_GIS_DEBUG +#if defined(UNIV_GIS_DEBUG) /* For spatial index, print the mbr info.*/ if (index->type & DICT_SPATIAL) { rec_print_mbr_rec(stderr, rec, offsets); @@ -2556,42 +2635,42 @@ func_exit2: offs = page_offset(rec_get_start(rec, offsets)); i = rec_offs_size(offsets); - if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) { - ib::error() << "Record offset out of bounds"; - goto func_exit; + if (UNIV_UNLIKELY(offs + i >= srv_page_size)) { + ib::error() << "Record offset out of bounds: " + << offs << '+' << i; + ret = FALSE; + goto next_rec; } - while (i--) { if (UNIV_UNLIKELY(buf[offs + i])) { - /* No other record may overlap this */ - ib::error() << "Record overlaps another"; - goto func_exit; + ib::error() << "Record overlaps another: " + << offs << '+' << i; + ret = FALSE; + break; } - buf[offs + i] = 1; } - if (page_is_comp(page)) { - rec_own_count = rec_get_n_owned_new(rec); - } else { - rec_own_count = rec_get_n_owned_old(rec); - } - - if (UNIV_UNLIKELY(rec_own_count)) { + if (ulint rec_own_count = page_is_comp(page) + ? rec_get_n_owned_new(rec) + : rec_get_n_owned_old(rec)) { /* This is a record pointed to by a dir slot */ if (UNIV_UNLIKELY(rec_own_count != own_count)) { - ib::error() << "Wrong owned count " - << rec_own_count << ", " << own_count; - goto func_exit; + ib::error() << "Wrong owned count at " << offs + << ": " << rec_own_count + << ", " << own_count; + ret = FALSE; } if (page_dir_slot_get_rec(slot) != rec) { ib::error() << "Dir slot does not" - " point to right rec"; - goto func_exit; + " point to right rec at " << offs; + ret = FALSE; } - page_dir_slot_check(slot); + if (ret) { + page_dir_slot_check(slot); + } own_count = 0; if (!page_rec_is_supremum(rec)) { @@ -2600,6 +2679,7 @@ func_exit2: } } +next_rec: if (page_rec_is_supremum(rec)) { break; } @@ -2625,14 +2705,14 @@ func_exit2: } } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { n_owned_zero: - ib::error() << "n owned is zero"; - goto func_exit; + ib::error() << "n owned is zero at " << offs; + ret = FALSE; } if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { ib::error() << "n slots wrong " << slot_no << " " << (n_slots - 1); - goto func_exit; + ret = FALSE; } if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS)) @@ -2641,21 +2721,20 @@ n_owned_zero: ib::error() << "n recs wrong " << page_header_get_field(page, PAGE_N_RECS) + PAGE_HEAP_NO_USER_LOW << " " << (count + 1); - goto func_exit; + ret = FALSE; } if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) { ib::error() << "Summed data size " << data_size << ", returned by func " << page_get_data_size(page); - goto func_exit; + ret = FALSE; } /* Check then the free list */ rec = page_header_get_ptr(page, PAGE_FREE); while (rec != NULL) { - offsets = rec_get_offsets(rec, index, offsets, - page_is_leaf(page), + offsets = rec_get_offsets(rec, index, offsets, n_core, ULINT_UNDEFINED, &heap); if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { ret = FALSE; @@ -2685,15 +2764,13 @@ next_free: ret = FALSE; goto next_free; } - while (i--) { - if (UNIV_UNLIKELY(buf[offs + i])) { - ib::error() << "Record overlaps another" - " in free list"; - goto func_exit; + ib::error() << "Free record overlaps another: " + << offs << '+' << i; + ret = FALSE; + break; } - buf[offs + i] = 1; } @@ -2703,15 +2780,12 @@ next_free: if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { ib::error() << "N heap is wrong " << page_dir_get_n_heap(page) << " " << count + 1; - goto func_exit; + ret = FALSE; } - ret = TRUE; - -func_exit: mem_heap_free(heap); - if (UNIV_UNLIKELY(ret == FALSE)) { + if (UNIV_UNLIKELY(!ret)) { goto func_exit2; } @@ -2776,7 +2850,11 @@ page_delete_rec( belongs to */ page_cur_t* pcur, /*!< in/out: page cursor on record to delete */ - page_zip_des_t* page_zip,/*!< in: compressed page descriptor */ + page_zip_des_t* +#ifdef UNIV_ZIP_DEBUG + page_zip/*!< in: compressed page descriptor */ +#endif + , const rec_offs* offsets)/*!< in: offsets for record */ { bool no_compress_needed; @@ -2829,19 +2907,26 @@ page_find_rec_max_not_deleted( const rec_t* rec = page_get_infimum_rec(page); const rec_t* prev_rec = NULL; // remove warning - /* Because the page infimum is never delete-marked, + /* Because the page infimum is never delete-marked + and never the metadata pseudo-record (MIN_REC_FLAG)), prev_rec will always be assigned to it first. */ - ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec))); + ut_ad(!rec_get_info_bits(rec, page_rec_is_comp(rec))); + ut_ad(page_is_leaf(page)); + if (page_is_comp(page)) { do { - if (!rec_get_deleted_flag(rec, true)) { + if (!(rec[-REC_NEW_INFO_BITS] + & (REC_INFO_DELETED_FLAG + | REC_INFO_MIN_REC_FLAG))) { prev_rec = rec; } rec = page_rec_get_next_low(rec, true); } while (rec != page + PAGE_NEW_SUPREMUM); } else { do { - if (!rec_get_deleted_flag(rec, false)) { + if (!(rec[-REC_OLD_INFO_BITS] + & (REC_INFO_DELETED_FLAG + | REC_INFO_MIN_REC_FLAG))) { prev_rec = rec; } rec = page_rec_get_next_low(rec, false); diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc index 1acbfd2e230..d9146407833 100644 --- a/storage/innobase/page/page0zip.cc +++ b/storage/innobase/page/page0zip.cc @@ -2,7 +2,7 @@ Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2014, 2020, MariaDB Corporation. +Copyright (c) 2014, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -154,7 +154,7 @@ page_zip_empty_size( ulint n_fields, /*!< in: number of columns in the index */ ulint zip_size) /*!< in: compressed page size in bytes */ { - lint size = zip_size + ulint size = zip_size /* subtract the page header and the longest uncompressed data needed for one record */ - (PAGE_DATA @@ -164,7 +164,7 @@ page_zip_empty_size( - REC_N_NEW_EXTRA_BYTES/* omitted bytes */) /* subtract the space for page_zip_fields_encode() */ - compressBound(static_cast<uLong>(2 * (n_fields + 1))); - return(size > 0 ? (ulint) size : 0); + return(lint(size) > 0 ? size : 0); } /** Check whether a tuple is too big for compressed table @@ -228,7 +228,8 @@ page_zip_dir_elems( const page_zip_des_t* page_zip) /*!< in: compressed page */ { /* Exclude the page infimum and supremum from the record count. */ - return(page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW); + return ulint(page_dir_get_n_heap(page_zip->data)) + - PAGE_HEAP_NO_USER_LOW; } /*************************************************************//** @@ -287,7 +288,7 @@ page_zip_dir_user_size( const page_zip_des_t* page_zip) /*!< in: compressed page */ { ulint size = PAGE_ZIP_DIR_SLOT_SIZE - * page_get_n_recs(page_zip->data); + * ulint(page_get_n_recs(page_zip->data)); ut_ad(size <= page_zip_dir_size(page_zip)); return(size); } @@ -395,7 +396,7 @@ page_zip_compress_write_log( } /* Read the number of user records. */ - trailer_size = page_dir_get_n_heap(page_zip->data) + trailer_size = ulint(page_dir_get_n_heap(page_zip->data)) - PAGE_HEAP_NO_USER_LOW; /* Multiply by uncompressed of size stored per record */ if (!page_is_leaf(page)) { @@ -409,15 +410,13 @@ page_zip_compress_write_log( /* Add the space occupied by BLOB pointers. */ trailer_size += page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; ut_a(page_zip->m_end > PAGE_DATA); -#if FIL_PAGE_DATA > PAGE_DATA -# error "FIL_PAGE_DATA > PAGE_DATA" -#endif + compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA); ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip)); log_ptr = mlog_write_initial_log_record_fast((page_t*) page, MLOG_ZIP_PAGE_COMPRESS, log_ptr, mtr); - mach_write_to_2(log_ptr, page_zip->m_end - FIL_PAGE_TYPE); + mach_write_to_2(log_ptr, ulint(page_zip->m_end - FIL_PAGE_TYPE)); log_ptr += 2; mach_write_to_2(log_ptr, trailer_size); log_ptr += 2; @@ -429,7 +428,7 @@ page_zip_compress_write_log( /* Write most of the page header, the compressed stream and the modification log. */ mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_TYPE, - page_zip->m_end - FIL_PAGE_TYPE); + ulint(page_zip->m_end - FIL_PAGE_TYPE)); /* Write the uncompressed trailer of the compressed page. */ mlog_catenate_string(mtr, page_zip->data + page_zip_get_size(page_zip) - trailer_size, trailer_size); @@ -612,7 +611,7 @@ page_zip_fields_encode( } buf = page_zip_fixed_field_encode( - buf, field->fixed_len << 1); + buf, ulint(field->fixed_len) << 1); col++; } } @@ -692,15 +691,14 @@ page_zip_dir_encode( heap_no = rec_get_heap_no_new(rec); ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); ut_a(heap_no < n_heap); - ut_a(offs < UNIV_PAGE_SIZE - PAGE_DIR); + ut_a(offs < srv_page_size - PAGE_DIR); ut_a(offs >= PAGE_ZIP_START); -#if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1) -# error PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2 -#endif -#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_ZIP_SIZE_MAX - 1 -# error PAGE_ZIP_DIR_SLOT_MASK < UNIV_ZIP_SIZE_MAX - 1 -#endif - if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + compile_time_assert(!(PAGE_ZIP_DIR_SLOT_MASK + & (PAGE_ZIP_DIR_SLOT_MASK + 1))); + compile_time_assert(PAGE_ZIP_DIR_SLOT_MASK + >= UNIV_ZIP_SIZE_MAX - 1); + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) { offs |= PAGE_ZIP_DIR_SLOT_OWNED; } @@ -723,7 +721,7 @@ page_zip_dir_encode( recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; } - ut_a(rec_get_status(rec) == status); + ut_a(ulint(rec_get_status(rec)) == status); } offs = page_header_get_field(page, PAGE_FREE); @@ -738,7 +736,7 @@ page_zip_dir_encode( ut_a(heap_no < n_heap); ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */ - ut_a(rec_get_status(rec) == status); + ut_a(ulint(rec_get_status(rec)) == status); mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); @@ -807,7 +805,7 @@ page_zip_set_alloc( #ifdef PAGE_ZIP_COMPRESS_DBG /** Set this variable in a debugger to enable excessive logging in page_zip_compress(). */ -static ibool page_zip_compress_dbg; +static bool page_zip_compress_dbg; /** Set this variable in a debugger to enable binary logging of the data passed to deflate(). When this variable is nonzero, it will act @@ -883,7 +881,7 @@ page_zip_compress_node_ptrs( do { const rec_t* rec = *recs++; - offsets = rec_get_offsets(rec, index, offsets, false, + offsets = rec_get_offsets(rec, index, offsets, 0, ULINT_UNDEFINED, &heap); /* Only leaf nodes may contain externally stored columns. */ ut_ad(!rec_offs_any_extern(offsets)); @@ -1132,7 +1130,7 @@ page_zip_compress_clust( do { const rec_t* rec = *recs++; - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, ULINT_UNDEFINED, &heap); ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index)); @@ -1325,7 +1323,7 @@ page_zip_compress( } /* The dense directory excludes the infimum and supremum records. */ - n_dense = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW; + n_dense = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW; #ifdef PAGE_ZIP_COMPRESS_DBG if (UNIV_UNLIKELY(page_zip_compress_dbg)) { ib::info() << "compress " @@ -1344,8 +1342,8 @@ page_zip_compress( if (logfile) { /* Write the uncompressed page to the log. */ - if (fwrite(page, 1, UNIV_PAGE_SIZE, logfile) - != UNIV_PAGE_SIZE) { + if (fwrite(page, 1, srv_page_size, logfile) + != srv_page_size) { perror("fwrite"); } /* Record the compressed size as zero. @@ -1377,7 +1375,7 @@ page_zip_compress( + REC_OFFS_HEADER_SIZE + n_dense * ((sizeof *recs) - PAGE_ZIP_DIR_SLOT_SIZE) - + UNIV_PAGE_SIZE * 4 + + srv_page_size * 4 + (512 << MAX_MEM_LEVEL)); recs = static_cast<const rec_t**>( @@ -1394,7 +1392,7 @@ page_zip_compress( page_zip_set_alloc(&c_stream, heap); err = deflateInit2(&c_stream, static_cast<int>(level), - Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT, + Z_DEFLATED, srv_page_size_shift, MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY); ut_a(err == Z_OK); @@ -1515,7 +1513,7 @@ page_zip_compress( c_stream.avail_in = static_cast<uInt>( page_header_get_field(page, PAGE_HEAP_TOP) - (c_stream.next_in - page)); - ut_a(c_stream.avail_in <= UNIV_PAGE_SIZE - PAGE_ZIP_START - PAGE_DIR); + ut_a(c_stream.avail_in <= srv_page_size - PAGE_ZIP_START - PAGE_DIR); MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in); err = deflate(&c_stream, Z_FINISH); @@ -1552,11 +1550,11 @@ err_exit: ut_ad(buf + c_stream.total_out == c_stream.next_out); ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out); -#ifdef HAVE_valgrind +#if defined HAVE_valgrind && !__has_feature(memory_sanitizer) /* Valgrind believes that zlib does not initialize some bits in the last 7 or 8 bytes of the stream. Make Valgrind happy. */ MEM_MAKE_DEFINED(buf, c_stream.total_out); -#endif /* HAVE_valgrind */ +#endif /* HAVE_valgrind && !memory_sanitizer */ /* Zero out the area reserved for the modification log. Space for the end marker of the modification log is not @@ -1595,7 +1593,7 @@ err_exit: /* Record the compressed size of the block. */ byte sz[4]; mach_write_to_4(sz, c_stream.total_out); - fseek(logfile, UNIV_PAGE_SIZE, SEEK_SET); + fseek(logfile, srv_page_size, SEEK_SET); if (fwrite(sz, 1, sizeof sz, logfile) != sizeof sz) { perror("fwrite"); } @@ -1680,11 +1678,9 @@ page_zip_fields_decode( return(NULL); } - table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n, 0, + table = dict_mem_table_create("ZIP_DUMMY", NULL, n, 0, DICT_TF_COMPACT, 0); - index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY", - DICT_HDR_SPACE, 0, n); - index->table = table; + index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n); index->n_uniq = unsigned(n); /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ index->cached = TRUE; @@ -1748,6 +1744,11 @@ fail: } } + /* ROW_FORMAT=COMPRESSED does not support instant ADD COLUMN */ + index->n_core_fields = index->n_fields; + index->n_core_null_bytes + = UT_BITS_IN_BYTES(unsigned(index->n_nullable)); + ut_ad(b == end); if (is_spatial) { @@ -1789,7 +1790,7 @@ page_zip_dir_decode( /* Traverse the list of stored records in the sorting order, starting from the first user record. */ - slot = page + (UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE); + slot = page + (srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE); UNIV_PREFETCH_RW(slot); /* Zero out the page trailer. */ @@ -1823,7 +1824,7 @@ page_zip_dir_decode( mach_write_to_2(slot, PAGE_NEW_SUPREMUM); { const page_dir_slot_t* last_slot = page_dir_get_nth_slot( - page, page_dir_get_n_slots(page) - 1); + page, page_dir_get_n_slots(page) - 1U); if (UNIV_UNLIKELY(slot != last_slot)) { page_zip_fail(("page_zip_dir_decode 3: %p != %p\n", @@ -1906,7 +1907,7 @@ page_zip_set_extra_bytes( page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned; /* The dense directory excludes the infimum and supremum records. */ - n = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW; + n = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW; if (i >= n) { if (UNIV_LIKELY(i == n)) { @@ -1992,8 +1993,8 @@ page_zip_apply_log_ext( return(NULL); } - memcpy(next_out, data, dst - next_out); - data += dst - next_out; + memcpy(next_out, data, ulint(dst - next_out)); + data += ulint(dst - next_out); next_out = dst + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); } else if (rec_offs_nth_extern(offsets, i)) { @@ -2002,7 +2003,7 @@ page_zip_apply_log_ext( ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); - len += dst - next_out + len += ulint(dst - next_out) - BTR_EXTERN_FIELD_REF_SIZE; if (UNIV_UNLIKELY(data + len >= end)) { @@ -2022,7 +2023,7 @@ page_zip_apply_log_ext( } /* Copy the last bytes of the record. */ - len = rec_get_end(rec, offsets) - next_out; + len = ulint(rec_get_end(rec, offsets) - next_out); if (UNIV_UNLIKELY(data + len >= end)) { page_zip_fail(("page_zip_apply_log_ext:" " last %p+%lu >= %p\n", @@ -2051,7 +2052,7 @@ page_zip_apply_log( sorted by address (indexed by heap_no - PAGE_HEAP_NO_USER_LOW) */ ulint n_dense,/*!< in: size of recs[] */ - bool is_leaf,/*!< in: whether this is a leaf page */ + ulint n_core, /*!< in: index->n_fields, or 0 for non-leaf */ ulint trx_id_col,/*!< in: column number of trx_id in the index, or ULINT_UNDEFINED if none */ ulint heap_status, @@ -2127,7 +2128,7 @@ page_zip_apply_log( /* Clear the data bytes of the record. */ mem_heap_t* heap = NULL; rec_offs* offs; - offs = rec_get_offsets(rec, index, offsets, is_leaf, + offs = rec_get_offsets(rec, index, offsets, n_core, ULINT_UNDEFINED, &heap); memset(rec, 0, rec_offs_data_size(offs)); @@ -2137,13 +2138,11 @@ page_zip_apply_log( continue; } -#if REC_STATUS_NODE_PTR != TRUE -# error "REC_STATUS_NODE_PTR != TRUE" -#endif + compile_time_assert(REC_STATUS_NODE_PTR == TRUE); rec_get_offsets_reverse(data, index, hs & REC_STATUS_NODE_PTR, offsets); - rec_offs_make_valid(rec, index, offsets); + rec_offs_make_valid(rec, index, n_core != 0, offsets); /* Copy the extra bytes (backwards). */ { @@ -2224,7 +2223,7 @@ page_zip_apply_log( /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */ b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); - len = rec_get_end(rec, offsets) - b; + len = ulint(rec_get_end(rec, offsets) - b); if (UNIV_UNLIKELY(data + len >= end)) { page_zip_fail(("page_zip_apply_log:" " clust %p+%lu >= %p\n", @@ -2297,7 +2296,7 @@ page_zip_decompress_node_ptrs( d_stream->avail_out = static_cast<uInt>( rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); - ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE + ut_ad(d_stream->avail_out < srv_page_size - PAGE_ZIP_START - PAGE_DIR); switch (inflate(d_stream, Z_SYNC_FLUSH)) { case Z_STREAM_END: @@ -2323,7 +2322,7 @@ page_zip_decompress_node_ptrs( } /* Read the offsets. The status bits are needed here. */ - offsets = rec_get_offsets(rec, index, offsets, false, + offsets = rec_get_offsets(rec, index, offsets, 0, ULINT_UNDEFINED, &heap); /* Non-leaf nodes should not have any externally @@ -2364,7 +2363,7 @@ page_zip_decompress_node_ptrs( d_stream->avail_out = static_cast<uInt>( page_header_get_field(page_zip->data, PAGE_HEAP_TOP) - page_offset(d_stream->next_out)); - if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size - PAGE_ZIP_START - PAGE_DIR)) { page_zip_fail(("page_zip_decompress_node_ptrs:" @@ -2395,9 +2394,10 @@ zlib_done: /* Clear the unused heap space on the uncompressed page. */ memset(d_stream->next_out, 0, - page_dir_get_nth_slot(page, - page_dir_get_n_slots(page) - 1) - - d_stream->next_out); + ulint(page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) + - 1U) + - d_stream->next_out)); } #ifdef UNIV_DEBUG @@ -2409,7 +2409,7 @@ zlib_done: const byte* mod_log_ptr; mod_log_ptr = page_zip_apply_log(d_stream->next_in, d_stream->avail_in + 1, - recs, n_dense, false, + recs, n_dense, 0, ULINT_UNDEFINED, heap_status, index, offsets); @@ -2440,7 +2440,7 @@ zlib_done: for (slot = 0; slot < n_dense; slot++) { rec_t* rec = recs[slot]; - offsets = rec_get_offsets(rec, index, offsets, false, + offsets = rec_get_offsets(rec, index, offsets, 0, ULINT_UNDEFINED, &heap); /* Non-leaf nodes should not have any externally stored columns. */ @@ -2518,7 +2518,7 @@ page_zip_decompress_sec( d_stream->avail_out = static_cast<uInt>( page_header_get_field(page_zip->data, PAGE_HEAP_TOP) - page_offset(d_stream->next_out)); - if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size - PAGE_ZIP_START - PAGE_DIR)) { page_zip_fail(("page_zip_decompress_sec:" @@ -2549,9 +2549,10 @@ zlib_done: /* Clear the unused heap space on the uncompressed page. */ memset(d_stream->next_out, 0, - page_dir_get_nth_slot(page, - page_dir_get_n_slots(page) - 1) - - d_stream->next_out); + ulint(page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) + - 1U) + - d_stream->next_out)); } ut_d(page_zip->m_start = unsigned(PAGE_DATA + d_stream->total_in)); @@ -2561,7 +2562,8 @@ zlib_done: const byte* mod_log_ptr; mod_log_ptr = page_zip_apply_log(d_stream->next_in, d_stream->avail_in + 1, - recs, n_dense, true, + recs, n_dense, + index->n_fields, ULINT_UNDEFINED, heap_status, index, offsets); @@ -2737,7 +2739,7 @@ page_zip_decompress_clust( d_stream->avail_out =static_cast<uInt>( rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); - ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE + ut_ad(d_stream->avail_out < srv_page_size - PAGE_ZIP_START - PAGE_DIR); err = inflate(d_stream, Z_SYNC_FLUSH); switch (err) { @@ -2764,7 +2766,7 @@ page_zip_decompress_clust( } /* Read the offsets. The status bits are needed here. */ - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, ULINT_UNDEFINED, &heap); /* This is a leaf page in a clustered index. */ @@ -2847,7 +2849,7 @@ page_zip_decompress_clust( d_stream->avail_out = static_cast<uInt>( page_header_get_field(page_zip->data, PAGE_HEAP_TOP) - page_offset(d_stream->next_out)); - if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size - PAGE_ZIP_START - PAGE_DIR)) { page_zip_fail(("page_zip_decompress_clust:" @@ -2878,9 +2880,10 @@ zlib_done: /* Clear the unused heap space on the uncompressed page. */ memset(d_stream->next_out, 0, - page_dir_get_nth_slot(page, - page_dir_get_n_slots(page) - 1) - - d_stream->next_out); + ulint(page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) + - 1U) + - d_stream->next_out)); } ut_d(page_zip->m_start = unsigned(PAGE_DATA + d_stream->total_in)); @@ -2890,7 +2893,8 @@ zlib_done: const byte* mod_log_ptr; mod_log_ptr = page_zip_apply_log(d_stream->next_in, d_stream->avail_in + 1, - recs, n_dense, true, + recs, n_dense, + index->n_fields, trx_id_col, heap_status, index, offsets); @@ -2924,9 +2928,9 @@ zlib_done: ulint len; byte* dst; rec_t* rec = recs[slot]; - ibool exists = !page_zip_dir_find_free( + bool exists = !page_zip_dir_find_free( page_zip, page_offset(rec)); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, ULINT_UNDEFINED, &heap); dst = rec_get_nth_field(rec, offsets, @@ -3033,7 +3037,7 @@ page_zip_decompress_low( return(FALSE); } - heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE); + heap = mem_heap_create(n_dense * (3 * sizeof *recs) + srv_page_size); recs = static_cast<rec_t**>( mem_heap_alloc(heap, n_dense * sizeof *recs)); @@ -3100,9 +3104,9 @@ zlib_error: d_stream.avail_in = static_cast<uInt>( page_zip_get_size(page_zip) - (PAGE_DATA + 1)); d_stream.next_out = page + PAGE_ZIP_START; - d_stream.avail_out = uInt(UNIV_PAGE_SIZE - PAGE_ZIP_START); + d_stream.avail_out = uInt(srv_page_size - PAGE_ZIP_START); - if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) + if (UNIV_UNLIKELY(inflateInit2(&d_stream, srv_page_size_shift) != Z_OK)) { ut_error; } @@ -3286,7 +3290,7 @@ page_zip_hexdump_func( #define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size) /** Flag: make page_zip_validate() compare page headers only */ -ibool page_zip_validate_header_only = FALSE; +bool page_zip_validate_header_only; /**********************************************************************//** Check that the compressed and decompressed pages match. @@ -3325,7 +3329,7 @@ page_zip_validate_low( page_zip_fail(("page_zip_validate: page header\n")); page_zip_hexdump(page_zip, sizeof *page_zip); page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); - page_zip_hexdump(page, UNIV_PAGE_SIZE); + page_zip_hexdump(page, srv_page_size); return(FALSE); } @@ -3336,9 +3340,10 @@ page_zip_validate_low( } /* page_zip_decompress() expects the uncompressed page to be - UNIV_PAGE_SIZE aligned. */ - temp_page_buf = static_cast<byte*>(ut_malloc_nokey(2 * UNIV_PAGE_SIZE)); - temp_page = static_cast<byte*>(ut_align(temp_page_buf, UNIV_PAGE_SIZE)); + srv_page_size aligned. */ + temp_page_buf = static_cast<byte*>( + ut_malloc_nokey(2 << srv_page_size_shift)); + temp_page = static_cast<byte*>(ut_align(temp_page_buf, srv_page_size)); MEM_CHECK_DEFINED(page, srv_page_size); MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); @@ -3373,7 +3378,7 @@ page_zip_validate_low( valid = FALSE; } if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER, - UNIV_PAGE_SIZE - PAGE_HEADER - FIL_PAGE_DATA_END)) { + srv_page_size - PAGE_HEADER - FIL_PAGE_DATA_END)) { /* In crash recovery, the "minimum record" flag may be set incorrectly until the mini-transaction is @@ -3397,7 +3402,7 @@ page_zip_validate_low( if (!memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER, - UNIV_PAGE_SIZE - PAGE_HEADER + srv_page_size - PAGE_HEADER - FIL_PAGE_DATA_END)) { /* Only the minimum record flag @@ -3449,7 +3454,7 @@ page_zip_validate_low( page + PAGE_NEW_INFIMUM, TRUE); trec = page_rec_get_next_low( temp_page + PAGE_NEW_INFIMUM, TRUE); - ut_d(const bool is_leaf = page_is_leaf(page)); + const ulint n_core = page_is_leaf(page) ? index->n_fields : 0; do { if (page_offset(rec) != page_offset(trec)) { @@ -3464,7 +3469,7 @@ page_zip_validate_low( if (index) { /* Compare the data. */ offsets = rec_get_offsets( - rec, index, offsets, is_leaf, + rec, index, offsets, n_core, ULINT_UNDEFINED, &heap); if (memcmp(rec - rec_offs_extra_size(offsets), @@ -3492,8 +3497,8 @@ func_exit: if (!valid) { page_zip_hexdump(page_zip, sizeof *page_zip); page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); - page_zip_hexdump(page, UNIV_PAGE_SIZE); - page_zip_hexdump(temp_page, UNIV_PAGE_SIZE); + page_zip_hexdump(page, srv_page_size); + page_zip_hexdump(temp_page, srv_page_size); } ut_free(temp_page_buf); return(valid); @@ -3591,7 +3596,7 @@ page_zip_write_rec_ext( memmove(ext_end - n_ext * BTR_EXTERN_FIELD_REF_SIZE, ext_end, - externs - ext_end); + ulint(externs - ext_end)); } ut_a(blob_no + n_ext <= page_zip->n_blobs); @@ -3617,7 +3622,7 @@ page_zip_write_rec_ext( /* Log the preceding fields. */ ASSERT_ZERO(data, src - start); - memcpy(data, start, src - start); + memcpy(data, start, ulint(src - start)); data += src - start; start = src + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); @@ -3637,7 +3642,7 @@ page_zip_write_rec_ext( src += len - BTR_EXTERN_FIELD_REF_SIZE; ASSERT_ZERO(data, src - start); - memcpy(data, start, src - start); + memcpy(data, start, ulint(src - start)); data += src - start; start = src + BTR_EXTERN_FIELD_REF_SIZE; @@ -3649,7 +3654,7 @@ page_zip_write_rec_ext( } /* Log the last bytes of the record. */ - len = rec_offs_data_size(offsets) - (start - rec); + len = rec_offs_data_size(offsets) - ulint(start - rec); ASSERT_ZERO(data, len); memcpy(data, start, len); @@ -3709,7 +3714,7 @@ page_zip_write_rec( } ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START); - ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + UNIV_PAGE_SIZE + ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(page)); @@ -3780,7 +3785,7 @@ page_zip_write_rec( /* Log the preceding fields. */ ASSERT_ZERO(data, src - rec); - memcpy(data, rec, src - rec); + memcpy(data, rec, ulint(src - rec)); data += src - rec; /* Store trx_id and roll_ptr. */ @@ -3794,7 +3799,7 @@ page_zip_write_rec( /* Log the last bytes of the record. */ len = rec_offs_data_size(offsets) - - (src - rec); + - ulint(src - rec); ASSERT_ZERO(data, len); memcpy(data, src, len); @@ -3875,8 +3880,8 @@ page_zip_parse_write_blob_ptr( z_offset = mach_read_from_2(ptr + 2); if (offset < PAGE_ZIP_START - || offset >= UNIV_PAGE_SIZE - || z_offset >= UNIV_PAGE_SIZE) { + || offset >= srv_page_size + || z_offset >= srv_page_size) { corrupt: recv_sys->found_corrupt_log = TRUE; @@ -3982,7 +3987,7 @@ page_zip_write_blob_ptr( (byte*) field, MLOG_ZIP_WRITE_BLOB_PTR, log_ptr, mtr); mach_write_to_2(log_ptr, page_offset(field)); log_ptr += 2; - mach_write_to_2(log_ptr, externs - page_zip->data); + mach_write_to_2(log_ptr, ulint(externs - page_zip->data)); log_ptr += 2; memcpy(log_ptr, externs, BTR_EXTERN_FIELD_REF_SIZE); log_ptr += BTR_EXTERN_FIELD_REF_SIZE; @@ -4017,8 +4022,8 @@ page_zip_parse_write_node_ptr( z_offset = mach_read_from_2(ptr + 2); if (offset < PAGE_ZIP_START - || offset >= UNIV_PAGE_SIZE - || z_offset >= UNIV_PAGE_SIZE) { + || offset >= srv_page_size + || z_offset >= srv_page_size) { corrupt: recv_sys->found_corrupt_log = TRUE; @@ -4045,7 +4050,7 @@ corrupt: storage_end = page_zip_dir_start(page_zip); - heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE; + heap_no = 1 + ulint(storage_end - storage) / REC_NODE_PTR_SIZE; if (UNIV_UNLIKELY((storage_end - storage) % REC_NODE_PTR_SIZE) || UNIV_UNLIKELY(heap_no < PAGE_HEAP_NO_USER_LOW) @@ -4103,9 +4108,7 @@ page_zip_write_node_ptr( #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE)); #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ -#if REC_NODE_PTR_SIZE != 4 -# error "REC_NODE_PTR_SIZE != 4" -#endif + compile_time_assert(REC_NODE_PTR_SIZE == 4); mach_write_to_4(field, ptr); memcpy(storage, field, REC_NODE_PTR_SIZE); @@ -4120,7 +4123,7 @@ page_zip_write_node_ptr( field, MLOG_ZIP_WRITE_NODE_PTR, log_ptr, mtr); mach_write_to_2(log_ptr, page_offset(field)); log_ptr += 2; - mach_write_to_2(log_ptr, storage - page_zip->data); + mach_write_to_2(log_ptr, ulint(storage - page_zip->data)); log_ptr += 2; memcpy(log_ptr, field, REC_NODE_PTR_SIZE); log_ptr += REC_NODE_PTR_SIZE; @@ -4128,17 +4131,23 @@ page_zip_write_node_ptr( } } -/**********************************************************************//** -Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */ +/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record. +@param[in,out] page_zip compressed page +@param[in,out] rec record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields) +@param[in] trx_id DB_TRX_ID value (transaction identifier) +@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer) +@param[in,out] mtr mini-transaction, or NULL to skip logging */ void page_zip_write_trx_id_and_roll_ptr( -/*===============================*/ - page_zip_des_t* page_zip,/*!< in/out: compressed page */ - byte* rec, /*!< in/out: record */ - const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ - ulint trx_id_col,/*!< in: column number of TRX_ID in rec */ - trx_id_t trx_id, /*!< in: transaction identifier */ - roll_ptr_t roll_ptr)/*!< in: roll_ptr */ + page_zip_des_t* page_zip, + byte* rec, + const rec_offs* offsets, + ulint trx_id_col, + trx_id_t trx_id, + roll_ptr_t roll_ptr, + mtr_t* mtr) { byte* field; byte* storage; @@ -4165,9 +4174,7 @@ page_zip_write_trx_id_and_roll_ptr( - (rec_get_heap_no_new(rec) - 1) * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); -#if DATA_TRX_ID + 1 != DATA_ROLL_PTR -# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" -#endif + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); field = rec_get_nth_field(rec, offsets, trx_id_col, &len); ut_ad(len == DATA_TRX_ID_LEN); ut_ad(field + DATA_TRX_ID_LEN @@ -4176,13 +4183,9 @@ page_zip_write_trx_id_and_roll_ptr( #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG ut_a(!memcmp(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)); #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ -#if DATA_TRX_ID_LEN != 6 -# error "DATA_TRX_ID_LEN != 6" -#endif + compile_time_assert(DATA_TRX_ID_LEN == 6); mach_write_to_6(field, trx_id); -#if DATA_ROLL_PTR_LEN != 7 -# error "DATA_ROLL_PTR_LEN != 7" -#endif + compile_time_assert(DATA_ROLL_PTR_LEN == 7); mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr); memcpy(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); @@ -4190,6 +4193,83 @@ page_zip_write_trx_id_and_roll_ptr( MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), rec_offs_extra_size(offsets)); MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + if (mtr) { + byte* log_ptr = mlog_open( + mtr, 11 + 2 + 2 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + if (UNIV_UNLIKELY(!log_ptr)) { + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + (byte*) field, MLOG_ZIP_WRITE_TRX_ID, log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(field)); + log_ptr += 2; + mach_write_to_2(log_ptr, ulint(storage - page_zip->data)); + log_ptr += 2; + memcpy(log_ptr, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + log_ptr += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + mlog_close(mtr, log_ptr); + } +} + +/** Parse a MLOG_ZIP_WRITE_TRX_ID record. +@param[in] ptr redo log buffer +@param[in] end_ptr end of redo log buffer +@param[in,out] page uncompressed page +@param[in,out] page_zip compressed page +@return end of log record +@retval NULL if the log record is incomplete */ +byte* +page_zip_parse_write_trx_id( + byte* ptr, + byte* end_ptr, + page_t* page, + page_zip_des_t* page_zip) +{ + byte* const end = 2 + 2 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + ptr; + + if (UNIV_UNLIKELY(end_ptr < end)) { + return(NULL); + } + + uint offset = mach_read_from_2(ptr); + uint z_offset = mach_read_from_2(ptr + 2); + + if (offset < PAGE_ZIP_START + || offset >= srv_page_size + || z_offset >= srv_page_size) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (page) { + if (!page_zip || !page_is_leaf(page)) { + goto corrupt; + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + + byte* field = page + offset; + byte* storage = page_zip->data + z_offset; + + if (storage >= page_zip_dir_start(page_zip)) { + goto corrupt; + } + + memcpy(field, ptr + 4, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + memcpy(storage, ptr + 4, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return end; } /**********************************************************************//** @@ -4365,7 +4445,7 @@ page_zip_dir_insert( /* Read the old n_dense (n_heap may have been incremented). */ n_dense = page_dir_get_n_heap(page_zip->data) - - (PAGE_HEAP_NO_USER_LOW + 1); + - (PAGE_HEAP_NO_USER_LOW + 1U); if (UNIV_LIKELY_NULL(free_rec)) { /* The record was allocated from the free list. @@ -4392,7 +4472,7 @@ page_zip_dir_insert( /* Shift the dense directory to allocate place for rec. */ memmove(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free, - slot_rec - slot_free); + ulint(slot_rec - slot_free)); /* Write the entry for the inserted record. The "owned" and "deleted" flags must be zero. */ @@ -4452,7 +4532,7 @@ page_zip_dir_delete( if (UNIV_LIKELY(slot_rec > slot_free)) { memmove(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free, - slot_rec - slot_free); + ulint(slot_rec - slot_free)); } /* Write the entry for the deleted record. @@ -4465,7 +4545,7 @@ page_zip_dir_delete( } n_ext = rec_offs_n_extern(offsets); - if (UNIV_UNLIKELY(n_ext)) { + if (UNIV_UNLIKELY(n_ext != 0)) { /* Shift and zero fill the array of BLOB pointers. */ ulint blob_no; byte* externs; @@ -4485,7 +4565,7 @@ page_zip_dir_delete( page_zip->n_blobs -= static_cast<unsigned>(n_ext); /* Shift and zero fill the array. */ memmove(ext_end + n_ext * BTR_EXTERN_FIELD_REF_SIZE, ext_end, - (page_zip->n_blobs - blob_no) + ulint(page_zip->n_blobs - blob_no) * BTR_EXTERN_FIELD_REF_SIZE); memset(ext_end, 0, n_ext * BTR_EXTERN_FIELD_REF_SIZE); } @@ -4516,7 +4596,7 @@ page_zip_dir_add_slot( /* Read the old n_dense (n_heap has already been incremented). */ n_dense = page_dir_get_n_heap(page_zip->data) - - (PAGE_HEAP_NO_USER_LOW + 1); + - (PAGE_HEAP_NO_USER_LOW + 1U); dir = page_zip->data + page_zip_get_size(page_zip) - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; @@ -4536,7 +4616,7 @@ page_zip_dir_add_slot( ASSERT_ZERO(externs - PAGE_ZIP_CLUST_LEAF_SLOT_SIZE, PAGE_ZIP_CLUST_LEAF_SLOT_SIZE); memmove(externs - PAGE_ZIP_CLUST_LEAF_SLOT_SIZE, - externs, stored - externs); + externs, ulint(stored - externs)); } else { stored = dir - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; @@ -4546,7 +4626,7 @@ page_zip_dir_add_slot( /* Move the uncompressed area backwards to make space for one directory slot. */ - memmove(stored - PAGE_ZIP_DIR_SLOT_SIZE, stored, dir - stored); + memmove(stored - PAGE_ZIP_DIR_SLOT_SIZE, stored, ulint(dir - stored)); } /***********************************************************//** @@ -4621,9 +4701,7 @@ page_zip_write_header_log( ut_ad(offset < PAGE_DATA); ut_ad(offset + length < PAGE_DATA); -#if PAGE_DATA > 255 -# error "PAGE_DATA > 255" -#endif + compile_time_assert(PAGE_DATA < 256U); ut_ad(length > 0); ut_ad(length < 256); @@ -4672,7 +4750,7 @@ page_zip_reorganize( ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); ut_ad(page_is_comp(page)); ut_ad(!dict_index_is_ibuf(index)); - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); /* Note that page_zip_validate(page_zip, page, index) may fail here. */ MEM_CHECK_DEFINED(page, srv_page_size); MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); @@ -4748,7 +4826,7 @@ page_zip_copy_recs( ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX)); ut_ad(!dict_index_is_ibuf(index)); - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); #ifdef UNIV_ZIP_DEBUG /* The B-tree operations that call this function may set FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag @@ -4771,13 +4849,11 @@ page_zip_copy_recs( the records stored in the page. Also copy the field PAGE_MAX_TRX_ID. Skip the rest of the page header and trailer. On the compressed page, there is no trailer. */ -#if PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END -# error "PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END" -#endif + compile_time_assert(PAGE_MAX_TRX_ID + 8 == PAGE_HEADER_PRIV_END); memcpy(PAGE_HEADER + page, PAGE_HEADER + src, PAGE_HEADER_PRIV_END); memcpy(PAGE_DATA + page, PAGE_DATA + src, - UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END); + srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END); memcpy(PAGE_HEADER + page_zip->data, PAGE_HEADER + src_zip->data, PAGE_HEADER_PRIV_END); memcpy(PAGE_DATA + page_zip->data, PAGE_DATA + src_zip->data, diff --git a/storage/innobase/pars/pars0opt.cc b/storage/innobase/pars/pars0opt.cc index 4874ce610ca..0cfd93d4120 100644 --- a/storage/innobase/pars/pars0opt.cc +++ b/storage/innobase/pars/pars0opt.cc @@ -205,7 +205,7 @@ opt_look_for_col_in_comparison_before( if (opt_check_exp_determined_before(exp, sel_node, nth_table)) { - *op = search_cond->func; + *op = ulint(search_cond->func); return(exp); } @@ -224,7 +224,8 @@ opt_look_for_col_in_comparison_before( if (opt_check_exp_determined_before(exp, sel_node, nth_table)) { - *op = opt_invert_cmp_op(search_cond->func); + *op = ulint(opt_invert_cmp_op( + search_cond->func)); return(exp); } @@ -1256,7 +1257,7 @@ opt_print_query_plan( fprintf(stderr, "Index %s of table %s" "; exact m. %lu, match %lu, end conds %lu\n", - plan->index->name(), plan->index->table_name, + plan->index->name(), plan->index->table->name.m_name, (unsigned long) plan->n_exact_match, (unsigned long) n_fields, (unsigned long) UT_LIST_GET_LEN(plan->end_conds)); diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc index 991762673aa..6a187787c12 100644 --- a/storage/innobase/pars/pars0pars.cc +++ b/storage/innobase/pars/pars0pars.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, MariaDB Corporation. +Copyright (c) 2018, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1040,7 +1040,7 @@ pars_update_statement_start( node = upd_node_create(pars_sym_tab_global->heap); - node->is_delete = is_delete; + node->is_delete = is_delete ? PLAIN_DELETE : NO_DELETE; node->table_sym = table_sym; node->col_assign_list = col_assign_list; @@ -1205,9 +1205,9 @@ pars_update_statement( node->select = sel_node; ut_a(!node->is_delete || (node->col_assign_list == NULL)); - ut_a(node->is_delete || (node->col_assign_list != NULL)); + ut_a(node->is_delete == PLAIN_DELETE || node->col_assign_list != NULL); - if (node->is_delete) { + if (node->is_delete == PLAIN_DELETE) { node->cmpl_info = 0; } else { pars_process_assign_list(node); @@ -1748,7 +1748,7 @@ pars_column_def( ulint len2; if (len) { - len2 = eval_node_get_int_val(len); + len2 = ulint(eval_node_get_int_val(len)); } else { len2 = 0; } @@ -1783,14 +1783,15 @@ pars_create_table( n_cols = que_node_list_get_len(column_defs); table = dict_mem_table_create( - table_sym->name, 0, n_cols, 0, flags, flags2); + table_sym->name, NULL, n_cols, 0, flags, flags2); + mem_heap_t* heap = pars_sym_tab_global->heap; column = column_defs; while (column) { dtype = dfield_get_type(que_node_get_val(column)); - dict_mem_table_add_col(table, table->heap, + dict_mem_table_add_col(table, heap, column->name, dtype->mtype, dtype->prtype, dtype->len); column->resolved = TRUE; @@ -1799,8 +1800,10 @@ pars_create_table( column = static_cast<sym_node_t*>(que_node_get_next(column)); } - node = tab_create_graph_create(table, pars_sym_tab_global->heap, - FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); + dict_table_add_system_columns(table, heap); + node = tab_create_graph_create(table, heap, + FIL_ENCRYPTION_DEFAULT, + FIL_DEFAULT_ENCRYPTION_KEY); table_sym->resolved = TRUE; table_sym->token_type = SYM_TABLE; @@ -1840,7 +1843,7 @@ pars_create_index( ind_type = ind_type | DICT_CLUSTERED; } - index = dict_mem_index_create(table_sym->name, index_sym->name, 0, + index = dict_mem_index_create(NULL, index_sym->name, ind_type, n_fields); column = column_list; @@ -1853,7 +1856,8 @@ pars_create_index( column = static_cast<sym_node_t*>(que_node_get_next(column)); } - node = ind_create_graph_create(index, pars_sym_tab_global->heap, NULL); + node = ind_create_graph_create(index, table_sym->name, + pars_sym_tab_global->heap); table_sym->resolved = TRUE; table_sym->token_type = SYM_TABLE; @@ -2204,7 +2208,7 @@ pars_info_add_int4_literal( /*=======================*/ pars_info_t* info, /*!< in: info struct */ const char* name, /*!< in: name */ - lint val) /*!< in: value */ + ulint val) /*!< in: value */ { byte* buf = static_cast<byte*>(mem_heap_alloc(info->heap, 4)); diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc index 0b630bb1c8c..67da45a64e4 100644 --- a/storage/innobase/que/que0que.cc +++ b/storage/innobase/que/que0que.cc @@ -480,7 +480,7 @@ que_graph_free_recursive( if (upd->in_mysql_interface) { btr_pcur_free_for_mysql(upd->pcur); - upd->in_mysql_interface = FALSE; + upd->in_mysql_interface = false; } que_graph_free_recursive(upd->cascade_node); @@ -997,11 +997,6 @@ que_thr_step( } else if (type == QUE_NODE_FOR) { for_step(thr); } else if (type == QUE_NODE_PROC) { - - /* We can access trx->undo_no without reserving - trx->undo_mutex, because there cannot be active query - threads doing updating or inserting at the moment! */ - if (thr->prev_node == que_node_get_parent(node)) { trx->last_sql_stat_start.least_undo_no = trx->undo_no; @@ -1018,8 +1013,10 @@ que_thr_step( } else if (type == QUE_NODE_SELECT) { thr = row_sel_step(thr); } else if (type == QUE_NODE_INSERT) { + trx_start_if_not_started_xa(thr_get_trx(thr), true); thr = row_ins_step(thr); } else if (type == QUE_NODE_UPDATE) { + trx_start_if_not_started_xa(thr_get_trx(thr), true); thr = row_upd_step(thr); } else if (type == QUE_NODE_FETCH) { thr = fetch_step(thr); diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc index 3fd52d5d6dd..e14f564e264 100644 --- a/storage/innobase/read/read0read.cc +++ b/storage/innobase/read/read0read.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, MariaDB Corporation. +Copyright (c) 2018, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,10 +24,11 @@ Cursor read Created 2/16/1997 Heikki Tuuri *******************************************************/ -#include "read0read.h" +#include "read0types.h" #include "srv0srv.h" #include "trx0sys.h" +#include "trx0purge.h" /* ------------------------------------------------------------------------------- @@ -163,8 +164,8 @@ For details see: row_vers_old_has_index_entry() and row_purge_poss_sec() Some additional issues: -What if trx_sys->view_list == NULL and some transaction T1 and Purge both -try to open read_view at same time. Only one can acquire trx_sys->mutex. +What if trx_sys.view_list == NULL and some transaction T1 and Purge both +try to open read_view at same time. Only one can acquire trx_sys.mutex. In which order will the views be opened? Should it matter? If no, why? The order does not matter. No new transactions can be created and no running @@ -172,617 +173,124 @@ RW transaction can commit or rollback (or free views). AC-NL-RO transactions will mark their views as closed but not actually free their views. */ -/** Minimum number of elements to reserve in ReadView::ids_t */ -static const ulint MIN_TRX_IDS = 32; - -#ifdef UNIV_DEBUG -/** Functor to validate the view list. */ -struct ViewCheck { - - ViewCheck() : m_prev_view() { } - - void operator()(const ReadView* view) - { - ut_a(m_prev_view == NULL - || view->is_closed() - || view->le(m_prev_view)); - - m_prev_view = view; - } - - const ReadView* m_prev_view; -}; - -/** -Validates a read view list. */ - -bool -MVCC::validate() const -{ - ViewCheck check; - - ut_ad(mutex_own(&trx_sys->mutex)); - - ut_list_map(m_views, check); - - return(true); -} -#endif /* UNIV_DEBUG */ - -/** -Try and increase the size of the array. Old elements are -copied across. -@param n Make space for n elements */ - -void -ReadView::ids_t::reserve(ulint n) -{ - if (n <= capacity()) { - return; - } - - /** Keep a minimum threshold */ - if (n < MIN_TRX_IDS) { - n = MIN_TRX_IDS; - } - - value_type* p = m_ptr; - - m_ptr = UT_NEW_ARRAY_NOKEY(value_type, n); - - m_reserved = n; - - ut_ad(size() < capacity()); - - if (p != NULL) { - - ::memmove(m_ptr, p, size() * sizeof(value_type)); - - UT_DELETE_ARRAY(p); - } -} - -/** -Copy and overwrite this array contents -@param start Source array -@param end Pointer to end of array */ - -void -ReadView::ids_t::assign(const value_type* start, const value_type* end) -{ - ut_ad(end >= start); - - ulint n = end - start; - - /* No need to copy the old contents across during reserve(). */ - clear(); - - /* Create extra space if required. */ - reserve(n); - - resize(n); - - ut_ad(size() == n); - - ::memmove(m_ptr, start, size() * sizeof(value_type)); -} - -/** -Append a value to the array. -@param value the value to append */ - -void -ReadView::ids_t::push_back(value_type value) -{ - if (capacity() <= size()) { - reserve(size() * 2); - } - - m_ptr[m_size++] = value; - ut_ad(size() <= capacity()); -} - -/** -Insert the value in the correct slot, preserving the order. Doesn't -check for duplicates. */ - -void -ReadView::ids_t::insert(value_type value) -{ - ut_ad(value > 0); - - reserve(size() + 1); - - if (empty() || back() < value) { - push_back(value); - return; - } - - value_type* end = data() + size(); - value_type* ub = std::upper_bound(data(), end, value); - - if (ub == end) { - push_back(value); - } else { - ut_ad(ub < end); - - ulint n_elems = std::distance(ub, end); - ulint n = n_elems * sizeof(value_type); - - /* Note: Copying overlapped memory locations. */ - ::memmove(ub + 1, ub, n); - - *ub = value; - - resize(size() + 1); - } -} - -/** -ReadView constructor */ -ReadView::ReadView() - : - m_low_limit_id(), - m_up_limit_id(), - m_creator_trx_id(), - m_ids(), - m_low_limit_no() -{ - ut_d(::memset(&m_view_list, 0x0, sizeof(m_view_list))); -} - -/** -ReadView destructor */ -ReadView::~ReadView() -{ - // Do nothing -} - -/** Constructor -@param size Number of views to pre-allocate */ -MVCC::MVCC(ulint size) -{ - UT_LIST_INIT(m_free, &ReadView::m_view_list); - UT_LIST_INIT(m_views, &ReadView::m_view_list); - - for (ulint i = 0; i < size; ++i) { - ReadView* view = UT_NEW_NOKEY(ReadView()); - - UT_LIST_ADD_FIRST(m_free, view); - } -} - -MVCC::~MVCC() -{ - for (ReadView* view = UT_LIST_GET_FIRST(m_free); - view != NULL; - view = UT_LIST_GET_FIRST(m_free)) { - - UT_LIST_REMOVE(m_free, view); - - UT_DELETE(view); - } - - ut_a(UT_LIST_GET_LEN(m_views) == 0); -} - -/** -Copy the transaction ids from the source vector */ - -void -ReadView::copy_trx_ids(const trx_ids_t& trx_ids) -{ - ulint size = trx_ids.size(); - - if (m_creator_trx_id > 0) { - ut_ad(size > 0); - --size; - } - - if (size == 0) { - m_ids.clear(); - return; - } - - m_ids.reserve(size); - m_ids.resize(size); - - ids_t::value_type* p = m_ids.data(); - - /* Copy all the trx_ids except the creator trx id */ - - if (m_creator_trx_id > 0) { - - /* Note: We go through all this trouble because it is - unclear whether std::vector::resize() will cause an - overhead or not. We should test this extensively and - if the vector to vector copy is fast enough then get - rid of this code and replace it with more readable - and obvious code. The code below does exactly one copy, - and filters out the creator's trx id. */ - - trx_ids_t::const_iterator it = std::lower_bound( - trx_ids.begin(), trx_ids.end(), m_creator_trx_id); - - ut_ad(it != trx_ids.end() && *it == m_creator_trx_id); - - ulint i = std::distance(trx_ids.begin(), it); - ulint n = i * sizeof(trx_ids_t::value_type); - - ::memmove(p, &trx_ids[0], n); - - n = (trx_ids.size() - i - 1) * sizeof(trx_ids_t::value_type); - - ut_ad(i + (n / sizeof(trx_ids_t::value_type)) == m_ids.size()); - - if (n > 0) { - ::memmove(p + i, &trx_ids[i + 1], n); - } - } else { - ulint n = size * sizeof(trx_ids_t::value_type); - - ::memmove(p, &trx_ids[0], n); - } - -#ifdef UNIV_DEBUG - /* Assert that all transaction ids in list are active. */ - for (trx_ids_t::const_iterator it = trx_ids.begin(); - it != trx_ids.end(); ++it) { - - trx_t* trx = trx_get_rw_trx_by_id(*it); - ut_ad(trx != NULL); - switch (trx->state) { - case TRX_STATE_ACTIVE: - case TRX_STATE_PREPARED: - case TRX_STATE_PREPARED_RECOVERED: - case TRX_STATE_COMMITTED_IN_MEMORY: - continue; - case TRX_STATE_NOT_STARTED: - break; - } - ut_ad(!"invalid state"); - } -#endif /* UNIV_DEBUG */ -} - -/** -Opens a read view where exactly the transactions serialized before this -point in time are seen in the view. -@param id Creator transaction id */ - -void -ReadView::prepare(trx_id_t id) -{ - ut_ad(mutex_own(&trx_sys->mutex)); - - m_creator_trx_id = id; - - m_low_limit_no = m_low_limit_id = trx_sys->max_trx_id; - - if (!trx_sys->rw_trx_ids.empty()) { - copy_trx_ids(trx_sys->rw_trx_ids); - } else { - m_ids.clear(); - } - - if (UT_LIST_GET_LEN(trx_sys->serialisation_list) > 0) { - const trx_t* trx; - - trx = UT_LIST_GET_FIRST(trx_sys->serialisation_list); - - if (trx->no < m_low_limit_no) { - m_low_limit_no = trx->no; - } - } -} - -/** -Complete the read view creation */ - -void -ReadView::complete() -{ - /* The first active transaction has the smallest id. */ - m_up_limit_id = !m_ids.empty() ? m_ids.front() : m_low_limit_id; - - ut_ad(m_up_limit_id <= m_low_limit_id); - - m_closed = false; -} - -/** -Find a free view from the active list, if none found then allocate -a new view. -@return a view to use */ - -ReadView* -MVCC::get_view() -{ - ut_ad(mutex_own(&trx_sys->mutex)); - - ReadView* view; - - if (UT_LIST_GET_LEN(m_free) > 0) { - view = UT_LIST_GET_FIRST(m_free); - UT_LIST_REMOVE(m_free, view); - } else { - view = UT_NEW_NOKEY(ReadView()); - - if (view == NULL) { - ib::error() << "Failed to allocate MVCC view"; - } - } - - return(view); -} - -/** -Release a view that is inactive but not closed. Caller must own -the trx_sys_t::mutex. -@param view View to release */ -void -MVCC::view_release(ReadView*& view) -{ - ut_ad(!srv_read_only_mode); - ut_ad(trx_sys_mutex_own()); - - uintptr_t p = reinterpret_cast<uintptr_t>(view); - - ut_a(p & 0x1); - - view = reinterpret_cast<ReadView*>(p & ~1); - - ut_ad(view->m_closed); - - /** RW transactions should not free their views here. Their views - should freed using view_close_view() */ - - ut_ad(view->m_creator_trx_id == 0); - - UT_LIST_REMOVE(m_views, view); - - UT_LIST_ADD_LAST(m_free, view); - - view = NULL; -} - -/** -Allocate and create a view. -@param view view owned by this class created for the - caller. Must be freed by calling view_close() -@param trx transaction instance of caller */ -void -MVCC::view_open(ReadView*& view, trx_t* trx) -{ - ut_ad(!srv_read_only_mode); - - /** If no new RW transaction has been started since the last view - was created then reuse the the existing view. */ - if (view != NULL) { - - uintptr_t p = reinterpret_cast<uintptr_t>(view); - - view = reinterpret_cast<ReadView*>(p & ~1); - - ut_ad(view->m_closed); - - /* NOTE: This can be optimised further, for now we only - resuse the view iff there are no active RW transactions. - - There is an inherent race here between purge and this - thread. Purge will skip views that are marked as closed. - Therefore we must set the low limit id after we reset the - closed status after the check. */ - - if (trx_is_autocommit_non_locking(trx) && view->empty()) { - - view->m_closed = false; - - if (view->m_low_limit_id == trx_sys_get_max_trx_id()) { - return; - } else { - view->m_closed = true; - } - } - - mutex_enter(&trx_sys->mutex); - - UT_LIST_REMOVE(m_views, view); - - } else { - mutex_enter(&trx_sys->mutex); - - view = get_view(); - } - - if (view != NULL) { - - view->prepare(trx->id); - - view->complete(); - - UT_LIST_ADD_FIRST(m_views, view); - - ut_ad(!view->is_closed()); - - ut_ad(validate()); - } - - trx_sys_mutex_exit(); -} /** -Get the oldest (active) view in the system. -@return oldest view if found or NULL */ + Creates a snapshot where exactly the transactions serialized before this + point in time are seen in the view. -ReadView* -MVCC::get_oldest_view() const + @param[in,out] trx transaction +*/ +inline void ReadView::snapshot(trx_t *trx) { - ReadView* view; - - ut_ad(mutex_own(&trx_sys->mutex)); - - for (view = UT_LIST_GET_LAST(m_views); - view != NULL; - view = UT_LIST_GET_PREV(m_view_list, view)) { - - if (!view->is_closed()) { - break; - } - } - - return(view); + trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no); + std::sort(m_ids.begin(), m_ids.end()); + m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front(); + ut_ad(m_up_limit_id <= m_low_limit_id); } -/** -Copy state from another view. Must call copy_complete() to finish. -@param other view to copy from */ - -void -ReadView::copy_prepare(const ReadView& other) -{ - ut_ad(&other != this); - - if (!other.m_ids.empty()) { - const ids_t::value_type* p = other.m_ids.data(); - - m_ids.assign(p, p + other.m_ids.size()); - } else { - m_ids.clear(); - } - - m_up_limit_id = other.m_up_limit_id; - - m_low_limit_no = other.m_low_limit_no; - - m_low_limit_id = other.m_low_limit_id; - - m_creator_trx_id = other.m_creator_trx_id; -} /** -Complete the copy, insert the creator transaction id into the -m_ids too and adjust the m_up_limit_id, if required */ - -void -ReadView::copy_complete() -{ - ut_ad(!trx_sys_mutex_own()); - - if (m_creator_trx_id > 0) { - m_ids.insert(m_creator_trx_id); - } - - if (!m_ids.empty()) { - /* The last active transaction has the smallest id. */ - m_up_limit_id = std::min(m_ids.front(), m_up_limit_id); - } - - ut_ad(m_up_limit_id <= m_low_limit_id); - - /* We added the creator transaction ID to the m_ids. */ - m_creator_trx_id = 0; -} - -/** Clones the oldest view and stores it in view. No need to -call view_close(). The caller owns the view that is passed in. -This function is called by Purge to determine whether it should -purge the delete marked record or not. -@param view Preallocated view, owned by the caller */ - -void -MVCC::clone_oldest_view(ReadView* view) -{ - mutex_enter(&trx_sys->mutex); - - ReadView* oldest_view = get_oldest_view(); - - if (oldest_view == NULL) { + Opens a read view where exactly the transactions serialized before this + point in time are seen in the view. - view->prepare(0); + View becomes visible to purge thread. - trx_sys_mutex_exit(); - - view->complete(); - - } else { - view->copy_prepare(*oldest_view); - - trx_sys_mutex_exit(); - - view->copy_complete(); - } -} - -/** -@return the number of active views */ - -ulint -MVCC::size() const + @param[in,out] trx transaction +*/ +void ReadView::open(trx_t *trx) { - trx_sys_mutex_enter(); - - ulint size = 0; - - for (const ReadView* view = UT_LIST_GET_FIRST(m_views); - view != NULL; - view = UT_LIST_GET_NEXT(m_view_list, view)) { - - if (!view->is_closed()) { - ++size; - } - } - - trx_sys_mutex_exit(); - - return(size); + ut_ad(this == &trx->read_view); + switch (m_state) + { + case READ_VIEW_STATE_OPEN: + ut_ad(!srv_read_only_mode); + return; + case READ_VIEW_STATE_CLOSED: + if (srv_read_only_mode) + return; + /* + Reuse closed view if there were no read-write transactions since (and at) + its creation time. + + Original comment states: there is an inherent race here between purge + and this thread. + + To avoid this race we should've checked trx_sys.get_max_trx_id() and + set state to READ_VIEW_STATE_OPEN atomically under trx_sys.mutex + protection. But we're cutting edges to achieve great scalability. + + There're at least two types of concurrent threads interested in this + value: purge coordinator thread (see trx_sys_t::clone_oldest_view()) and + InnoDB monitor thread (see lock_trx_print_wait_and_mvcc_state()). + + What bad things can happen because we allow this race? + + Speculative execution may reorder state change before get_max_trx_id(). + In this case purge thread has short gap to clone outdated view. Which is + probably not that bad: it just won't be able to purge things that it was + actually allowed to purge for a short while. + + This thread may as well get suspended after trx_sys.get_max_trx_id() and + before state is set to READ_VIEW_STATE_OPEN. New read-write transaction + may get started, committed and purged meanwhile. It is acceptable as + well, since this view doesn't see it. + */ + if (trx_is_autocommit_non_locking(trx) && m_ids.empty() && + m_low_limit_id == trx_sys.get_max_trx_id()) + goto reopen; + + /* + Can't reuse view, take new snapshot. + + Alas this empty critical section is simplest way to make sure concurrent + purge thread completed snapshot copy. Of course purge thread may come + again and try to copy once again after we release this mutex, but in + this case it is guaranteed to see READ_VIEW_STATE_REGISTERED and thus + it'll skip this view. + + This critical section can be replaced with new state, which purge thread + would set to inform us to wait until it completes snapshot. However it'd + complicate m_state even further. + */ + mutex_enter(&trx_sys.mutex); + mutex_exit(&trx_sys.mutex); + my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_SNAPSHOT, + MY_MEMORY_ORDER_RELAXED); + break; + default: + ut_ad(0); + } + + snapshot(trx); +reopen: + m_creator_trx_id= trx->id; + my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_OPEN, + MY_MEMORY_ORDER_RELEASE); } -/** -Close a view created by the above function. -@param view view allocated by view_open() -@param own_mutex whether the caller owns trx_sys_t::mutex */ -void MVCC::view_close(ReadView*& view, bool own_mutex) -{ - uintptr_t p = reinterpret_cast<uintptr_t>(view); - - /* Note: The assumption here is that AC-NL-RO transactions will - call this function with own_mutex == false. */ - if (!own_mutex) { - /* Sanitise the pointer first. */ - ReadView* ptr = reinterpret_cast<ReadView*>(p & ~1); - - /* Note this can be called for a read view that - was already closed. */ - ptr->m_closed = true; - - /* Set the view as closed. */ - view = reinterpret_cast<ReadView*>(p | 0x1); - } else { - view = reinterpret_cast<ReadView*>(p & ~1); - - view->close(); - - UT_LIST_REMOVE(m_views, view); - UT_LIST_ADD_LAST(m_free, view); - - ut_ad(validate()); - - view = NULL; - } -} /** -Set the view creator transaction id. Note: This shouldbe set only -for views created by RW transactions. -@param view Set the creator trx id for this view -@param id Transaction id to set */ + Clones the oldest view and stores it in view. -void -MVCC::set_view_creator_trx_id(ReadView* view, trx_id_t id) + No need to call ReadView::close(). The caller owns the view that is passed + in. This function is called by purge thread to determine whether it should + purge the delete marked record or not. +*/ +void trx_sys_t::clone_oldest_view() { - ut_ad(id > 0); - ut_ad(mutex_own(&trx_sys->mutex)); - - view->creator_trx_id(id); + purge_sys.view.snapshot(0); + mutex_enter(&mutex); + /* Find oldest view. */ + for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx; + trx= UT_LIST_GET_NEXT(trx_list, trx)) + { + int32_t state; + + while ((state= trx->read_view.get_state()) == READ_VIEW_STATE_SNAPSHOT) + ut_delay(1); + + if (state == READ_VIEW_STATE_OPEN) + purge_sys.view.copy(trx->read_view); + } + mutex_exit(&mutex); } diff --git a/storage/innobase/rem/rem0cmp.cc b/storage/innobase/rem/rem0cmp.cc index ed44e91711c..35cd24f06d3 100644 --- a/storage/innobase/rem/rem0cmp.cc +++ b/storage/innobase/rem/rem0cmp.cc @@ -224,7 +224,6 @@ static int cmp_geometry_field( /*===============*/ - ulint mtype, /*!< in: main type */ ulint prtype, /*!< in: precise type */ const byte* a, /*!< in: data field */ unsigned int a_length, /*!< in: data field length, @@ -298,12 +297,10 @@ cmp_gis_field( not UNIV_SQL_NULL */ { if (mode == PAGE_CUR_MBR_EQUAL) { - /* TODO: Since the DATA_GEOMETRY is not used in compare - function, we could pass it instead of a specific type now */ - return(cmp_geometry_field(DATA_GEOMETRY, DATA_GIS_MBR, - a, a_length, b, b_length)); + return cmp_geometry_field(DATA_GIS_MBR, + a, a_length, b, b_length); } else { - return(rtree_key_cmp(mode, a, a_length, b, b_length)); + return rtree_key_cmp(mode, a, int(a_length), b, int(b_length)); } } @@ -374,8 +371,7 @@ cmp_whole_field( return(innobase_mysql_cmp(prtype, a, a_length, b, b_length)); case DATA_GEOMETRY: - return(cmp_geometry_field(mtype, prtype, a, a_length, b, - b_length)); + return cmp_geometry_field(prtype, a, a_length, b, b_length); default: ib::fatal() << "Unknown data type number " << mtype; } @@ -404,6 +400,9 @@ cmp_data( const byte* data2, ulint len2) { + ut_ad(len1 != UNIV_SQL_DEFAULT); + ut_ad(len2 != UNIV_SQL_DEFAULT); + if (len1 == UNIV_SQL_NULL || len2 == UNIV_SQL_NULL) { if (len1 == len2) { return(0); @@ -703,6 +702,11 @@ cmp_dtuple_rec_with_match_low( contain externally stored fields, and the first fields (primary key fields) should already differ. */ ut_ad(!rec_offs_nth_extern(offsets, cur_field)); + /* We should never compare against instantly added columns. + Columns can only be instantly added to clustered index + leaf page records, and the first fields (primary key fields) + should already differ. */ + ut_ad(!rec_offs_nth_default(offsets, cur_field)); rec_b_ptr = rec_get_nth_field(rec, offsets, cur_field, &rec_f_len); @@ -785,20 +789,23 @@ cmp_dtuple_rec_with_match_bytes( ulint* matched_fields, ulint* matched_bytes) { - ulint n_cmp = dtuple_get_n_fields_cmp(dtuple); - ulint cur_field; /* current field number */ - ulint cur_bytes; - int ret; /* return value */ - ut_ad(dtuple_check_typed(dtuple)); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(!(REC_INFO_MIN_REC_FLAG & dtuple_get_info_bits(dtuple))); - ut_ad(!(REC_INFO_MIN_REC_FLAG - & rec_get_info_bits(rec, rec_offs_comp(offsets)))); - cur_field = *matched_fields; - cur_bytes = *matched_bytes; + if (UNIV_UNLIKELY(REC_INFO_MIN_REC_FLAG + & rec_get_info_bits(rec, rec_offs_comp(offsets)))) { + ut_ad(page_rec_is_first(rec, page_align(rec))); + ut_ad(!page_has_prev(page_align(rec))); + ut_ad(rec_is_metadata(rec, index)); + return 1; + } + + ulint cur_field = *matched_fields; + ulint cur_bytes = *matched_bytes; + ulint n_cmp = dtuple_get_n_fields_cmp(dtuple); + int ret; ut_ad(n_cmp <= dtuple_get_n_fields(dtuple)); ut_ad(cur_field <= n_cmp); @@ -818,6 +825,8 @@ cmp_dtuple_rec_with_match_bytes( dtuple_b_ptr = static_cast<const byte*>( dfield_get_data(dfield)); + + ut_ad(!rec_offs_nth_default(offsets, cur_field)); rec_b_ptr = rec_get_nth_field(rec, offsets, cur_field, &rec_f_len); ut_ad(!rec_offs_nth_extern(offsets, cur_field)); @@ -1139,10 +1148,9 @@ cmp_rec_rec( /* Test if rec is the predefined minimum record */ if (UNIV_UNLIKELY(rec_get_info_bits(rec1, comp) & REC_INFO_MIN_REC_FLAG)) { - /* There should only be one such record. */ - ut_ad(!(rec_get_info_bits(rec2, comp) - & REC_INFO_MIN_REC_FLAG)); - ret = -1; + ret = UNIV_UNLIKELY(rec_get_info_bits(rec2, comp) + & REC_INFO_MIN_REC_FLAG) + ? 0 : -1; goto order_resolved; } else if (UNIV_UNLIKELY (rec_get_info_bits(rec2, comp) @@ -1196,6 +1204,8 @@ cmp_rec_rec( DB_ROLL_PTR, and any externally stored columns. */ ut_ad(!rec_offs_nth_extern(offsets1, cur_field)); ut_ad(!rec_offs_nth_extern(offsets2, cur_field)); + ut_ad(!rec_offs_nth_default(offsets1, cur_field)); + ut_ad(!rec_offs_nth_default(offsets2, cur_field)); rec1_b_ptr = rec_get_nth_field(rec1, offsets1, cur_field, &rec1_f_len); diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc index fa6a6c738e6..221e1f46b4f 100644 --- a/storage/innobase/rem/rem0rec.cc +++ b/storage/innobase/rem/rem0rec.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,6 +29,7 @@ Created 5/30/1994 Heikki Tuuri #include "mtr0log.h" #include "fts0fts.h" #include "trx0sys.h" +#include "row0log.h" /* PHYSICAL RECORD (OLD STYLE) =========================== @@ -166,7 +167,10 @@ rec_get_n_extern_new( ulint i; ut_ad(dict_table_is_comp(index->table)); - ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + ut_ad(!index->table->supports_instant() || index->is_dummy); + ut_ad(!index->is_instant()); + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY + || rec_get_status(rec) == REC_STATUS_COLUMNS_ADDED); ut_ad(n == ULINT_UNDEFINED || n <= dict_index_get_n_fields(index)); if (n == ULINT_UNDEFINED) { @@ -228,50 +232,123 @@ rec_get_n_extern_new( return(n_extern); } -/******************************************************//** -Determine the offset to each field in a leaf-page record -in ROW_FORMAT=COMPACT. This is a special case of -rec_init_offsets() and rec_get_offsets_func(). */ -UNIV_INLINE MY_ATTRIBUTE((nonnull)) +/** Get the added field count in a REC_STATUS_COLUMNS_ADDED record. +@param[in,out] header variable header of a REC_STATUS_COLUMNS_ADDED record +@return number of added fields */ +static inline unsigned rec_get_n_add_field(const byte*& header) +{ + unsigned n_fields_add = *--header; + if (n_fields_add < 0x80) { + ut_ad(rec_get_n_add_field_len(n_fields_add) == 1); + return n_fields_add; + } + + n_fields_add &= 0x7f; + n_fields_add |= unsigned(*--header) << 7; + ut_ad(n_fields_add < REC_MAX_N_FIELDS); + ut_ad(rec_get_n_add_field_len(n_fields_add) == 2); + return n_fields_add; +} + +/** Format of a leaf-page ROW_FORMAT!=REDUNDANT record */ +enum rec_leaf_format { + /** Temporary file record */ + REC_LEAF_TEMP, + /** Temporary file record, with added columns + (REC_STATUS_COLUMNS_ADDED) */ + REC_LEAF_TEMP_COLUMNS_ADDED, + /** Normal (REC_STATUS_ORDINARY) */ + REC_LEAF_ORDINARY, + /** With added columns (REC_STATUS_COLUMNS_ADDED) */ + REC_LEAF_COLUMNS_ADDED +}; + +/** Determine the offset to each field in a leaf-page record +in ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED. +This is a special case of rec_init_offsets() and rec_get_offsets_func(). +@param[in] rec leaf-page record +@param[in] index the index that the record belongs in +@param[in] n_core number of core fields (index->n_core_fields) +@param[in] def_val default values for non-core fields, or + NULL to refer to index->fields[].col->def_val +@param[in,out] offsets offsets, with valid rec_offs_n_fields(offsets) +@param[in] format record format */ +static inline void rec_init_offsets_comp_ordinary( -/*===========================*/ - const rec_t* rec, /*!< in: physical record in - ROW_FORMAT=COMPACT */ - bool temp, /*!< in: whether to use the - format for temporary files in - index creation */ - const dict_index_t* index, /*!< in: record descriptor */ - rec_offs* offsets)/*!< in/out: array of offsets; - in: n=rec_offs_n_fields(offsets) */ + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets, + ulint n_core, + const dict_col_t::def_t*def_val, + rec_leaf_format format) { - ulint i = 0; rec_offs offs = 0; - rec_offs any_ext = 0; - ulint n_null = index->n_nullable; - const byte* nulls = temp - ? rec - 1 - : rec - (1 + REC_N_NEW_EXTRA_BYTES); - const byte* lens = nulls - UT_BITS_IN_BYTES(n_null); + rec_offs any = 0; + const byte* nulls = rec; + const byte* lens = NULL; + ulint n_fields = n_core; ulint null_mask = 1; + ut_ad(n_core > 0); + ut_ad(index->n_core_fields >= n_core); + ut_ad(index->n_fields >= index->n_core_fields); + ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); + ut_ad(format == REC_LEAF_TEMP || format == REC_LEAF_TEMP_COLUMNS_ADDED + || dict_table_is_comp(index->table)); + ut_ad(format != REC_LEAF_TEMP_COLUMNS_ADDED + || index->n_fields == rec_offs_n_fields(offsets)); + ut_d(ulint n_null= 0); + + const unsigned n_core_null_bytes = UNIV_UNLIKELY(index->n_core_fields + != n_core) + ? UT_BITS_IN_BYTES(unsigned(index->get_n_nullable(n_core))) + : index->n_core_null_bytes; + + switch (format) { + case REC_LEAF_TEMP: + if (dict_table_is_comp(index->table)) { + /* No need to do adjust fixed_len=0. We only need to + adjust it for ROW_FORMAT=REDUNDANT. */ + format = REC_LEAF_ORDINARY; + } + goto ordinary; + case REC_LEAF_ORDINARY: + nulls -= REC_N_NEW_EXTRA_BYTES; +ordinary: + lens = --nulls - n_core_null_bytes; + + ut_d(n_null = std::min(n_core_null_bytes * 8U, + index->n_nullable)); + break; + case REC_LEAF_COLUMNS_ADDED: + /* We would have !index->is_instant() when rolling back + an instant ADD COLUMN operation. */ + nulls -= REC_N_NEW_EXTRA_BYTES; + ut_ad(index->is_instant()); + /* fall through */ + case REC_LEAF_TEMP_COLUMNS_ADDED: + n_fields = n_core + 1 + rec_get_n_add_field(nulls); + ut_ad(n_fields <= index->n_fields); + const ulint n_nullable = index->get_n_nullable(n_fields); + const ulint n_null_bytes = UT_BITS_IN_BYTES(n_nullable); + ut_d(n_null = n_nullable); + ut_ad(n_null <= index->n_nullable); + ut_ad(n_null_bytes >= n_core_null_bytes + || n_core < index->n_core_fields); + lens = --nulls - n_null_bytes; + } + #ifdef UNIV_DEBUG - /* We cannot invoke rec_offs_make_valid() here if temp=true. + /* We cannot invoke rec_offs_make_valid() if format==REC_LEAF_TEMP. Similarly, rec_offs_validate() will fail in that case, because it invokes rec_get_status(). */ memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec)); memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index)); #endif /* UNIV_DEBUG */ - ut_ad(temp || dict_table_is_comp(index->table)); - - if (temp && dict_table_is_comp(index->table)) { - /* No need to do adjust fixed_len=0. We only need to - adjust it for ROW_FORMAT=REDUNDANT. */ - temp = false; - } - - /* read the lengths of fields 0..n */ + /* read the lengths of fields 0..n_fields */ + ulint i = 0; do { const dict_field_t* field = dict_index_get_nth_field(index, i); @@ -279,6 +356,32 @@ rec_init_offsets_comp_ordinary( = dict_field_get_col(field); rec_offs len; + /* set default value flag */ + if (i < n_fields) { + } else if (def_val) { + const dict_col_t::def_t& d = def_val[i - n_core]; + if (!d.data) { + len = combine(offs, SQL_NULL); + ut_ad(d.len == UNIV_SQL_NULL); + } else { + len = combine(offs, DEFAULT); + any |= REC_OFFS_DEFAULT; + } + + goto resolved; + } else { + ulint dlen; + if (!index->instant_field_value(i, &dlen)) { + len = combine(offs, SQL_NULL); + ut_ad(dlen == UNIV_SQL_NULL); + } else { + len = combine(offs, DEFAULT); + any |= REC_OFFS_DEFAULT; + } + + goto resolved; + } + if (!(col->prtype & DATA_NOT_NULL)) { /* nullable field => read the null flag */ ut_ad(n_null--); @@ -301,7 +404,8 @@ rec_init_offsets_comp_ordinary( } if (!field->fixed_len - || (temp && !dict_col_get_fixed_size(col, temp))) { + || (format == REC_LEAF_TEMP + && !dict_col_get_fixed_size(col, true))) { /* Variable-length field: read the length */ len = *lens--; /* If the maximum length of the field is up @@ -311,26 +415,21 @@ rec_init_offsets_comp_ordinary( stored in one byte for 0..127. The length will be encoded in two bytes when it is 128 or more, or when the field is stored externally. */ - if (DATA_BIG_COL(col)) { - if (len & 0x80) { - /* 1exxxxxxx xxxxxxxx */ - len <<= 8; - len |= *lens--; - - offs += get_value(len); - if (UNIV_UNLIKELY(len - & 0x4000)) { - ut_ad(dict_index_is_clust - (index)); - any_ext = REC_OFFS_EXTERNAL; - len = combine(offs, - STORED_OFFPAGE); - } else { - len = offs; - } - - goto resolved; + if ((len & 0x80) && DATA_BIG_COL(col)) { + /* 1exxxxxxx xxxxxxxx */ + len <<= 8; + len |= *lens--; + + offs += get_value(len); + if (UNIV_UNLIKELY(len & 0x4000)) { + ut_ad(dict_index_is_clust(index)); + any |= REC_OFFS_EXTERNAL; + len = combine(offs, STORED_OFFPAGE); + } else { + len = offs; } + + goto resolved; } len = offs += len; @@ -341,13 +440,118 @@ resolved: rec_offs_base(offsets)[i + 1] = len; } while (++i < rec_offs_n_fields(offsets)); - *rec_offs_base(offsets) = static_cast<rec_offs>(rec - (lens + 1)) - | REC_OFFS_COMPACT | any_ext; + *rec_offs_base(offsets) + = static_cast<rec_offs>(rec - (lens + 1)) | REC_OFFS_COMPACT | any; } -/******************************************************//** -The following function determines the offsets to each field in the -record. The offsets are written to a previously allocated array of +#ifdef UNIV_DEBUG +/** Update debug data in offsets, in order to tame rec_offs_validate(). +@param[in] rec record +@param[in] index the index that the record belongs in +@param[in] leaf whether the record resides in a leaf page +@param[in,out] offsets offsets from rec_get_offsets() to adjust */ +void +rec_offs_make_valid( + const rec_t* rec, + const dict_index_t* index, + bool leaf, + rec_offs* offsets) +{ + ut_ad(rec_offs_n_fields(offsets) + <= (leaf + ? dict_index_get_n_fields(index) + : dict_index_get_n_unique_in_tree_nonleaf(index) + 1) + || index->is_dummy || dict_index_is_ibuf(index)); + const bool is_user_rec = (dict_table_is_comp(index->table) + ? rec_get_heap_no_new(rec) + : rec_get_heap_no_old(rec)) + >= PAGE_HEAP_NO_USER_LOW; + ulint n = rec_get_n_fields(rec, index); + /* The infimum and supremum records carry 1 field. */ + ut_ad(is_user_rec || n == 1); + ut_ad(is_user_rec || rec_offs_n_fields(offsets) == 1); + ut_ad(!is_user_rec + || (n + (index->id == DICT_INDEXES_ID)) >= index->n_core_fields + || n >= rec_offs_n_fields(offsets)); + for (; n < rec_offs_n_fields(offsets); n++) { + ut_ad(leaf); + ut_ad(get_type(rec_offs_base(offsets)[1 + n]) == DEFAULT); + } + memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec)); + memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index)); +} + +/** Validate offsets returned by rec_get_offsets(). +@param[in] rec record, or NULL +@param[in] index the index that the record belongs in, or NULL +@param[in,out] offsets the offsets of the record +@return true */ +bool +rec_offs_validate( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets) +{ + ulint i = rec_offs_n_fields(offsets); + ulint last = ULINT_MAX; + ulint comp = *rec_offs_base(offsets) & REC_OFFS_COMPACT; + + if (rec) { + ut_ad(!memcmp(&rec, &offsets[RECORD_OFFSET], sizeof(rec))); + if (!comp) { + const bool is_user_rec = rec_get_heap_no_old(rec) + >= PAGE_HEAP_NO_USER_LOW; + ulint n = rec_get_n_fields_old(rec); + /* The infimum and supremum records carry 1 field. */ + ut_ad(is_user_rec || n == 1); + ut_ad(is_user_rec || i == 1); + ut_ad(!is_user_rec || n >= i || !index + || (n + (index->id == DICT_INDEXES_ID)) + >= index->n_core_fields); + for (; n < i; n++) { + ut_ad(get_type(rec_offs_base(offsets)[1 + n]) + == DEFAULT); + } + } + } + if (index) { + ulint max_n_fields; + ut_ad(!memcmp(&index, &offsets[INDEX_OFFSET], sizeof(index))); + max_n_fields = ut_max( + dict_index_get_n_fields(index), + dict_index_get_n_unique_in_tree(index) + 1); + if (comp && rec) { + switch (rec_get_status(rec)) { + case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_NODE_PTR: + max_n_fields = dict_index_get_n_unique_in_tree( + index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + max_n_fields = 1; + break; + default: + ut_error; + } + } + /* index->n_def == 0 for dummy indexes if !comp */ + ut_a(!comp || index->n_def); + ut_a(!index->n_def || i <= max_n_fields); + } + while (i--) { + ulint curr = get_value(rec_offs_base(offsets)[1 + i]); + ut_a(curr <= last); + last = curr; + } + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/** Determine the offsets to each field in the record. + The offsets are written to a previously allocated array of ulint, where rec_offs_n_fields(offsets) has been initialized to the number of fields in the record. The rest of the array will be initialized by this function. rec_offs_base(offsets)[0] will be set @@ -357,27 +561,34 @@ stored columns), and rec_offs_base(offsets)[1..n_fields] will be set to offsets past the end of fields 0..n_fields, or to the beginning of fields 1..n_fields+1. When the type of the offset at [i+1] is (SQL_NULL), the field i is NULL. When the type of the offset at [i+1] -is (STORED_OFFPAGE), the field i is stored externally. */ +is (STORED_OFFPAGE), the field i is stored externally. +@param[in] rec record +@param[in] index the index that the record belongs in +@param[in] n_core 0, or index->n_core_fields for leaf page +@param[in,out] offsets array of offsets, with valid rec_offs_n_fields() */ static void rec_init_offsets( -/*=============*/ - const rec_t* rec, /*!< in: physical record */ - const dict_index_t* index, /*!< in: record descriptor */ - rec_offs* offsets)/*!< in/out: array of offsets; - in: n=rec_offs_n_fields(offsets) */ + const rec_t* rec, + const dict_index_t* index, + ulint n_core, + rec_offs* offsets) { ulint i = 0; rec_offs offs; - rec_offs_make_valid(rec, index, offsets); + ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); + ut_d(memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec))); + ut_d(memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index))); + ut_ad(index->n_fields >= n_core); + ut_ad(index->n_core_fields >= n_core); if (dict_table_is_comp(index->table)) { const byte* nulls; const byte* lens; dict_field_t* field; ulint null_mask; - ulint status = rec_get_status(rec); + rec_comp_status_t status = rec_get_status(rec); ulint n_node_ptr_field = ULINT_UNDEFINED; switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { @@ -389,18 +600,38 @@ rec_init_offsets( rec_offs_base(offsets)[1] = 8; return; case REC_STATUS_NODE_PTR: + ut_ad(!n_core); n_node_ptr_field = dict_index_get_n_unique_in_tree_nonleaf( index); break; + case REC_STATUS_COLUMNS_ADDED: + ut_ad(index->is_instant()); + rec_init_offsets_comp_ordinary(rec, index, offsets, + n_core, + NULL, + REC_LEAF_COLUMNS_ADDED); + return; case REC_STATUS_ORDINARY: - rec_init_offsets_comp_ordinary( - rec, false, index, offsets); + rec_init_offsets_comp_ordinary(rec, index, offsets, + n_core, + NULL, + REC_LEAF_ORDINARY); return; } + /* The n_nullable flags in the clustered index node pointer + records in ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC must + reflect the number of 'core columns'. These flags are + useless garbage, and they are only reserved because of + file format compatibility. + (Clustered index node pointer records only contain the + PRIMARY KEY columns, which are always NOT NULL, + so we should have used n_nullable=0.) */ + ut_ad(index->n_core_fields > 0); + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); - lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + lens = nulls - index->n_core_null_bytes; offs = 0; null_mask = 1; @@ -475,15 +706,19 @@ resolved: rec_offs_base(offsets)[i + 1] = len; } while (++i < rec_offs_n_fields(offsets)); - *rec_offs_base(offsets) = static_cast<rec_offs>( - (rec - (lens + 1)) | REC_OFFS_COMPACT); + *rec_offs_base(offsets) + = static_cast<rec_offs>(rec - (lens + 1)) + | REC_OFFS_COMPACT; } else { /* Old-style record: determine extra size and end offsets */ offs = REC_N_OLD_EXTRA_BYTES; + const ulint n_fields = rec_get_n_fields_old(rec); + const ulint n = std::min(n_fields, rec_offs_n_fields(offsets)); + rec_offs any; + if (rec_get_1byte_offs_flag(rec)) { - offs += static_cast<rec_offs>( - rec_offs_n_fields(offsets)); - *rec_offs_base(offsets) = offs; + offs += static_cast<rec_offs>(n_fields); + any = offs; /* Determine offsets to fields */ do { offs = rec_1_get_field_end_info(rec, i); @@ -492,12 +727,10 @@ resolved: set_type(offs, SQL_NULL); } rec_offs_base(offsets)[1 + i] = offs; - } while (++i < rec_offs_n_fields(offsets)); + } while (++i < n); } else { - offs += 2 - * static_cast<rec_offs>( - rec_offs_n_fields(offsets)); - *rec_offs_base(offsets) = offs; + offs += 2 * static_cast<rec_offs>(n_fields); + any = offs; /* Determine offsets to fields */ do { offs = rec_2_get_field_end_info(rec, i); @@ -508,11 +741,28 @@ resolved: if (offs & REC_2BYTE_EXTERN_MASK) { offs &= ~REC_2BYTE_EXTERN_MASK; set_type(offs, STORED_OFFPAGE); - *rec_offs_base(offsets) |= REC_OFFS_EXTERNAL; + any |= REC_OFFS_EXTERNAL; } rec_offs_base(offsets)[1 + i] = offs; + } while (++i < n); + } + + if (i < rec_offs_n_fields(offsets)) { + ut_ad(index->is_instant() + || i + (index->id == DICT_INDEXES_ID) + == rec_offs_n_fields(offsets)); + + ut_ad(i != 0); + offs = combine(rec_offs_base(offsets)[i], DEFAULT); + + do { + rec_offs_base(offsets)[1 + i] = offs; } while (++i < rec_offs_n_fields(offsets)); + + any |= REC_OFFS_DEFAULT; } + + *rec_offs_base(offsets) = any; } } @@ -521,7 +771,7 @@ resolved: @param[in] index the index that the record belongs to @param[in,out] offsets array comprising offsets[0] allocated elements, or an array from rec_get_offsets(), or NULL -@param[in] leaf whether this is a leaf-page record +@param[in] n_core 0, or index->n_core_fields for leaf page @param[in] n_fields maximum number of offsets to compute (ULINT_UNDEFINED to compute all offsets) @param[in,out] heap memory heap @@ -531,9 +781,7 @@ rec_get_offsets_func( const rec_t* rec, const dict_index_t* index, rec_offs* offsets, -#ifdef UNIV_DEBUG - bool leaf, -#endif /* UNIV_DEBUG */ + ulint n_core, ulint n_fields, #ifdef UNIV_DEBUG const char* file, /*!< in: file name where called */ @@ -544,18 +792,22 @@ rec_get_offsets_func( ulint n; ulint size; + ut_ad(index->n_core_fields >= n_core); + ut_ad(index->n_fields >= index->n_core_fields); + if (dict_table_is_comp(index->table)) { switch (UNIV_EXPECT(rec_get_status(rec), REC_STATUS_ORDINARY)) { + case REC_STATUS_COLUMNS_ADDED: case REC_STATUS_ORDINARY: - ut_ad(leaf); + ut_ad(n_core); n = dict_index_get_n_fields(index); break; case REC_STATUS_NODE_PTR: /* Node pointer records consist of the uniquely identifying fields of the record followed by a child page number field. */ - ut_ad(!leaf); + ut_ad(!n_core); n = dict_index_get_n_unique_in_tree_nonleaf(index) + 1; break; case REC_STATUS_INFIMUM: @@ -580,21 +832,26 @@ rec_get_offsets_func( page_rec_is_user_rec(rec) and similar predicates cannot be evaluated. We can still distinguish the infimum and supremum record based on the heap number. */ - ut_d(const bool is_user_rec = rec_get_heap_no_old(rec) - >= PAGE_HEAP_NO_USER_LOW); + const bool is_user_rec = rec_get_heap_no_old(rec) + >= PAGE_HEAP_NO_USER_LOW; /* The infimum and supremum records carry 1 field. */ ut_ad(is_user_rec || n == 1); - ut_ad(!is_user_rec || leaf || index->is_dummy + ut_ad(!is_user_rec || n_core || index->is_dummy || dict_index_is_ibuf(index) || n == n_fields /* dict_stats_analyze_index_level() */ || n == dict_index_get_n_unique_in_tree_nonleaf(index) + 1); - ut_ad(!is_user_rec || !leaf || index->is_dummy + ut_ad(!is_user_rec || !n_core || index->is_dummy || dict_index_is_ibuf(index) || n == n_fields /* btr_pcur_restore_position() */ - || n == index->n_fields - || (index->id == DICT_INDEXES_ID - && (n == DICT_NUM_FIELDS__SYS_INDEXES - 1))); + || (n + (index->id == DICT_INDEXES_ID) + >= n_core && n <= index->n_fields)); + + if (is_user_rec && n_core && n < index->n_fields) { + ut_ad(!index->is_dummy); + ut_ad(!dict_index_is_ibuf(index)); + n = index->n_fields; + } } if (UNIV_UNLIKELY(n_fields < n)) { @@ -618,7 +875,7 @@ rec_get_offsets_func( } rec_offs_set_n_fields(offsets, n); - rec_init_offsets(rec, index, offsets); + rec_init_offsets(rec, index, n_core, offsets); return(offsets); } @@ -649,8 +906,9 @@ rec_get_offsets_reverse( ulint n_node_ptr_field; ut_ad(dict_table_is_comp(index->table)); + ut_ad(!index->is_instant()); - if (UNIV_UNLIKELY(node_ptr)) { + if (UNIV_UNLIKELY(node_ptr != 0)) { n_node_ptr_field = dict_index_get_n_unique_in_tree_nonleaf(index); n = n_node_ptr_field + 1; @@ -737,8 +995,8 @@ resolved: ut_ad(lens >= extra); *rec_offs_base(offsets) - = static_cast<rec_offs>((lens - extra + REC_N_NEW_EXTRA_BYTES) - | REC_OFFS_COMPACT | any_ext); + = static_cast<rec_offs>(lens - extra + REC_N_NEW_EXTRA_BYTES) + | REC_OFFS_COMPACT | any_ext; } /************************************************************//** @@ -787,7 +1045,7 @@ rec_get_nth_field_offs_old( *len = next_os - os; - ut_ad(*len < UNIV_PAGE_SIZE); + ut_ad(*len < srv_page_size); return(os); } @@ -795,7 +1053,9 @@ rec_get_nth_field_offs_old( /**********************************************************//** Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. @return total size */ -UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(1,2))) +template<bool redundant_temp> +MY_ATTRIBUTE((warn_unused_result, nonnull(1,2))) +static inline ulint rec_get_converted_size_comp_prefix_low( /*===================================*/ @@ -806,21 +1066,34 @@ rec_get_converted_size_comp_prefix_low( const dfield_t* fields, /*!< in: array of data fields */ ulint n_fields,/*!< in: number of data fields */ ulint* extra, /*!< out: extra size */ + rec_comp_status_t status, /*!< in: status flags */ bool temp) /*!< in: whether this is a temporary file record */ { - ulint extra_size; + ulint extra_size = temp ? 0 : REC_N_NEW_EXTRA_BYTES; ulint data_size; ulint i; ut_ad(n_fields > 0); ut_ad(n_fields <= dict_index_get_n_fields(index)); - ut_ad(!temp || extra); - ut_d(ulint n_null = index->n_nullable); + ut_ad(status == REC_STATUS_ORDINARY || status == REC_STATUS_NODE_PTR + || status == REC_STATUS_COLUMNS_ADDED); + unsigned n_core_fields = redundant_temp + ? row_log_get_n_core_fields(index) + : index->n_core_fields; + + if (status == REC_STATUS_COLUMNS_ADDED + && (!temp || n_fields > n_core_fields)) { + if (!redundant_temp) { ut_ad(index->is_instant()); } + ut_ad(UT_BITS_IN_BYTES(n_null) >= index->n_core_null_bytes); + extra_size += UT_BITS_IN_BYTES(index->get_n_nullable(n_fields)) + + rec_get_n_add_field_len(n_fields - 1 + - n_core_fields); + } else { + ut_ad(n_fields <= n_core_fields); + extra_size += index->n_core_null_bytes; + } - extra_size = temp - ? UT_BITS_IN_BYTES(index->n_nullable) - : REC_N_NEW_EXTRA_BYTES + UT_BITS_IN_BYTES(index->n_nullable); data_size = 0; if (temp && dict_table_is_comp(index->table)) { @@ -932,8 +1205,9 @@ rec_get_converted_size_comp_prefix( ulint* extra) /*!< out: extra size */ { ut_ad(dict_table_is_comp(index->table)); - return(rec_get_converted_size_comp_prefix_low( - index, fields, n_fields, extra, false)); + return(rec_get_converted_size_comp_prefix_low<false>( + index, fields, n_fields, extra, + REC_STATUS_ORDINARY, false)); } /**********************************************************//** @@ -946,40 +1220,41 @@ rec_get_converted_size_comp( dict_table_is_comp() is assumed to hold, even if it does not */ - ulint status, /*!< in: status bits of the record */ + rec_comp_status_t status, /*!< in: status bits of the record */ const dfield_t* fields, /*!< in: array of data fields */ ulint n_fields,/*!< in: number of data fields */ ulint* extra) /*!< out: extra size */ { - ulint size; ut_ad(n_fields > 0); switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { case REC_STATUS_ORDINARY: - ut_ad(n_fields == dict_index_get_n_fields(index)); - size = 0; - break; + if (n_fields > index->n_core_fields) { + ut_ad(index->is_instant()); + status = REC_STATUS_COLUMNS_ADDED; + } + /* fall through */ + case REC_STATUS_COLUMNS_ADDED: + ut_ad(n_fields >= index->n_core_fields); + ut_ad(n_fields <= index->n_fields); + return rec_get_converted_size_comp_prefix_low<false>( + index, fields, n_fields, extra, status, false); case REC_STATUS_NODE_PTR: n_fields--; ut_ad(n_fields == dict_index_get_n_unique_in_tree_nonleaf( index)); ut_ad(dfield_get_len(&fields[n_fields]) == REC_NODE_PTR_SIZE); - size = REC_NODE_PTR_SIZE; /* child page number */ - break; + return REC_NODE_PTR_SIZE /* child page number */ + + rec_get_converted_size_comp_prefix_low<false>( + index, fields, n_fields, extra, status, false); case REC_STATUS_INFIMUM: case REC_STATUS_SUPREMUM: - /* infimum or supremum record, 8 data bytes */ - if (UNIV_LIKELY_NULL(extra)) { - *extra = REC_N_NEW_EXTRA_BYTES; - } - return(REC_N_NEW_EXTRA_BYTES + 8); - default: - ut_error; - return(ULINT_UNDEFINED); + /* not supported */ + break; } - return(size + rec_get_converted_size_comp_prefix_low( - index, fields, n_fields, extra, false)); + ut_error; + return(ULINT_UNDEFINED); } /***********************************************************//** @@ -1076,8 +1351,7 @@ rec_convert_dtuple_to_rec_old( /* Set the info bits of the record */ rec_set_info_bits_old(rec, dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK); - /* Make rec_get_offsets() and rec_offs_make_valid() happy. */ - ut_d(rec_set_heap_no_old(rec, PAGE_HEAP_NO_USER_LOW)); + rec_set_heap_no_old(rec, PAGE_HEAP_NO_USER_LOW); /* Store the data and the offsets */ @@ -1149,78 +1423,91 @@ rec_convert_dtuple_to_rec_old( return(rec); } -/*********************************************************//** -Builds a ROW_FORMAT=COMPACT record out of a data tuple. */ -UNIV_INLINE +/** Convert a data tuple into a ROW_FORMAT=COMPACT record. +@param[out] rec converted record +@param[in] index index +@param[in] fields data fields to convert +@param[in] n_fields number of data fields +@param[in] status rec_get_status(rec) +@param[in] temp whether to use the format for temporary files + in index creation */ +template<bool redundant_temp> +static inline void rec_convert_dtuple_to_rec_comp( -/*===========================*/ - rec_t* rec, /*!< in: origin of record */ - const dict_index_t* index, /*!< in: record descriptor */ - const dfield_t* fields, /*!< in: array of data fields */ - ulint n_fields,/*!< in: number of data fields */ - ulint status, /*!< in: status bits of the record */ - bool temp) /*!< in: whether to use the - format for temporary files in - index creation */ + rec_t* rec, + const dict_index_t* index, + const dfield_t* fields, + ulint n_fields, + rec_comp_status_t status, + bool temp) { const dfield_t* field; const dtype_t* type; byte* end; - byte* nulls; - byte* lens; + byte* nulls = temp + ? rec - 1 : rec - (REC_N_NEW_EXTRA_BYTES + 1); + byte* UNINIT_VAR(lens); ulint len; ulint i; - ulint n_node_ptr_field; + ulint UNINIT_VAR(n_node_ptr_field); ulint fixed_len; ulint null_mask = 1; - + const ulint n_core_fields = redundant_temp + ? row_log_get_n_core_fields(index) + : index->n_core_fields; ut_ad(n_fields > 0); ut_ad(temp || dict_table_is_comp(index->table)); - ulint n_null = index->n_nullable; - const ulint n_null_bytes = UT_BITS_IN_BYTES(n_null); + ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); + + ut_d(ulint n_null = index->n_nullable); - if (temp) { - ut_ad(status == REC_STATUS_ORDINARY); + switch (status) { + case REC_STATUS_COLUMNS_ADDED: + if (!redundant_temp) { ut_ad(index->is_instant()); } + ut_ad(n_fields > n_core_fields); + rec_set_n_add_field(nulls, n_fields - 1 - n_core_fields); + /* fall through */ + case REC_STATUS_ORDINARY: ut_ad(n_fields <= dict_index_get_n_fields(index)); - n_node_ptr_field = ULINT_UNDEFINED; - nulls = rec - 1; - if (dict_table_is_comp(index->table)) { + if (!temp) { + rec_set_heap_no_new(rec, PAGE_HEAP_NO_USER_LOW); + rec_set_status(rec, n_fields == n_core_fields + ? REC_STATUS_ORDINARY + : REC_STATUS_COLUMNS_ADDED); + } if (dict_table_is_comp(index->table)) { /* No need to do adjust fixed_len=0. We only need to adjust it for ROW_FORMAT=REDUNDANT. */ temp = false; } - } else { - /* Make rec_get_offsets() and rec_offs_make_valid() happy. */ - ut_d(rec_set_heap_no_new(rec, PAGE_HEAP_NO_USER_LOW)); - nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); - switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { - case REC_STATUS_ORDINARY: - ut_ad(n_fields <= dict_index_get_n_fields(index)); - n_node_ptr_field = ULINT_UNDEFINED; - break; - case REC_STATUS_NODE_PTR: - ut_ad(n_fields - == dict_index_get_n_unique_in_tree_nonleaf(index) - + 1); - n_node_ptr_field = n_fields - 1; - break; - case REC_STATUS_INFIMUM: - case REC_STATUS_SUPREMUM: - ut_ad(n_fields == 1); - n_node_ptr_field = ULINT_UNDEFINED; - break; - default: - ut_error; - return; - } + n_node_ptr_field = ULINT_UNDEFINED; + lens = nulls - (index->is_instant() + ? UT_BITS_IN_BYTES(index->get_n_nullable( + n_fields)) + : UT_BITS_IN_BYTES( + unsigned(index->n_nullable))); + break; + case REC_STATUS_NODE_PTR: + ut_ad(!temp); + rec_set_heap_no_new(rec, PAGE_HEAP_NO_USER_LOW); + rec_set_status(rec, status); + ut_ad(n_fields + == dict_index_get_n_unique_in_tree_nonleaf(index) + 1); + ut_d(n_null = std::min<unsigned>(index->n_core_null_bytes * 8U, + index->n_nullable)); + n_node_ptr_field = n_fields - 1; + lens = nulls - index->n_core_null_bytes; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_error; + return; } end = rec; /* clear the SQL-null flags */ - lens = nulls - n_null_bytes; - memset(lens + 1, 0, nulls - lens); + memset(lens + 1, 0, ulint(nulls - lens)); /* Store the data and the offsets */ @@ -1326,21 +1613,26 @@ rec_convert_dtuple_to_rec_new( const dict_index_t* index, /*!< in: record descriptor */ const dtuple_t* dtuple) /*!< in: data tuple */ { + ut_ad(!(dtuple->info_bits + & ~(REC_NEW_STATUS_MASK | REC_INFO_DELETED_FLAG + | REC_INFO_MIN_REC_FLAG))); + rec_comp_status_t status = static_cast<rec_comp_status_t>( + dtuple->info_bits & REC_NEW_STATUS_MASK); + if (status == REC_STATUS_ORDINARY + && dtuple->n_fields > index->n_core_fields) { + ut_ad(index->is_instant()); + status = REC_STATUS_COLUMNS_ADDED; + } + ulint extra_size; - ulint status; - rec_t* rec; - status = dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK; rec_get_converted_size_comp( index, status, dtuple->fields, dtuple->n_fields, &extra_size); - rec = buf + extra_size; + rec_t* rec = buf + extra_size; - rec_convert_dtuple_to_rec_comp( + rec_convert_dtuple_to_rec_comp<false>( rec, index, dtuple->fields, dtuple->n_fields, status, false); - - /* Set the info bits of the record */ - rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple)); - + rec_set_info_bits_new(rec, dtuple->info_bits & ~REC_NEW_STATUS_MASK); return(rec); } @@ -1380,62 +1672,117 @@ rec_convert_dtuple_to_rec( @param[in] fields data fields @param[in] n_fields number of data fields @param[out] extra record header size +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED @return total size, in bytes */ +template<bool redundant_temp> ulint rec_get_converted_size_temp( const dict_index_t* index, const dfield_t* fields, ulint n_fields, - ulint* extra) + ulint* extra, + rec_comp_status_t status) { - return(rec_get_converted_size_comp_prefix_low( - index, fields, n_fields, extra, true)); + return rec_get_converted_size_comp_prefix_low<redundant_temp>( + index, fields, n_fields, extra, status, true); } -/******************************************************//** -Determine the offset to each field in temporary file. -@see rec_convert_dtuple_to_temp() */ +template ulint rec_get_converted_size_temp<false>( + const dict_index_t*, const dfield_t*, ulint, ulint*, + rec_comp_status_t); + +template ulint rec_get_converted_size_temp<true>( + const dict_index_t*, const dfield_t*, ulint, ulint*, + rec_comp_status_t); + +/** Determine the offset to each field in temporary file. +@param[in] rec temporary file record +@param[in] index index of that the record belongs to +@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) +@param[in] n_core number of core fields (index->n_core_fields) +@param[in] def_val default values for non-core fields +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED */ void rec_init_offsets_temp( -/*==================*/ - const rec_t* rec, /*!< in: temporary file record */ - const dict_index_t* index, /*!< in: record descriptor */ - rec_offs* offsets)/*!< in/out: array of offsets; - in: n=rec_offs_n_fields(offsets) */ + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets, + ulint n_core, + const dict_col_t::def_t*def_val, + rec_comp_status_t status) { - rec_init_offsets_comp_ordinary(rec, true, index, offsets); + ut_ad(status == REC_STATUS_ORDINARY + || status == REC_STATUS_COLUMNS_ADDED); + /* The table may have been converted to plain format + if it was emptied during an ALTER TABLE operation. */ + ut_ad(index->n_core_fields == n_core || !index->is_instant()); + ut_ad(index->n_core_fields >= n_core); + rec_init_offsets_comp_ordinary(rec, index, offsets, n_core, def_val, + status == REC_STATUS_COLUMNS_ADDED + ? REC_LEAF_TEMP_COLUMNS_ADDED + : REC_LEAF_TEMP); } -/*********************************************************//** -Builds a temporary file record out of a data tuple. -@see rec_init_offsets_temp() */ +/** Determine the offset to each field in temporary file. +@param[in] rec temporary file record +@param[in] index index of that the record belongs to +@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) +*/ +void +rec_init_offsets_temp( + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets) +{ + ut_ad(!index->is_instant()); + rec_init_offsets_comp_ordinary(rec, index, offsets, + index->n_core_fields, NULL, + REC_LEAF_TEMP); +} + +/** Convert a data tuple prefix to the temporary file format. +@param[out] rec record in temporary file format +@param[in] index clustered or secondary index +@param[in] fields data fields +@param[in] n_fields number of data fields +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED +*/ +template<bool redundant_temp> void rec_convert_dtuple_to_temp( -/*=======================*/ - rec_t* rec, /*!< out: record */ - const dict_index_t* index, /*!< in: record descriptor */ - const dfield_t* fields, /*!< in: array of data fields */ - ulint n_fields) /*!< in: number of fields */ + rec_t* rec, + const dict_index_t* index, + const dfield_t* fields, + ulint n_fields, + rec_comp_status_t status) { - rec_convert_dtuple_to_rec_comp(rec, index, fields, n_fields, - REC_STATUS_ORDINARY, true); + rec_convert_dtuple_to_rec_comp<redundant_temp>( + rec, index, fields, n_fields, status, true); } +template void rec_convert_dtuple_to_temp<false>( + rec_t*, const dict_index_t*, const dfield_t*, + ulint, rec_comp_status_t); + +template void rec_convert_dtuple_to_temp<true>( + rec_t*, const dict_index_t*, const dfield_t*, + ulint, rec_comp_status_t); + /** Copy the first n fields of a (copy of a) physical record to a data tuple. The fields are copied into the memory heap. @param[out] tuple data tuple @param[in] rec index record, or a copy thereof -@param[in] is_leaf whether rec is a leaf page record +@param[in] index index of rec +@param[in] n_core index->n_core_fields at the time rec was + copied, or 0 if non-leaf page record @param[in] n_fields number of fields to copy @param[in,out] heap memory heap */ void -rec_copy_prefix_to_dtuple_func( +rec_copy_prefix_to_dtuple( dtuple_t* tuple, const rec_t* rec, const dict_index_t* index, -#ifdef UNIV_DEBUG - bool is_leaf, -#endif /* UNIV_DEBUG */ + ulint n_core, ulint n_fields, mem_heap_t* heap) { @@ -1443,17 +1790,18 @@ rec_copy_prefix_to_dtuple_func( rec_offs* offsets = offsets_; rec_offs_init(offsets_); - ut_ad(is_leaf || n_fields + ut_ad(n_core <= index->n_core_fields); + ut_ad(n_core || n_fields <= dict_index_get_n_unique_in_tree_nonleaf(index) + 1); - offsets = rec_get_offsets(rec, index, offsets, is_leaf, + offsets = rec_get_offsets(rec, index, offsets, n_core, n_fields, &heap); ut_ad(rec_validate(rec, offsets)); + ut_ad(!rec_offs_any_default(offsets)); ut_ad(dtuple_check_typed(tuple)); - dtuple_set_info_bits(tuple, rec_get_info_bits( - rec, dict_table_is_comp(index->table))); + tuple->info_bits = rec_get_info_bits(rec, rec_offs_comp(offsets)); for (ulint i = 0; i < n_fields; i++) { dfield_t* field; @@ -1531,14 +1879,8 @@ rec_copy_prefix_to_buf( or NULL */ ulint* buf_size) /*!< in/out: buffer size */ { - const byte* nulls; - const byte* lens; - ulint i; - ulint prefix_len; - ulint null_mask; - ulint status; - bool is_rtr_node_ptr = false; - + ut_ad(n_fields <= index->n_fields || dict_index_is_ibuf(index)); + ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); UNIV_PREFETCH_RW(*buf); if (!dict_table_is_comp(index->table)) { @@ -1549,40 +1891,62 @@ rec_copy_prefix_to_buf( buf, buf_size)); } - status = rec_get_status(rec); + ulint prefix_len = 0; + ulint instant_omit = 0; + const byte* nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + const byte* nullf = nulls; + const byte* lens = nulls - index->n_core_null_bytes; - switch (status) { + switch (rec_get_status(rec)) { + default: + /* infimum or supremum record: no sense to copy anything */ + ut_error; + return(NULL); case REC_STATUS_ORDINARY: - ut_ad(n_fields <= dict_index_get_n_fields(index)); + ut_ad(n_fields <= index->n_core_fields); break; case REC_STATUS_NODE_PTR: /* For R-tree, we need to copy the child page number field. */ + compile_time_assert(DICT_INDEX_SPATIAL_NODEPTR_SIZE == 1); if (dict_index_is_spatial(index)) { + ut_ad(index->n_core_null_bytes == 0); ut_ad(n_fields == DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1); - is_rtr_node_ptr = true; - } else { - /* it doesn't make sense to copy the child page number - field */ - ut_ad(n_fields <= - dict_index_get_n_unique_in_tree_nonleaf(index)); + ut_ad(index->fields[0].col->prtype & DATA_NOT_NULL); + ut_ad(DATA_BIG_COL(index->fields[0].col)); + /* This is a deficiency of the format introduced + in MySQL 5.7. The length in the R-tree index should + always be DATA_MBR_LEN. */ + ut_ad(!index->fields[0].fixed_len); + ut_ad(*lens == DATA_MBR_LEN); + lens--; + prefix_len = DATA_MBR_LEN + REC_NODE_PTR_SIZE; + n_fields = 0; /* skip the "for" loop below */ + break; } + /* it doesn't make sense to copy the child page number field */ + ut_ad(n_fields + <= dict_index_get_n_unique_in_tree_nonleaf(index)); break; - case REC_STATUS_INFIMUM: - case REC_STATUS_SUPREMUM: - /* infimum or supremum record: no sense to copy anything */ - default: - ut_error; - return(NULL); + case REC_STATUS_COLUMNS_ADDED: + /* We would have !index->is_instant() when rolling back + an instant ADD COLUMN operation. */ + ut_ad(index->is_instant() || page_rec_is_metadata(rec)); + nulls++; + const ulint n_rec = ulint(index->n_core_fields) + 1 + + rec_get_n_add_field(nulls); + instant_omit = ulint(&rec[-REC_N_NEW_EXTRA_BYTES] - nulls); + ut_ad(instant_omit == 1 || instant_omit == 2); + nullf = nulls; + const uint nb = UT_BITS_IN_BYTES(index->get_n_nullable(n_rec)); + instant_omit += nb - index->n_core_null_bytes; + lens = --nulls - nb; } - nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); - lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + const byte* const lenf = lens; UNIV_PREFETCH_R(lens); - prefix_len = 0; - null_mask = 1; /* read the lengths of fields 0..n */ - for (i = 0; i < n_fields; i++) { + for (ulint i = 0, null_mask = 1; i < n_fields; i++) { const dict_field_t* field; const dict_col_t* col; @@ -1604,11 +1968,7 @@ rec_copy_prefix_to_buf( null_mask <<= 1; } - if (is_rtr_node_ptr && i == 1) { - /* For rtree node ptr rec, we need to - copy the page no field with 4 bytes len. */ - prefix_len += 4; - } else if (field->fixed_len) { + if (field->fixed_len) { prefix_len += field->fixed_len; } else { ulint len = *lens--; @@ -1634,17 +1994,41 @@ rec_copy_prefix_to_buf( UNIV_PREFETCH_R(rec + prefix_len); - prefix_len += rec - (lens + 1); + ulint size = prefix_len + ulint(rec - (lens + 1)) - instant_omit; - if ((*buf == NULL) || (*buf_size < prefix_len)) { + if (*buf == NULL || *buf_size < size) { ut_free(*buf); - *buf_size = prefix_len; - *buf = static_cast<byte*>(ut_malloc_nokey(prefix_len)); + *buf_size = size; + *buf = static_cast<byte*>(ut_malloc_nokey(size)); } - memcpy(*buf, lens + 1, prefix_len); - - return(*buf + (rec - (lens + 1))); + if (instant_omit) { + /* Copy and convert the record header to a format where + instant ADD COLUMN has not been used: + + lengths of variable-length fields in the prefix + - omit any null flag bytes for any instantly added columns + + index->n_core_null_bytes of null flags + - omit the n_add_fields header (1 or 2 bytes) + + REC_N_NEW_EXTRA_BYTES of fixed header */ + byte* b = *buf; + /* copy the lengths of the variable-length fields */ + memcpy(b, lens + 1, ulint(lenf - lens)); + b += ulint(lenf - lens); + /* copy the null flags */ + memcpy(b, nullf - index->n_core_null_bytes, + index->n_core_null_bytes); + b += index->n_core_null_bytes + REC_N_NEW_EXTRA_BYTES; + ut_ad(ulint(b - *buf) + prefix_len == size); + /* copy the fixed-size header and the record prefix */ + memcpy(b - REC_N_NEW_EXTRA_BYTES, rec - REC_N_NEW_EXTRA_BYTES, + prefix_len + REC_N_NEW_EXTRA_BYTES); + ut_ad(rec_get_status(b) == REC_STATUS_COLUMNS_ADDED); + rec_set_status(b, REC_STATUS_ORDINARY); + return b; + } else { + memcpy(*buf, lens + 1, size); + return *buf + (rec - (lens + 1)); + } } /***************************************************************//** @@ -1672,7 +2056,7 @@ rec_validate_old( for (i = 0; i < n_fields; i++) { rec_get_nth_field_offs_old(rec, i, &len); - if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { + if (!((len < srv_page_size) || (len == UNIV_SQL_NULL))) { ib::error() << "Record field " << i << " len " << len; return(FALSE); } @@ -1714,20 +2098,27 @@ rec_validate( return(FALSE); } - ut_a(rec_offs_comp(offsets) || n_fields <= rec_get_n_fields_old(rec)); + ut_a(rec_offs_any_flag(offsets, REC_OFFS_COMPACT | REC_OFFS_DEFAULT) + || n_fields <= rec_get_n_fields_old(rec)); for (i = 0; i < n_fields; i++) { rec_get_nth_field_offs(offsets, i, &len); - if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { - ib::error() << "Record field " << i << " len " << len; - return(FALSE); - } - - if (len != UNIV_SQL_NULL) { + switch (len) { + default: + if (len >= srv_page_size) { + ib::error() << "Record field " << i + << " len " << len; + return(FALSE); + } len_sum += len; - } else if (!rec_offs_comp(offsets)) { - len_sum += rec_get_nth_field_size(rec, i); + break; + case UNIV_SQL_DEFAULT: + break; + case UNIV_SQL_NULL: + if (!rec_offs_comp(offsets)) { + len_sum += rec_get_nth_field_size(rec, i); + } } } @@ -1807,14 +2198,22 @@ rec_print_comp( ulint i; for (i = 0; i < rec_offs_n_fields(offsets); i++) { - const byte* data; + const byte* UNINIT_VAR(data); ulint len; - data = rec_get_nth_field(rec, offsets, i, &len); + if (rec_offs_nth_default(offsets, i)) { + len = UNIV_SQL_DEFAULT; + } else { + data = rec_get_nth_field(rec, offsets, i, &len); + } fprintf(file, " " ULINTPF ":", i); - if (len != UNIV_SQL_NULL) { + if (len == UNIV_SQL_NULL) { + fputs(" SQL NULL", file); + } else if (len == UNIV_SQL_DEFAULT) { + fputs(" SQL DEFAULT", file); + } else { if (len <= 30) { ut_print_buf(file, data, len); @@ -1832,8 +2231,6 @@ rec_print_comp( fprintf(file, " (total " ULINTPF " bytes)", len); } - } else { - fputs(" SQL NULL", file); } putc(';', file); putc('\n', file); @@ -1925,6 +2322,7 @@ rec_print_mbr_rec( const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ { ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_any_default(offsets)); if (!rec_offs_comp(offsets)) { rec_print_mbr_old(file, rec); @@ -2034,7 +2432,8 @@ rec_print( rec_print_new(file, rec, rec_get_offsets(rec, index, offsets_, - page_rec_is_leaf(rec), + page_rec_is_leaf(rec) + ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap)); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); @@ -2072,6 +2471,11 @@ rec_print( data = rec_get_nth_field(rec, offsets, i, &len); + if (len == UNIV_SQL_DEFAULT) { + o << "DEFAULT"; + continue; + } + if (len == UNIV_SQL_NULL) { o << "NULL"; continue; @@ -2105,7 +2509,8 @@ operator<<(std::ostream& o, const rec_index_print& r) { mem_heap_t* heap = NULL; rec_offs* offsets = rec_get_offsets( - r.m_rec, r.m_index, NULL, page_rec_is_leaf(r.m_rec), + r.m_rec, r.m_index, NULL, page_rec_is_leaf(r.m_rec) + ? r.m_index->n_core_fields : 0, ULINT_UNDEFINED, &heap); rec_print(o, r.m_rec, rec_get_info_bits(r.m_rec, rec_offs_comp(offsets)), @@ -2151,7 +2556,7 @@ rec_get_trx_id( ut_ad(trx_id_col > 0); ut_ad(trx_id_col != ULINT_UNDEFINED); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, trx_id_col + 1, &heap); trx_id = rec_get_nth_field(rec, offsets, trx_id_col, &len); @@ -2175,7 +2580,7 @@ rec_offs_make_nth_extern( const ulint n) { ut_ad(!rec_offs_nth_sql_null(offsets, n)); - set_type(rec_offs_base(offsets)[1 +n ], STORED_OFFPAGE); + set_type(rec_offs_base(offsets)[1 + n], STORED_OFFPAGE); } #ifdef WITH_WSREP # include "ha_prototypes.h" @@ -2202,7 +2607,8 @@ wsrep_rec_get_foreign_key( ut_ad(index_ref); rec_offs_init(offsets_); - offsets = rec_get_offsets(rec, index_for, offsets_, true, + offsets = rec_get_offsets(rec, index_for, offsets_, + index_for->n_core_fields, ULINT_UNDEFINED, &heap); ut_ad(rec_offs_validate(rec, NULL, offsets)); @@ -2221,6 +2627,7 @@ wsrep_rec_get_foreign_key( dict_index_get_nth_field(index_ref, i); const dict_col_t* col_r = dict_field_get_col(field_r); + ut_ad(!rec_offs_nth_default(offsets, i)); data = rec_get_nth_field(rec, offsets, i, &len); if (key_len + ((len != UNIV_SQL_NULL) ? len + 1 : 1) > *buf_len) { diff --git a/storage/innobase/row/row0ext.cc b/storage/innobase/row/row0ext.cc index 32e9aad9896..5892cbf31fe 100644 --- a/storage/innobase/row/row0ext.cc +++ b/storage/innobase/row/row0ext.cc @@ -71,7 +71,7 @@ row_ext_cache_fill( } else { /* Fetch at most ext->max_len of the column. The column should be non-empty. However, - trx_rollback_or_clean_all_recovered() may try to + trx_rollback_all_recovered() may try to access a half-deleted BLOB if the server previously crashed during the execution of btr_free_externally_stored_field(). */ diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc index 2ca930e0934..3d9bc6f0ab1 100644 --- a/storage/innobase/row/row0ftsort.cc +++ b/storage/innobase/row/row0ftsort.cc @@ -65,15 +65,15 @@ integer value) dict_index_t* row_merge_create_fts_sort_index( /*============================*/ - dict_index_t* index, /*!< in: Original FTS index - based on which this sort index - is created */ - const dict_table_t* table, /*!< in: table that FTS index - is being created on */ - ibool* opt_doc_id_size) - /*!< out: whether to use 4 bytes - instead of 8 bytes integer to - store Doc ID during sort */ + dict_index_t* index, /*!< in: Original FTS index + based on which this sort index + is created */ + dict_table_t* table, /*!< in,out: table that FTS index + is being created on */ + ibool* opt_doc_id_size) + /*!< out: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ { dict_index_t* new_index; dict_field_t* field; @@ -81,11 +81,9 @@ row_merge_create_fts_sort_index( CHARSET_INFO* charset; // FIXME: This name shouldn't be hard coded here. - new_index = dict_mem_index_create( - index->table->name.m_name, "tmp_fts_idx", 0, DICT_FTS, 3); + new_index = dict_mem_index_create(table, "tmp_fts_idx", DICT_FTS, 3); new_index->id = index->id; - new_index->table = (dict_table_t*) table; new_index->n_uniq = FTS_NUM_FIELDS_SORT; new_index->n_def = FTS_NUM_FIELDS_SORT; new_index->cached = TRUE; @@ -105,7 +103,7 @@ row_merge_create_fts_sort_index( ? DATA_VARCHAR : DATA_VARMYSQL; field->col->mbminlen = idx_field->col->mbminlen; field->col->mbmaxlen = idx_field->col->mbmaxlen; - field->col->len = HA_FT_MAXCHARLEN * field->col->mbmaxlen; + field->col->len = HA_FT_MAXCHARLEN * unsigned(field->col->mbmaxlen); field->fixed_len = 0; @@ -161,25 +159,26 @@ row_merge_create_fts_sort_index( return(new_index); } -/*********************************************************************//** -Initialize FTS parallel sort structures. + +/** Initialize FTS parallel sort structures. +@param[in] trx transaction +@param[in,out] dup descriptor of FTS index being created +@param[in] new_table table where indexes are created +@param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes + integer to store Doc ID during sort +@param[in] old_page_size page size of the old table during alter +@param[out] psort parallel sort info to be instantiated +@param[out] merge parallel merge info to be instantiated @return TRUE if all successful */ ibool row_fts_psort_info_init( -/*====================*/ - trx_t* trx, /*!< in: transaction */ - row_merge_dup_t* dup, /*!< in,own: descriptor of - FTS index being created */ - const dict_table_t* new_table,/*!< in: table on which indexes are - created */ - ibool opt_doc_id_size, - /*!< in: whether to use 4 bytes - instead of 8 bytes integer to - store Doc ID during sort */ - fts_psort_t** psort, /*!< out: parallel sort info to be - instantiated */ - fts_psort_t** merge) /*!< out: parallel merge info - to be instantiated */ + trx_t* trx, + row_merge_dup_t* dup, + const dict_table_t* new_table, + ibool opt_doc_id_size, + const page_size_t old_page_size, + fts_psort_t** psort, + fts_psort_t** merge) { ulint i; ulint j; @@ -212,6 +211,7 @@ row_fts_psort_info_init( common_info->dup = dup; common_info->new_table = (dict_table_t*) new_table; + common_info->old_page_size = old_page_size; common_info->trx = trx; common_info->all_info = psort_info; common_info->sort_event = os_event_create(0); @@ -247,7 +247,7 @@ row_fts_psort_info_init( dup->index); if (row_merge_file_create(psort_info[j].merge_file[i], - path) < 0) { + path) == OS_FILE_CLOSED) { goto func_exit; } @@ -407,9 +407,9 @@ row_merge_fts_doc_add_word_for_parser( ut_ad(t_ctx); str.f_str = (byte*)(word); - str.f_len = word_len; + str.f_len = ulint(word_len); str.f_n_char = fts_get_token_size( - (CHARSET_INFO*)param->cs, word, word_len); + (CHARSET_INFO*)param->cs, word, ulint(word_len)); /* JAN: TODO: MySQL 5.7 FTS ut_ad(boolean_info->position >= 0); @@ -663,7 +663,7 @@ row_merge_fts_doc_tokenize( MySQL 5.7 changed the fulltext parser plugin interface by adding MYSQL_FTPARSER_BOOLEAN_INFO::position. Below we assume that the field is always 0. */ - unsigned pos = t_ctx->init_pos; + ulint pos = t_ctx->init_pos; byte position[4]; if (parser == NULL) { pos += t_ctx->processed_len + inc - str.f_len; @@ -756,7 +756,7 @@ It also performs the initial in memory sort of the parsed records. @return OS_THREAD_DUMMY_RETURN */ static os_thread_ret_t -fts_parallel_tokenization( +DECLARE_THREAD(fts_parallel_tokenization)( /*======================*/ void* arg) /*!< in: psort_info for the thread */ { @@ -768,7 +768,7 @@ fts_parallel_tokenization( merge_file_t** merge_file; row_merge_block_t** block; row_merge_block_t** crypt_block; - int tmpfd[FTS_NUM_AUX_INDEX]; + pfs_os_file_t tmpfd[FTS_NUM_AUX_INDEX]; ulint mycount[FTS_NUM_AUX_INDEX]; ib_uint64_t total_rec = 0; ulint num_doc_processed = 0; @@ -805,7 +805,8 @@ fts_parallel_tokenization( block = psort_info->merge_block; crypt_block = psort_info->crypt_block; - const page_size_t& page_size = dict_table_page_size(table); + const page_size_t old_page_size = + psort_info->psort_common->old_page_size; row_merge_fts_get_next_doc_item(psort_info, &doc_item); @@ -835,7 +836,7 @@ loop: doc.text.f_str = btr_copy_externally_stored_field( &doc.text.f_len, data, - page_size, data_len, blob_heap); + old_page_size, data_len, blob_heap); } else { doc.text.f_str = data; doc.text.f_len = data_len; @@ -898,7 +899,7 @@ loop: merge_file[t_ctx.buf_used]->offset++, block[t_ctx.buf_used], crypt_block[t_ctx.buf_used], - table->space)) { + table->space_id)) { error = DB_TEMP_FILE_WRITE_FAIL; goto func_exit; } @@ -992,19 +993,19 @@ exit: merge_file[i]->offset++, block[i], crypt_block[i], - table->space)) { + table->space_id)) { error = DB_TEMP_FILE_WRITE_FAIL; goto func_exit; } -#ifdef HAVE_valgrind_or_MSAN +#ifdef HAVE_valgrind MEM_UNDEFINED(block[i], srv_sort_buf_size); if (crypt_block[i]) { MEM_UNDEFINED(crypt_block[i], srv_sort_buf_size); } -#endif /* HAVE_valgrind_or_MSAN */ +#endif /* HAVE_valgrind */ } buf[i] = row_merge_buf_empty(buf[i]); @@ -1022,7 +1023,7 @@ exit: } tmpfd[i] = row_merge_file_create_low(path); - if (tmpfd[i] < 0) { + if (tmpfd[i] == OS_FILE_CLOSED) { error = DB_OUT_OF_MEMORY; goto func_exit; } @@ -1031,15 +1032,15 @@ exit: psort_info->psort_common->dup, merge_file[i], block[i], &tmpfd[i], false, 0.0/* pct_progress */, 0.0/* pct_cost */, - crypt_block[i], table->space); + crypt_block[i], table->space_id); if (error != DB_SUCCESS) { - close(tmpfd[i]); + os_file_close(tmpfd[i]); goto func_exit; } total_rec += merge_file[i]->n_rec; - close(tmpfd[i]); + os_file_close(tmpfd[i]); } func_exit: @@ -1097,7 +1098,7 @@ Function performs the merge and insertion of the sorted records. @return OS_THREAD_DUMMY_RETURN */ static os_thread_ret_t -fts_parallel_merge( +DECLARE_THREAD(fts_parallel_merge)( /*===============*/ void* arg) /*!< in: parallel merge info */ { @@ -1128,7 +1129,7 @@ row_fts_start_parallel_merge( /*=========================*/ fts_psort_t* merge_info) /*!< in: parallel sort info */ { - int i = 0; + ulint i = 0; /* Kick off merge/insert threads */ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { @@ -1368,10 +1369,10 @@ row_fts_insert_tuple( Propagate a newly added record up one level in the selection tree @return parent where this value propagated to */ static -int +ulint row_fts_sel_tree_propagate( /*=======================*/ - int propogated, /*<! in: tree node propagated */ + ulint propogated, /*<! in: tree node propagated */ int* sel_tree, /*<! in: selection tree */ const mrec_t** mrec, /*<! in: sort record */ rec_offs** offsets, /*<! in: record offsets */ @@ -1410,7 +1411,7 @@ row_fts_sel_tree_propagate( sel_tree[parent] = selected; - return(static_cast<int>(parent)); + return parent; } /*********************************************************************//** @@ -1430,8 +1431,8 @@ row_fts_sel_tree_update( ulint i; for (i = 1; i <= height; i++) { - propagated = static_cast<ulint>(row_fts_sel_tree_propagate( - static_cast<int>(propagated), sel_tree, mrec, offsets, index)); + propagated = row_fts_sel_tree_propagate( + propagated, sel_tree, mrec, offsets, index); } return(sel_tree[0]); @@ -1511,7 +1512,7 @@ row_fts_build_sel_tree( { ulint treelevel = 1; ulint num = 2; - int i = 0; + ulint i = 0; ulint start; /* No need to build selection tree if we only have two merge threads */ @@ -1526,14 +1527,15 @@ row_fts_build_sel_tree( start = (ulint(1) << treelevel) - 1; - for (i = 0; i < (int) fts_sort_pll_degree; i++) { - sel_tree[i + start] = i; + for (i = 0; i < fts_sort_pll_degree; i++) { + sel_tree[i + start] = int(i); } - for (i = static_cast<int>(treelevel) - 1; i >= 0; i--) { + i = treelevel; + do { row_fts_build_sel_tree_level( - sel_tree, static_cast<ulint>(i), mrec, offsets, index); - } + sel_tree, --i, mrec, offsets, index); + } while (i > 0); return(treelevel); } @@ -1563,7 +1565,7 @@ row_fts_merge_insert( ib_alloc_t* heap_alloc; ulint i; mrec_buf_t** buf; - int* fd; + pfs_os_file_t* fd; byte** block; byte** crypt_block; const mrec_t** mrec; @@ -1572,18 +1574,17 @@ row_fts_merge_insert( ulint height; ulint start; fts_psort_insert_t ins_ctx; - ulint count_diag = 0; + uint64_t count_diag = 0; fts_table_t fts_table; char aux_table_name[MAX_FULL_NAME_LEN]; dict_table_t* aux_table; dict_index_t* aux_index; trx_t* trx; - byte sys_buf[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; /* We use the insert query graph as the dummy graph needed in the row module call */ - trx = trx_allocate_for_background(); + trx = trx_create(); trx_start_if_not_started(trx, true); trx->op_info = "inserting index entries"; @@ -1600,7 +1601,7 @@ row_fts_merge_insert( heap, sizeof(*offsets) * fts_sort_pll_degree); buf = (mrec_buf_t**) mem_heap_alloc( heap, sizeof(*buf) * fts_sort_pll_degree); - fd = (int*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree); + fd = (pfs_os_file_t*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree); block = (byte**) mem_heap_alloc( heap, sizeof(*block) * fts_sort_pll_degree); crypt_block = (byte**) mem_heap_alloc( @@ -1633,7 +1634,7 @@ row_fts_merge_insert( buf[i] = static_cast<mrec_buf_t*>( mem_heap_alloc(heap, sizeof *buf[i])); - count_diag += (int) psort_info[i].merge_file[id]->n_rec; + count_diag += psort_info[i].merge_file[id]->n_rec; } if (UNIV_UNLIKELY(fts_enable_diag_print)) { @@ -1669,6 +1670,11 @@ row_fts_merge_insert( dict_table_close(aux_table, FALSE, FALSE); aux_index = dict_table_get_first_index(aux_table); + ut_ad(!aux_index->is_instant()); + /* row_merge_write_fts_node() depends on the correct value */ + ut_ad(aux_index->n_core_null_bytes + == UT_BITS_IN_BYTES(aux_index->n_nullable)); + /* Create bulk load instance */ ins_ctx.btr_bulk = UT_NEW_NOKEY( BtrBulk(aux_index, trx, psort_info[0].psort_common->trx @@ -1680,17 +1686,14 @@ row_fts_merge_insert( dict_index_get_n_fields(aux_index)); /* Set TRX_ID and ROLL_PTR */ - trx_write_trx_id(sys_buf, trx->id); - trx_write_roll_ptr(&sys_buf[DATA_TRX_ID_LEN], - 1ULL << ROLL_PTR_INSERT_FLAG_POS); dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 2), - sys_buf, DATA_TRX_ID_LEN); + &reset_trx_id, DATA_TRX_ID_LEN); dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 3), - &sys_buf[DATA_TRX_ID_LEN], DATA_ROLL_PTR_LEN); + &reset_trx_id[DATA_TRX_ID_LEN], DATA_ROLL_PTR_LEN); ut_d(ins_ctx.aux_index_id = id); - const ulint space = table->space; + const ulint space = table->space_id; for (i = 0; i < fts_sort_pll_degree; i++) { if (psort_info[i].merge_file[id]->n_rec == 0) { @@ -1717,7 +1720,7 @@ row_fts_merge_insert( height = row_fts_build_sel_tree(sel_tree, (const mrec_t **) mrec, offsets, index); - start = (1 << height) - 1; + start = (1U << height) - 1; /* Fetch sorted records from sort buffer and insert them into corresponding FTS index auxiliary tables */ @@ -1800,7 +1803,7 @@ exit: error = ins_ctx.btr_bulk->finish(error); UT_DELETE(ins_ctx.btr_bulk); - trx_free_for_background(trx); + trx->free(); mem_heap_free(heap); diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc index fa7db1e27b8..d3ca5579d17 100644 --- a/storage/innobase/row/row0import.cc +++ b/storage/innobase/row/row0import.cc @@ -34,12 +34,12 @@ Created 2012-02-08 by Sunny Bains. #include "dict0load.h" #include "ibuf0ibuf.h" #include "pars0pars.h" -#include "row0upd.h" #include "row0sel.h" #include "row0mysql.h" #include "srv0start.h" #include "row0quiesce.h" #include "fil0pagecompress.h" +#include "trx0undo.h" #ifdef HAVE_LZO #include "lzo/lzo1x.h" #endif @@ -357,11 +357,11 @@ class AbstractCallback public: /** Constructor @param trx covering transaction */ - AbstractCallback(trx_t* trx) + AbstractCallback(trx_t* trx, ulint space_id) : m_page_size(0, 0, false), m_trx(trx), - m_space(ULINT_UNDEFINED), + m_space(space_id), m_xdes(), m_xdes_page_no(ULINT_UNDEFINED), m_space_flags(ULINT_UNDEFINED) UNIV_NOTHROW { } @@ -411,14 +411,13 @@ public: Called for every page in the tablespace. If the page was not updated then its state must be set to BUF_PAGE_NOT_USED. For compressed tables the page descriptor memory will be at offset: - block->frame + UNIV_PAGE_SIZE; + block->frame + srv_page_size; @param block block read from file, note it is not from the buffer pool @retval DB_SUCCESS or error code. */ virtual dberr_t operator()(buf_block_t* block) UNIV_NOTHROW = 0; - /** - @return the space id of the tablespace */ - virtual ulint get_space_id() const UNIV_NOTHROW = 0; + /** @return the tablespace identifier */ + ulint get_space_id() const { return m_space; } bool is_interrupted() const { return trx_is_interrupted(m_trx); } @@ -570,7 +569,7 @@ AbstractCallback::init( ib::error() << "Page size " << m_page_size.physical() << " of ibd file is not the same as the server page" - " size " << univ_page_size.physical(); + " size " << srv_page_size; return(DB_CORRUPTION); @@ -583,10 +582,11 @@ AbstractCallback::init( return(DB_CORRUPTION); } - ut_a(m_space == ULINT_UNDEFINED); - m_size = mach_read_from_4(page + FSP_SIZE); - m_space = mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SPACE_ID); + if (m_space == ULINT_UNDEFINED) { + m_space = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + + page); + } return set_current_xdes(0, page); } @@ -615,19 +615,12 @@ struct FetchIndexRootPages : public AbstractCallback { @param table table definition in server .*/ FetchIndexRootPages(const dict_table_t* table, trx_t* trx) : - AbstractCallback(trx), + AbstractCallback(trx, ULINT_UNDEFINED), m_table(table) UNIV_NOTHROW { } /** Destructor */ virtual ~FetchIndexRootPages() UNIV_NOTHROW { } - /** - @retval the space id of the tablespace being iterated over */ - virtual ulint get_space_id() const UNIV_NOTHROW - { - return(m_space); - } - /** Called for each block as it is read from the file. @param block block to convert, it is not from the buffer pool. @retval DB_SUCCESS or error code. */ @@ -786,8 +779,23 @@ class PageConverter : public AbstractCallback { public: /** Constructor @param cfg config of table being imported. + @param space_id tablespace identifier @param trx transaction covering the import */ - PageConverter(row_import* cfg, trx_t* trx) UNIV_NOTHROW; + PageConverter(row_import* cfg, ulint space_id, trx_t* trx) + : + AbstractCallback(trx, space_id), + m_cfg(cfg), + m_index(cfg->m_indexes), + m_current_lsn(log_get_lsn()), + m_page_zip_ptr(0), + m_rec_iter(), + m_offsets_(), m_offsets(m_offsets_), + m_heap(0), + m_cluster_index(dict_table_get_first_index(cfg->m_table)) + { + ut_ad(m_current_lsn); + rec_offs_init(m_offsets_); + } virtual ~PageConverter() UNIV_NOTHROW { @@ -796,13 +804,6 @@ public: } } - /** - @retval the server space id of the tablespace being iterated over */ - virtual ulint get_space_id() const UNIV_NOTHROW - { - return(m_cfg->m_table->space); - } - /** Called for each block as it is read from the file. @param block block to convert, it is not from the buffer pool. @retval DB_SUCCESS or error code. */ @@ -862,17 +863,14 @@ private: /** Purge delete-marked records, only if it is possible to do so without re-organising the B+tree. - @param offsets current row offsets. @retval true if purged */ - bool purge(const rec_offs* offsets) UNIV_NOTHROW; + bool purge() UNIV_NOTHROW; /** Adjust the BLOB references and sys fields for the current record. - @param index the index being converted @param rec record to update @param offsets column offsets for the record @return DB_SUCCESS or error code. */ dberr_t adjust_cluster_record( - const dict_index_t* index, rec_t* rec, const rec_offs* offsets) UNIV_NOTHROW; @@ -1363,8 +1361,6 @@ row_import::set_root_by_name() UNIV_NOTHROW /* We've already checked that it exists. */ ut_a(index != 0); - /* Set the root page number and space id. */ - index->space = m_table->space; index->page = cfg_index->m_page_no; } } @@ -1424,7 +1420,6 @@ row_import::set_root_by_heuristic() UNIV_NOTHROW cfg_index[i].m_srv_index = index; - index->space = m_table->space; index->page = cfg_index[i].m_page_no; ++i; @@ -1479,6 +1474,13 @@ IndexPurge::open() UNIV_NOTHROW btr_pcur_open_at_index_side( true, m_index, BTR_MODIFY_LEAF, &m_pcur, true, 0, &m_mtr); + btr_pcur_move_to_next_user_rec(&m_pcur, &m_mtr); + if (rec_is_metadata(btr_pcur_get_rec(&m_pcur), m_index)) { + ut_ad(btr_pcur_is_on_user_rec(&m_pcur)); + /* Skip the metadata pseudo-record. */ + } else { + btr_pcur_move_to_prev_on_page(&m_pcur); + } } /** @@ -1523,7 +1525,7 @@ IndexPurge::next() UNIV_NOTHROW ut_ad(m_pcur.latch_mode == BTR_MODIFY_LEAF); do { if (btr_pcur_is_after_last_on_page(&m_pcur)) { - if (btr_pcur_is_after_last_in_tree(&m_pcur, &m_mtr)) { + if (btr_pcur_is_after_last_in_tree(&m_pcur)) { return DB_END_OF_INDEX; } @@ -1625,28 +1627,6 @@ IndexPurge::purge() UNIV_NOTHROW btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr); } -/** Constructor -@param cfg config of table being imported. -@param trx transaction covering the import */ -inline -PageConverter::PageConverter( - row_import* cfg, - trx_t* trx) - : - AbstractCallback(trx), - m_cfg(cfg), - m_index(cfg->m_indexes), - m_current_lsn(log_get_lsn()), - m_page_zip_ptr(0), - m_rec_iter(), - m_offsets_(), m_offsets(m_offsets_), - m_heap(0), - m_cluster_index(dict_table_get_first_index(cfg->m_table)) UNIV_NOTHROW -{ - ut_a(m_current_lsn > 0); - rec_offs_init(m_offsets_); -} - /** Adjust the BLOB reference for a single column that is externally stored @param rec record to update @param offsets column offsets for the record @@ -1679,7 +1659,7 @@ PageConverter::adjust_cluster_index_blob_column( return(DB_CORRUPTION); } - field += BTR_EXTERN_SPACE_ID - BTR_EXTERN_FIELD_REF_SIZE + len; + field += len - (BTR_EXTERN_FIELD_REF_SIZE - BTR_EXTERN_SPACE_ID); mach_write_to_4(field, get_space_id()); @@ -1750,11 +1730,8 @@ PageConverter::adjust_cluster_index_blob_ref( /** Purge delete-marked records, only if it is possible to do so without re-organising the B+tree. -@param offsets current row offsets. @return true if purge succeeded */ -inline -bool -PageConverter::purge(const rec_offs* offsets) UNIV_NOTHROW +inline bool PageConverter::purge() UNIV_NOTHROW { const dict_index_t* index = m_index->m_srv_index; @@ -1778,7 +1755,6 @@ PageConverter::purge(const rec_offs* offsets) UNIV_NOTHROW inline dberr_t PageConverter::adjust_cluster_record( - const dict_index_t* index, rec_t* rec, const rec_offs* offsets) UNIV_NOTHROW { @@ -1789,10 +1765,20 @@ PageConverter::adjust_cluster_record( /* Reset DB_TRX_ID and DB_ROLL_PTR. Normally, these fields are only written in conjunction with other changes to the record. */ - - row_upd_rec_sys_fields( - rec, m_page_zip_ptr, m_cluster_index, m_offsets, - m_trx, roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS); + ulint trx_id_pos = m_cluster_index->n_uniq + ? m_cluster_index->n_uniq : 1; + if (m_page_zip_ptr) { + page_zip_write_trx_id_and_roll_ptr( + m_page_zip_ptr, rec, m_offsets, trx_id_pos, + 0, roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS, + NULL); + } else { + ulint len; + byte* ptr = rec_get_nth_field( + rec, m_offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + memcpy(ptr, reset_trx_id, sizeof reset_trx_id); + } } return(err); @@ -1825,14 +1811,14 @@ PageConverter::update_records( if (deleted || clust_index) { m_offsets = rec_get_offsets( - rec, m_index->m_srv_index, m_offsets, true, + rec, m_index->m_srv_index, m_offsets, + m_index->m_srv_index->n_core_fields, ULINT_UNDEFINED, &m_heap); } if (clust_index) { - dberr_t err = adjust_cluster_record( - m_index->m_srv_index, rec, m_offsets); + dberr_t err = adjust_cluster_record(rec, m_offsets); if (err != DB_SUCCESS) { return(err); @@ -1846,7 +1832,7 @@ PageConverter::update_records( /* A successful purge will move the cursor to the next record. */ - if (!purge(m_offsets)) { + if (!purge()) { m_rec_iter.next(); } @@ -1924,8 +1910,24 @@ PageConverter::update_index_page( page, m_page_zip_ptr, m_index->m_srv_index->id, 0); if (dict_index_is_clust(m_index->m_srv_index)) { - if (block->page.id.page_no() == m_index->m_srv_index->page) { + dict_index_t* index = const_cast<dict_index_t*>( + m_index->m_srv_index); + if (block->page.id.page_no() == index->page) { /* Preserve the PAGE_ROOT_AUTO_INC. */ + if (index->table->supports_instant()) { + if (btr_cur_instant_root_init(index, page)) { + return(DB_CORRUPTION); + } + + /* Provisionally set all instantly + added columns to be DEFAULT NULL. */ + for (unsigned i = index->n_core_fields; + i < index->n_fields; i++) { + dict_col_t* col = index->fields[i].col; + col->def_val.len = UNIV_SQL_NULL; + col->def_val.data = NULL; + } + } } else { /* Clear PAGE_MAX_TRX_ID so that it can be used for other purposes in the future. IMPORT @@ -2028,6 +2030,8 @@ PageConverter::update_page( return(DB_CORRUPTION); } + /* fall through */ + case FIL_PAGE_TYPE_INSTANT: /* This is on every page in the tablespace. */ mach_write_to_4( get_frame(block) @@ -2142,12 +2146,13 @@ row_import_discard_changes( index = UT_LIST_GET_NEXT(indexes, index)) { index->page = FIL_NULL; - index->space = FIL_NULL; } table->file_unreadable = true; - - fil_close_tablespace(trx, table->space); + if (table->space) { + fil_close_tablespace(trx, table->space_id); + table->space = NULL; + } } /*****************************************************************//** @@ -2174,7 +2179,7 @@ row_import_cleanup( row_mysql_unlock_data_dictionary(trx); - trx_free_for_mysql(trx); + trx->free(); prebuilt->trx->op_info = ""; @@ -2219,8 +2224,6 @@ static MY_ATTRIBUTE((nonnull, warn_unused_result)) dberr_t row_import_adjust_root_pages_of_secondary_indexes( /*==============================================*/ - row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from - handler */ trx_t* trx, /*!< in: transaction used for the import */ dict_table_t* table, /*!< in: table the indexes @@ -2244,7 +2247,6 @@ row_import_adjust_root_pages_of_secondary_indexes( ut_a(!dict_index_is_clust(index)); if (!(index->type & DICT_CORRUPT) - && index->space != FIL_NULL && index->page != FIL_NULL) { /* Update the Btree segment headers for index node and @@ -2359,7 +2361,14 @@ row_import_set_sys_max_row_id( rec = btr_pcur_get_rec(&pcur); /* Check for empty table. */ - if (!page_rec_is_infimum(rec)) { + if (page_rec_is_infimum(rec)) { + /* The table is empty. */ + err = DB_SUCCESS; + } else if (rec_is_metadata(rec, index)) { + /* The clustered index contains the metadata record only, + that is, the table is empty. */ + err = DB_SUCCESS; + } else { ulint len; const byte* field; mem_heap_t* heap = NULL; @@ -2369,7 +2378,8 @@ row_import_set_sys_max_row_id( rec_offs_init(offsets_); offsets = rec_get_offsets( - rec, index, offsets_, true, ULINT_UNDEFINED, &heap); + rec, index, offsets_, index->n_core_fields, + ULINT_UNDEFINED, &heap); field = rec_get_nth_field( rec, offsets, @@ -2386,9 +2396,6 @@ row_import_set_sys_max_row_id( if (heap != NULL) { mem_heap_free(heap); } - } else { - /* The table is empty. */ - err = DB_SUCCESS; } btr_pcur_close(&pcur); @@ -2475,8 +2482,7 @@ row_import_cfg_read_index_fields( /*=============================*/ FILE* file, /*!< in: file to write to */ THD* thd, /*!< in/out: session */ - row_index_t* index, /*!< Index being read in */ - row_import* cfg) /*!< in/out: meta-data read */ + row_index_t* index) /*!< Index being read in */ { byte row[sizeof(ib_uint32_t) * 3]; ulint n_fields = index->m_n_fields; @@ -2696,8 +2702,7 @@ row_import_read_index_data( return(err); } - err = row_import_cfg_read_index_fields( - file, thd, cfg_index, cfg); + err = row_import_cfg_read_index_fields(file, thd, cfg_index); if (err != DB_SUCCESS) { return(err); @@ -3027,14 +3032,14 @@ row_import_read_v1( const ulint logical_page_size = mach_read_from_4(ptr); ptr += sizeof(ib_uint32_t); - if (logical_page_size != univ_page_size.logical()) { + if (logical_page_size != srv_page_size) { ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, "Tablespace to be imported has a different" " page size than this server. Server page size" - " is " ULINTPF ", whereas tablespace page size" + " is %lu, whereas tablespace page size" " is " ULINTPF, - univ_page_size.logical(), + srv_page_size, logical_page_size); return(DB_ERROR); @@ -3073,7 +3078,6 @@ static MY_ATTRIBUTE((nonnull, warn_unused_result)) dberr_t row_import_read_meta_data( /*======================*/ - dict_table_t* table, /*!< in: table */ FILE* file, /*!< in: File to read from */ THD* thd, /*!< in: session */ row_import& cfg) /*!< out: contents of the .cfg file */ @@ -3147,7 +3151,7 @@ row_import_read_cfg( cfg.m_missing = false; - err = row_import_read_meta_data(table, file, thd, cfg); + err = row_import_read_meta_data(file, thd, cfg); fclose(file); } @@ -3166,6 +3170,8 @@ row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset) que_t* graph = 0; dberr_t err = DB_SUCCESS; + ut_ad(reset || table->space->id == table->space_id); + static const char sql[] = { "PROCEDURE UPDATE_INDEX_ROOT() IS\n" "BEGIN\n" @@ -3201,7 +3207,7 @@ row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset) mach_write_to_4( reinterpret_cast<byte*>(&space), - reset ? FIL_NULL : index->space); + reset ? FIL_NULL : index->table->space_id); mach_write_to_8( reinterpret_cast<byte*>(&index_id), @@ -3300,22 +3306,13 @@ row_import_set_discarded( return(FALSE); } -/*****************************************************************//** -Update the DICT_TF2_DISCARDED flag in SYS_TABLES. -@return DB_SUCCESS or error code. */ -dberr_t -row_import_update_discarded_flag( -/*=============================*/ - trx_t* trx, /*!< in/out: transaction that - covers the update */ - table_id_t table_id, /*!< in: Table for which we want - to set the root table->flags2 */ - bool discarded, /*!< in: set MIX_LEN column bit - to discarded, if true */ - bool dict_locked) /*!< in: set to true if the - caller already owns the - dict_sys_t:: mutex. */ - +/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN. +@param[in,out] trx dictionary transaction +@param[in] table_id table identifier +@param[in] discarded whether to set or clear the flag +@return DB_SUCCESS or error code */ +dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id, + bool discarded) { pars_info_t* info; discard_t discard; @@ -3354,7 +3351,7 @@ row_import_update_discarded_flag( pars_info_bind_function( info, "my_func", row_import_set_discarded, &discard); - dberr_t err = que_eval_sql(info, sql, !dict_locked, trx); + dberr_t err = que_eval_sql(info, sql, false, trx); ut_a(discard.n_recs == 1); ut_a(discard.flags2 != ULINT32_UNDEFINED); @@ -3704,7 +3701,7 @@ not_encrypted: } } - if (page_compressed && punch_hole && srv_use_trim) { + if (page_compressed && punch_hole) { err = fil_import_compress_fwrite( iter, write_request, offset, writeptr, n_bytes, !updated); @@ -3802,8 +3799,8 @@ fil_tablespace_iterate( We allocate an extra page in case it is a compressed table. One page is to ensure alignement. */ - void* page_ptr = ut_malloc_nokey(3 * UNIV_PAGE_SIZE); - byte* page = static_cast<byte*>(ut_align(page_ptr, UNIV_PAGE_SIZE)); + void* page_ptr = ut_malloc_nokey(3U << srv_page_size_shift); + byte* page = static_cast<byte*>(ut_align(page_ptr, srv_page_size)); buf_block_t* block = reinterpret_cast<buf_block_t*> (ut_zalloc_nokey(sizeof *block)); @@ -3819,7 +3816,7 @@ fil_tablespace_iterate( request.disable_partial_io_warnings(); err = os_file_read_no_error_handling(request, file, page, 0, - UNIV_PAGE_SIZE, 0); + srv_page_size, 0); if (err == DB_SUCCESS) { err = callback.init(file_size, block); @@ -3858,23 +3855,24 @@ fil_tablespace_iterate( /* Add an extra page for compressed page scratch area. */ void* io_buffer = ut_malloc_nokey( - (2 + iter.n_io_buffers) * UNIV_PAGE_SIZE); + (2 + iter.n_io_buffers) << srv_page_size_shift); iter.io_buffer = static_cast<byte*>( - ut_align(io_buffer, UNIV_PAGE_SIZE)); + ut_align(io_buffer, srv_page_size)); void* crypt_io_buffer = NULL; if (iter.crypt_data) { crypt_io_buffer = ut_malloc_nokey( - (2 + iter.n_io_buffers) * UNIV_PAGE_SIZE); + (2 + iter.n_io_buffers) + << srv_page_size_shift); iter.crypt_io_buffer = static_cast<byte*>( - ut_align(crypt_io_buffer, UNIV_PAGE_SIZE)); + ut_align(crypt_io_buffer, srv_page_size)); } if (block->page.zip.ssize) { ut_ad(iter.n_io_buffers == 1); block->frame = iter.io_buffer; - block->page.zip.data = block->frame + UNIV_PAGE_SIZE; + block->page.zip.data = block->frame + srv_page_size; } err = fil_iterate(iter, block, callback); @@ -3926,17 +3924,18 @@ row_import_for_mysql( /* The caller assured that this is not read_only_mode and that no temorary tablespace is being imported. */ ut_ad(!srv_read_only_mode); - ut_ad(!dict_table_is_temporary(table)); + ut_ad(!table->is_temporary()); - ut_a(table->space); + ut_ad(table->space_id); + ut_ad(table->space_id < SRV_LOG_SPACE_FIRST_ID); ut_ad(prebuilt->trx); - ut_a(!table->is_readable()); + ut_ad(!table->is_readable()); - ibuf_delete_for_discarded_space(table->space); + ibuf_delete_for_discarded_space(table->space_id); trx_start_if_not_started(prebuilt->trx, true); - trx = trx_allocate_for_mysql(); + trx = trx_create(); /* So that the table is not DROPped during recovery. */ trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); @@ -3954,14 +3953,13 @@ row_import_for_mysql( /* Assign an undo segment for the transaction, so that the transaction will be recovered after a crash. */ - mutex_enter(&trx->undo_mutex); - /* TODO: Do not write any undo log for the IMPORT cleanup. */ - trx_undo_t** pundo = &trx->rsegs.m_redo.update_undo; - err = trx_undo_assign_undo(trx, trx->rsegs.m_redo.rseg, pundo, - TRX_UNDO_UPDATE); - - mutex_exit(&trx->undo_mutex); + { + mtr_t mtr; + mtr.start(); + trx_undo_assign(trx, &err, &mtr); + mtr.commit(); + } DBUG_EXECUTE_IF("ib_import_undo_assign_failure", err = DB_TOO_MANY_CONCURRENT_TRXS;); @@ -3970,7 +3968,7 @@ row_import_for_mysql( return(row_import_cleanup(prebuilt, trx, err)); - } else if (trx->rsegs.m_redo.update_undo == 0) { + } else if (trx->rsegs.m_redo.undo == 0) { err = DB_TOO_MANY_CONCURRENT_TRXS; return(row_import_cleanup(prebuilt, trx, err)); @@ -4058,7 +4056,7 @@ row_import_for_mysql( /* Iterate over all the pages and do the sanity checking and the conversion required to import the tablespace. */ - PageConverter converter(&cfg, trx); + PageConverter converter(&cfg, table->space_id, trx); /* Set the IO buffer size in pages. */ @@ -4134,18 +4132,19 @@ row_import_for_mysql( have an x-lock on dict_operation_lock and dict_sys->mutex. The tablespace is initially opened as a temporary one, because we will not be writing any redo log for it before we have invoked - fil_space_set_imported() to declare it a persistent tablespace. */ + fil_space_t::set_imported() to declare it a persistent tablespace. */ ulint fsp_flags = dict_tf_to_fsp_flags(table->flags); - err = fil_ibd_open( - true, true, FIL_TYPE_IMPORT, table->space, - fsp_flags, table->name.m_name, filepath); + table->space = fil_ibd_open( + true, true, FIL_TYPE_IMPORT, table->space_id, + fsp_flags, table->name, filepath, &err); + ut_ad((table->space == NULL) == (err != DB_SUCCESS)); DBUG_EXECUTE_IF("ib_import_open_tablespace_failure", - err = DB_TABLESPACE_NOT_FOUND;); + err = DB_TABLESPACE_NOT_FOUND; table->space = NULL;); - if (err != DB_SUCCESS) { + if (!table->space) { row_mysql_unlock_data_dictionary(trx); ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, @@ -4212,7 +4211,7 @@ row_import_for_mysql( during the page conversion phase. */ err = row_import_adjust_root_pages_of_secondary_indexes( - prebuilt, trx, table, cfg); + trx, table, cfg); DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure", err = DB_CORRUPTION;); @@ -4241,7 +4240,7 @@ row_import_for_mysql( { FlushObserver observer(prebuilt->table->space, trx, NULL); - buf_LRU_flush_or_remove_pages(prebuilt->table->space, + buf_LRU_flush_or_remove_pages(prebuilt->table->space_id, &observer); if (observer.is_interrupted()) { @@ -4252,7 +4251,7 @@ row_import_for_mysql( } ib::info() << "Phase IV - Flush complete"; - fil_space_set_imported(prebuilt->table->space); + prebuilt->table->space->set_imported(); /* The dictionary latches will be released in in row_import_cleanup() after the transaction commit, for both success and error. */ @@ -4266,8 +4265,7 @@ row_import_for_mysql( return(row_import_error(prebuilt, trx, err)); } - /* Update the table's discarded flag, unset it. */ - err = row_import_update_discarded_flag(trx, table->id, false, true); + err = row_import_update_discarded_flag(trx, table->id, false); if (err != DB_SUCCESS) { return(row_import_error(prebuilt, trx, err)); diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index 4dc9c66a536..512d373bbfa 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2020, MariaDB Corporation. +Copyright (c) 2016, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -61,40 +61,6 @@ check. If you make a change in this module make sure that no codepath is introduced where a call to log_free_check() is bypassed. */ -/*********************************************************************//** -Creates an insert node struct. -@return own: insert node struct */ -ins_node_t* -ins_node_create( -/*============*/ - ulint ins_type, /*!< in: INS_VALUES, ... */ - dict_table_t* table, /*!< in: table where to insert */ - mem_heap_t* heap) /*!< in: mem heap where created */ -{ - ins_node_t* node; - - node = new (static_cast<ins_node_t*>( - mem_heap_alloc(heap, sizeof(ins_node_t)))) ins_node_t; - - node->common.type = QUE_NODE_INSERT; - - node->ins_type = ins_type; - - node->state = INS_NODE_SET_IX_LOCK; - node->table = table; - node->index = NULL; - - node->select = NULL; - - node->trx_id = 0; - - node->entry_sys_heap = mem_heap_create(128); - - node->magic_n = INS_NODE_MAGIC_N; - - return(node); -} - /** Create an row template for each index of a table. */ static void ins_node_create_entry_list(ins_node_t *node) { @@ -138,6 +104,8 @@ row_ins_alloc_sys_fields( memset(node->sys_buf, 0, sizeof node->sys_buf); /* Assign DB_ROLL_PTR to 1 << ROLL_PTR_INSERT_FLAG_POS */ node->sys_buf[DATA_ROW_ID_LEN + DATA_TRX_ID_LEN] = 0x80; + ut_ad(!memcmp(node->sys_buf + DATA_ROW_ID_LEN, reset_trx_id, + sizeof reset_trx_id)); /* 1. Populate row-id */ col = dict_table_get_sys_col(table, DATA_ROW_ID); @@ -346,7 +314,8 @@ row_ins_clust_index_entry_by_modify( } if (mode != BTR_MODIFY_TREE) { - ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF); + ut_ad((mode & ulint(~BTR_ALREADY_S_LATCHED)) + == BTR_MODIFY_LEAF); /* Try optimistic updating of the record, keeping changes within the page */ @@ -412,7 +381,7 @@ row_ins_cascade_ancestor_updates_table( upd_node = static_cast<upd_node_t*>(parent); - if (upd_node->table == table && upd_node->is_delete == FALSE) { + if (upd_node->table == table && !upd_node->is_delete) { return(TRUE); } @@ -538,6 +507,8 @@ row_ins_cascade_calc_update_vec( ufield->exp = NULL; ufield->new_val = parent_ufield->new_val; + dfield_get_type(&ufield->new_val)->prtype |= + col->prtype & DATA_VERSIONED; ufield_len = dfield_get_len(&ufield->new_val); /* Clear the "external storage" flag */ @@ -744,8 +715,6 @@ row_ins_foreign_trx_print( heap_size = mem_heap_get_size(trx->lock.lock_heap); lock_mutex_exit(); - trx_sys_mutex_enter(); - mutex_enter(&dict_foreign_err_mutex); rewind(dict_foreign_err_file); ut_print_timestamp(dict_foreign_err_file); @@ -754,8 +723,6 @@ row_ins_foreign_trx_print( trx_print_low(dict_foreign_err_file, trx, 600, n_rec_locks, n_trx_locks, heap_size); - trx_sys_mutex_exit(); - ut_ad(mutex_own(&dict_foreign_err_mutex)); } @@ -894,8 +861,7 @@ row_ins_invalidate_query_cache( const char* name) /*!< in: table name prefixed with database name and a '/' character */ { - ulint len = strlen(name) + 1; - innobase_invalidate_query_cache(thr_get_trx(thr), name, len); + innobase_invalidate_query_cache(thr_get_trx(thr), name); } @@ -920,7 +886,7 @@ row_ins_foreign_fill_virtual( rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs_init(offsets_); const rec_offs* offsets = - rec_get_offsets(rec, index, offsets_, true, + rec_get_offsets(rec, index, offsets_, index->n_core_fields, ULINT_UNDEFINED, &cascade->heap); TABLE* mysql_table= NULL; upd_t* update = cascade->update; @@ -1095,8 +1061,13 @@ row_ins_foreign_check_on_constraint( cascade = node->cascade_node; cascade->table = table; cascade->foreign = foreign; - if (!(cascade->is_delete = node->is_delete - && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE))) { + + if (node->is_delete + && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) { + cascade->is_delete = PLAIN_DELETE; + } else { + cascade->is_delete = NO_DELETE; + if (foreign->n_fields > cascade->update_n_fields) { /* We have to make the update vector longer */ @@ -1241,10 +1212,8 @@ row_ins_foreign_check_on_constraint( update->info_bits = 0; update->n_fields = foreign->n_fields; -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(update->fields, update->n_fields * sizeof *update->fields); -#endif /* HAVE_valgrind_or_MSAN */ for (ulint i = 0; i < foreign->n_fields; i++) { upd_field_t* ufield = &update->fields[i]; @@ -1277,8 +1246,9 @@ row_ins_foreign_check_on_constraint( goto nonstandard_exit_func; } } - } else if (table->fts && cascade->is_delete + } else if (table->fts && cascade->is_delete == PLAIN_DELETE && foreign->affects_fulltext()) { + /* DICT_FOREIGN_ON_DELETE_CASCADE case */ fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL); } @@ -1332,6 +1302,15 @@ row_ins_foreign_check_on_constraint( } } + if (table->versioned() && cascade->is_delete != PLAIN_DELETE + && cascade->update->affects_versioned()) { + ut_ad(!cascade->historical_heap); + cascade->historical_heap = mem_heap_create(srv_page_size); + cascade->historical_row = row_build( + ROW_COPY_DATA, clust_index, clust_rec, NULL, table, + NULL, NULL, NULL, cascade->historical_heap); + } + /* Store pcur position and initialize or store the cascade node pcur stored position */ @@ -1362,9 +1341,7 @@ row_ins_foreign_check_on_constraint( /* Release the data dictionary latch for a while, so that we do not starve other threads from doing CREATE TABLE etc. if we have a huge - cascaded operation running. The counter n_foreign_key_checks_running - will prevent other users from dropping or ALTERing the table when we - release the latch. */ + cascaded operation running. */ row_mysql_unfreeze_data_dictionary(thr_get_trx(thr)); @@ -1519,8 +1496,14 @@ row_ins_check_foreign_constraint( /* If any of the foreign key fields in entry is SQL NULL, we suppress the foreign key check: this is compatible with Oracle, for example */ - for (ulint i = 0; i < foreign->n_fields; i++) { - if (dfield_is_null(dtuple_get_nth_field(entry, i))) { + for (ulint i = 0; i < entry->n_fields; i++) { + dfield_t* field = dtuple_get_nth_field(entry, i); + if (i < foreign->n_fields && dfield_is_null(field)) { + goto exit_func; + } + /* System Versioning: if row_end != Inf, we + suppress the foreign key check */ + if (field->type.vers_sys_end() && field->vers_history_row()) { goto exit_func; } } @@ -1528,7 +1511,8 @@ row_ins_check_foreign_constraint( if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) { upd_node = static_cast<upd_node_t*>(thr->run_node); - if (!(upd_node->is_delete) && upd_node->foreign == foreign) { + if (upd_node->is_delete != PLAIN_DELETE + && upd_node->foreign == foreign) { /* If a cascaded update is done as defined by a foreign key constraint, do not check that constraint for the child row. In ON UPDATE CASCADE @@ -1549,6 +1533,19 @@ row_ins_check_foreign_constraint( } } + if (que_node_get_type(thr->run_node) == QUE_NODE_INSERT) { + ins_node_t* insert_node = + static_cast<ins_node_t*>(thr->run_node); + dict_table_t* table = insert_node->index->table; + if (table->versioned()) { + dfield_t* row_end = dtuple_get_nth_field( + insert_node->row, table->vers_end); + if (row_end->vers_history_row()) { + goto exit_func; + } + } + } + if (check_ref) { check_table = foreign->referenced_table; check_index = foreign->referenced_index; @@ -1559,8 +1556,7 @@ row_ins_check_foreign_constraint( if (check_table == NULL || !check_table->is_readable() - || check_index == NULL - || fil_space_get(check_table->space)->is_being_truncated) { + || check_index == NULL) { FILE* ef = dict_foreign_err_file; std::string fk_str; @@ -1644,7 +1640,8 @@ row_ins_check_foreign_constraint( continue; } - offsets = rec_get_offsets(rec, check_index, offsets, true, + offsets = rec_get_offsets(rec, check_index, offsets, + check_index->n_core_fields, ULINT_UNDEFINED, &heap); if (page_rec_is_supremum(rec)) { @@ -1669,6 +1666,23 @@ row_ins_check_foreign_constraint( cmp = cmp_dtuple_rec(entry, rec, offsets); if (cmp == 0) { + if (check_table->versioned()) { + bool history_row = false; + + if (check_index->is_primary()) { + history_row = check_index-> + vers_history_row(rec, offsets); + } else if (check_index-> + vers_history_row(rec, history_row)) + { + break; + } + + if (history_row) { + continue; + } + } + if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { /* In delete-marked records, DB_TRX_ID must @@ -1833,9 +1847,7 @@ do_possible_lock_wait: thr->lock_state = QUE_THR_LOCK_ROW; - /* To avoid check_table being dropped, increment counter */ - my_atomic_addlint( - &check_table->n_foreign_key_checks_running, 1); + check_table->inc_fk_checks(); lock_wait_suspend_thread(thr); @@ -1849,8 +1861,7 @@ do_possible_lock_wait: err = DB_LOCK_WAIT; } - my_atomic_addlint(&check_table->n_foreign_key_checks_running, - -1); + check_table->dec_fk_checks(); } exit_func: @@ -1973,9 +1984,7 @@ row_ins_check_foreign_constraints( } if (referenced_table) { - my_atomic_addlint( - &foreign->foreign_table - ->n_foreign_key_checks_running, 1); + foreign->foreign_table->inc_fk_checks(); } /* NOTE that if the thread ends up waiting for a lock @@ -1987,9 +1996,7 @@ row_ins_check_foreign_constraints( TRUE, foreign, table, ref_tuple, thr); if (referenced_table) { - my_atomic_addlint( - &foreign->foreign_table - ->n_foreign_key_checks_running, -1); + foreign->foreign_table->dec_fk_checks(); } if (got_s_lock) { @@ -2132,7 +2139,8 @@ row_ins_scan_sec_index_for_duplicate( continue; } - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &offsets_heap); if (flags & BTR_NO_LOCKING_FLAG) { @@ -2234,8 +2242,14 @@ row_ins_duplicate_online( return(DB_SUCCESS); } - if (fields == n_uniq + 2) { - /* rec is an exact match of entry. */ + ulint trx_id_len; + + if (fields == n_uniq + 2 + && memcmp(rec_get_nth_field(rec, offsets, n_uniq, &trx_id_len), + reset_trx_id, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) { + ut_ad(trx_id_len == DATA_TRX_ID_LEN); + /* rec is an exact match of entry, and DB_TRX_ID belongs + to a transaction that started after our ALTER TABLE. */ return(DB_SUCCESS_LOCKED_REC); } @@ -2260,8 +2274,11 @@ row_ins_duplicate_error_in_clust_online( dberr_t err = DB_SUCCESS; const rec_t* rec = btr_cur_get_rec(cursor); + ut_ad(!cursor->index->is_instant()); + if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) { - *offsets = rec_get_offsets(rec, cursor->index, *offsets, true, + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + cursor->index->n_fields, ULINT_UNDEFINED, heap); err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets); if (err != DB_SUCCESS) { @@ -2272,7 +2289,8 @@ row_ins_duplicate_error_in_clust_online( rec = page_rec_get_next_const(btr_cur_get_rec(cursor)); if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) { - *offsets = rec_get_offsets(rec, cursor->index, *offsets, true, + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + cursor->index->n_fields, ULINT_UNDEFINED, heap); err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets); } @@ -2328,7 +2346,7 @@ row_ins_duplicate_error_in_clust( if (!page_rec_is_infimum(rec)) { offsets = rec_get_offsets(rec, cursor->index, offsets, - true, + cursor->index->n_core_fields, ULINT_UNDEFINED, &heap); /* We set a lock on the possible duplicate: this @@ -2371,6 +2389,18 @@ row_ins_duplicate_error_in_clust( duplicate: trx->error_info = cursor->index; err = DB_DUPLICATE_KEY; + if (cursor->index->table->versioned() + && entry->vers_history_row()) + { + ulint trx_id_len; + byte *trx_id = rec_get_nth_field( + rec, offsets, n_unique, + &trx_id_len); + ut_ad(trx_id_len == DATA_TRX_ID_LEN); + if (trx->id == trx_read_trx_id(trx_id)) { + err = DB_FOREIGN_DUPLICATE_KEY; + } + } goto func_exit; } } @@ -2382,7 +2412,7 @@ duplicate: if (!page_rec_is_supremum(rec)) { offsets = rec_get_offsets(rec, cursor->index, offsets, - true, + cursor->index->n_core_fields, ULINT_UNDEFINED, &heap); if (trx->duplicates) { @@ -2493,13 +2523,13 @@ row_ins_index_entry_big_rec( if (index->table->is_temporary()) { mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - mtr.set_named_space(index->space); + index->set_modified(mtr); } btr_pcur_open(index, entry, PAGE_CUR_LE, BTR_MODIFY_TREE, &pcur, &mtr); rec = btr_pcur_get_rec(&pcur); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, ULINT_UNDEFINED, heap); DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern"); @@ -2564,7 +2594,7 @@ row_ins_clust_index_entry_low( mtr_start(&mtr); - if (dict_table_is_temporary(index->table)) { + if (index->table->is_temporary()) { /* Disable REDO logging as the lifetime of temp-tables is limited to server or connection lifetime and so REDO information is not needed on restart for recovery. @@ -2573,14 +2603,15 @@ row_ins_clust_index_entry_low( ut_ad(flags & BTR_NO_LOCKING_FLAG); ut_ad(!dict_index_is_online_ddl(index)); ut_ad(!index->table->persistent_autoinc); + ut_ad(!index->is_instant()); mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - mtr.set_named_space(index->space); + index->set_modified(mtr); if (mode == BTR_MODIFY_LEAF && dict_index_is_online_ddl(index)) { mode = BTR_MODIFY_LEAF_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } if (unsigned ai = index->table->persistent_autoinc) { @@ -2624,6 +2655,39 @@ row_ins_clust_index_entry_low( } #endif /* UNIV_DEBUG */ + if (UNIV_UNLIKELY(entry->info_bits != 0)) { + ut_ad(entry->info_bits == REC_INFO_METADATA); + ut_ad(flags == BTR_NO_LOCKING_FLAG); + ut_ad(index->is_instant()); + ut_ad(!dict_index_is_online_ddl(index)); + + const rec_t* rec = btr_cur_get_rec(cursor); + + switch (rec_get_info_bits(rec, page_rec_is_comp(rec)) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) { + case REC_INFO_MIN_REC_FLAG: + thr_get_trx(thr)->error_info = index; + err = DB_DUPLICATE_KEY; + goto err_exit; + case REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG: + /* The metadata record never carries the delete-mark + in MariaDB Server 10.3. + If a table loses its 'instantness', it happens + by the rollback of this first-time insert, or + by a call to btr_page_empty() on the root page + when the table becomes empty. */ + err = DB_CORRUPTION; + goto err_exit; + default: + ut_ad(!row_ins_must_modify_rec(cursor)); + goto do_insert; + } + } + + if (rec_is_metadata(btr_cur_get_rec(cursor), index)) { + goto do_insert; + } + if (n_uniq && (cursor->up_match >= n_uniq || cursor->low_match >= n_uniq)) { @@ -2681,10 +2745,12 @@ err_exit: mtr_commit(&mtr); mem_heap_free(entry_heap); } else { + if (index->is_instant()) entry->trim(*index); +do_insert: rec_t* insert_rec; if (mode != BTR_MODIFY_TREE) { - ut_ad((mode & ~BTR_ALREADY_S_LATCHED) + ut_ad((mode & ulint(~BTR_ALREADY_S_LATCHED)) == BTR_MODIFY_LEAF); err = btr_cur_optimistic_insert( flags, cursor, &offsets, &offsets_heap, @@ -2765,12 +2831,12 @@ row_ins_sec_mtr_start_and_check_if_aborted( ulint search_mode) { ut_ad(!dict_index_is_clust(index)); - ut_ad(mtr->is_named_space(index->space)); + ut_ad(mtr->is_named_space(index->table->space)); const mtr_log_t log_mode = mtr->get_log_mode(); - mtr_start(mtr); - mtr->set_named_space(index->space); + mtr->start(); + index->set_modified(*mtr); mtr->set_log_mode(log_mode); if (!check) { @@ -2778,9 +2844,9 @@ row_ins_sec_mtr_start_and_check_if_aborted( } if (search_mode & BTR_ALREADY_S_LATCHED) { - mtr_s_lock(dict_index_get_lock(index), mtr); + mtr_s_lock_index(index, mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } switch (index->online_status) { @@ -2849,7 +2915,7 @@ row_ins_sec_index_entry_low( ut_ad(flags & BTR_NO_LOCKING_FLAG); mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - mtr.set_named_space(index->space); + index->set_modified(mtr); if (!dict_index_is_spatial(index)) { search_mode |= BTR_INSERT; } @@ -2866,9 +2932,9 @@ row_ins_sec_index_entry_low( DEBUG_SYNC_C("row_ins_sec_index_enter"); if (mode == BTR_MODIFY_LEAF) { search_mode |= BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } if (row_log_online_op_try( @@ -2893,7 +2959,7 @@ row_ins_sec_index_entry_low( err = btr_cur_search_to_nth_level( index, 0, entry, PAGE_CUR_RTREE_INSERT, search_mode, - &cursor, 0, __FILE__, __LINE__, &mtr, 0); + &cursor, 0, __FILE__, __LINE__, &mtr); if (mode == BTR_MODIFY_LEAF && rtr_info.mbr_adj) { mtr_commit(&mtr); @@ -2902,13 +2968,13 @@ row_ins_sec_index_entry_low( index, false); rtr_info_update_btr(&cursor, &rtr_info); mtr_start(&mtr); - mtr.set_named_space(index->space); + index->set_modified(mtr); search_mode &= ulint(~BTR_MODIFY_LEAF); search_mode |= BTR_MODIFY_TREE; err = btr_cur_search_to_nth_level( index, 0, entry, PAGE_CUR_RTREE_INSERT, search_mode, - &cursor, 0, __FILE__, __LINE__, &mtr, 0); + &cursor, 0, __FILE__, __LINE__, &mtr); mode = BTR_MODIFY_TREE; } @@ -2920,7 +2986,7 @@ row_ins_sec_index_entry_low( err = btr_cur_search_to_nth_level( index, 0, entry, PAGE_CUR_LE, search_mode, - &cursor, 0, __FILE__, __LINE__, &mtr, 0); + &cursor, 0, __FILE__, __LINE__, &mtr); } if (err != DB_SUCCESS) { @@ -3014,7 +3080,7 @@ row_ins_sec_index_entry_low( index, 0, entry, PAGE_CUR_LE, (search_mode & ~(BTR_INSERT | BTR_IGNORE_SEC_UNIQUE)), - &cursor, 0, __FILE__, __LINE__, &mtr, 0); + &cursor, 0, __FILE__, __LINE__, &mtr); } if (row_ins_must_modify_rec(&cursor)) { @@ -3022,7 +3088,8 @@ row_ins_sec_index_entry_low( prefix, we must convert the insert into a modify of an existing record */ offsets = rec_get_offsets( - btr_cur_get_rec(&cursor), index, offsets, true, + btr_cur_get_rec(&cursor), index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &offsets_heap); err = row_ins_sec_index_entry_by_modify( @@ -3031,7 +3098,7 @@ row_ins_sec_index_entry_low( if (err == DB_SUCCESS && dict_index_is_spatial(index) && rtr_info.mbr_adj) { - err = rtr_ins_enlarge_mbr(&cursor, thr, &mtr); + err = rtr_ins_enlarge_mbr(&cursor, &mtr); } } else { rec_t* insert_rec; @@ -3045,7 +3112,7 @@ row_ins_sec_index_entry_low( if (err == DB_SUCCESS && dict_index_is_spatial(index) && rtr_info.mbr_adj) { - err = rtr_ins_enlarge_mbr(&cursor, thr, &mtr); + err = rtr_ins_enlarge_mbr(&cursor, &mtr); } } else { ut_ad(mode == BTR_MODIFY_TREE); @@ -3070,7 +3137,7 @@ row_ins_sec_index_entry_low( if (err == DB_SUCCESS && dict_index_is_spatial(index) && rtr_info.mbr_adj) { - err = rtr_ins_enlarge_mbr(&cursor, thr, &mtr); + err = rtr_ins_enlarge_mbr(&cursor, &mtr); } } @@ -3123,9 +3190,13 @@ row_ins_clust_index_entry( n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0; - ulint flags = dict_table_is_temporary(index->table) - ? BTR_NO_LOCKING_FLAG - : 0; + ulint flags = index->table->no_rollback() ? BTR_NO_ROLLBACK + : index->table->is_temporary() + ? BTR_NO_LOCKING_FLAG : 0; + const ulint orig_n_fields = entry->n_fields; + + /* Try first optimistic descent to the B-tree */ + log_free_check(); /* For intermediate table during copy alter table, skip the undo log and record lock checking for @@ -3142,6 +3213,8 @@ row_ins_clust_index_entry( flags, BTR_MODIFY_LEAF, index, n_uniq, entry, n_ext, thr); + entry->n_fields = orig_n_fields; + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, "after_row_ins_clust_index_entry_leaf"); @@ -3157,6 +3230,8 @@ row_ins_clust_index_entry( flags, BTR_MODIFY_TREE, index, n_uniq, entry, n_ext, thr); + entry->n_fields = orig_n_fields; + DBUG_RETURN(err); } @@ -3171,7 +3246,9 @@ row_ins_sec_index_entry( /*====================*/ dict_index_t* index, /*!< in: secondary index */ dtuple_t* entry, /*!< in/out: index entry to insert */ - que_thr_t* thr) /*!< in: query thread */ + que_thr_t* thr, /*!< in: query thread */ + bool check_foreign) /*!< in: true if check + foreign table is needed, false otherwise */ { dberr_t err; mem_heap_t* offsets_heap; @@ -3182,7 +3259,7 @@ row_ins_sec_index_entry( DBUG_SET("-d,row_ins_sec_index_entry_timeout"); return(DB_LOCK_WAIT);}); - if (!index->table->foreign_set.empty()) { + if (check_foreign && !index->table->foreign_set.empty()) { err = row_ins_check_foreign_constraints(index->table, index, false, entry, thr); if (err != DB_SUCCESS) { @@ -3199,7 +3276,7 @@ row_ins_sec_index_entry( /* Try first optimistic descent to the B-tree */ log_free_check(); - ulint flags = dict_table_is_temporary(index->table) + ulint flags = index->table->is_temporary() ? BTR_NO_LOCKING_FLAG : 0; @@ -3218,7 +3295,7 @@ row_ins_sec_index_entry( if (err == DB_FAIL) { mem_heap_empty(heap); - if (index->space == IBUF_SPACE_ID + if (index->table->space == fil_system.sys_space && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) { ibuf_free_excess_pages(); } @@ -3250,16 +3327,17 @@ row_ins_index_entry( dtuple_t* entry, /*!< in/out: index entry to insert */ que_thr_t* thr) /*!< in: query thread */ { - ut_ad(thr_get_trx(thr)->id != 0 || index->table->is_temporary()); + ut_ad(thr_get_trx(thr)->id || index->table->no_rollback() + || index->table->is_temporary()); DBUG_EXECUTE_IF("row_ins_index_entry_timeout", { DBUG_SET("-d,row_ins_index_entry_timeout"); return(DB_LOCK_WAIT);}); - if (dict_index_is_clust(index)) { - return(row_ins_clust_index_entry(index, entry, thr, 0)); + if (index->is_primary()) { + return row_ins_clust_index_entry(index, entry, thr, 0); } else { - return(row_ins_sec_index_entry(index, entry, thr)); + return row_ins_sec_index_entry(index, entry, thr); } } @@ -3330,7 +3408,7 @@ row_ins_index_entry_set_vals( col = ind_field->col; } - if (dict_col_is_virtual(col)) { + if (col->is_virtual()) { const dict_v_col_t* v_col = reinterpret_cast<const dict_v_col_t*>(col); ut_ad(dtuple_get_n_fields(row) @@ -3506,6 +3584,16 @@ row_ins_get_row_from_select( } } +inline +bool ins_node_t::vers_history_row() const +{ + if (!table->versioned()) + return false; + dfield_t* row_end = dtuple_get_nth_field(row, table->vers_end); + return row_end->vers_history_row(); +} + + /***********************************************************//** Inserts a row to a table. @return DB_SUCCESS if operation successfully completed, else error @@ -3544,12 +3632,31 @@ row_ins( ut_ad(node->state == INS_NODE_INSERT_ENTRIES); while (node->index != NULL) { - if (node->index->type != DICT_FTS) { + dict_index_t *index = node->index; + /* + We do not insert history rows into FTS_DOC_ID_INDEX because + it is unique by FTS_DOC_ID only and we do not want to add + row_end to unique key. Fulltext field works the way new + FTS_DOC_ID is created on every fulltext UPDATE, so holding only + FTS_DOC_ID for history is enough. + */ + const unsigned type = index->type; + if (index->type & DICT_FTS) { + } else if (!(type & DICT_UNIQUE) || index->n_uniq > 1 + || !node->vers_history_row()) { + dberr_t err = row_ins_index_entry_step(node, thr); if (err != DB_SUCCESS) { DBUG_RETURN(err); } + } else { + /* Unique indexes with system versioning must contain + the version end column. The only exception is a hidden + FTS_DOC_ID_INDEX that InnoDB may create on a hidden or + user-created FTS_DOC_ID column. */ + ut_ad(!strcmp(index->name, FTS_DOC_ID_INDEX_NAME)); + ut_ad(!strcmp(index->fields[0].name, FTS_DOC_ID_COL_NAME)); } node->index = dict_table_get_next_index(node->index); @@ -3590,8 +3697,6 @@ row_ins_step( trx = thr_get_trx(thr); - trx_start_if_not_started_xa(trx, true); - node = static_cast<ins_node_t*>(thr->run_node); ut_ad(que_node_get_type(node) == QUE_NODE_INSERT); @@ -3613,7 +3718,28 @@ row_ins_step( table during the search operation, and there is no need to set it again here. But we must write trx->id to node->sys_buf. */ - trx_write_trx_id(&node->sys_buf[DATA_TRX_ID_LEN], trx->id); + if (node->table->no_rollback()) { + /* No-rollback tables should only be written to by a + single thread at a time, but there can be multiple + concurrent readers. We must hold an open table handle. */ + DBUG_ASSERT(node->table->get_ref_count() > 0); + DBUG_ASSERT(node->ins_type == INS_DIRECT); + /* No-rollback tables can consist only of a single index. */ + DBUG_ASSERT(node->entry_list.size() == 1); + DBUG_ASSERT(UT_LIST_GET_LEN(node->table->indexes) == 1); + /* There should be no possibility for interruption and + restarting here. In theory, we could allow resumption + from the INS_NODE_INSERT_ENTRIES state here. */ + DBUG_ASSERT(node->state == INS_NODE_SET_IX_LOCK); + node->index = dict_table_get_first_index(node->table); + node->entry = node->entry_list.begin(); + node->state = INS_NODE_INSERT_ENTRIES; + goto do_insert; + } + + if (UNIV_LIKELY(!node->table->skip_alter_undo)) { + trx_write_trx_id(&node->sys_buf[DATA_TRX_ID_LEN], trx->id); + } if (node->state == INS_NODE_SET_IX_LOCK) { @@ -3666,7 +3792,7 @@ same_trx: return(thr); } - +do_insert: /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ err = row_ins(node, thr); diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc index 1a7652350a9..6095d72bb86 100644 --- a/storage/innobase/row/row0log.cc +++ b/storage/innobase/row/row0log.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,6 +38,7 @@ Created 2011-05-26 Marko Makela #include "ut0stage.h" #include "trx0rec.h" +#include <sql_class.h> #include <algorithm> #include <map> @@ -70,6 +71,7 @@ enum row_op { /** Log block for modifications during online ALTER TABLE */ struct row_log_buf_t { byte* block; /*!< file block buffer */ + size_t size; /*!< length of block in bytes */ ut_new_pfx_t block_pfx; /*!< opaque descriptor of "block". Set by ut_allocator::allocate_large() and fed to ut_allocator::deallocate_large(). */ @@ -169,7 +171,7 @@ When head.blocks == tail.blocks, the reader will access tail.block directly. When also head.bytes == tail.bytes, both counts will be reset to 0 and the file will be truncated. */ struct row_log_t { - int fd; /*!< file descriptor */ + pfs_os_file_t fd; /*!< file descriptor */ ib_mutex_t mutex; /*!< mutex protecting error, max_trx and tail */ page_no_map* blobs; /*!< map of page numbers of off-page columns @@ -181,12 +183,27 @@ struct row_log_t { index that is being created online */ bool same_pk;/*!< whether the definition of the PRIMARY KEY has remained the same */ - const dtuple_t* add_cols; - /*!< default values of added columns, or NULL */ + const dtuple_t* defaults; + /*!< default values of added, changed columns, + or NULL */ const ulint* col_map;/*!< mapping of old column numbers to new ones, or NULL if !table */ dberr_t error; /*!< error that occurred during online table rebuild */ + /** The transaction ID of the ALTER TABLE transaction. Any + concurrent DML would necessarily be logged with a larger + transaction ID, because ha_innobase::prepare_inplace_alter_table() + acts as a barrier that ensures that any concurrent transaction + that operates on the table would have been started after + ha_innobase::prepare_inplace_alter_table() returns and before + ha_innobase::commit_inplace_alter_table(commit=true) is invoked. + + Due to the nondeterministic nature of purge and due to the + possibility of upgrading from an earlier version of MariaDB + or MySQL, it is possible that row_log_table_low() would be + fed DB_TRX_ID that precedes than min_trx. We must normalize + such references to reset_trx_id[]. */ + trx_id_t min_trx; trx_id_t max_trx;/*!< biggest observed trx_id in row_log_online_op(); protected by mutex and index->lock S-latch, @@ -204,24 +221,55 @@ struct row_log_t { decryption or NULL */ const char* path; /*!< where to create temporary file during log operation */ + /** the number of core fields in the clustered index of the + source table; before row_log_table_apply() completes, the + table could be emptied, so that table->is_instant() no longer holds, + but all log records must be in the "instant" format. */ + unsigned n_core_fields; + /** the default values of non-core fields when the operation started */ + dict_col_t::def_t* non_core_fields; + bool allow_not_null; /*!< Whether the alter ignore is being + used or if the sql mode is non-strict mode; + if not, NULL values will not be converted to + defaults */ + const TABLE* old_table; /*< Use old table in case of error. */ + + uint64_t n_rows; /*< Number of rows read from the table */ + /** Determine whether the log should be in the 'instant ADD' format + @param[in] index the clustered index of the source table + @return whether to use the 'instant ADD COLUMN' format */ + bool is_instant(const dict_index_t* index) const + { + ut_ad(table); + ut_ad(n_core_fields <= index->n_fields); + return n_core_fields != index->n_fields; + } + + const byte* instant_field_value(ulint n, ulint* len) const + { + ut_ad(n >= n_core_fields); + const dict_col_t::def_t& d= non_core_fields[n - n_core_fields]; + *len = d.len; + return static_cast<const byte*>(d.data); + } }; /** Create the file or online log if it does not exist. @param[in,out] log online rebuild log @return true if success, false if not */ static MY_ATTRIBUTE((warn_unused_result)) -int +pfs_os_file_t row_log_tmpfile( row_log_t* log) { DBUG_ENTER("row_log_tmpfile"); - if (log->fd < 0) { + if (log->fd == OS_FILE_CLOSED) { log->fd = row_merge_file_create_low(log->path); DBUG_EXECUTE_IF("row_log_tmpfile_fail", - if (log->fd > 0) + if (log->fd != OS_FILE_CLOSED) row_merge_file_destroy_low(log->fd); - log->fd = -1;); - if (log->fd >= 0) { + log->fd = OS_FILE_CLOSED;); + if (log->fd != OS_FILE_CLOSED) { MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_LOG_FILES); } } @@ -250,6 +298,7 @@ row_log_block_allocate( if (log_buf.block == NULL) { DBUG_RETURN(false); } + log_buf.size = srv_sort_buf_size; } DBUG_RETURN(true); } @@ -264,7 +313,7 @@ row_log_block_free( DBUG_ENTER("row_log_block_free"); if (log_buf.block != NULL) { ut_allocator<byte>(mem_key_row_log_buf).deallocate_large( - log_buf.block, &log_buf.block_pfx); + log_buf.block, &log_buf.block_pfx, log_buf.size); log_buf.block = NULL; } DBUG_VOID_RETURN; @@ -302,7 +351,7 @@ row_log_online_op( row_merge_buf_encode(), because here we do not encode extra_size+1 (and reserve 0 as the end-of-chunk marker). */ - size = rec_get_converted_size_temp( + size = rec_get_converted_size_temp<false>( index, tuple->fields, tuple->n_fields, &extra_size); ut_ad(size >= extra_size); ut_ad(size <= sizeof log->tail.buf); @@ -323,9 +372,7 @@ row_log_online_op( goto err_exit; } -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf); -#endif /* HAVE_valgrind_or_MSAN */ ut_ad(log->tail.bytes < srv_sort_buf_size); avail_size = srv_sort_buf_size - log->tail.bytes; @@ -352,7 +399,7 @@ row_log_online_op( *b++ = (byte) extra_size; } - rec_convert_dtuple_to_temp( + rec_convert_dtuple_to_temp<false>( b + extra_size, index, tuple->fields, tuple->n_fields); b += size; @@ -377,7 +424,7 @@ row_log_online_op( MEM_CHECK_DEFINED(buf, srv_sort_buf_size); - if (row_log_tmpfile(log) < 0) { + if (row_log_tmpfile(log) == OS_FILE_CLOSED) { log->error = DB_OUT_OF_MEMORY; goto err_exit; } @@ -397,11 +444,12 @@ row_log_online_op( } log->tail.blocks++; - if (DB_SUCCESS != os_file_write_int_fd( + if (os_file_write( request, "(modification log)", log->fd, - buf, byte_offset, srv_sort_buf_size)) { + buf, byte_offset, srv_sort_buf_size) + != DB_SUCCESS) { write_failed: /* We set the flag directly instead of invoking dict_set_corrupted_index_cache_only(index) here, @@ -409,10 +457,8 @@ write_failed: index->type |= DICT_CORRUPT; } -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(log->tail.block, srv_sort_buf_size); MEM_UNDEFINED(buf, srv_sort_buf_size); -#endif /* HAVE_valgrind_or_MSAN */ memcpy(log->tail.block, log->tail.buf + avail_size, mrec_size - avail_size); @@ -422,9 +468,7 @@ write_failed: ut_ad(b == log->tail.block + log->tail.bytes); } -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf); -#endif /* HAVE_valgrind_or_MSAN */ err_exit: mutex_exit(&log->mutex); } @@ -456,9 +500,7 @@ row_log_table_open( { mutex_enter(&log->mutex); -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf); -#endif /* HAVE_valgrind_or_MSAN */ if (log->error != DB_SUCCESS) { err_exit: @@ -520,7 +562,7 @@ row_log_table_close_func( MEM_CHECK_DEFINED(buf, srv_sort_buf_size); - if (row_log_tmpfile(log) < 0) { + if (row_log_tmpfile(log) == OS_FILE_CLOSED) { log->error = DB_OUT_OF_MEMORY; goto err_exit; } @@ -531,7 +573,7 @@ row_log_table_close_func( if (!log_tmp_block_encrypt( log->tail.block, srv_sort_buf_size, log->crypt_tail, byte_offset, - index->table->space)) { + index->table->space_id)) { log->error = DB_DECRYPTION_FAILED; goto err_exit; } @@ -541,18 +583,18 @@ row_log_table_close_func( } log->tail.blocks++; - if (DB_SUCCESS != os_file_write_int_fd( + if (os_file_write( request, "(modification log)", log->fd, - buf, byte_offset, srv_sort_buf_size)) { + buf, byte_offset, srv_sort_buf_size) + != DB_SUCCESS) { write_failed: log->error = DB_ONLINE_LOG_TOO_BIG; } -#ifdef HAVE_valgrind_or_MSAN + MEM_UNDEFINED(log->tail.block, srv_sort_buf_size); MEM_UNDEFINED(buf, srv_sort_buf_size); -#endif /* HAVE_valgrind_or_MSAN */ memcpy(log->tail.block, log->tail.buf + avail, size - avail); log->tail.bytes = size - avail; } else { @@ -561,9 +603,7 @@ write_failed: } log->tail.total += size; -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf); -#endif /* HAVE_valgrind_or_MSAN */ err_exit: mutex_exit(&log->mutex); @@ -634,6 +674,7 @@ row_log_table_delete( ut_ad(dict_index_is_clust(new_index)); ut_ad(!dict_index_is_online_ddl(new_index)); + ut_ad(index->online_log->min_trx); /* Create the tuple PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in new_table. */ if (index->online_log->same_pk) { @@ -644,8 +685,9 @@ row_log_table_delete( fields of the record. */ heap = mem_heap_create( DATA_TRX_ID_LEN - + DTUPLE_EST_ALLOC(new_index->n_uniq + 2)); - old_pk = tuple = dtuple_create(heap, new_index->n_uniq + 2); + + DTUPLE_EST_ALLOC(unsigned(new_index->n_uniq) + 2)); + old_pk = tuple = dtuple_create( + heap, unsigned(new_index->n_uniq) + 2); dict_index_copy_types(tuple, new_index, tuple->n_fields); dtuple_set_n_fields_cmp(tuple, new_index->n_uniq); @@ -660,16 +702,27 @@ row_log_table_delete( dfield_set_data(dfield, field, len); } - if (sys) { - dfield_set_data( - dtuple_get_nth_field(tuple, - new_index->n_uniq), - sys, DATA_TRX_ID_LEN); - dfield_set_data( - dtuple_get_nth_field(tuple, - new_index->n_uniq + 1), - sys + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN); + dfield_t* db_trx_id = dtuple_get_nth_field( + tuple, new_index->n_uniq); + + const bool replace_sys_fields + = sys + || trx_read_trx_id(static_cast<byte*>(db_trx_id->data)) + < index->online_log->min_trx; + + if (replace_sys_fields) { + if (!sys || trx_read_trx_id(sys) + < index->online_log->min_trx) { + sys = reset_trx_id; + } + + dfield_set_data(db_trx_id, sys, DATA_TRX_ID_LEN); + dfield_set_data(db_trx_id + 1, sys + DATA_TRX_ID_LEN, + DATA_ROLL_PTR_LEN); } + + ut_d(trx_id_check(db_trx_id->data, + index->online_log->min_trx)); } else { /* The PRIMARY KEY has changed. Translate the tuple. */ old_pk = row_log_table_get_pk( @@ -688,7 +741,7 @@ row_log_table_delete( old_pk, old_pk->n_fields - 2)->len); ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( old_pk, old_pk->n_fields - 1)->len); - old_pk_size = rec_get_converted_size_temp( + old_pk_size = rec_get_converted_size_temp<false>( new_index, old_pk->fields, old_pk->n_fields, &old_pk_extra_size); ut_ad(old_pk_extra_size < 0x100); @@ -701,7 +754,7 @@ row_log_table_delete( *b++ = ROW_T_DELETE; *b++ = static_cast<byte>(old_pk_extra_size); - rec_convert_dtuple_to_temp( + rec_convert_dtuple_to_temp<false>( b + old_pk_extra_size, new_index, old_pk->fields, old_pk->n_fields); @@ -742,21 +795,23 @@ row_log_table_low_redundant( ulint avail_size; mem_heap_t* heap = NULL; dtuple_t* tuple; + const ulint n_fields = rec_get_n_fields_old(rec); ut_ad(!page_is_comp(page_align(rec))); - ut_ad(dict_index_get_n_fields(index) == rec_get_n_fields_old(rec)); + ut_ad(index->n_fields >= n_fields); + ut_ad(index->n_fields == n_fields || index->is_instant()); ut_ad(dict_tf2_is_valid(index->table->flags, index->table->flags2)); ut_ad(!dict_table_is_comp(index->table)); /* redundant row format */ ut_ad(dict_index_is_clust(new_index)); - heap = mem_heap_create(DTUPLE_EST_ALLOC(index->n_fields)); - tuple = dtuple_create(heap, index->n_fields); - dict_index_copy_types(tuple, index, index->n_fields); + heap = mem_heap_create(DTUPLE_EST_ALLOC(n_fields)); + tuple = dtuple_create(heap, n_fields); + dict_index_copy_types(tuple, index, n_fields); dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index)); if (rec_get_1byte_offs_flag(rec)) { - for (ulint i = 0; i < index->n_fields; i++) { + for (ulint i = 0; i < n_fields; i++) { dfield_t* dfield; ulint len; const void* field; @@ -767,7 +822,7 @@ row_log_table_low_redundant( dfield_set_data(dfield, field, len); } } else { - for (ulint i = 0; i < index->n_fields; i++) { + for (ulint i = 0; i < n_fields; i++) { dfield_t* dfield; ulint len; const void* field; @@ -783,8 +838,28 @@ row_log_table_low_redundant( } } - size = rec_get_converted_size_temp( - index, tuple->fields, tuple->n_fields, &extra_size); + dfield_t* db_trx_id = dtuple_get_nth_field(tuple, index->n_uniq); + ut_ad(dfield_get_len(db_trx_id) == DATA_TRX_ID_LEN); + ut_ad(dfield_get_len(db_trx_id + 1) == DATA_ROLL_PTR_LEN); + + if (trx_read_trx_id(static_cast<const byte*> + (dfield_get_data(db_trx_id))) + < index->online_log->min_trx) { + dfield_set_data(db_trx_id, reset_trx_id, DATA_TRX_ID_LEN); + dfield_set_data(db_trx_id + 1, reset_trx_id + DATA_TRX_ID_LEN, + DATA_ROLL_PTR_LEN); + } + + const bool is_instant = index->online_log->is_instant(index); + rec_comp_status_t status = is_instant + ? REC_STATUS_COLUMNS_ADDED : REC_STATUS_ORDINARY; + + size = rec_get_converted_size_temp<true>( + index, tuple->fields, tuple->n_fields, &extra_size, status); + if (is_instant) { + size++; + extra_size++; + } mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80); @@ -799,7 +874,7 @@ row_log_table_low_redundant( ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( old_pk, old_pk->n_fields - 1)->len); - old_pk_size = rec_get_converted_size_temp( + old_pk_size = rec_get_converted_size_temp<false>( new_index, old_pk->fields, old_pk->n_fields, &old_pk_extra_size); ut_ad(old_pk_extra_size < 0x100); @@ -808,15 +883,19 @@ row_log_table_low_redundant( if (byte* b = row_log_table_open(index->online_log, mrec_size, &avail_size)) { - *b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE; + if (insert) { + *b++ = ROW_T_INSERT; + } else { + *b++ = ROW_T_UPDATE; - if (old_pk_size) { - *b++ = static_cast<byte>(old_pk_extra_size); + if (old_pk_size) { + *b++ = static_cast<byte>(old_pk_extra_size); - rec_convert_dtuple_to_temp( - b + old_pk_extra_size, new_index, - old_pk->fields, old_pk->n_fields); - b += old_pk_size; + rec_convert_dtuple_to_temp<false>( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + b += old_pk_size; + } } if (extra_size < 0x80) { @@ -827,8 +906,17 @@ row_log_table_low_redundant( *b++ = static_cast<byte>(extra_size); } - rec_convert_dtuple_to_temp( - b + extra_size, index, tuple->fields, tuple->n_fields); + if (status == REC_STATUS_COLUMNS_ADDED) { + ut_ad(is_instant); + if (n_fields <= index->online_log->n_core_fields) { + status = REC_STATUS_ORDINARY; + } + *b = status; + } + + rec_convert_dtuple_to_temp<true>( + b + extra_size, index, tuple->fields, tuple->n_fields, + status); b += size; row_log_table_close(index, b, mrec_size, avail_size); @@ -852,26 +940,39 @@ row_log_table_low( const dtuple_t* old_pk) /*!< in: old PRIMARY KEY value (if !insert and a PRIMARY KEY is being created) */ { - ulint omit_size; ulint old_pk_size; ulint old_pk_extra_size; ulint extra_size; ulint mrec_size; ulint avail_size; const dict_index_t* new_index; + row_log_t* log = index->online_log; - new_index = dict_table_get_first_index(index->online_log->table); + new_index = dict_table_get_first_index(log->table); ut_ad(dict_index_is_clust(index)); ut_ad(dict_index_is_clust(new_index)); ut_ad(!dict_index_is_online_ddl(new_index)); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index)); - ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf); + ut_ad(rec_offs_size(offsets) <= sizeof log->tail.buf); ut_ad(rw_lock_own_flagged( &index->lock, RW_LOCK_FLAG_S | RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX)); - ut_ad(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX); +#ifdef UNIV_DEBUG + switch (fil_page_get_type(page_align(rec))) { + case FIL_PAGE_INDEX: + break; + case FIL_PAGE_TYPE_INSTANT: + ut_ad(index->is_instant()); + ut_ad(!page_has_siblings(page_align(rec))); + ut_ad(page_get_page_no(page_align(rec)) == index->page); + break; + default: + ut_ad(!"wrong page type"); + } +#endif /* UNIV_DEBUG */ + ut_ad(!rec_is_metadata(rec, index)); ut_ad(page_rec_is_leaf(rec)); ut_ad(!page_is_comp(page_align(rec)) == !rec_offs_comp(offsets)); /* old_pk=row_log_table_get_pk() [not needed in INSERT] is a prefix @@ -882,7 +983,7 @@ row_log_table_low( if (index->online_status != ONLINE_INDEX_CREATION || (index->type & DICT_CORRUPT) || index->table->corrupted - || index->online_log->error != DB_SUCCESS) { + || log->error != DB_SUCCESS) { return; } @@ -893,16 +994,38 @@ row_log_table_low( } ut_ad(page_is_comp(page_align(rec))); - ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); - - omit_size = REC_N_NEW_EXTRA_BYTES; - - extra_size = rec_offs_extra_size(offsets) - omit_size; + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY + || rec_get_status(rec) == REC_STATUS_COLUMNS_ADDED); + + const ulint omit_size = REC_N_NEW_EXTRA_BYTES; + + const ulint rec_extra_size = rec_offs_extra_size(offsets) - omit_size; + const bool is_instant = log->is_instant(index); + extra_size = rec_extra_size + is_instant; + + unsigned fake_extra_size = 0; + byte fake_extra_buf[3]; + if (is_instant && UNIV_UNLIKELY(!index->is_instant())) { + /* The source table was emptied after ALTER TABLE + started, and it was converted to non-instant format. + Because row_log_table_apply_op() expects to find + all records to be logged in the same way, we will + be unable to copy the rec_extra_size bytes from the + record header, but must convert them here. */ + unsigned n_add = index->n_fields - 1 - log->n_core_fields; + fake_extra_size = rec_get_n_add_field_len(n_add); + ut_ad(fake_extra_size == 1 || fake_extra_size == 2); + extra_size += fake_extra_size; + byte* fake_extra = fake_extra_buf + fake_extra_size; + rec_set_n_add_field(fake_extra, n_add); + ut_ad(fake_extra == fake_extra_buf); + } mrec_size = ROW_LOG_HEADER_SIZE - + (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size; + + (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size + + is_instant + fake_extra_size; - if (insert || index->online_log->same_pk) { + if (insert || log->same_pk) { ut_ad(!old_pk); old_pk_extra_size = old_pk_size = 0; } else { @@ -913,24 +1036,27 @@ row_log_table_low( ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( old_pk, old_pk->n_fields - 1)->len); - old_pk_size = rec_get_converted_size_temp( + old_pk_size = rec_get_converted_size_temp<false>( new_index, old_pk->fields, old_pk->n_fields, &old_pk_extra_size); ut_ad(old_pk_extra_size < 0x100); mrec_size += 1/*old_pk_extra_size*/ + old_pk_size; } - if (byte* b = row_log_table_open(index->online_log, - mrec_size, &avail_size)) { - *b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE; + if (byte* b = row_log_table_open(log, mrec_size, &avail_size)) { + if (insert) { + *b++ = ROW_T_INSERT; + } else { + *b++ = ROW_T_UPDATE; - if (old_pk_size) { - *b++ = static_cast<byte>(old_pk_extra_size); + if (old_pk_size) { + *b++ = static_cast<byte>(old_pk_extra_size); - rec_convert_dtuple_to_temp( - b + old_pk_extra_size, new_index, - old_pk->fields, old_pk->n_fields); - b += old_pk_size; + rec_convert_dtuple_to_temp<false>( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + b += old_pk_size; + } } if (extra_size < 0x80) { @@ -941,9 +1067,27 @@ row_log_table_low( *b++ = static_cast<byte>(extra_size); } - memcpy(b, rec - rec_offs_extra_size(offsets), extra_size); - b += extra_size; + if (is_instant) { + *b++ = fake_extra_size + ? REC_STATUS_COLUMNS_ADDED + : rec_get_status(rec); + } else { + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + } + + memcpy(b, rec - rec_extra_size - omit_size, rec_extra_size); + b += rec_extra_size; + memcpy(b, fake_extra_buf + 1, fake_extra_size); + b += fake_extra_size; + ulint len; + ulint trx_id_offs = rec_get_nth_field_offs( + offsets, index->n_uniq, &len); + ut_ad(len == DATA_TRX_ID_LEN); memcpy(b, rec, rec_offs_data_size(offsets)); + if (trx_read_trx_id(b + trx_id_offs) < log->min_trx) { + memcpy(b + trx_id_offs, + reset_trx_id, sizeof reset_trx_id); + } b += rec_offs_data_size(offsets); row_log_table_close(index, b, mrec_size, avail_size); @@ -990,7 +1134,6 @@ row_log_table_get_pk_old_col( } /** Maps an old table column of a PRIMARY KEY column. -@param[in] col old table column (before ALTER TABLE) @param[in] ifield clustered index field in the new table (after ALTER TABLE) @param[in,out] dfield clustered index tuple field in the new table @@ -1001,12 +1144,12 @@ table @param[in] i rec field corresponding to col @param[in] page_size page size of the old table @param[in] max_len maximum length of dfield +@param[in] log row log for the table @retval DB_INVALID_NULL if a NULL value is encountered @retval DB_TOO_BIG_INDEX_COL if the maximum prefix length is exceeded */ static dberr_t row_log_table_get_pk_col( - const dict_col_t* col, const dict_field_t* ifield, dfield_t* dfield, mem_heap_t* heap, @@ -1014,15 +1157,32 @@ row_log_table_get_pk_col( const rec_offs* offsets, ulint i, const page_size_t& page_size, - ulint max_len) + ulint max_len, + const row_log_t* log) { const byte* field; ulint len; field = rec_get_nth_field(rec, offsets, i, &len); + if (len == UNIV_SQL_DEFAULT) { + field = log->instant_field_value(i, &len); + } + if (len == UNIV_SQL_NULL) { - return(DB_INVALID_NULL); + if (!log->allow_not_null) { + return(DB_INVALID_NULL); + } + + unsigned col_no= ifield->col->ind; + ut_ad(col_no < log->defaults->n_fields); + + field = static_cast<const byte*>( + log->defaults->fields[col_no].data); + if (!field) { + return(DB_INVALID_NULL); + } + len = log->defaults->fields[col_no].len; } if (rec_offs_nth_extern(offsets, i)) { @@ -1082,6 +1242,7 @@ row_log_table_get_pk( ut_ad(log); ut_ad(log->table); + ut_ad(log->min_trx); if (log->same_pk) { /* The PRIMARY KEY columns are unchanged. */ @@ -1097,7 +1258,8 @@ row_log_table_get_pk( if (!offsets) { offsets = rec_get_offsets( - rec, index, NULL, true, + rec, index, NULL, + index->n_core_fields, pos + 1, heap); } @@ -1106,8 +1268,13 @@ row_log_table_get_pk( ut_ad(len == DATA_TRX_ID_LEN); } - memcpy(sys, rec + trx_id_offs, - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + const byte* ptr = trx_read_trx_id(rec + trx_id_offs) + < log->min_trx + ? reset_trx_id + : rec + trx_id_offs; + + memcpy(sys, ptr, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + ut_d(trx_id_check(sys, log->min_trx)); } return(NULL); @@ -1128,7 +1295,7 @@ row_log_table_get_pk( if (!offsets) { size += (1 + REC_OFFS_HEADER_SIZE - + index->n_fields) + + unsigned(index->n_fields)) * sizeof *offsets; } @@ -1142,7 +1309,8 @@ row_log_table_get_pk( } if (!offsets) { - offsets = rec_get_offsets(rec, index, NULL, true, + offsets = rec_get_offsets(rec, index, NULL, + index->n_core_fields, ULINT_UNDEFINED, heap); } @@ -1179,8 +1347,8 @@ row_log_table_get_pk( } log->error = row_log_table_get_pk_col( - col, ifield, dfield, *heap, - rec, offsets, i, page_size, max_len); + ifield, dfield, *heap, + rec, offsets, i, page_size, max_len, log); if (log->error != DB_SUCCESS) { err_exit: @@ -1195,10 +1363,10 @@ err_exit: /* No matching column was found in the old table, so this must be an added column. Copy the default value. */ - ut_ad(log->add_cols); + ut_ad(log->defaults); dfield_copy(dfield, dtuple_get_nth_field( - log->add_cols, col_no)); + log->defaults, col_no)); mbminlen = dfield->type.mbminlen; mbmaxlen = dfield->type.mbmaxlen; prtype = dfield->type.prtype; @@ -1226,7 +1394,13 @@ err_exit: /* Copy the fields, because the fields will be updated or the record may be moved somewhere else in the B-tree as part of the upcoming operation. */ - if (sys) { + if (trx_read_trx_id(trx_roll) < log->min_trx) { + trx_roll = reset_trx_id; + if (sys) { + memcpy(sys, trx_roll, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } + } else if (sys) { memcpy(sys, trx_roll, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); trx_roll = sys; @@ -1237,6 +1411,8 @@ err_exit: DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)); } + ut_d(trx_id_check(trx_roll, log->min_trx)); + dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq), trx_roll, DATA_TRX_ID_LEN); dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1), @@ -1350,20 +1526,20 @@ row_log_table_apply_convert_mrec( const mrec_t* mrec, /*!< in: merge record */ dict_index_t* index, /*!< in: index of mrec */ const rec_offs* offsets, /*!< in: offsets of mrec */ - const row_log_t* log, /*!< in: rebuild context */ + row_log_t* log, /*!< in: rebuild context */ mem_heap_t* heap, /*!< in/out: memory heap */ - trx_id_t trx_id, /*!< in: DB_TRX_ID of mrec */ dberr_t* error) /*!< out: DB_SUCCESS or DB_MISSING_HISTORY or reason of failure */ { dtuple_t* row; + log->n_rows++; *error = DB_SUCCESS; /* This is based on row_build(). */ - if (log->add_cols) { - row = dtuple_copy(log->add_cols, heap); + if (log->defaults) { + row = dtuple_copy(log->defaults, heap); /* dict_table_copy_types() would set the fields to NULL */ for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) { dict_col_copy_type( @@ -1441,6 +1617,9 @@ blob_done: rw_lock_x_unlock(dict_index_get_lock(index)); } else { data = rec_get_nth_field(mrec, offsets, i, &len); + if (len == UNIV_SQL_DEFAULT) { + data = log->instant_field_value(i, &len); + } dfield_set_data(dfield, data, len); } @@ -1481,9 +1660,23 @@ blob_done: if ((new_col->prtype & DATA_NOT_NULL) && dfield_is_null(dfield)) { - /* We got a NULL value for a NOT NULL column. */ - *error = DB_INVALID_NULL; - return(NULL); + + const dfield_t& default_field + = log->defaults->fields[col_no]; + + Field* field = log->old_table->field[col->ind]; + + field->set_warning(Sql_condition::WARN_LEVEL_WARN, + WARN_DATA_TRUNCATED, 1, + ulong(log->n_rows)); + + if (!log->allow_not_null) { + /* We got a NULL value for a NOT NULL column. */ + *error = DB_INVALID_NULL; + return NULL; + } + + *dfield = default_field; } /* Adjust the DATA_NOT_NULL flag in the parsed row. */ @@ -1506,7 +1699,6 @@ row_log_table_apply_insert_low( que_thr_t* thr, /*!< in: query graph */ const dtuple_t* row, /*!< in: table row in the old table definition */ - trx_id_t trx_id, /*!< in: trx_id of the row */ mem_heap_t* offsets_heap, /*!< in/out: memory heap that can be emptied */ mem_heap_t* heap, /*!< in/out: memory heap */ @@ -1520,7 +1712,6 @@ row_log_table_apply_insert_low( ulint n_index = 0; ut_ad(dtuple_validate(row)); - ut_ad(trx_id); DBUG_LOG("ib_alter_table", "insert table " << index->table->id << " (index " @@ -1559,7 +1750,8 @@ row_log_table_apply_insert_low( entry = row_build_index_entry(row, NULL, index, heap); error = row_ins_sec_index_entry_low( flags, BTR_MODIFY_TREE, - index, offsets_heap, heap, entry, trx_id, thr); + index, offsets_heap, heap, entry, + thr_get_trx(thr)->id, thr); if (error != DB_SUCCESS) { if (error == DB_DUPLICATE_KEY) { @@ -1585,14 +1777,13 @@ row_log_table_apply_insert( mem_heap_t* offsets_heap, /*!< in/out: memory heap that can be emptied */ mem_heap_t* heap, /*!< in/out: memory heap */ - row_merge_dup_t* dup, /*!< in/out: for reporting + row_merge_dup_t* dup) /*!< in/out: for reporting duplicate key errors */ - trx_id_t trx_id) /*!< in: DB_TRX_ID of mrec */ { - const row_log_t*log = dup->index->online_log; + row_log_t*log = dup->index->online_log; dberr_t error; const dtuple_t* row = row_log_table_apply_convert_mrec( - mrec, dup->index, offsets, log, heap, trx_id, &error); + mrec, dup->index, offsets, log, heap, &error); switch (error) { case DB_MISSING_HISTORY: @@ -1609,13 +1800,14 @@ row_log_table_apply_insert( break; default: ut_ad(0); + /* fall through */ case DB_INVALID_NULL: ut_ad(row == NULL); return(error); } error = row_log_table_apply_insert_low( - thr, row, trx_id, offsets_heap, heap, dup); + thr, row, offsets_heap, heap, dup); if (error != DB_SUCCESS) { /* Report the erroneous row using the new version of the table. */ @@ -1674,8 +1866,8 @@ row_log_table_apply_delete_low( const dtuple_t* entry = row_build_index_entry( row, ext, index, heap); - mtr_start(mtr); - mtr->set_named_space(index->space); + mtr->start(); + index->set_modified(*mtr); btr_pcur_open(index, entry, PAGE_CUR_LE, BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE, pcur, mtr); @@ -1702,14 +1894,14 @@ flag_ok: found, because new_table is being modified by this thread only, and all indexes should be updated in sync. */ - mtr_commit(mtr); + mtr->commit(); return(DB_INDEX_CORRUPT); } btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur), BTR_CREATE_FLAG, false, mtr); - mtr_commit(mtr); + mtr->commit(); } return(error); @@ -1757,7 +1949,7 @@ row_log_table_apply_delete( } mtr_start(&mtr); - mtr.set_named_space(index->space); + index->set_modified(mtr); btr_pcur_open(index, old_pk, PAGE_CUR_LE, BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE, &pcur, &mtr); @@ -1790,7 +1982,8 @@ all_done: return(DB_SUCCESS); } - offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, NULL, true, + offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, NULL, + index->n_core_fields, ULINT_UNDEFINED, &offsets_heap); #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets)); @@ -1807,6 +2000,8 @@ all_done: = rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets, trx_id_col, &len); ut_ad(len == DATA_TRX_ID_LEN); + ut_d(trx_id_check(rec_trx_id, log->min_trx)); + ut_d(trx_id_check(mrec_trx_id, log->min_trx)); ut_ad(rec_get_nth_field(mrec, moffsets, trx_id_col + 1, &len) == mrec_trx_id + DATA_TRX_ID_LEN); @@ -1853,13 +2048,12 @@ row_log_table_apply_update( mem_heap_t* heap, /*!< in/out: memory heap */ row_merge_dup_t* dup, /*!< in/out: for reporting duplicate key errors */ - trx_id_t trx_id, /*!< in: DB_TRX_ID of mrec */ const dtuple_t* old_pk) /*!< in: PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR of the old value, or PRIMARY KEY if same_pk */ { - const row_log_t*log = dup->index->online_log; + row_log_t* log = dup->index->online_log; const dtuple_t* row; dict_index_t* index = dict_table_get_first_index(log->table); mtr_t mtr; @@ -1874,7 +2068,7 @@ row_log_table_apply_update( + (log->same_pk ? 0 : 2)); row = row_log_table_apply_convert_mrec( - mrec, dup->index, offsets, log, heap, trx_id, &error); + mrec, dup->index, offsets, log, heap, &error); switch (error) { case DB_MISSING_HISTORY: @@ -1896,13 +2090,14 @@ row_log_table_apply_update( break; default: ut_ad(0); + /* fall through */ case DB_INVALID_NULL: ut_ad(row == NULL); return(error); } mtr_start(&mtr); - mtr.set_named_space(index->space); + index->set_modified(mtr); btr_pcur_open(index, old_pk, PAGE_CUR_LE, BTR_MODIFY_TREE, &pcur, &mtr); #ifdef UNIV_DEBUG @@ -1962,7 +2157,7 @@ row_log_table_apply_update( ROW_T_UPDATE or ROW_T_DELETE will delete it. */ mtr_commit(&mtr); error = row_log_table_apply_insert_low( - thr, row, trx_id, offsets_heap, heap, dup); + thr, row, offsets_heap, heap, dup); } else { /* Some BLOBs are missing, so we are interpreting this ROW_T_UPDATE as ROW_T_DELETE (see *1). @@ -1986,28 +2181,28 @@ func_exit_committed: /* Prepare to update (or delete) the record. */ rec_offs* cur_offsets = rec_get_offsets( - btr_pcur_get_rec(&pcur), index, NULL, true, + btr_pcur_get_rec(&pcur), index, NULL, index->n_core_fields, ULINT_UNDEFINED, &offsets_heap); if (!log->same_pk) { /* Only update the record if DB_TRX_ID,DB_ROLL_PTR match what was buffered. */ ulint len; - const void* rec_trx_id + const byte* rec_trx_id = rec_get_nth_field(btr_pcur_get_rec(&pcur), cur_offsets, index->n_uniq, &len); + const dfield_t* old_pk_trx_id + = dtuple_get_nth_field(old_pk, index->n_uniq); ut_ad(len == DATA_TRX_ID_LEN); - ut_ad(dtuple_get_nth_field(old_pk, index->n_uniq)->len - == DATA_TRX_ID_LEN); - ut_ad(dtuple_get_nth_field(old_pk, index->n_uniq + 1)->len - == DATA_ROLL_PTR_LEN); - ut_ad(DATA_TRX_ID_LEN + static_cast<const char*>( - dtuple_get_nth_field(old_pk, - index->n_uniq)->data) - == dtuple_get_nth_field(old_pk, - index->n_uniq + 1)->data); - if (memcmp(rec_trx_id, - dtuple_get_nth_field(old_pk, index->n_uniq)->data, + ut_d(trx_id_check(rec_trx_id, log->min_trx)); + ut_ad(old_pk_trx_id->len == DATA_TRX_ID_LEN); + ut_ad(old_pk_trx_id[1].len == DATA_ROLL_PTR_LEN); + ut_ad(DATA_TRX_ID_LEN + + static_cast<const char*>(old_pk_trx_id->data) + == old_pk_trx_id[1].data); + ut_d(trx_id_check(old_pk_trx_id->data, log->min_trx)); + + if (memcmp(rec_trx_id, old_pk_trx_id->data, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) { /* The ROW_T_UPDATE was logged for a different DB_TRX_ID,DB_ROLL_PTR. This is possible if an @@ -2090,7 +2285,7 @@ func_exit_committed: if (error == DB_SUCCESS) { error = row_log_table_apply_insert_low( - thr, row, trx_id, offsets_heap, heap, dup); + thr, row, offsets_heap, heap, dup); } goto func_exit_committed; @@ -2165,7 +2360,7 @@ func_exit_committed: } mtr_start(&mtr); - mtr.set_named_space(index->space); + index->set_modified(mtr); if (ROW_FOUND != row_search_index_entry( index, entry, BTR_MODIFY_TREE, &pcur, &mtr)) { @@ -2189,7 +2384,7 @@ func_exit_committed: BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG, BTR_MODIFY_TREE, index, offsets_heap, heap, - entry, trx_id, thr); + entry, thr_get_trx(thr)->id, thr); /* Report correct index name for duplicate key error. */ if (error == DB_DUPLICATE_KEY) { @@ -2197,7 +2392,7 @@ func_exit_committed: } mtr_start(&mtr); - mtr.set_named_space(index->space); + index->set_modified(mtr); } goto func_exit; @@ -2212,8 +2407,6 @@ const mrec_t* row_log_table_apply_op( /*===================*/ que_thr_t* thr, /*!< in: query graph */ - ulint trx_id_col, /*!< in: position of - DB_TRX_ID in old index */ ulint new_trx_id_col, /*!< in: position of DB_TRX_ID in new index */ row_merge_dup_t* dup, /*!< in/out: for reporting @@ -2245,6 +2438,7 @@ row_log_table_apply_op( return(NULL); } + const bool is_instant = log->is_instant(dup->index); const mrec_t* const mrec_start = mrec; switch (*mrec++) { @@ -2264,28 +2458,29 @@ row_log_table_apply_op( mrec += extra_size; + ut_ad(extra_size || !is_instant); + if (mrec > mrec_end) { return(NULL); } rec_offs_set_n_fields(offsets, dup->index->n_fields); - rec_init_offsets_temp(mrec, dup->index, offsets); + rec_init_offsets_temp(mrec, dup->index, offsets, + log->n_core_fields, log->non_core_fields, + is_instant + ? static_cast<rec_comp_status_t>( + *(mrec - extra_size)) + : REC_STATUS_ORDINARY); next_mrec = mrec + rec_offs_data_size(offsets); if (next_mrec > mrec_end) { return(NULL); } else { - log->head.total += next_mrec - mrec_start; - - ulint len; - const byte* db_trx_id - = rec_get_nth_field( - mrec, offsets, trx_id_col, &len); - ut_ad(len == DATA_TRX_ID_LEN); + log->head.total += ulint(next_mrec - mrec_start); *error = row_log_table_apply_insert( thr, mrec, offsets, offsets_heap, - heap, dup, trx_read_trx_id(db_trx_id)); + heap, dup); } break; @@ -2302,14 +2497,18 @@ row_log_table_apply_op( For fixed-length PRIMARY key columns, it is 0. */ mrec += extra_size; - rec_offs_set_n_fields(offsets, new_index->n_uniq + 2); + /* The ROW_T_DELETE record was converted by + rec_convert_dtuple_to_temp() using new_index. */ + ut_ad(!new_index->is_instant()); + rec_offs_set_n_fields(offsets, + unsigned(new_index->n_uniq) + 2); rec_init_offsets_temp(mrec, new_index, offsets); next_mrec = mrec + rec_offs_data_size(offsets); if (next_mrec > mrec_end) { return(NULL); } - log->head.total += next_mrec - mrec_start; + log->head.total += ulint(next_mrec - mrec_start); *error = row_log_table_apply_delete( new_trx_id_col, @@ -2325,7 +2524,7 @@ row_log_table_apply_op( is not changed, the log will only contain DB_TRX_ID,new_row. */ - if (dup->index->online_log->same_pk) { + if (log->same_pk) { ut_ad(new_index->n_uniq == dup->index->n_uniq); extra_size = *mrec++; @@ -2339,12 +2538,20 @@ row_log_table_apply_op( mrec += extra_size; + ut_ad(extra_size || !is_instant); + if (mrec > mrec_end) { return(NULL); } rec_offs_set_n_fields(offsets, dup->index->n_fields); - rec_init_offsets_temp(mrec, dup->index, offsets); + rec_init_offsets_temp(mrec, dup->index, offsets, + log->n_core_fields, + log->non_core_fields, + is_instant + ? static_cast<rec_comp_status_t>( + *(mrec - extra_size)) + : REC_STATUS_ORDINARY); next_mrec = mrec + rec_offs_data_size(offsets); @@ -2382,7 +2589,11 @@ row_log_table_apply_op( /* Get offsets for PRIMARY KEY, DB_TRX_ID, DB_ROLL_PTR. */ - rec_offs_set_n_fields(offsets, new_index->n_uniq + 2); + /* The old_pk prefix was converted by + rec_convert_dtuple_to_temp() using new_index. */ + ut_ad(!new_index->is_instant()); + rec_offs_set_n_fields(offsets, + unsigned(new_index->n_uniq) + 2); rec_init_offsets_temp(mrec, new_index, offsets); next_mrec = mrec + rec_offs_data_size(offsets); @@ -2392,7 +2603,8 @@ row_log_table_apply_op( /* Copy the PRIMARY KEY fields and DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */ - old_pk = dtuple_create(heap, new_index->n_uniq + 2); + old_pk = dtuple_create( + heap, unsigned(new_index->n_uniq) + 2); dict_index_copy_types(old_pk, new_index, old_pk->n_fields); @@ -2428,12 +2640,20 @@ row_log_table_apply_op( mrec += extra_size; + ut_ad(extra_size || !is_instant); + if (mrec > mrec_end) { return(NULL); } rec_offs_set_n_fields(offsets, dup->index->n_fields); - rec_init_offsets_temp(mrec, dup->index, offsets); + rec_init_offsets_temp(mrec, dup->index, offsets, + log->n_core_fields, + log->non_core_fields, + is_instant + ? static_cast<rec_comp_status_t>( + *(mrec - extra_size)) + : REC_STATUS_ORDINARY); next_mrec = mrec + rec_offs_data_size(offsets); @@ -2443,21 +2663,12 @@ row_log_table_apply_op( } ut_ad(next_mrec <= mrec_end); - log->head.total += next_mrec - mrec_start; + log->head.total += ulint(next_mrec - mrec_start); dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq); - { - ulint len; - const byte* db_trx_id - = rec_get_nth_field( - mrec, offsets, trx_id_col, &len); - ut_ad(len == DATA_TRX_ID_LEN); - *error = row_log_table_apply_update( - thr, new_trx_id_col, - mrec, offsets, offsets_heap, - heap, dup, trx_read_trx_id(db_trx_id), old_pk); - } - + *error = row_log_table_apply_update( + thr, new_trx_id_col, + mrec, offsets, offsets_heap, heap, dup, old_pk); break; } @@ -2482,10 +2693,8 @@ row_log_progress_inc_per_block() /* We must increment the progress once per page (as in univ_page_size, usually 16KiB). One block here is srv_sort_buf_size (usually 1MiB). */ - const ulint pages_per_block = std::max( - static_cast<unsigned long>( - srv_sort_buf_size / univ_page_size.physical()), - 1UL); + const ulint pages_per_block = std::max<ulint>( + ulint(srv_sort_buf_size >> srv_page_size_shift), 1); /* Multiply by an artificial factor of 6 to even the pace with the rest of the ALTER TABLE phases, they process page_size amount @@ -2553,8 +2762,6 @@ row_log_table_apply_ops( const ulint i = 1 + REC_OFFS_HEADER_SIZE + ut_max(dict_index_get_n_fields(index), dict_index_get_n_unique(new_index) + 2); - const ulint trx_id_col = dict_col_get_clust_pos( - dict_table_get_sys_col(index->table, DATA_TRX_ID), index); const ulint new_trx_id_col = dict_col_get_clust_pos( dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index); trx_t* trx = thr_get_trx(thr); @@ -2564,21 +2771,20 @@ row_log_table_apply_ops( ut_ad(trx->mysql_thd); ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X)); ut_ad(!dict_index_is_online_ddl(new_index)); - ut_ad(trx_id_col > 0); - ut_ad(trx_id_col != ULINT_UNDEFINED); + ut_ad(dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, DATA_TRX_ID), index) + != ULINT_UNDEFINED); ut_ad(new_trx_id_col > 0); ut_ad(new_trx_id_col != ULINT_UNDEFINED); -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(&mrec_end, sizeof mrec_end); -#endif /* HAVE_valgrind_or_MSAN */ offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets)); rec_offs_set_n_alloc(offsets, i); rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index)); - heap = mem_heap_create(UNIV_PAGE_SIZE); - offsets_heap = mem_heap_create(UNIV_PAGE_SIZE); + heap = mem_heap_create(srv_page_size); + offsets_heap = mem_heap_create(srv_page_size); has_index_lock = true; next_block: @@ -2670,12 +2876,12 @@ all_done: IORequest request(IORequest::READ); byte* buf = index->online_log->head.block; - if (DB_SUCCESS != os_file_read_no_error_handling_int_fd( + if (os_file_read_no_error_handling( request, index->online_log->fd, - buf, ofs, srv_sort_buf_size)) { + buf, ofs, srv_sort_buf_size, 0) != DB_SUCCESS) { ib::error() << "Unable to read temporary file" - " for table " << index->table_name; + " for table " << index->table->name; goto corruption; } @@ -2723,9 +2929,9 @@ all_done: ut_ad(mrec_end < (&index->online_log->head.buf)[1]); memcpy((mrec_t*) mrec_end, next_mrec, - (&index->online_log->head.buf)[1] - mrec_end); + ulint((&index->online_log->head.buf)[1] - mrec_end)); mrec = row_log_table_apply_op( - thr, trx_id_col, new_trx_id_col, + thr, new_trx_id_col, dup, &error, offsets_heap, heap, index->online_log->head.buf, (&index->online_log->head.buf)[1], offsets); @@ -2740,7 +2946,7 @@ all_done: it should proceed beyond the old end of the buffer. */ ut_a(mrec > mrec_end); - index->online_log->head.bytes = mrec - mrec_end; + index->online_log->head.bytes = ulint(mrec - mrec_end); next_mrec += index->online_log->head.bytes; } @@ -2831,7 +3037,7 @@ all_done: } next_mrec = row_log_table_apply_op( - thr, trx_id_col, new_trx_id_col, + thr, new_trx_id_col, dup, &error, offsets_heap, heap, mrec, mrec_end, offsets); @@ -2856,7 +3062,8 @@ process_next_block: goto next_block; } else if (next_mrec != NULL) { ut_ad(next_mrec < next_mrec_end); - index->online_log->head.bytes += next_mrec - mrec; + index->online_log->head.bytes + += ulint(next_mrec - mrec); } else if (has_index_lock) { /* When mrec is within tail.block, it should be a complete record, because we are holding @@ -2868,8 +3075,8 @@ process_next_block: goto unexpected_eof; } else { memcpy(index->online_log->head.buf, mrec, - mrec_end - mrec); - mrec_end += index->online_log->head.buf - mrec; + ulint(mrec_end - mrec)); + mrec_end += ulint(index->online_log->head.buf - mrec); mrec = index->online_log->head.buf; goto process_next_block; } @@ -2896,13 +3103,15 @@ func_exit: @param[in,out] stage performance schema accounting object, used by ALTER TABLE. stage->begin_phase_log_table() will be called initially and then stage->inc() will be called for each block of log that is applied. +@param[in] new_table Altered table @return DB_SUCCESS, or error code on failure */ dberr_t row_log_table_apply( que_thr_t* thr, dict_table_t* old_table, struct TABLE* table, - ut_stage_alter_t* stage) + ut_stage_alter_t* stage, + dict_table_t* new_table) { dberr_t error; dict_index_t* clust_index; @@ -2916,6 +3125,10 @@ row_log_table_apply( ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_S)); clust_index = dict_table_get_first_index(old_table); + if (clust_index->online_log->n_rows == 0) { + clust_index->online_log->n_rows = new_table->stat_n_rows; + } + rw_lock_x_lock(dict_index_get_lock(clust_index)); if (!clust_index->online_log) { @@ -2953,17 +3166,21 @@ for online creation. bool row_log_allocate( /*=============*/ + const trx_t* trx, /*!< in: the ALTER TABLE transaction */ dict_index_t* index, /*!< in/out: index */ dict_table_t* table, /*!< in/out: new table being rebuilt, or NULL when creating a secondary index */ bool same_pk,/*!< in: whether the definition of the PRIMARY KEY has remained the same */ - const dtuple_t* add_cols, + const dtuple_t* defaults, /*!< in: default values of - added columns, or NULL */ + added, changed columns, or NULL */ const ulint* col_map,/*!< in: mapping of old column numbers to new ones, or NULL if !table */ - const char* path) /*!< in: where to create temporary file */ + const char* path, /*!< in: where to create temporary file */ + const TABLE* old_table, /*!< in: table definition before alter */ + const bool allow_not_null) /*!< in: allow null to not-null + conversion */ { row_log_t* log; DBUG_ENTER("row_log_allocate"); @@ -2973,8 +3190,10 @@ row_log_allocate( ut_ad(!table || index->table != table); ut_ad(same_pk || table); ut_ad(!table || col_map); - ut_ad(!add_cols || col_map); + ut_ad(!defaults || col_map); ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X)); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(trx->id); log = static_cast<row_log_t*>(ut_malloc_nokey(sizeof *log)); @@ -2982,15 +3201,16 @@ row_log_allocate( DBUG_RETURN(false); } - log->fd = -1; + log->fd = OS_FILE_CLOSED; mutex_create(LATCH_ID_INDEX_ONLINE_LOG, &log->mutex); log->blobs = NULL; log->table = table; log->same_pk = same_pk; - log->add_cols = add_cols; + log->defaults = defaults; log->col_map = col_map; log->error = DB_SUCCESS; + log->min_trx = trx->id; log->max_trx = 0; log->tail.blocks = log->tail.bytes = 0; log->tail.total = 0; @@ -2999,6 +3219,23 @@ row_log_allocate( log->head.blocks = log->head.bytes = 0; log->head.total = 0; log->path = path; + log->n_core_fields = index->n_core_fields; + ut_ad(!table || log->is_instant(index) == index->is_instant()); + log->allow_not_null = allow_not_null; + log->old_table = old_table; + log->n_rows = 0; + + if (table && index->is_instant()) { + const unsigned n = log->n_core_fields; + log->non_core_fields = UT_NEW_ARRAY_NOKEY( + dict_col_t::def_t, index->n_fields - n); + for (unsigned i = n; i < index->n_fields; i++) { + log->non_core_fields[i - n] + = index->fields[i].col->def_val; + } + } else { + log->non_core_fields = NULL; + } dict_index_set_online_status(index, ONLINE_INDEX_CREATION); @@ -3032,6 +3269,7 @@ row_log_free( MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX); UT_DELETE(log->blobs); + UT_DELETE_ARRAY(log->non_core_fields); row_log_block_free(log->tail); row_log_block_free(log->head); row_merge_file_destroy_low(log->fd); @@ -3103,7 +3341,7 @@ row_log_apply_op_low( << rec_printer(entry).str()); mtr_start(&mtr); - mtr.set_named_space(index->space); + index->set_modified(mtr); /* We perform the pessimistic variant of the operations if we already hold index->lock exclusively. First, search the @@ -3115,7 +3353,7 @@ row_log_apply_op_low( ? BTR_MODIFY_TREE : BTR_MODIFY_LEAF, &cursor, 0, __FILE__, __LINE__, - &mtr, 0); + &mtr); ut_ad(dict_index_get_n_unique(index) > 0); /* This test is somewhat similar to row_ins_must_modify_rec(), @@ -3160,11 +3398,11 @@ row_log_apply_op_low( Lock the index tree exclusively. */ mtr_commit(&mtr); mtr_start(&mtr); - mtr.set_named_space(index->space); + index->set_modified(mtr); btr_cur_search_to_nth_level( index, 0, entry, PAGE_CUR_LE, BTR_MODIFY_TREE, &cursor, 0, - __FILE__, __LINE__, &mtr, 0); + __FILE__, __LINE__, &mtr); /* No other thread than the current one is allowed to modify the index tree. @@ -3263,11 +3501,11 @@ insert_the_rec: Lock the index tree exclusively. */ mtr_commit(&mtr); mtr_start(&mtr); - mtr.set_named_space(index->space); + index->set_modified(mtr); btr_cur_search_to_nth_level( index, 0, entry, PAGE_CUR_LE, BTR_MODIFY_TREE, &cursor, 0, - __FILE__, __LINE__, &mtr, 0); + __FILE__, __LINE__, &mtr); } /* We already determined that the @@ -3447,16 +3685,15 @@ row_log_apply_ops( ut_ad(!index->is_committed()); ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X)); ut_ad(index->online_log); -#ifdef HAVE_valgrind_or_MSAN + MEM_UNDEFINED(&mrec_end, sizeof mrec_end); -#endif /* HAVE_valgrind_or_MSAN */ offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets)); rec_offs_set_n_alloc(offsets, i); rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index)); - offsets_heap = mem_heap_create(UNIV_PAGE_SIZE); - heap = mem_heap_create(UNIV_PAGE_SIZE); + offsets_heap = mem_heap_create(srv_page_size); + heap = mem_heap_create(srv_page_size); has_index_lock = true; next_block: @@ -3540,9 +3777,9 @@ all_done: byte* buf = index->online_log->head.block; - if (DB_SUCCESS != os_file_read_no_error_handling_int_fd( + if (os_file_read_no_error_handling( request, index->online_log->fd, - buf, ofs, srv_sort_buf_size)) { + buf, ofs, srv_sort_buf_size, 0) != DB_SUCCESS) { ib::error() << "Unable to read temporary file" " for index " << index->name; @@ -3583,7 +3820,7 @@ all_done: ut_ad(mrec_end < (&index->online_log->head.buf)[1]); memcpy((mrec_t*) mrec_end, next_mrec, - (&index->online_log->head.buf)[1] - mrec_end); + ulint((&index->online_log->head.buf)[1] - mrec_end)); mrec = row_log_apply_op( index, dup, &error, offsets_heap, heap, has_index_lock, index->online_log->head.buf, @@ -3599,7 +3836,7 @@ all_done: it should proceed beyond the old end of the buffer. */ ut_a(mrec > mrec_end); - index->online_log->head.bytes = mrec - mrec_end; + index->online_log->head.bytes = ulint(mrec - mrec_end); next_mrec += index->online_log->head.bytes; } @@ -3697,7 +3934,8 @@ process_next_block: goto next_block; } else if (next_mrec != NULL) { ut_ad(next_mrec < next_mrec_end); - index->online_log->head.bytes += next_mrec - mrec; + index->online_log->head.bytes + += ulint(next_mrec - mrec); } else if (has_index_lock) { /* When mrec is within tail.block, it should be a complete record, because we are holding @@ -3709,8 +3947,8 @@ process_next_block: goto unexpected_eof; } else { memcpy(index->online_log->head.buf, mrec, - mrec_end - mrec); - mrec_end += index->online_log->head.buf - mrec; + ulint(mrec_end - mrec)); + mrec_end += ulint(index->online_log->head.buf - mrec); mrec = index->online_log->head.buf; goto process_next_block; } @@ -3784,7 +4022,7 @@ row_log_apply( } if (error != DB_SUCCESS) { - ut_a(!dict_table_is_discarded(index->table)); + ut_ad(index->table->space); /* We set the flag directly instead of invoking dict_set_corrupted_index_cache_only(index) here, because the index is not "public" yet. */ @@ -3805,3 +4043,9 @@ row_log_apply( DBUG_RETURN(error); } + +unsigned row_log_get_n_core_fields(const dict_index_t *index) +{ + ut_ad(index->online_log); + return index->online_log->n_core_fields; +} diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index e3b3f2c2762..f77eae9a76d 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2005, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2014, 2020, MariaDB Corporation. +Copyright (c) 2014, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,7 +24,7 @@ New index creation routines using a merge sort Created 12/4/2005 Jan Lindstrom Completed by Sunny Bains and Marko Makela *******************************************************/ -#include <my_config.h> +#include <my_global.h> #include <log.h> #include <sql_class.h> #include <math.h> @@ -134,7 +134,7 @@ public: ut_ad(dict_index_is_spatial(m_index)); DBUG_EXECUTE_IF("row_merge_instrument_log_check_flush", - log_sys->check_flush_or_checkpoint = true; + log_sys.check_flush_or_checkpoint = true; ); for (idx_tuple_vec::iterator it = m_dtuple_vec->begin(); @@ -143,7 +143,7 @@ public: dtuple = *it; ut_ad(dtuple); - if (log_sys->check_flush_or_checkpoint) { + if (log_sys.check_flush_or_checkpoint) { if (scan_mtr->is_active()) { btr_pcur_move_to_prev_on_page(pcur); btr_pcur_store_position(pcur, scan_mtr); @@ -154,7 +154,7 @@ public: } mtr.start(); - mtr.set_named_space(m_index->space); + m_index->set_modified(mtr); ins_cur.index = m_index; rtr_init_rtr_info(&rtr_info, false, &ins_cur, m_index, @@ -165,7 +165,7 @@ public: PAGE_CUR_RTREE_INSERT, BTR_MODIFY_LEAF, &ins_cur, 0, __FILE__, __LINE__, - &mtr, 0); + &mtr); /* It need to update MBR in parent entry, so change search mode to BTR_MODIFY_TREE */ @@ -176,12 +176,12 @@ public: m_index, false); rtr_info_update_btr(&ins_cur, &rtr_info); mtr_start(&mtr); - mtr.set_named_space(m_index->space); + m_index->set_modified(mtr); btr_cur_search_to_nth_level( m_index, 0, dtuple, PAGE_CUR_RTREE_INSERT, BTR_MODIFY_TREE, &ins_cur, 0, - __FILE__, __LINE__, &mtr, 0); + __FILE__, __LINE__, &mtr); } error = btr_cur_optimistic_insert( @@ -192,7 +192,7 @@ public: ut_ad(!big_rec); mtr.commit(); mtr.start(); - mtr.set_named_space(m_index->space); + m_index->set_modified(mtr); rtr_clean_rtr_info(&rtr_info, true); rtr_init_rtr_info(&rtr_info, false, @@ -204,7 +204,7 @@ public: PAGE_CUR_RTREE_INSERT, BTR_MODIFY_TREE, &ins_cur, 0, - __FILE__, __LINE__, &mtr, 0); + __FILE__, __LINE__, &mtr); error = btr_cur_pessimistic_insert( flag, &ins_cur, &ins_offsets, @@ -220,7 +220,7 @@ public: if (error == DB_SUCCESS) { if (rtr_info.mbr_adj) { error = rtr_ins_enlarge_mbr( - &ins_cur, NULL, &mtr); + &ins_cur, &mtr); } if (error == DB_SUCCESS) { @@ -278,7 +278,7 @@ dberr_t row_merge_insert_index_tuples( dict_index_t* index, const dict_table_t* old_table, - int fd, + const pfs_os_file_t& fd, row_merge_block_t* block, const row_merge_buf_t* row_buf, BtrBulk* btr_bulk, @@ -308,7 +308,7 @@ row_merge_buf_encode( ulint size; ulint extra_size; - size = rec_get_converted_size_temp( + size = rec_get_converted_size_temp<false>( index, entry->fields, n_fields, &extra_size); ut_ad(size >= extra_size); @@ -321,7 +321,7 @@ row_merge_buf_encode( *(*b)++ = (byte) (extra_size + 1); } - rec_convert_dtuple_to_temp(*b + extra_size, index, + rec_convert_dtuple_to_temp<false>(*b + extra_size, index, entry->fields, n_fields); *b += size; @@ -548,7 +548,7 @@ row_merge_buf_add( mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields)); data_size = 0; - extra_size = UT_BITS_IN_BYTES(index->n_nullable); + extra_size = UT_BITS_IN_BYTES(unsigned(index->n_nullable)); ifield = dict_index_get_nth_field(index, 0); @@ -796,7 +796,7 @@ row_merge_buf_add( ulint size; ulint extra; - size = rec_get_converted_size_temp( + size = rec_get_converted_size_temp<false>( index, entry->fields, n_fields, &extra); ut_ad(data_size + extra_size == size); @@ -812,9 +812,9 @@ row_merge_buf_add( /* Record size can exceed page size while converting to redundant row format. But there is assert - ut_ad(size < UNIV_PAGE_SIZE) in rec_offs_data_size(). + ut_ad(size < srv_page_size) in rec_offs_data_size(). It may hit the assert before attempting to insert the row. */ - if (conv_heap != NULL && data_size > UNIV_PAGE_SIZE) { + if (conv_heap != NULL && data_size > srv_page_size) { *err = DB_TOO_BIG_RECORD; } @@ -1023,11 +1023,11 @@ row_merge_buf_write( ut_a(b < &block[srv_sort_buf_size]); ut_a(b == &block[0] + buf->total_size); *b++ = 0; -#ifdef HAVE_valgrind_or_MSAN +#ifdef HAVE_valgrind /* The rest of the block is uninitialized. Initialize it to avoid bogus warnings. */ memset(b, 0xff, &block[srv_sort_buf_size] - b); -#endif /* HAVE_valgrind_or_MSAN */ +#endif /* HAVE_valgrind */ DBUG_LOG("ib_merge_sort", "write " << reinterpret_cast<const void*>(b) << ',' << of->fd << ',' << of->offset << " EOF"); @@ -1072,7 +1072,7 @@ row_merge_heap_create( bool row_merge_read( /*===========*/ - int fd, /*!< in: file descriptor */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ ulint offset, /*!< in: offset where to read in number of row_merge_block_t elements */ @@ -1087,9 +1087,8 @@ row_merge_read( DBUG_EXECUTE_IF("row_merge_read_failure", DBUG_RETURN(FALSE);); IORequest request(IORequest::READ); - const bool success = DB_SUCCESS - == os_file_read_no_error_handling_int_fd( - request, fd, buf, ofs, srv_sort_buf_size); + const bool success = DB_SUCCESS == os_file_read_no_error_handling( + request, fd, buf, ofs, srv_sort_buf_size, 0); /* If encryption is enabled decrypt buffer */ if (success && log_tmp_is_encrypted()) { @@ -1123,7 +1122,7 @@ UNIV_INTERN bool row_merge_write( /*============*/ - int fd, /*!< in: file descriptor */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ ulint offset, /*!< in: offset where to write, in number of row_merge_block_t elements */ const void* buf, /*!< in: data */ @@ -1152,7 +1151,7 @@ row_merge_write( } IORequest request(IORequest::WRITE); - const bool success = DB_SUCCESS == os_file_write_int_fd( + const bool success = DB_SUCCESS == os_file_write( request, "(merge)", fd, out_buf, ofs, buf_len); #ifdef POSIX_FADV_DONTNEED @@ -1174,7 +1173,7 @@ row_merge_read_rec( mrec_buf_t* buf, /*!< in/out: secondary buffer */ const byte* b, /*!< in: pointer to record */ const dict_index_t* index, /*!< in: index of the record */ - int fd, /*!< in: file descriptor */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ ulint* foffs, /*!< in/out: file offset */ const mrec_t** mrec, /*!< out: pointer to merge record, or NULL on end of list @@ -1238,7 +1237,7 @@ err_exit: to the auxiliary buffer and handle this as a special case. */ - avail_size = &block[srv_sort_buf_size] - b; + avail_size = ulint(&block[srv_sort_buf_size] - b); ut_ad(avail_size < sizeof *buf); memcpy(*buf, b, avail_size); @@ -1293,7 +1292,7 @@ err_exit: /* The record spans two blocks. Copy it to buf. */ b -= extra_size + data_size; - avail_size = &block[srv_sort_buf_size] - b; + avail_size = ulint(&block[srv_sort_buf_size] - b); memcpy(*buf, b, avail_size); *mrec = *buf + extra_size; @@ -1332,7 +1331,7 @@ row_merge_write_rec_low( ulint e, /*!< in: encoded extra_size */ #ifndef DBUG_OFF ulint size, /*!< in: total size to write */ - int fd, /*!< in: file descriptor */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ ulint foffs, /*!< in: file offset */ #endif /* !DBUG_OFF */ const mrec_t* mrec, /*!< in: record to write */ @@ -1361,7 +1360,7 @@ row_merge_write_rec_low( } memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets)); - DBUG_ASSERT(b + rec_offs_size(offsets) == end); + DBUG_SLOW_ASSERT(b + rec_offs_size(offsets) == end); DBUG_VOID_RETURN; } @@ -1375,7 +1374,7 @@ row_merge_write_rec( row_merge_block_t* block, /*!< in/out: file buffer */ mrec_buf_t* buf, /*!< in/out: secondary buffer */ byte* b, /*!< in: pointer to end of block */ - int fd, /*!< in: file descriptor */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ ulint* foffs, /*!< in/out: file offset */ const mrec_t* mrec, /*!< in: record to write */ const rec_offs* offsets,/*!< in: offsets of mrec */ @@ -1404,7 +1403,7 @@ row_merge_write_rec( if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) { /* The record spans two blocks. Copy it to the temporary buffer first. */ - avail_size = &block[srv_sort_buf_size] - b; + avail_size = ulint(&block[srv_sort_buf_size] - b); row_merge_write_rec_low(buf[0], extra_size, size, fd, *foffs, @@ -1421,9 +1420,7 @@ row_merge_write_rec( return(NULL); } -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(&block[0], srv_sort_buf_size); -#endif /* HAVE_valgrind_or_MSAN */ /* Copy the rest. */ b = &block[0]; @@ -1447,7 +1444,7 @@ row_merge_write_eof( /*================*/ row_merge_block_t* block, /*!< in/out: file buffer */ byte* b, /*!< in: pointer to end of block */ - int fd, /*!< in: file descriptor */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ ulint* foffs, /*!< in/out: file offset */ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */ ulint space) /*!< in: space id */ @@ -1474,57 +1471,55 @@ row_merge_write_eof( DBUG_RETURN(NULL); } -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(&block[0], srv_sort_buf_size); -#endif DBUG_RETURN(&block[0]); } /** Create a temporary file if it has not been created already. @param[in,out] tmpfd temporary file handle @param[in] path location for creating temporary file -@return file descriptor, or -1 on failure */ +@return true on success, false on error */ static MY_ATTRIBUTE((warn_unused_result)) -int +bool row_merge_tmpfile_if_needed( - int* tmpfd, + pfs_os_file_t* tmpfd, const char* path) { - if (*tmpfd < 0) { + if (*tmpfd == OS_FILE_CLOSED) { *tmpfd = row_merge_file_create_low(path); - if (*tmpfd >= 0) { + if (*tmpfd != OS_FILE_CLOSED) { MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES); } } - return(*tmpfd); + return(*tmpfd != OS_FILE_CLOSED); } /** Create a temporary file for merge sort if it was not created already. @param[in,out] file merge file structure @param[in] nrec number of records in the file @param[in] path location for creating temporary file -@return file descriptor, or -1 on failure */ +@return true on success, false on error */ static MY_ATTRIBUTE((warn_unused_result)) -int +bool row_merge_file_create_if_needed( merge_file_t* file, - int* tmpfd, + pfs_os_file_t* tmpfd, ulint nrec, const char* path) { - ut_ad(file->fd < 0 || *tmpfd >=0); - if (file->fd < 0 && row_merge_file_create(file, path) >= 0) { + ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED); + if (file->fd == OS_FILE_CLOSED && row_merge_file_create(file, path)!= OS_FILE_CLOSED) { MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES); - if (row_merge_tmpfile_if_needed(tmpfd, path) < 0) { - return(-1); + if (!row_merge_tmpfile_if_needed(tmpfd, path) ) { + return(false); } file->n_rec = nrec; } - ut_ad(file->fd < 0 || *tmpfd >=0); - return(file->fd); + ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED); + return(file->fd != OS_FILE_CLOSED); } /** Copy the merge data tuple from another merge data tuple. @@ -1654,7 +1649,7 @@ containing the index entries for the indexes to be built. @param[in] files temporary files @param[in] key_numbers MySQL key numbers to create @param[in] n_index number of indexes to create -@param[in] add_cols default values of added columns, or NULL +@param[in] defaults default values of added, changed columns, or NULL @param[in] add_v newly added virtual columns along with indexes @param[in] col_map mapping of old column numbers to new ones, or NULL if old_table == new_table @@ -1672,6 +1667,7 @@ stage->inc() will be called for each page read. @param[in,out] crypt_block crypted file buffer @param[in] eval_table mysql table used to evaluate virtual column value, see innobase_get_computed_value(). +@param[in] allow_not_null allow null to not-null conversion @return DB_SUCCESS or error */ static MY_ATTRIBUTE((warn_unused_result)) dberr_t @@ -1679,7 +1675,7 @@ row_merge_read_clustered_index( trx_t* trx, struct TABLE* table, const dict_table_t* old_table, - const dict_table_t* new_table, + dict_table_t* new_table, bool online, dict_index_t** index, dict_index_t* fts_sort_idx, @@ -1687,18 +1683,19 @@ row_merge_read_clustered_index( merge_file_t* files, const ulint* key_numbers, ulint n_index, - const dtuple_t* add_cols, + const dtuple_t* defaults, const dict_add_v_col_t* add_v, const ulint* col_map, ulint add_autoinc, ib_sequence_t& sequence, row_merge_block_t* block, bool skip_pk_sort, - int* tmpfd, + pfs_os_file_t* tmpfd, ut_stage_alter_t* stage, double pct_cost, row_merge_block_t* crypt_block, - struct TABLE* eval_table) + struct TABLE* eval_table, + bool allow_not_null) { dict_index_t* clust_index; /* Clustered index */ mem_heap_t* row_heap = NULL;/* Heap memory to create @@ -1731,11 +1728,17 @@ row_merge_read_clustered_index( double curr_progress = 0.0; ib_uint64_t read_rows = 0; ib_uint64_t table_total_rows = 0; + char new_sys_trx_start[8]; + char new_sys_trx_end[8]; + byte any_autoinc_data[8] = {0}; + bool vers_update_trt = false; DBUG_ENTER("row_merge_read_clustered_index"); ut_ad((old_table == new_table) == !col_map); - ut_ad(!add_cols || col_map); + ut_ad(!defaults || col_map); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(trx->id); table_total_rows = dict_table_get_n_rows(old_table); if(table_total_rows == 0) { @@ -1831,9 +1834,27 @@ row_merge_read_clustered_index( based on that. */ clust_index = dict_table_get_first_index(old_table); + const ulint old_trx_id_col = DATA_TRX_ID - DATA_N_SYS_COLS + + ulint(old_table->n_cols); + ut_ad(old_table->cols[old_trx_id_col].mtype == DATA_SYS); + ut_ad(old_table->cols[old_trx_id_col].prtype + == (DATA_TRX_ID | DATA_NOT_NULL)); + ut_ad(old_table->cols[old_trx_id_col + 1].mtype == DATA_SYS); + ut_ad(old_table->cols[old_trx_id_col + 1].prtype + == (DATA_ROLL_PTR | DATA_NOT_NULL)); + const ulint new_trx_id_col = col_map + ? col_map[old_trx_id_col] : old_trx_id_col; btr_pcur_open_at_index_side( true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + if (rec_is_metadata(btr_pcur_get_rec(&pcur), clust_index)) { + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + /* Skip the metadata pseudo-record. */ + } else { + ut_ad(!clust_index->is_instant()); + btr_pcur_move_to_prev_on_page(&pcur); + } if (old_table != new_table) { /* The table is being rebuilt. Identify the columns @@ -1885,6 +1906,10 @@ row_merge_read_clustered_index( prev_fields = NULL; } + mach_write_to_8(new_sys_trx_start, trx->id); + mach_write_to_8(new_sys_trx_end, TRX_ID_MAX); + uint64_t n_rows = 0; + /* Scan the clustered index. */ for (;;) { /* Do not continue if table pages are still encrypted */ @@ -1895,6 +1920,7 @@ row_merge_read_clustered_index( } const rec_t* rec; + trx_id_t rec_trx_id; rec_offs* offsets; dtuple_t* row; row_ext_t* ext; @@ -1924,15 +1950,6 @@ row_merge_read_clustered_index( } } -#ifdef DBUG_OFF -# define dbug_run_purge false -#else /* DBUG_OFF */ - bool dbug_run_purge = false; -#endif /* DBUG_OFF */ - DBUG_EXECUTE_IF( - "ib_purge_on_create_index_page_switch", - dbug_run_purge = true;); - /* Insert the cached spatial index rows. */ err = row_merge_spatial_rows( trx->id, sp_tuples, num_spatial, @@ -1946,8 +1963,8 @@ row_merge_read_clustered_index( goto scan_next; } - if (dbug_run_purge - || dict_index_get_lock(clust_index)->waiters) { + if (my_atomic_load32_explicit(&clust_index->lock.waiters, + MY_MEMORY_ORDER_RELAXED)) { /* There are waiters on the clustered index tree lock, likely the purge thread. Store and restore the cursor @@ -1968,18 +1985,6 @@ row_merge_read_clustered_index( btr_pcur_store_position(&pcur, &mtr); mtr_commit(&mtr); - if (dbug_run_purge) { - /* This is for testing - purposes only (see - DBUG_EXECUTE_IF above). We - signal the purge thread and - hope that the purge batch will - complete before we execute - btr_pcur_restore_position(). */ - trx_purge_run(); - os_thread_sleep(1000000); - } - /* Give the waiters a chance to proceed. */ os_thread_yield(); scan_next: @@ -2033,8 +2038,11 @@ end_of_index: rec = page_cur_get_rec(cur); if (online) { - offsets = rec_get_offsets(rec, clust_index, NULL, true, + offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index->n_core_fields, ULINT_UNDEFINED, &row_heap); + rec_trx_id = row_get_rec_trx_id(rec, clust_index, + offsets); /* Perform a REPEATABLE READ. @@ -2055,33 +2063,45 @@ end_of_index: ONLINE_INDEX_COMPLETE state between the time the DML thread has updated the clustered index but has not yet accessed secondary index. */ - ut_ad(MVCC::is_view_active(trx->read_view)); + ut_ad(trx->read_view.is_open()); + ut_ad(rec_trx_id != trx->id); - if (!trx->read_view->changes_visible( - row_get_rec_trx_id( - rec, clust_index, offsets), - old_table->name)) { + if (!trx->read_view.changes_visible( + rec_trx_id, old_table->name)) { rec_t* old_vers; row_vers_build_for_consistent_read( rec, &mtr, clust_index, &offsets, - trx->read_view, &row_heap, + &trx->read_view, &row_heap, row_heap, &old_vers, NULL); - rec = old_vers; - - if (!rec) { + if (!old_vers) { continue; } + + /* The old version must necessarily be + in the "prehistory", because the + exclusive lock in + ha_innobase::prepare_inplace_alter_table() + forced the completion of any transactions + that accessed this table. */ + ut_ad(row_get_rec_trx_id(old_vers, clust_index, + offsets) < trx->id); + + rec = old_vers; + rec_trx_id = 0; } if (rec_get_deleted_flag( rec, dict_table_is_comp(old_table))) { /* In delete-marked records, DB_TRX_ID must - always refer to an existing undo log record. */ - ut_ad(row_get_rec_trx_id(rec, clust_index, - offsets)); + always refer to an existing undo log record. + Above, we did reset rec_trx_id = 0 + for rec = old_vers.*/ + ut_ad(rec == page_cur_get_rec(cur) + ? rec_trx_id + : !rec_trx_id); /* This record was deleted in the latest committed version, or it was deleted and then reinserted-by-update before purge @@ -2094,19 +2114,38 @@ end_of_index: rec, dict_table_is_comp(old_table))) { /* In delete-marked records, DB_TRX_ID must always refer to an existing undo log record. */ - ut_ad(rec_get_trx_id(rec, clust_index)); + ut_d(rec_trx_id = rec_get_trx_id(rec, clust_index)); + ut_ad(rec_trx_id); + /* This must be a purgeable delete-marked record, + and the transaction that delete-marked the record + must have been committed before this + !online ALTER TABLE transaction. */ + ut_ad(rec_trx_id < trx->id); /* Skip delete-marked records. Skipping delete-marked records will make the created indexes unuseable for transactions whose read views were created before the index - creation completed, but preserving the history - would make it tricky to detect duplicate - keys. */ + creation completed, but an attempt to preserve + the history would make it tricky to detect + duplicate keys. */ continue; } else { - offsets = rec_get_offsets(rec, clust_index, NULL, true, + offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index->n_core_fields, ULINT_UNDEFINED, &row_heap); + /* This is a locking ALTER TABLE. + + If we are not rebuilding the table, the + DB_TRX_ID does not matter, as it is not being + written to any secondary indexes; see + if (old_table == new_table) below. + + If we are rebuilding the table, the + DB_TRX_ID,DB_ROLL_PTR should be reset, because + there will be no history available. */ + ut_ad(rec_get_trx_id(rec, clust_index) < trx->id); + rec_trx_id = 0; } /* When !online, we are holding a lock on old_table, preventing @@ -2118,19 +2157,35 @@ end_of_index: row = row_build_w_add_vcol(ROW_COPY_POINTERS, clust_index, rec, offsets, new_table, - add_cols, add_v, col_map, &ext, + defaults, add_v, col_map, &ext, row_heap); ut_ad(row); for (ulint i = 0; i < n_nonnull; i++) { - const dfield_t* field = &row->fields[nonnull[i]]; + dfield_t* field = &row->fields[nonnull[i]]; ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL); if (dfield_is_null(field)) { - err = DB_INVALID_NULL; - trx->error_key_num = 0; - goto func_exit; + + Field* null_field = + table->field[nonnull[i]]; + + null_field->set_warning( + Sql_condition::WARN_LEVEL_WARN, + WARN_DATA_TRUNCATED, 1, + ulong(n_rows + 1)); + + if (!allow_not_null) { + err = DB_INVALID_NULL; + trx->error_key_num = 0; + goto func_exit; + } + + const dfield_t& default_field + = defaults->fields[nonnull[i]]; + + *field = default_field; } } @@ -2141,13 +2196,62 @@ end_of_index: doc_id = 0; } + ut_ad(row->fields[new_trx_id_col].type.mtype == DATA_SYS); + ut_ad(row->fields[new_trx_id_col].type.prtype + == (DATA_TRX_ID | DATA_NOT_NULL)); + ut_ad(row->fields[new_trx_id_col].len == DATA_TRX_ID_LEN); + ut_ad(row->fields[new_trx_id_col + 1].type.mtype == DATA_SYS); + ut_ad(row->fields[new_trx_id_col + 1].type.prtype + == (DATA_ROLL_PTR | DATA_NOT_NULL)); + ut_ad(row->fields[new_trx_id_col + 1].len == DATA_ROLL_PTR_LEN); + + if (old_table == new_table) { + /* Do not bother touching DB_TRX_ID,DB_ROLL_PTR + because they are not going to be written into + secondary indexes. */ + } else if (rec_trx_id < trx->id) { + /* Reset the DB_TRX_ID,DB_ROLL_PTR of old rows + for which history is not going to be + available after the rebuild operation. + This essentially mimics row_purge_reset_trx_id(). */ + row->fields[new_trx_id_col].data + = const_cast<byte*>(reset_trx_id); + row->fields[new_trx_id_col + 1].data + = const_cast<byte*>(reset_trx_id + + DATA_TRX_ID_LEN); + } + if (add_autoinc != ULINT_UNDEFINED) { ut_ad(add_autoinc < dict_table_get_n_user_cols(new_table)); + bool history_row = false; + if (new_table->versioned()) { + const dfield_t* dfield = dtuple_get_nth_field( + row, new_table->vers_end); + history_row = dfield->vers_history_row(); + } + dfield_t* dfield = dtuple_get_nth_field(row, add_autoinc); + + if (new_table->versioned()) { + if (history_row) { + if (dfield_get_type(dfield)->prtype & DATA_NOT_NULL) { + err = DB_UNSUPPORTED; + my_error(ER_UNSUPPORTED_EXTENSION, MYF(0), + old_table->name.m_name); + goto func_exit; + } + dfield_set_null(dfield); + } else { + // set not null + ulint len = dfield_get_type(dfield)->len; + dfield_set_data(dfield, any_autoinc_data, len); + } + } + if (dfield_is_null(dfield)) { goto write_buffers; } @@ -2193,10 +2297,26 @@ end_of_index: } } + if (old_table->versioned()) { + if (!new_table->versioned() + && clust_index->vers_history_row(rec, offsets)) { + continue; + } + } else if (new_table->versioned()) { + dfield_t* start = + dtuple_get_nth_field(row, new_table->vers_start); + dfield_t* end = + dtuple_get_nth_field(row, new_table->vers_end); + dfield_set_data(start, new_sys_trx_start, 8); + dfield_set_data(end, new_sys_trx_end, 8); + vers_update_trt = true; + } + write_buffers: /* Build all entries for all the indexes to be created in a single scan of the clustered index. */ + n_rows++; ulint s_idx_cnt = 0; bool skip_sort = skip_pk_sort && dict_index_is_clust(merge_buf[0]->index); @@ -2226,6 +2346,11 @@ write_buffers: continue; } + ut_ad(!row + || !dict_index_is_clust(buf->index) + || trx_id_check(row->fields[new_trx_id_col].data, + trx->id)); + merge_file_t* file = &files[k++]; if (UNIV_LIKELY @@ -2390,12 +2515,13 @@ write_buffers: err = row_merge_insert_index_tuples( index[i], old_table, - -1, NULL, buf, clust_btr_bulk, + OS_FILE_CLOSED, NULL, buf, + clust_btr_bulk, table_total_rows, curr_progress, pct_cost, crypt_block, - new_table->space); + new_table->space_id); if (row == NULL) { err = clust_btr_bulk->finish( @@ -2482,7 +2608,7 @@ write_buffers: we can insert directly into the index without temporary file if clustered index does not uses temporary file. */ - if (row == NULL && file->fd == -1 + if (row == NULL && file->fd == OS_FILE_CLOSED && !clust_temp_file) { DBUG_EXECUTE_IF( "row_merge_write_failure", @@ -2502,12 +2628,13 @@ write_buffers: err = row_merge_insert_index_tuples( index[i], old_table, - -1, NULL, buf, &btr_bulk, + OS_FILE_CLOSED, NULL, buf, + &btr_bulk, table_total_rows, curr_progress, pct_cost, crypt_block, - new_table->space); + new_table->space_id); err = btr_bulk.finish(err); @@ -2519,9 +2646,9 @@ write_buffers: break; } } else { - if (row_merge_file_create_if_needed( + if (!row_merge_file_create_if_needed( file, tmpfd, - buf->n_tuples, path) < 0) { + buf->n_tuples, path)) { err = DB_OUT_OF_MEMORY; trx->error_key_num = i; break; @@ -2541,16 +2668,14 @@ write_buffers: if (!row_merge_write( file->fd, file->offset++, block, crypt_block, - new_table->space)) { + new_table->space_id)) { err = DB_TEMP_FILE_WRITE_FAIL; trx->error_key_num = i; break; } -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED( &block[0], srv_sort_buf_size); -#endif /* HAVE_valgrind_or_MSAN */ } } merge_buf[i] = row_merge_buf_empty(buf); @@ -2581,6 +2706,10 @@ write_buffers: } if (row == NULL) { + if (old_table != new_table) { + new_table->stat_n_rows = n_rows; + } + goto all_done; } @@ -2735,6 +2864,15 @@ wait_again: } } + if (vers_update_trt) { + trx_mod_table_time_t& time = + trx->mod_tables + .insert(trx_mod_tables_t::value_type( + const_cast<dict_table_t*>(new_table), 0)) + .first->second; + time.set_versioned(0); + } + trx->op_info = ""; DBUG_RETURN(err); @@ -2791,10 +2929,10 @@ wait_again: @param[in,out] foffs1 offset of second source list in the file @param[in,out] of output file @param[in,out] stage performance schema accounting object, used by -@param[in,out] crypt_block encryption buffer -@param[in] space tablespace ID for encryption ALTER TABLE. If not NULL stage->inc() will be called for each record processed. +@param[in,out] crypt_block encryption buffer +@param[in] space tablespace ID for encryption @return DB_SUCCESS or error code */ static MY_ATTRIBUTE((warn_unused_result)) dberr_t @@ -2805,7 +2943,7 @@ row_merge_blocks( ulint* foffs0, ulint* foffs1, merge_file_t* of, - ut_stage_alter_t* stage, + ut_stage_alter_t* stage MY_ATTRIBUTE((unused)), row_merge_block_t* crypt_block, ulint space) { @@ -2913,10 +3051,10 @@ done1: @param[in,out] foffs0 input file offset @param[in,out] of output file @param[in,out] stage performance schema accounting object, used by -@param[in,out] crypt_block encryption buffer -@param[in] space tablespace ID for encryption ALTER TABLE. If not NULL stage->inc() will be called for each record processed. +@param[in,out] crypt_block encryption buffer +@param[in] space tablespace ID for encryption @return TRUE on success, FALSE on failure */ static MY_ATTRIBUTE((warn_unused_result)) ibool @@ -2926,7 +3064,7 @@ row_merge_blocks_copy( row_merge_block_t* block, ulint* foffs0, merge_file_t* of, - ut_stage_alter_t* stage, + ut_stage_alter_t* stage MY_ATTRIBUTE((unused)), row_merge_block_t* crypt_block, ulint space) { @@ -3017,7 +3155,7 @@ row_merge( const row_merge_dup_t* dup, merge_file_t* file, row_merge_block_t* block, - int* tmpfd, + pfs_os_file_t* tmpfd, ulint* num_run, ulint* run_offset, ut_stage_alter_t* stage, @@ -3057,9 +3195,7 @@ row_merge( foffs0 = 0; foffs1 = ihalf; -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(run_offset, *num_run * sizeof *run_offset); -#endif /* HAVE_valgrind_or_MSAN */ for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) { @@ -3140,9 +3276,7 @@ row_merge( *tmpfd = file->fd; *file = of; -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(&block[0], 3 * srv_sort_buf_size); -#endif /* HAVE_valgrind_or_MSAN */ return(DB_SUCCESS); } @@ -3163,7 +3297,7 @@ row_merge_sort( const row_merge_dup_t* dup, merge_file_t* file, row_merge_block_t* block, - int* tmpfd, + pfs_os_file_t* tmpfd, const bool update_progress, /*!< in: update progress status variable or not */ @@ -3372,7 +3506,7 @@ dberr_t row_merge_insert_index_tuples( dict_index_t* index, const dict_table_t* old_table, - int fd, + const pfs_os_file_t& fd, row_merge_block_t* block, const row_merge_buf_t* row_buf, BtrBulk* btr_bulk, @@ -3424,7 +3558,7 @@ row_merge_insert_index_tuples( } if (row_buf != NULL) { - ut_ad(fd == -1); + ut_ad(fd == OS_FILE_CLOSED); ut_ad(block == NULL); DBUG_EXECUTE_IF("row_merge_read_failure", error = DB_CORRUPTION; @@ -3917,7 +4051,7 @@ row_merge_drop_temp_indexes(void) /* Load the table definitions that contain partially defined indexes, so that the data dictionary information can be checked when accessing the tablename.ibd files. */ - trx = trx_allocate_for_background(); + trx = trx_create(); trx->op_info = "dropping partially created indexes"; row_mysql_lock_data_dictionary(trx); /* Ensure that this transaction will be rolled back and locks @@ -3940,7 +4074,7 @@ row_merge_drop_temp_indexes(void) trx_commit_for_mysql(trx); row_mysql_unlock_data_dictionary(trx); - trx_free_for_background(trx); + trx->free(); } @@ -3948,15 +4082,15 @@ row_merge_drop_temp_indexes(void) UNIV_PFS_IO defined, register the file descriptor with Performance Schema. @param[in] path location for creating temporary merge files, or NULL @return File descriptor */ -int +pfs_os_file_t row_merge_file_create_low( const char* path) { - int fd; #ifdef UNIV_PFS_IO /* This temp file open does not go through normal file APIs, add instrumentation to register with performance schema */ + struct PSI_file_locker* locker; PSI_file_locker_state state; if (!path) { path = mysql_tmpdir; @@ -3966,27 +4100,21 @@ row_merge_file_create_low( ut_malloc_nokey(strlen(path) + sizeof label)); strcpy(name, path); strcat(name, label); - PSI_file_locker* locker = PSI_FILE_CALL(get_thread_file_name_locker)( - &state, innodb_temp_file_key, PSI_FILE_OPEN, - path ? name : label, &locker); - if (locker != NULL) { - PSI_FILE_CALL(start_file_open_wait)(locker, - __FILE__, - __LINE__); - } + + register_pfs_file_open_begin( + &state, locker, innodb_temp_file_key, + PSI_FILE_CREATE, path ? name : label, __FILE__, __LINE__); + #endif - fd = innobase_mysql_tmpfile(path); + pfs_os_file_t fd = innobase_mysql_tmpfile(path); #ifdef UNIV_PFS_IO - if (locker != NULL) { - PSI_FILE_CALL(end_file_open_wait_and_bind_to_descriptor)( - locker, fd); - } + register_pfs_file_open_end(locker, fd, + (fd == OS_FILE_CLOSED)?NULL:&fd); ut_free(name); #endif - if (fd < 0) { + if (fd == OS_FILE_CLOSED) { ib::error() << "Cannot create temporary merge file"; - return(-1); } return(fd); } @@ -3995,8 +4123,8 @@ row_merge_file_create_low( /** Create a merge file in the given location. @param[out] merge_file merge file structure @param[in] path location for creating temporary file, or NULL -@return file descriptor, or -1 on failure */ -int +@return file descriptor, or OS_FILE_CLOSED on error */ +pfs_os_file_t row_merge_file_create( merge_file_t* merge_file, const char* path) @@ -4005,7 +4133,7 @@ row_merge_file_create( merge_file->offset = 0; merge_file->n_rec = 0; - if (merge_file->fd >= 0) { + if (merge_file->fd != OS_FILE_CLOSED) { if (srv_disable_sort_file_cache) { os_file_set_nocache(merge_file->fd, "row0merge.cc", "sort"); @@ -4020,26 +4148,11 @@ if UNIV_PFS_IO is defined. */ void row_merge_file_destroy_low( /*=======================*/ - int fd) /*!< in: merge file descriptor */ + const pfs_os_file_t& fd) /*!< in: merge file descriptor */ { -#ifdef UNIV_PFS_IO - struct PSI_file_locker* locker = NULL; - PSI_file_locker_state state; - locker = PSI_FILE_CALL(get_thread_file_descriptor_locker)( - &state, fd, PSI_FILE_CLOSE); - if (locker != NULL) { - PSI_FILE_CALL(start_file_wait)( - locker, 0, __FILE__, __LINE__); - } -#endif - if (fd >= 0) { - close(fd); + if (fd != OS_FILE_CLOSED) { + os_file_close(fd); } -#ifdef UNIV_PFS_IO - if (locker != NULL) { - PSI_FILE_CALL(end_file_wait)(locker, 0); - } -#endif } /*********************************************************************//** Destroy a merge file. */ @@ -4050,9 +4163,9 @@ row_merge_file_destroy( { ut_ad(!srv_read_only_mode); - if (merge_file->fd != -1) { + if (merge_file->fd != OS_FILE_CLOSED) { row_merge_file_destroy_low(merge_file->fd); - merge_file->fd = -1; + merge_file->fd = OS_FILE_CLOSED; } } @@ -4171,19 +4284,9 @@ row_make_new_pathname( dict_table_t* table, /*!< in: table to be renamed */ const char* new_name) /*!< in: new name */ { - char* new_path; - char* old_path; - - ut_ad(!is_system_tablespace(table->space)); - - old_path = fil_space_get_first_path(table->space); - ut_a(old_path); - - new_path = os_file_make_new_pathname(old_path, new_name); - - ut_free(old_path); - - return(new_path); + ut_ad(!is_system_tablespace(table->space_id)); + return os_file_make_new_pathname(table->space->chain.start->name, + new_name); } /*********************************************************************//** @@ -4235,8 +4338,7 @@ row_merge_rename_tables_dict( renamed is a single-table tablespace, which must be implicitly renamed along with the table. */ if (err == DB_SUCCESS - && dict_table_is_file_per_table(old_table) - && fil_space_get(old_table->space) != NULL) { + && old_table->space_id) { /* Make pathname to update SYS_DATAFILES. */ char* tmp_path = row_make_new_pathname(old_table, tmp_name); @@ -4245,7 +4347,7 @@ row_merge_rename_tables_dict( pars_info_add_str_literal(info, "tmp_name", tmp_name); pars_info_add_str_literal(info, "tmp_path", tmp_path); pars_info_add_int4_literal(info, "old_space", - (lint) old_table->space); + old_table->space_id); err = que_eval_sql(info, "PROCEDURE RENAME_OLD_SPACE () IS\n" @@ -4276,7 +4378,7 @@ row_merge_rename_tables_dict( old_table->name.m_name); pars_info_add_str_literal(info, "old_path", old_path); pars_info_add_int4_literal(info, "new_space", - (lint) new_table->space); + new_table->space_id); err = que_eval_sql(info, "PROCEDURE RENAME_NEW_SPACE () IS\n" @@ -4292,9 +4394,9 @@ row_merge_rename_tables_dict( ut_free(old_path); } - if (err == DB_SUCCESS && dict_table_is_discarded(new_table)) { + if (err == DB_SUCCESS && (new_table->flags2 & DICT_TF2_DISCARDED)) { err = row_import_update_discarded_flag( - trx, new_table->id, true, true); + trx, new_table->id, true); } trx->op_info = ""; @@ -4302,54 +4404,7 @@ row_merge_rename_tables_dict( return(err); } -/** Create and execute a query graph for creating an index. -@param[in,out] trx trx -@param[in,out] table table -@param[in,out] index index -@param[in] add_v new virtual columns added along with add index call -@return DB_SUCCESS or error code */ -MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)) -static -dberr_t -row_merge_create_index_graph( - trx_t* trx, - dict_table_t* table, - dict_index_t*& index, - const dict_add_v_col_t* add_v) -{ - ind_node_t* node; /*!< Index creation node */ - mem_heap_t* heap; /*!< Memory heap */ - que_thr_t* thr; /*!< Query thread */ - dberr_t err; - - DBUG_ENTER("row_merge_create_index_graph"); - - ut_ad(trx); - ut_ad(table); - ut_ad(index); - - heap = mem_heap_create(512); - - index->table = table; - node = ind_create_graph_create(index, heap, add_v); - thr = pars_complete_graph_for_exec(node, trx, heap, NULL); - - ut_a(thr == que_fork_start_command( - static_cast<que_fork_t*>(que_node_get_parent(thr)))); - - que_run_threads(thr); - - err = trx->error_state; - - index = node->index; - - que_graph_free((que_t*) que_node_get_parent(thr)); - - DBUG_RETURN(err); -} - /** Create the index and load in to the dictionary. -@param[in,out] trx trx (sets error_state) @param[in,out] table the index is on this table @param[in] index_def the index definition @param[in] add_v new virtual columns added along with add @@ -4357,13 +4412,11 @@ row_merge_create_index_graph( @return index, or NULL on error */ dict_index_t* row_merge_create_index( - trx_t* trx, dict_table_t* table, const index_def_t* index_def, const dict_add_v_col_t* add_v) { dict_index_t* index; - dberr_t err; ulint n_fields = index_def->n_fields; ulint i; ulint n_add_vcol = 0; @@ -4376,11 +4429,8 @@ row_merge_create_index( a persistent operation. We pass 0 as the space id, and determine at a lower level the space id where to store the table. */ - index = dict_mem_index_create(table->name.m_name, index_def->name, - 0, index_def->ind_type, n_fields); - - ut_a(index); - + index = dict_mem_index_create(table, index_def->name, + index_def->ind_type, n_fields); index->set_committed(index_def->rebuild); for (i = 0; i < n_fields; i++) { @@ -4406,26 +4456,8 @@ row_merge_create_index( dict_mem_index_add_field(index, name, ifield->prefix_len); } - ut_d(const dict_index_t* const index_template = index); - /* Add the index to SYS_INDEXES, using the index prototype. */ - err = row_merge_create_index_graph(trx, table, index, add_v); - - if (err == DB_SUCCESS) { - ut_ad(index != index_template); - index->parser = index_def->parser; - if (n_add_vcol) { - index->assign_new_v_col(n_add_vcol); - } - /* Note the id of the transaction that created this - index, we use it to restrict readers from accessing - this index, to ensure read consistency. */ - ut_ad(index->trx_id == trx->id); - } else { - ut_ad(!index || index == index_template); - if (index) { - dict_mem_index_free(index); - } - index = NULL; + if (n_add_vcol) { + index->assign_new_v_col(n_add_vcol); } DBUG_RETURN(index); @@ -4446,10 +4478,10 @@ row_merge_is_index_usable( } return(!index->is_corrupted() - && (dict_table_is_temporary(index->table) + && (index->table->is_temporary() || index->trx_id == 0 - || !MVCC::is_view_active(trx->read_view) - || trx->read_view->changes_visible( + || !trx->read_view.is_open() + || trx->read_view.changes_visible( index->trx_id, index->table->name))); } @@ -4481,7 +4513,7 @@ the flushing of such pages to the data files was completed. @param[in] index an index tree on which redo logging was disabled */ void row_merge_write_redo(const dict_index_t* index) { - ut_ad(!dict_table_is_temporary(index->table)); + ut_ad(!index->table->is_temporary()); ut_ad(!(index->type & (DICT_SPATIAL | DICT_FTS))); mtr_t mtr; @@ -4489,7 +4521,7 @@ void row_merge_write_redo(const dict_index_t* index) byte* log_ptr = mlog_open(&mtr, 11 + 8); log_ptr = mlog_write_initial_log_record_low( MLOG_INDEX_LOAD, - index->space, index->page, log_ptr, &mtr); + index->table->space_id, index->page, log_ptr, &mtr); mach_write_to_8(log_ptr, index->id); mlog_close(&mtr, log_ptr + 8); mtr.commit(); @@ -4508,7 +4540,7 @@ old_table unless creating a PRIMARY KEY @param[in] n_indexes size of indexes[] @param[in,out] table MySQL table, for reporting erroneous key value if applicable -@param[in] add_cols default values of added columns, or NULL +@param[in] defaults default values of added, changed columns, or NULL @param[in] col_map mapping of old column numbers to new ones, or NULL if old_table == new_table @param[in] add_autoinc number of added AUTO_INCREMENT columns, or @@ -4522,6 +4554,7 @@ this function and it will be passed to other functions for further accounting. @param[in] add_v new virtual columns added along with indexes @param[in] eval_table mysql table used to evaluate virtual column value, see innobase_get_computed_value(). +@param[in] allow_not_null allow the conversion from null to not-null @return DB_SUCCESS or error code */ dberr_t row_merge_build_indexes( @@ -4533,24 +4566,26 @@ row_merge_build_indexes( const ulint* key_numbers, ulint n_indexes, struct TABLE* table, - const dtuple_t* add_cols, + const dtuple_t* defaults, const ulint* col_map, ulint add_autoinc, ib_sequence_t& sequence, bool skip_pk_sort, ut_stage_alter_t* stage, const dict_add_v_col_t* add_v, - struct TABLE* eval_table) + struct TABLE* eval_table, + bool allow_not_null) { merge_file_t* merge_files; row_merge_block_t* block; ut_new_pfx_t block_pfx; + size_t block_size; ut_new_pfx_t crypt_pfx; row_merge_block_t* crypt_block = NULL; ulint i; ulint j; dberr_t error; - int tmpfd = -1; + pfs_os_file_t tmpfd = OS_FILE_CLOSED; dict_index_t* fts_sort_idx = NULL; fts_psort_t* psort_info = NULL; fts_psort_t* merge_info = NULL; @@ -4567,7 +4602,7 @@ row_merge_build_indexes( ut_ad(!srv_read_only_mode); ut_ad((old_table == new_table) == !col_map); - ut_ad(!add_cols || col_map); + ut_ad(!defaults || col_map); stage->begin_phase_read_pk(skip_pk_sort && new_table != old_table ? n_indexes - 1 @@ -4580,7 +4615,8 @@ row_merge_build_indexes( /* This will allocate "3 * srv_sort_buf_size" elements of type row_merge_block_t. The latter is defined as byte. */ - block = alloc.allocate_large(3 * srv_sort_buf_size, &block_pfx); + block_size = 3 * srv_sort_buf_size; + block = alloc.allocate_large(block_size, &block_pfx); if (block == NULL) { DBUG_RETURN(DB_OUT_OF_MEMORY); @@ -4591,7 +4627,7 @@ row_merge_build_indexes( if (log_tmp_is_encrypted()) { crypt_block = static_cast<row_merge_block_t*>( - alloc.allocate_large(3 * srv_sort_buf_size, + alloc.allocate_large(block_size, &crypt_pfx)); if (crypt_block == NULL) { @@ -4617,7 +4653,7 @@ row_merge_build_indexes( merge file descriptor */ for (i = 0; i < n_merge_files; i++) { - merge_files[i].fd = -1; + merge_files[i].fd = OS_FILE_CLOSED; merge_files[i].offset = 0; merge_files[i].n_rec = 0; } @@ -4647,6 +4683,7 @@ row_merge_build_indexes( created */ if (!row_fts_psort_info_init( trx, dup, new_table, opt_doc_id_size, + dict_table_page_size(old_table), &psort_info, &merge_info)) { error = DB_CORRUPTION; goto func_exit; @@ -4658,10 +4695,6 @@ row_merge_build_indexes( } } - /* Reset the MySQL row buffer that is used when reporting - duplicate keys. */ - innobase_rec_reset(table); - if (global_system_variables.log_warnings > 2) { sql_print_information("InnoDB: Online DDL : Start reading" " clustered index of the table" @@ -4688,9 +4721,9 @@ row_merge_build_indexes( error = row_merge_read_clustered_index( trx, table, old_table, new_table, online, indexes, fts_sort_idx, psort_info, merge_files, key_numbers, - n_indexes, add_cols, add_v, col_map, add_autoinc, + n_indexes, defaults, add_v, col_map, add_autoinc, sequence, block, skip_pk_sort, &tmpfd, stage, - pct_cost, crypt_block, eval_table); + pct_cost, crypt_block, eval_table, allow_not_null); stage->end_phase_read_pk(); @@ -4794,7 +4827,7 @@ wait_again: #ifdef FTS_INTERNAL_DIAG_PRINT DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n"); #endif - } else if (merge_files[k].fd >= 0) { + } else if (merge_files[k].fd != OS_FILE_CLOSED) { char buf[NAME_LEN + 1]; row_merge_dup_t dup = { sort_idx, table, col_map, 0}; @@ -4827,7 +4860,8 @@ wait_again: trx, &dup, &merge_files[k], block, &tmpfd, true, pct_progress, pct_cost, - crypt_block, new_table->space, stage); + crypt_block, new_table->space_id, + stage); pct_progress += pct_cost; @@ -4865,7 +4899,8 @@ wait_again: merge_files[k].fd, block, NULL, &btr_bulk, merge_files[k].n_rec, pct_progress, pct_cost, - crypt_block, new_table->space, stage); + crypt_block, new_table->space_id, + stage); error = btr_bulk.finish(error); @@ -4957,10 +4992,10 @@ func_exit: ut_free(merge_files); - alloc.deallocate_large(block, &block_pfx); + alloc.deallocate_large(block, &block_pfx, block_size); if (crypt_block) { - alloc.deallocate_large(crypt_block, &crypt_pfx); + alloc.deallocate_large(crypt_block, &crypt_pfx, block_size); } DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID); diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index c2f9186d408..0b28fe03c41 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -38,7 +38,6 @@ Created 9/17/2000 Heikki Tuuri #include "dict0dict.h" #include "dict0load.h" #include "dict0priv.h" -#include "dict0boot.h" #include "dict0stats.h" #include "dict0stats_bg.h" #include "dict0defrag_bg.h" @@ -483,7 +482,7 @@ row_mysql_store_col_in_innobase_format( case 4: /* space=0x00000020 */ /* Trim "half-chars", just in case. */ - col_len &= ~3; + col_len &= ~3U; while (col_len >= 4 && ptr[col_len - 4] == 0x00 @@ -496,7 +495,7 @@ row_mysql_store_col_in_innobase_format( case 2: /* space=0x0020 */ /* Trim "half-chars", just in case. */ - col_len &= ~1; + col_len &= ~1U; while (col_len >= 2 && ptr[col_len - 2] == 0x00 && ptr[col_len - 1] == 0x20) { @@ -690,6 +689,7 @@ row_mysql_handle_errors( dberr_t err; DBUG_ENTER("row_mysql_handle_errors"); + DEBUG_SYNC_C("row_mysql_handle_errors"); handle_new_error: err = trx->error_state; @@ -779,6 +779,12 @@ handle_new_error: << FK_MAX_CASCADE_DEL << ". Please drop excessive" " foreign constraints and try again"; goto rollback_to_savept; + case DB_UNSUPPORTED: + ib::error() << "Cannot delete/update rows with cascading" + " foreign key constraints in timestamp-based temporal" + " table. Please drop excessive" + " foreign constraints and try again"; + goto rollback_to_savept; default: ib::fatal() << "Unknown error " << err; } @@ -1292,7 +1298,7 @@ row_mysql_get_table_status( bool push_warning = true) { dberr_t err; - if (fil_space_t* space = fil_space_acquire_silent(table->space)) { + if (const fil_space_t* space = table->space) { if (space->crypt_data && space->crypt_data->is_encrypted()) { // maybe we cannot access the table due to failing // to decrypt @@ -1314,8 +1320,6 @@ row_mysql_get_table_status( err = DB_CORRUPTION; } - - fil_space_release(space); } else { ib::error() << ".ibd file is missing for table " << table->name; @@ -1332,7 +1336,8 @@ row_mysql_get_table_status( dberr_t row_insert_for_mysql( const byte* mysql_rec, - row_prebuilt_t* prebuilt) + row_prebuilt_t* prebuilt, + ins_mode_t ins_mode) { trx_savept_t savept; que_thr_t* thr; @@ -1350,7 +1355,7 @@ row_insert_for_mysql( ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED); - if (dict_table_is_discarded(prebuilt->table)) { + if (!prebuilt->table->space) { ib::error() << "The table " << prebuilt->table->name << " doesn't have a corresponding tablespace, it was" @@ -1380,7 +1385,9 @@ row_insert_for_mysql( row_mysql_delay_if_needed(); - trx_start_if_not_started_xa(trx, true); + if (!table->no_rollback()) { + trx_start_if_not_started_xa(trx, true); + } row_get_prebuilt_insert_row(prebuilt); node = prebuilt->ins_node; @@ -1388,6 +1395,10 @@ row_insert_for_mysql( row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec, &blob_heap); + if (ins_mode != ROW_INS_NORMAL) { + node->vers_update_end(prebuilt, ins_mode == ROW_INS_HISTORICAL); + } + savept = trx_savept_take(trx); thr = que_fork_get_first_thr(prebuilt->ins_graph); @@ -1573,8 +1584,8 @@ row_create_update_node_for_mysql( node = upd_node_create(heap); - node->in_mysql_interface = TRUE; - node->is_delete = FALSE; + node->in_mysql_interface = true; + node->is_delete = NO_DELETE; node->searched_update = FALSE; node->select = NULL; node->pcur = btr_pcur_create_for_mysql(); @@ -1669,7 +1680,7 @@ row_fts_update_or_delete( ut_a(dict_table_has_fts_index(prebuilt->table)); /* Deletes are simple; get them out of the way first. */ - if (node->is_delete) { + if (node->is_delete == PLAIN_DELETE) { /* A delete affects all FTS indexes, so we pass NULL */ fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL); } else { @@ -1768,7 +1779,9 @@ row_update_for_mysql(row_prebuilt_t* prebuilt) init_fts_doc_id_for_ref(table, &fk_depth); - trx_start_if_not_started_xa(trx, true); + if (!table->no_rollback()) { + trx_start_if_not_started_xa(trx, true); + } if (dict_table_is_referenced_by_foreign_key(table)) { /* Share lock the data dictionary to prevent any @@ -1783,17 +1796,16 @@ row_update_for_mysql(row_prebuilt_t* prebuilt) } node = prebuilt->upd_node; - const bool is_delete = node->is_delete; + const bool is_delete = node->is_delete == PLAIN_DELETE; ut_ad(node->table == table); clust_index = dict_table_get_first_index(table); - if (prebuilt->pcur->btr_cur.index == clust_index) { - btr_pcur_copy_stored_position(node->pcur, prebuilt->pcur); - } else { - btr_pcur_copy_stored_position(node->pcur, - prebuilt->clust_pcur); - } + btr_pcur_copy_stored_position(node->pcur, + prebuilt->pcur->btr_cur.index + == clust_index + ? prebuilt->pcur + : prebuilt->clust_pcur); ut_a(node->pcur->rel_pos == BTR_PCUR_ON); @@ -1814,6 +1826,16 @@ row_update_for_mysql(row_prebuilt_t* prebuilt) que_thr_move_to_run_state_for_mysql(thr, trx); + ut_ad(!prebuilt->versioned_write || node->table->versioned()); + + if (prebuilt->versioned_write) { + if (node->is_delete == VERSIONED_DELETE) { + node->vers_make_delete(trx); + } else if (node->update->affects_versioned()) { + node->vers_make_update(trx); + } + } + for (;;) { thr->run_node = node; thr->prev_node = node; @@ -1864,9 +1886,9 @@ row_update_for_mysql(row_prebuilt_t* prebuilt) } bool update_statistics; - ut_ad(node->is_delete == is_delete); + ut_ad(is_delete == (node->is_delete == PLAIN_DELETE)); - if (/*node->*/is_delete) { + if (is_delete) { /* Not protected by dict_sys->mutex for performance reasons, we would rather get garbage in stat_n_rows (which is just an estimate anyway) than protecting the following code @@ -2001,7 +2023,8 @@ row_unlock_for_mysql( rec_offs* offsets = offsets_; rec_offs_init(offsets_); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &heap); rec_trx_id = row_get_rec_trx_id(rec, index, offsets); @@ -2074,6 +2097,135 @@ row_mysql_unfreeze_data_dictionary( trx->dict_operation_lock_mode = 0; } +/** Write query start time as SQL field data to a buffer. Needed by InnoDB. +@param thd Thread object +@param buf Buffer to hold start time data */ +void thd_get_query_start_data(THD *thd, char *buf); + +/** Insert history row when evaluating foreign key referential action. + +1. Create new dtuple_t 'row' from node->historical_row; +2. Update its row_end to current timestamp; +3. Insert it to a table; +4. Update table statistics. + +This is used in UPDATE CASCADE/SET NULL of a system versioned referenced table. + +node->historical_row: dtuple_t containing pointers of row changed by refertial +action. + +@param[in] thr current query thread +@param[in] node a node which just updated a row in a foreign table +@return DB_SUCCESS or some error */ +static dberr_t row_update_vers_insert(que_thr_t* thr, upd_node_t* node) +{ + trx_t* trx = thr_get_trx(thr); + dfield_t* row_end; + char row_end_data[8]; + dict_table_t* table = node->table; + page_size_t page_size= dict_table_page_size(table); + ut_ad(table->versioned()); + + dtuple_t* row; + const ulint n_cols = dict_table_get_n_cols(table); + const ulint n_v_cols = dict_table_get_n_v_cols(table); + + ut_ad(n_cols == dtuple_get_n_fields(node->historical_row)); + ut_ad(n_v_cols == dtuple_get_n_v_fields(node->historical_row)); + + row = dtuple_create_with_vcol(node->historical_heap, n_cols, n_v_cols); + + dict_table_copy_types(row, table); + + ins_node_t* insert_node = + ins_node_create(INS_DIRECT, table, node->historical_heap); + + if (!insert_node) { + trx->error_state = DB_OUT_OF_MEMORY; + goto exit; + } + + insert_node->common.parent = thr; + ins_node_set_new_row(insert_node, row); + + ut_ad(n_cols > DATA_N_SYS_COLS); + // Exclude DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR + for (ulint i = 0; i < n_cols - DATA_N_SYS_COLS; i++) { + dfield_t *src= dtuple_get_nth_field(node->historical_row, i); + dfield_t *dst= dtuple_get_nth_field(row, i); + dfield_copy(dst, src); + if (dfield_is_ext(src)) { + byte *field_data + = static_cast<byte*>(dfield_get_data(src)); + ulint ext_len; + ulint field_len = dfield_get_len(src); + + ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE); + + ut_a(memcmp(field_data + field_len + - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + + byte *data = btr_copy_externally_stored_field( + &ext_len, field_data, page_size, field_len, + node->historical_heap); + dfield_set_data(dst, data, ext_len); + } + } + + for (ulint i = 0; i < n_v_cols; i++) { + dfield_t *dst= dtuple_get_nth_v_field(row, i); + dfield_t *src= dtuple_get_nth_v_field(node->historical_row, i); + dfield_copy(dst, src); + } + + node->historical_row = NULL; + + row_end = dtuple_get_nth_field(row, table->vers_end); + if (dict_table_get_nth_col(table, table->vers_end)->vers_native()) { + mach_write_to_8(row_end_data, trx->id); + dfield_set_data(row_end, row_end_data, 8); + } else { + thd_get_query_start_data(trx->mysql_thd, row_end_data); + dfield_set_data(row_end, row_end_data, 7); + } + + for (;;) { + thr->run_node = insert_node; + thr->prev_node = insert_node; + + row_ins_step(thr); + + switch (trx->error_state) { + case DB_LOCK_WAIT: + que_thr_stop_for_mysql(thr); + lock_wait_suspend_thread(thr); + + if (trx->error_state == DB_SUCCESS) { + continue; + } + + /* fall through */ + default: + /* Other errors are handled for the parent node. */ + thr->fk_cascade_depth = 0; + goto exit; + + case DB_SUCCESS: + srv_stats.n_rows_inserted.inc( + static_cast<size_t>(trx->id)); + dict_stats_update_if_needed(table, *trx); + goto exit; + } + } +exit: + que_graph_free_recursive(insert_node); + mem_heap_free(node->historical_heap); + node->historical_heap = NULL; + return trx->error_state; +} + /**********************************************************************//** Does a cascaded delete or set null in a foreign key operation. @return error code or DB_SUCCESS */ @@ -2095,6 +2247,18 @@ row_update_cascade_for_mysql( const trx_t* trx = thr_get_trx(thr); + if (table->versioned()) { + if (node->is_delete == PLAIN_DELETE) { + node->vers_make_delete(trx); + } else if (node->update->affects_versioned()) { + dberr_t err = row_update_vers_insert(thr, node); + if (err != DB_SUCCESS) { + return err; + } + node->vers_make_update(trx); + } + } + for (;;) { thr->run_node = node; thr->prev_node = node; @@ -2126,7 +2290,7 @@ row_update_cascade_for_mysql( thr->fk_cascade_depth = 0; bool stats; - if (node->is_delete) { + if (node->is_delete == PLAIN_DELETE) { /* Not protected by dict_sys->mutex for performance reasons, we would rather @@ -2274,18 +2438,10 @@ err_exit: /* Update SYS_TABLESPACES and SYS_DATAFILES if a new file-per-table tablespace was created. */ if (err == DB_SUCCESS && dict_table_is_file_per_table(table)) { - - ut_ad(dict_table_is_file_per_table(table)); - - char* path; - path = fil_space_get_first_path(table->space); - err = dict_replace_tablespace_in_dictionary( - table->space, table->name.m_name, - fil_space_get_flags(table->space), - path, trx); - - ut_free(path); + table->space_id, table->name.m_name, + table->space->flags, + table->space->chain.start->name, trx); if (err != DB_SUCCESS) { @@ -2320,11 +2476,9 @@ err_exit: /* We already have .ibd file here. it should be deleted. */ if (dict_table_is_file_per_table(table) - && fil_delete_tablespace(table->space) != DB_SUCCESS) { - - ib::error() << "Not able to delete tablespace " - << table->space << " of table " - << table->name << "!"; + && fil_delete_tablespace(table->space_id) != DB_SUCCESS) { + ib::error() << "Cannot delete the file of table " + << table->name; } /* fall through */ @@ -2367,31 +2521,11 @@ row_create_index_for_mysql( dberr_t err; ulint i; ulint len; - char* table_name; - char* index_name; - dict_table_t* table = NULL; - ibool is_fts; - - trx->op_info = "creating index"; - - /* Copy the table name because we may want to drop the - table later, after the index object is freed (inside - que_run_threads()) and thus index->table_name is not available. */ - table_name = mem_strdup(index->table_name); - index_name = mem_strdup(index->name); - - is_fts = (index->type == DICT_FTS); + dict_table_t* table = index->table; ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); ut_ad(mutex_own(&dict_sys->mutex)); - table = dict_table_open_on_name(table_name, TRUE, TRUE, - DICT_ERR_IGNORE_NONE); - - if (!dict_table_is_temporary(table)) { - trx_start_if_not_started_xa(trx, true); - } - for (i = 0; i < index->n_def; i++) { /* Check that prefix_len and actual length < DICT_MAX_INDEX_COL_LEN */ @@ -2409,25 +2543,26 @@ row_create_index_for_mysql( /* Column or prefix length exceeds maximum column length */ if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) { - err = DB_TOO_BIG_INDEX_COL; - dict_mem_index_free(index); - goto error_handling; + return DB_TOO_BIG_INDEX_COL; } } - trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->op_info = "creating index"; /* For temp-table we avoid insertion into SYSTEM TABLES to maintain performance and so we have separate path that directly just updates dictonary cache. */ - if (!dict_table_is_temporary(table)) { + if (!table->is_temporary()) { + trx_start_if_not_started_xa(trx, true); + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); /* Note that the space id where we store the index is inherited from the table in dict_build_index_def_step() in dict0crea.cc. */ heap = mem_heap_create(512); - node = ind_create_graph_create(index, heap, NULL); + node = ind_create_graph_create(index, table->name.m_name, + heap); thr = pars_complete_graph_for_exec(node, trx, heap, NULL); @@ -2439,48 +2574,38 @@ row_create_index_for_mysql( err = trx->error_state; + index = node->index; + + ut_ad(!index == (err != DB_SUCCESS)); + que_graph_free((que_t*) que_node_get_parent(thr)); + + if (index && (index->type & DICT_FTS)) { + err = fts_create_index_tables(trx, index, table->id); + } } else { dict_build_index_def(table, index, trx); - err = dict_index_add_to_cache(table, index, FIL_NULL); + err = dict_index_add_to_cache(index, FIL_NULL); ut_ad((index == NULL) == (err != DB_SUCCESS)); + if (UNIV_LIKELY(err == DB_SUCCESS)) { + ut_ad(!index->is_instant()); + index->n_core_null_bytes = UT_BITS_IN_BYTES( + unsigned(index->n_nullable)); - if (err != DB_SUCCESS) { - goto error_handling; - } - - index->table = table; - - err = dict_create_index_tree_in_mem(index, trx); + err = dict_create_index_tree_in_mem(index, trx); #ifdef BTR_CUR_HASH_ADAPT - ut_ad(!index->search_info->ref_count); + ut_ad(!index->search_info->ref_count); #endif /* BTR_CUR_HASH_ADAPT */ - if (err != DB_SUCCESS) { - dict_index_remove_from_cache(table, index); + if (err != DB_SUCCESS) { + dict_index_remove_from_cache(table, index); + } } } - /* Create the index specific FTS auxiliary tables. */ - if (err == DB_SUCCESS && is_fts) { - dict_index_t* idx; - - idx = dict_table_get_index_on_name(table, index_name); - - ut_ad(idx); - err = fts_create_index_tables_low( - trx, idx, table->name.m_name, table->id); - } - -error_handling: - dict_table_close(table, TRUE, FALSE); - trx->op_info = ""; - ut_free(table_name); - ut_free(index_name); - return(err); } @@ -2501,7 +2626,7 @@ row_drop_table_for_mysql_in_background( dberr_t error; trx_t* trx; - trx = trx_allocate_for_background(); + trx = trx_create(); /* If the original transaction was dropping a table referenced by foreign keys, we must set the following to be able to drop the @@ -2515,7 +2640,7 @@ row_drop_table_for_mysql_in_background( trx_commit_for_mysql(trx); - trx_free_for_background(trx); + trx->free(); return(error); } @@ -2583,7 +2708,7 @@ skip: goto next; } - if (!srv_fast_shutdown && !trx_sys_any_active_transactions()) { + if (!srv_fast_shutdown && !trx_sys.any_active_transactions()) { lock_mutex_enter(); skip = UT_LIST_GET_LEN(table->locks) != 0; lock_mutex_exit(); @@ -2639,7 +2764,7 @@ row_mysql_drop_garbage_tables() mem_heap_t* heap = mem_heap_create(FN_REFLEN); btr_pcur_t pcur; mtr_t mtr; - trx_t* trx = trx_allocate_for_background(); + trx_t* trx = trx_create(); trx->op_info = "dropping garbage tables"; row_mysql_lock_data_dictionary(trx); @@ -2680,9 +2805,8 @@ row_mysql_drop_garbage_tables() if (dict_load_table(table_name, DICT_ERR_IGNORE_DROP)) { - row_drop_table_for_mysql( - table_name, trx, - SQLCOM_DROP_TABLE); + row_drop_table_for_mysql(table_name, trx, + SQLCOM_DROP_TABLE); trx_commit_for_mysql(trx); } @@ -2697,7 +2821,7 @@ row_mysql_drop_garbage_tables() btr_pcur_close(&pcur); mtr.commit(); row_mysql_unlock_data_dictionary(trx); - trx_free_for_background(trx); + trx->free(); mem_heap_free(heap); } @@ -2758,9 +2882,6 @@ row_mysql_table_id_reassign( dict_hdr_get_new_id(new_id, NULL, NULL, table, false); - /* Remove all locks except the table-level S and X locks. */ - lock_remove_all_on_table(table, FALSE); - pars_info_add_ull_literal(info, "old_id", table->id); pars_info_add_ull_literal(info, "new_id", *new_id); @@ -2810,8 +2931,8 @@ row_discard_tablespace_begin( if (table) { dict_stats_wait_bg_to_stop_using_table(table, trx); - ut_a(!is_system_tablespace(table->space)); - ut_a(table->n_foreign_key_checks_running == 0); + ut_a(!is_system_tablespace(table->space_id)); + ut_ad(!table->n_foreign_key_checks_running); } return(table); @@ -2930,19 +3051,15 @@ row_discard_tablespace( their operations. 3) Insert buffer: we remove all entries for the tablespace in - the insert buffer tree. + the insert buffer tree. */ - 4) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, - we do not allow the discard. */ - - ibuf_delete_for_discarded_space(table->space); + ibuf_delete_for_discarded_space(table->space_id); table_id_t new_id; /* Set the TABLESPACE DISCARD flag in the table definition on disk. */ - err = row_import_update_discarded_flag( - trx, table->id, true, true); + err = row_import_update_discarded_flag(trx, table->id, true); if (err != DB_SUCCESS) { return(err); @@ -2972,50 +3089,42 @@ row_discard_tablespace( } /* Discard the physical file that is used for the tablespace. */ - - err = fil_discard_tablespace(table->space); - + err = fil_delete_tablespace(table->space_id); switch (err) { - case DB_SUCCESS: case DB_IO_ERROR: + ib::warn() << "ALTER TABLE " << table->name + << " DISCARD TABLESPACE failed to delete file"; + break; case DB_TABLESPACE_NOT_FOUND: - /* All persistent operations successful, update the - data dictionary memory cache. */ - - table->file_unreadable = true; - - table->flags2 |= DICT_TF2_DISCARDED; - - dict_table_change_id_in_cache(table, new_id); - - /* Reset the root page numbers. */ - - for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); - index != 0; - index = UT_LIST_GET_NEXT(indexes, index)) { - - index->page = FIL_NULL; - index->space = FIL_NULL; - } - - /* If the tablespace did not already exist or we couldn't - write to it, we treat that as a successful DISCARD. It is - unusable anyway. */ - - err = DB_SUCCESS; + ib::warn() << "ALTER TABLE " << table->name + << " DISCARD TABLESPACE failed to find tablespace"; + break; + case DB_SUCCESS: break; - default: - /* We need to rollback the disk changes, something failed. */ + ut_error; + } - trx->error_state = DB_SUCCESS; + /* All persistent operations successful, update the + data dictionary memory cache. */ - trx_rollback_to_savepoint(trx, NULL); + table->file_unreadable = true; + table->space = NULL; + table->flags2 |= DICT_TF2_DISCARDED; + dict_table_change_id_in_cache(table, new_id); - trx->error_state = DB_SUCCESS; + dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + if (index) index->remove_instant(); + + /* Reset the root page numbers. */ + for (; index; index = UT_LIST_GET_NEXT(indexes, index)) { + index->page = FIL_NULL; } - return(err); + /* If the tablespace did not already exist or we couldn't + write to it, we treat that as a successful DISCARD. It is + unusable anyway. */ + return DB_SUCCESS; } /*********************************************************************//** @@ -3038,14 +3147,14 @@ row_discard_tablespace_for_mysql( if (table == 0) { err = DB_TABLE_NOT_FOUND; - } else if (dict_table_is_temporary(table)) { + } else if (table->is_temporary()) { ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, ER_CANNOT_DISCARD_TEMPORARY_TABLE); err = DB_ERROR; - } else if (table->space == TRX_SYS_SPACE) { + } else if (table->space_id == TRX_SYS_SPACE) { char table_name[MAX_FULL_NAME_LEN + 1]; innobase_format_name( @@ -3057,19 +3166,9 @@ row_discard_tablespace_for_mysql( err = DB_ERROR; - } else if (table->n_foreign_key_checks_running > 0) { - char table_name[MAX_FULL_NAME_LEN + 1]; - - innobase_format_name( - table_name, sizeof(table_name), - table->name.m_name); - - ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, - ER_DISCARD_FK_CHECKS_RUNNING, table_name); - - err = DB_ERROR; - } else { + ut_ad(!table->n_foreign_key_checks_running); + bool fts_exist = (dict_table_has_fts_index(table) || DICT_TF2_FLAG_IS_SET( table, DICT_TF2_FTS_HAS_DOC_ID)); @@ -3210,7 +3309,7 @@ row_drop_table_from_cache( trx_t* trx) { dberr_t err = DB_SUCCESS; - ut_ad(!dict_table_is_temporary(table)); + ut_ad(!table->is_temporary()); /* Remove the pointer to this table object from the list of modified tables by the transaction because the object @@ -3229,46 +3328,7 @@ row_drop_table_from_cache( return(err); } -/** Drop a single-table tablespace as part of dropping or renaming a table. -This deletes the fil_space_t if found and the file on disk. -@param[in] space_id Tablespace ID -@param[in] tablename Table name, same as the tablespace name -@param[in] filepath File path of tablespace to delete -@param[in] table_flags table flags -@return error code or DB_SUCCESS */ -UNIV_INLINE -dberr_t -row_drop_single_table_tablespace( - ulint space_id, - const char* tablename, - const char* filepath, - ulint table_flags) -{ - dberr_t err = DB_SUCCESS; - - /* If the tablespace is not in the cache, just delete the file. */ - if (!fil_space_for_table_exists_in_mem( - space_id, tablename, table_flags)) { - - /* Force a delete of any discarded or temporary files. */ - fil_delete_file(filepath); - - ib::info() << "Removed datafile " << filepath - << " for table " << tablename; - } else if (fil_delete_tablespace(space_id) != DB_SUCCESS) { - - ib::error() << "We removed the InnoDB internal data" - " dictionary entry of table " << tablename - << " but we are not able to delete the tablespace " - << space_id << " file " << filepath << "!"; - - err = DB_ERROR; - } - - return(err); -} - -/** Drop a table. +/** Drop a table for MySQL. If the data dictionary was not already locked by the transaction, the transaction will be committed. Otherwise, the data dictionary will remain locked. @@ -3291,10 +3351,11 @@ row_drop_table_for_mysql( dberr_t err; dict_foreign_t* foreign; dict_table_t* table; - char* filepath = NULL; char* tablename = NULL; bool locked_dictionary = false; pars_info_t* info = NULL; + mem_heap_t* heap = NULL; + DBUG_ENTER("row_drop_table_for_mysql"); DBUG_PRINT("row_drop_table_for_mysql", ("table: '%s'", name)); @@ -3333,14 +3394,30 @@ row_drop_table_for_mysql( DBUG_RETURN(DB_TABLE_NOT_FOUND); } - /* This function is called recursively via fts_drop_tables(). */ - if (!trx_is_started(trx)) { + const bool is_temp_name = strstr(table->name.m_name, + "/" TEMP_FILE_PREFIX); - if (!dict_table_is_temporary(table)) { - trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); - } else { - trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + if (table->is_temporary()) { + ut_ad(table->space == fil_system.temp_space); + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + btr_free(page_id_t(SRV_TMP_SPACE_ID, index->page), + univ_page_size); } + /* Remove the pointer to this table object from the list + of modified tables by the transaction because the object + is going to be destroyed below. */ + trx->mod_tables.erase(table); + table->release(); + dict_table_remove_from_cache(table); + err = DB_SUCCESS; + goto funct_exit_all_freed; + } + + /* This function is called recursively via fts_drop_tables(). */ + if (!trx_is_started(trx)) { + trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); } /* Turn on this drop bit before we could release the dictionary @@ -3370,23 +3447,13 @@ row_drop_table_for_mysql( row_mysql_lock_data_dictionary(trx); } - /* Do not bother to deal with persistent stats for temp - tables since we know temp tables do not use persistent - stats. */ - if (!dict_table_is_temporary(table)) { - dict_stats_wait_bg_to_stop_using_table( - table, trx); - } + dict_stats_wait_bg_to_stop_using_table(table, trx); } /* make sure background stats thread is not running on the table */ ut_ad(!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)); - const bool is_temp_name = strstr(table->name.m_name, - "/" TEMP_FILE_PREFIX); - mem_heap_t* heap = NULL; - - if (!dict_table_is_temporary(table)) { - if (table->space != TRX_SYS_SPACE) { + if (!table->no_rollback()) { + if (table->space != fil_system.sys_space) { /* Delete the link file if used. */ if (DICT_TF_HAS_DATA_DIR(table->flags)) { RemoteDatafile::delete_link_file(name); @@ -3531,9 +3598,8 @@ defer: case TRX_DICT_OP_INDEX: /* If the transaction was previously flagged as TRX_DICT_OP_INDEX, we should be dropping auxiliary - tables for full-text indexes or temp tables. */ - ut_ad(strstr(table->name.m_name, "/FTS_") - || strstr(table->name.m_name, TEMP_TABLE_PATH_PREFIX)); + tables for full-text indexes. */ + ut_ad(strstr(table->name.m_name, "/FTS_")); } /* Mark all indexes unavailable in the data dictionary cache @@ -3562,180 +3628,163 @@ defer: rw_lock_x_unlock(dict_index_get_lock(index)); } - if (!table->is_temporary()) { - /* We use the private SQL parser of Innobase to generate the - query graphs needed in deleting the dictionary data from system - tables in Innobase. Deleting a row from SYS_INDEXES table also - frees the file segments of the B-tree associated with the - index. */ + /* Deleting a row from SYS_INDEXES table will invoke + dict_drop_index_tree(). */ + info = pars_info_create(); - info = pars_info_create(); + pars_info_add_str_literal(info, "name", name); + + if (sqlcom != SQLCOM_TRUNCATE + && strchr(name, '/') + && dict_table_get_low("SYS_FOREIGN") + && dict_table_get_low("SYS_FOREIGN_COLS")) { + err = que_eval_sql( + info, + "PROCEDURE DROP_FOREIGN_PROC () IS\n" + "fid CHAR;\n" - pars_info_add_str_literal(info, "name", name); + "DECLARE CURSOR fk IS\n" + "SELECT ID FROM SYS_FOREIGN\n" + "WHERE FOR_NAME = :name\n" + "AND TO_BINARY(FOR_NAME) = TO_BINARY(:name)\n" + "FOR UPDATE;\n" - if (sqlcom != SQLCOM_TRUNCATE - && strchr(name, '/') - && dict_table_get_low("SYS_FOREIGN") - && dict_table_get_low("SYS_FOREIGN_COLS")) { + "BEGIN\n" + "OPEN fk;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH fk INTO fid;\n" + " IF (SQL % NOTFOUND) THEN RETURN; END IF;\n" + " DELETE FROM SYS_FOREIGN_COLS WHERE ID=fid;\n" + " DELETE FROM SYS_FOREIGN WHERE ID=fid;\n" + "END LOOP;\n" + "CLOSE fk;\n" + "END;\n", FALSE, trx); + if (err == DB_SUCCESS) { + info = pars_info_create(); + pars_info_add_str_literal(info, "name", name); + goto do_drop; + } + } else { +do_drop: + if (dict_table_get_low("SYS_VIRTUAL")) { err = que_eval_sql( info, - "PROCEDURE DROP_FOREIGN_PROC () IS\n" - "fid CHAR;\n" - - "DECLARE CURSOR fk IS\n" - "SELECT ID FROM SYS_FOREIGN\n" - "WHERE FOR_NAME = :name\n" - "AND TO_BINARY(FOR_NAME) = TO_BINARY(:name)\n" - "FOR UPDATE;\n" + "PROCEDURE DROP_VIRTUAL_PROC () IS\n" + "tid CHAR;\n" "BEGIN\n" - "OPEN fk;\n" - "WHILE 1 = 1 LOOP\n" - " FETCH fk INTO fid;\n" - " IF (SQL % NOTFOUND) THEN RETURN; END IF;\n" - " DELETE FROM SYS_FOREIGN_COLS WHERE ID=fid;\n" - " DELETE FROM SYS_FOREIGN WHERE ID=fid;\n" - "END LOOP;\n" - "CLOSE fk;\n" + "SELECT ID INTO tid FROM SYS_TABLES\n" + "WHERE NAME = :name FOR UPDATE;\n" + "IF (SQL % NOTFOUND) THEN RETURN;" + " END IF;\n" + "DELETE FROM SYS_VIRTUAL" + " WHERE TABLE_ID = tid;\n" "END;\n", FALSE, trx); if (err == DB_SUCCESS) { info = pars_info_create(); - pars_info_add_str_literal(info, "name", name); - goto do_drop; + pars_info_add_str_literal( + info, "name", name); } } else { -do_drop: - if (dict_table_get_low("SYS_VIRTUAL")) { - err = que_eval_sql( - info, - "PROCEDURE DROP_VIRTUAL_PROC () IS\n" - "tid CHAR;\n" - - "BEGIN\n" - "SELECT ID INTO tid FROM SYS_TABLES\n" - "WHERE NAME = :name FOR UPDATE;\n" - "IF (SQL % NOTFOUND) THEN RETURN;" - " END IF;\n" - "DELETE FROM SYS_VIRTUAL" - " WHERE TABLE_ID = tid;\n" - "END;\n", FALSE, trx); - if (err == DB_SUCCESS) { - info = pars_info_create(); - pars_info_add_str_literal( - info, "name", name); - } - } else { - err = DB_SUCCESS; - } + err = DB_SUCCESS; + } - err = err == DB_SUCCESS ? que_eval_sql( - info, - "PROCEDURE DROP_TABLE_PROC () IS\n" - "tid CHAR;\n" - "iid CHAR;\n" + err = err == DB_SUCCESS ? que_eval_sql( + info, + "PROCEDURE DROP_TABLE_PROC () IS\n" + "tid CHAR;\n" + "iid CHAR;\n" + + "DECLARE CURSOR cur_idx IS\n" + "SELECT ID FROM SYS_INDEXES\n" + "WHERE TABLE_ID = tid FOR UPDATE;\n" + + "BEGIN\n" + "SELECT ID INTO tid FROM SYS_TABLES\n" + "WHERE NAME = :name FOR UPDATE;\n" + "IF (SQL % NOTFOUND) THEN RETURN; END IF;\n" + + "OPEN cur_idx;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH cur_idx INTO iid;\n" + " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n" + " DELETE FROM SYS_FIELDS\n" + " WHERE INDEX_ID = iid;\n" + " DELETE FROM SYS_INDEXES\n" + " WHERE ID = iid AND TABLE_ID = tid;\n" + "END LOOP;\n" + "CLOSE cur_idx;\n" + + "DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n" + "DELETE FROM SYS_TABLES WHERE NAME=:name;\n" - "DECLARE CURSOR cur_idx IS\n" - "SELECT ID FROM SYS_INDEXES\n" - "WHERE TABLE_ID = tid FOR UPDATE;\n" + "END;\n", FALSE, trx) : err; + if (err == DB_SUCCESS && table->space + && dict_table_get_low("SYS_TABLESPACES") + && dict_table_get_low("SYS_DATAFILES")) { + info = pars_info_create(); + pars_info_add_int4_literal(info, "id", + lint(table->space_id)); + err = que_eval_sql( + info, + "PROCEDURE DROP_SPACE_PROC () IS\n" "BEGIN\n" - "SELECT ID INTO tid FROM SYS_TABLES\n" - "WHERE NAME = :name FOR UPDATE;\n" - "IF (SQL % NOTFOUND) THEN RETURN; END IF;\n" - - "OPEN cur_idx;\n" - "WHILE 1 = 1 LOOP\n" - " FETCH cur_idx INTO iid;\n" - " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n" - " DELETE FROM SYS_FIELDS\n" - " WHERE INDEX_ID = iid;\n" - " DELETE FROM SYS_INDEXES\n" - " WHERE ID = iid AND TABLE_ID = tid;\n" - "END LOOP;\n" - "CLOSE cur_idx;\n" - - "DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n" - "DELETE FROM SYS_TABLES WHERE NAME=:name;\n" - - "END;\n", FALSE, trx) : err; - - if (err == DB_SUCCESS && table->space - && dict_table_get_low("SYS_TABLESPACES") - && dict_table_get_low("SYS_DATAFILES")) { - info = pars_info_create(); - pars_info_add_int4_literal(info, "id", - lint(table->space)); - err = que_eval_sql( - info, - "PROCEDURE DROP_SPACE_PROC () IS\n" - "BEGIN\n" - "DELETE FROM SYS_TABLESPACES\n" - "WHERE SPACE = :id;\n" - "DELETE FROM SYS_DATAFILES\n" - "WHERE SPACE = :id;\n" - "END;\n", FALSE, trx); - } - } - } else { - page_no = page_nos; - for (dict_index_t* index = dict_table_get_first_index(table); - index != NULL; - index = dict_table_get_next_index(index)) { - /* remove the index object associated. */ - dict_drop_index_tree_in_mem(index, *page_no++); + "DELETE FROM SYS_TABLESPACES\n" + "WHERE SPACE = :id;\n" + "DELETE FROM SYS_DATAFILES\n" + "WHERE SPACE = :id;\n" + "END;\n", FALSE, trx); } - trx->mod_tables.erase(table); - dict_table_remove_from_cache(table); - err = DB_SUCCESS; - goto funct_exit; } switch (err) { - ulint space_id; - bool is_discarded; - ulint table_flags; - + fil_space_t* space; + char* filepath; case DB_SUCCESS: - space_id = table->space; - is_discarded = dict_table_is_discarded(table); - table_flags = table->flags; - ut_ad(!dict_table_is_temporary(table)); - - err = row_drop_ancillary_fts_tables(table, trx); - if (err != DB_SUCCESS) { - break; + if (!table->no_rollback()) { + err = row_drop_ancillary_fts_tables(table, trx); + if (err != DB_SUCCESS) { + break; + } } + space = table->space; + ut_ad(!space || space->id == table->space_id); /* Determine the tablespace filename before we drop - dict_table_t. Free this memory before returning. */ + dict_table_t. */ if (DICT_TF_HAS_DATA_DIR(table->flags)) { dict_get_and_save_data_dir_path(table, true); - ut_ad(table->data_dir_path - || dict_table_is_discarded(table)); - filepath = fil_make_filepath( + ut_ad(table->data_dir_path || !space); + filepath = space ? NULL : fil_make_filepath( table->data_dir_path, table->name.m_name, IBD, table->data_dir_path != NULL); } else { - filepath = fil_make_filepath( + filepath = space ? NULL : fil_make_filepath( NULL, table->name.m_name, IBD, false); } /* Free the dict_table_t object. */ err = row_drop_table_from_cache(tablename, table, trx); if (err != DB_SUCCESS) { + ut_free(filepath); break; } /* Do not attempt to drop known-to-be-missing tablespaces, nor the system tablespace. */ - if (is_discarded || is_system_tablespace(space_id)) { + if (!space) { + fil_delete_file(filepath); + ut_free(filepath); break; } - /* We can now drop the single-table tablespace. */ - err = row_drop_single_table_tablespace( - space_id, tablename, filepath, table_flags); + ut_ad(!filepath); + + if (space->id != TRX_SYS_SPACE) { + err = fil_delete_tablespace(space->id); + } break; case DB_OUT_OF_FILE_SPACE: @@ -3798,8 +3847,7 @@ funct_exit: mem_heap_free(heap); } - ut_free(filepath); - +funct_exit_all_freed: if (locked_dictionary) { if (trx_is_started(trx)) { @@ -3986,8 +4034,7 @@ loop: << table->name << ".frm' was lost."; } - if (!table->is_readable() - && !fil_space_get(table->space)) { + if (!table->is_readable() && !table->space) { ib::warn() << "Missing .ibd file for table " << table->name << "."; } @@ -4229,9 +4276,8 @@ row_rename_table_for_mysql( err = DB_TABLE_NOT_FOUND; goto funct_exit; - } else if (!table->is_readable() - && fil_space_get(table->space) == NULL - && !dict_table_is_discarded(table)) { + } else if (!table->is_readable() && !table->space + && !(table->flags2 & DICT_TF2_DISCARDED)) { err = DB_TABLE_NOT_FOUND; @@ -4276,7 +4322,7 @@ row_rename_table_for_mysql( goto funct_exit; } - if (!table->is_temporary() && srv_safe_truncate) { + if (!table->is_temporary()) { err = trx_undo_report_rename(trx, table); if (err != DB_SUCCESS) { @@ -4301,25 +4347,25 @@ row_rename_table_for_mysql( "END;\n" , FALSE, trx); + ut_ad(err != DB_DUPLICATE_KEY); + /* SYS_TABLESPACES and SYS_DATAFILES need to be updated if the table is in a single-table tablespace. */ if (err != DB_SUCCESS || !dict_table_is_file_per_table(table)) { - } else if (char* old_path = fil_space_get_first_path(table->space)) { - char* new_path = os_file_make_new_pathname(old_path, new_name); - + } else if (table->space) { /* If old path and new path are the same means tablename has not changed and only the database name holding the table has changed so we need to make the complete filepath again. */ - if (!dict_tables_have_same_db(old_name, new_name)) { - ut_free(new_path); - new_path = fil_make_filepath(NULL, new_name, IBD, false); - } + char* new_path = dict_tables_have_same_db(old_name, new_name) + ? os_file_make_new_pathname( + table->space->chain.start->name, new_name) + : fil_make_filepath(NULL, new_name, IBD, false); info = pars_info_create(); pars_info_add_str_literal(info, "new_table_name", new_name); pars_info_add_str_literal(info, "new_path_name", new_path); - pars_info_add_int4_literal(info, "space_id", table->space); + pars_info_add_int4_literal(info, "space_id", table->space_id); err = que_eval_sql(info, "PROCEDURE RENAME_SPACE () IS\n" @@ -4333,7 +4379,6 @@ row_rename_table_for_mysql( "END;\n" , FALSE, trx); - ut_free(old_path); ut_free(new_path); } if (err != DB_SUCCESS) { @@ -4473,7 +4518,7 @@ row_rename_table_for_mysql( if (err == DB_SUCCESS && (dict_table_has_fts_index(table) - || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) && !dict_tables_have_same_db(old_name, new_name)) { err = fts_rename_aux_tables(table, new_name, trx); if (err != DB_TABLE_NOT_FOUND) { @@ -4606,7 +4651,7 @@ funct_exit: && table != NULL && (table->space != 0)) { char* orig_name = table->name.m_name; - trx_t* trx_bg = trx_allocate_for_background(); + trx_t* trx_bg = trx_create(); /* If the first fts_rename fails, the trx would be rolled back and committed, we can't use it any more, @@ -4630,7 +4675,7 @@ funct_exit: trx_bg->dict_operation_lock_mode = 0; trx_commit_for_mysql(trx_bg); - trx_free_for_background(trx_bg); + trx_bg->free(); } if (table != NULL) { @@ -4701,7 +4746,8 @@ row_scan_index_for_mysql( return(DB_SUCCESS); } - ulint bufsize = ut_max(UNIV_PAGE_SIZE, prebuilt->mysql_row_len); + ulint bufsize = std::max<ulint>(srv_page_size, + prebuilt->mysql_row_len); buf = static_cast<byte*>(ut_malloc_nokey(bufsize)); heap = mem_heap_create(100); @@ -4750,7 +4796,7 @@ func_exit: rec = buf + mach_read_from_4(buf); - offsets = rec_get_offsets(rec, index, offsets_, true, + offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields, ULINT_UNDEFINED, &heap); if (prev_entry != NULL) { diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index 024625e9e0a..6a7f5921aa3 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -118,7 +118,7 @@ row_purge_remove_clust_if_poss_low( log_free_check(); mtr_start(&mtr); - mtr.set_named_space(index->space); + index->set_modified(mtr); if (!row_purge_reposition_pcur(mode, node, &mtr)) { /* The record was already removed. */ @@ -127,8 +127,8 @@ row_purge_remove_clust_if_poss_low( rec = btr_pcur_get_rec(&node->pcur); - offsets = rec_get_offsets( - rec, index, offsets_, true, ULINT_UNDEFINED, &heap); + offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields, + ULINT_UNDEFINED, &heap); if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) { /* Someone else has modified the record later: do not remove */ @@ -247,7 +247,7 @@ static bool row_purge_restore_vsec_cur( bool is_tree) { sec_mtr->start(); - sec_mtr->set_named_space(index->space); + index->set_modified(*sec_mtr); return btr_pcur_restore_position( is_tree ? BTR_PURGE_TREE : BTR_PURGE_LEAF, @@ -384,14 +384,14 @@ row_purge_remove_sec_if_poss_tree( enum row_search_result search_result; log_free_check(); - mtr_start(&mtr); - mtr.set_named_space(index->space); + mtr.start(); + index->set_modified(mtr); if (!index->is_committed()) { /* The index->online_status may change if the index is or was being created online, but not committed yet. It is protected by index->lock. */ - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); if (dict_index_is_online_ddl(index)) { /* Online secondary index creation will not @@ -486,9 +486,9 @@ row_purge_remove_sec_if_poss_tree( } func_exit: - btr_pcur_close(&pcur); + btr_pcur_close(&pcur); // FIXME: need this? func_exit_no_pcur: - mtr_commit(&mtr); + mtr.commit(); return(success); } @@ -514,9 +514,9 @@ row_purge_remove_sec_if_poss_leaf( log_free_check(); ut_ad(index->table == node->table); - ut_ad(!dict_table_is_temporary(index->table)); - mtr_start(&mtr); - mtr.set_named_space(index->space); + ut_ad(!index->table->is_temporary()); + mtr.start(); + index->set_modified(mtr); if (!index->is_committed()) { /* For uncommitted spatial index, we also skip the purge. */ @@ -527,7 +527,7 @@ row_purge_remove_sec_if_poss_leaf( /* The index->online_status may change if the the index is or was being created online, but not committed yet. It is protected by index->lock. */ - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); if (dict_index_is_online_ddl(index)) { /* Online secondary index creation will not @@ -631,7 +631,7 @@ row_purge_remove_sec_if_poss_leaf( ->page.id); btr_pcur_close(&pcur); - mtr_commit(&mtr); + mtr.commit(); return(success); } } @@ -657,9 +657,9 @@ row_purge_remove_sec_if_poss_leaf( /* The deletion was buffered. */ case ROW_NOT_FOUND: /* The index entry does not exist, nothing to do. */ - btr_pcur_close(&pcur); + btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set? func_exit_no_pcur: - mtr_commit(&mtr); + mtr.commit(); return(success); } @@ -781,6 +781,74 @@ row_purge_del_mark( return(row_purge_remove_clust_if_poss(node)); } +/** Reset DB_TRX_ID, DB_ROLL_PTR of a clustered index record +whose old history can no longer be observed. +@param[in,out] node purge node +@param[in,out] mtr mini-transaction (will be started and committed) */ +static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr) +{ + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_S) + || node->vcol_info.is_used()); + /* Reset DB_TRX_ID, DB_ROLL_PTR for old records. */ + mtr->start(); + + if (row_purge_reposition_pcur(BTR_MODIFY_LEAF, node, mtr)) { + dict_index_t* index = dict_table_get_first_index( + node->table); + ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1; + rec_t* rec = btr_pcur_get_rec(&node->pcur); + mem_heap_t* heap = NULL; + /* Reserve enough offsets for the PRIMARY KEY and 2 columns + so that we can access DB_TRX_ID, DB_ROLL_PTR. */ + rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2]; + rec_offs_init(offsets_); + rec_offs* offsets = rec_get_offsets( + rec, index, offsets_, index->n_core_fields, + trx_id_pos + 2, &heap); + ut_ad(heap == NULL); + + ut_ad(dict_index_get_nth_field(index, trx_id_pos) + ->col->mtype == DATA_SYS); + ut_ad(dict_index_get_nth_field(index, trx_id_pos) + ->col->prtype == (DATA_TRX_ID | DATA_NOT_NULL)); + ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1) + ->col->mtype == DATA_SYS); + ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1) + ->col->prtype == (DATA_ROLL_PTR | DATA_NOT_NULL)); + + /* Only update the record if DB_ROLL_PTR matches (the + record has not been modified after this transaction + became purgeable) */ + if (node->roll_ptr + == row_get_rec_roll_ptr(rec, index, offsets)) { + ut_ad(!rec_get_deleted_flag(rec, + rec_offs_comp(offsets))); + DBUG_LOG("purge", "reset DB_TRX_ID=" + << ib::hex(row_get_rec_trx_id( + rec, index, offsets))); + + index->set_modified(*mtr); + if (page_zip_des_t* page_zip + = buf_block_get_page_zip( + btr_pcur_get_block(&node->pcur))) { + page_zip_write_trx_id_and_roll_ptr( + page_zip, rec, offsets, trx_id_pos, + 0, 1ULL << ROLL_PTR_INSERT_FLAG_POS, + mtr); + } else { + ulint len; + byte* ptr = rec_get_nth_field( + rec, offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + mlog_write_string(ptr, reset_trx_id, + sizeof reset_trx_id, mtr); + } + } + } + + mtr->commit(); +} + /***********************************************************//** Purges an update of an existing record. Also purges an update of a delete marked record if that record contained an externally stored field. */ @@ -841,6 +909,8 @@ row_purge_upd_exist_or_extern_func( mem_heap_free(heap); skip_secondaries: + mtr_t mtr; + dict_index_t* index = dict_table_get_first_index(node->table); /* Free possible externally stored fields */ for (ulint i = 0; i < upd_get_n_fields(node->update); i++) { @@ -852,12 +922,10 @@ skip_secondaries: buf_block_t* block; ulint internal_offset; byte* data_field; - dict_index_t* index; ibool is_insert; ulint rseg_id; ulint page_no; ulint offset; - mtr_t mtr; /* We use the fact that new_val points to undo_rec and get thus the offset of @@ -865,32 +933,31 @@ skip_secondaries: can calculate from node->roll_ptr the file address of the new_val data */ - internal_offset - = ((const byte*) - dfield_get_data(&ufield->new_val)) - - undo_rec; + internal_offset = ulint( + static_cast<const byte*> + (dfield_get_data(&ufield->new_val)) + - undo_rec); - ut_a(internal_offset < UNIV_PAGE_SIZE); + ut_a(internal_offset < srv_page_size); trx_undo_decode_roll_ptr(node->roll_ptr, &is_insert, &rseg_id, &page_no, &offset); - rseg = trx_sys->rseg_array[rseg_id]; + rseg = trx_sys.rseg_array[rseg_id]; ut_a(rseg != NULL); ut_ad(rseg->id == rseg_id); ut_ad(rseg->is_persistent()); - mtr_start(&mtr); + mtr.start(); /* We have to acquire an SX-latch to the clustered index tree (exclude other tree changes) */ - index = dict_table_get_first_index(node->table); - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); - mtr.set_named_space(index->space); + index->set_modified(mtr); /* NOTE: we must also acquire an X-latch to the root page of the tree. We will need it when we @@ -904,7 +971,7 @@ skip_secondaries: btr_root_get(index, &mtr); block = buf_page_get( - page_id_t(rseg->space, page_no), + page_id_t(rseg->space->id, page_no), univ_page_size, RW_X_LATCH, &mtr); buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); @@ -919,9 +986,11 @@ skip_secondaries: data_field + dfield_get_len(&ufield->new_val) - BTR_EXTERN_FIELD_REF_SIZE, NULL, NULL, NULL, 0, false, &mtr); - mtr_commit(&mtr); + mtr.commit(); } } + + row_purge_reset_trx_id(node, &mtr); } #ifdef UNIV_DEBUG @@ -962,17 +1031,32 @@ row_purge_parse_undo_rec( node->rec_type = type; - if (type == TRX_UNDO_UPD_DEL_REC && !*updated_extern) { -skip: - node->table = NULL; + switch (type) { + case TRX_UNDO_RENAME_TABLE: return false; - } + case TRX_UNDO_INSERT_METADATA: + case TRX_UNDO_INSERT_REC: + /* These records do not store any transaction identifier. - ptr = trx_undo_update_rec_get_sys_cols(ptr, &node->trx_id, &roll_ptr, - &info_bits); + FIXME: Update SYS_TABLES.ID on both DISCARD TABLESPACE + and IMPORT TABLESPACE to get rid of the repeated lookups! */ + node->trx_id = TRX_ID_MAX; + break; + default: +#ifdef UNIV_DEBUG + ut_ad(!"unknown undo log record type"); + return false; + case TRX_UNDO_UPD_DEL_REC: + case TRX_UNDO_UPD_EXIST_REC: + case TRX_UNDO_DEL_MARK_REC: +#endif /* UNIV_DEBUG */ + ptr = trx_undo_update_rec_get_sys_cols(ptr, &node->trx_id, + &roll_ptr, &info_bits); + break; + } if (node->is_skipped(table_id)) { - goto skip; + return false; } /* Prevent DROP TABLE etc. from running when we are doing the purge @@ -991,14 +1075,21 @@ try_again: goto err_exit; } - ut_ad(!dict_table_is_temporary(node->table)); + ut_ad(!node->table->is_temporary()); if (!fil_table_accessible(node->table)) { goto inaccessible; } - if (node->table->n_v_cols && !node->table->vc_templ - && dict_table_has_indexed_v_cols(node->table)) { + switch (type) { + case TRX_UNDO_INSERT_METADATA: + case TRX_UNDO_INSERT_REC: + break; + default: + if (!node->table->n_v_cols || node->table->vc_templ + || !dict_table_has_indexed_v_cols(node->table)) { + break; + } /* Need server fully up for virtual column computation */ if (!mysqld_server_started) { @@ -1029,29 +1120,27 @@ inaccessible: if (!trx_id) { trx_id = TRX_ID_MAX; } -close_exit: + dict_table_close(node->table, FALSE, FALSE); node->table = NULL; err_exit: rw_lock_s_unlock(&dict_operation_lock); - if (table_id) { - node->skip(table_id, trx_id); - } + node->skip(table_id, trx_id); return(false); } - if (type == TRX_UNDO_UPD_EXIST_REC - && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) - && !*updated_extern) { - - /* Purge requires no changes to indexes: we may return */ - table_id = 0; - goto close_exit; + if (type == TRX_UNDO_INSERT_METADATA) { + node->ref = &trx_undo_metadata; + return(true); } ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), node->heap); + if (type == TRX_UNDO_INSERT_REC) { + return(true); + } + ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, node->trx_id, roll_ptr, info_bits, @@ -1060,10 +1149,13 @@ err_exit: /* Read to the partial row the fields that occur in indexes */ if (!(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + ut_ad(!(node->update->info_bits & REC_INFO_MIN_REC_FLAG)); ptr = trx_undo_rec_get_partial_row( ptr, clust_index, node->update, &node->row, type == TRX_UNDO_UPD_DEL_REC, node->heap); + } else if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) { + node->ref = &trx_undo_metadata; } return(true); @@ -1107,8 +1199,14 @@ row_purge_record_func( MONITOR_INC(MONITOR_N_DEL_ROW_PURGE); } break; + case TRX_UNDO_INSERT_METADATA: + case TRX_UNDO_INSERT_REC: + node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS; + /* fall through */ default: if (!updated_extern) { + mtr_t mtr; + row_purge_reset_trx_id(node, &mtr); break; } /* fall through */ @@ -1263,7 +1361,7 @@ purge_node_t::validate_pcur() dict_index_t* clust_index = pcur.btr_cur.index; rec_offs* offsets = rec_get_offsets( - pcur.old_rec, clust_index, NULL, true, + pcur.old_rec, clust_index, NULL, pcur.old_n_core_fields, pcur.old_n_fields, &heap); /* Here we are comparing the purge ref record and the stored initial diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc index b80b387b0fa..94a372bd046 100644 --- a/storage/innobase/row/row0quiesce.cc +++ b/storage/innobase/row/row0quiesce.cc @@ -142,7 +142,7 @@ row_quiesce_write_indexes( mach_write_to_8(ptr, index->id); ptr += sizeof(index_id_t); - mach_write_to_4(ptr, index->space); + mach_write_to_4(ptr, table->space_id); ptr += sizeof(ib_uint32_t); mach_write_to_4(ptr, index->page); @@ -240,7 +240,7 @@ row_quiesce_write_table( This field is also redundant, because the lengths are a property of the character set encoding, which in turn is encodedin prtype above. */ - mach_write_to_4(ptr, col->mbmaxlen * 5 + col->mbminlen); + mach_write_to_4(ptr, ulint(col->mbmaxlen * 5 + col->mbminlen)); ptr += sizeof(ib_uint32_t); mach_write_to_4(ptr, col->ind); @@ -391,7 +391,7 @@ row_quiesce_write_header( byte* ptr = row; /* Write the system page size. */ - mach_write_to_4(ptr, UNIV_PAGE_SIZE); + mach_write_to_4(ptr, srv_page_size); ptr += sizeof(ib_uint32_t); /* Write the table->flags. */ @@ -518,15 +518,15 @@ row_quiesce_table_start( ut_a(trx->mysql_thd != 0); - ut_ad(fil_space_get(table->space) != NULL); + ut_ad(table->space != NULL); ib::info() << "Sync to disk of " << table->name << " started."; if (srv_undo_sources) { - trx_purge_stop(); + purge_sys.stop(); } for (ulint count = 0; - ibuf_merge_space(table->space) != 0 + ibuf_merge_space(table->space_id) != 0 && !trx_is_interrupted(trx); ++count) { if (!(count % 20)) { @@ -538,7 +538,8 @@ row_quiesce_table_start( if (!trx_is_interrupted(trx)) { { FlushObserver observer(table->space, trx, NULL); - buf_LRU_flush_or_remove_pages(table->space, &observer); + buf_LRU_flush_or_remove_pages(table->space_id, + &observer); } if (trx_is_interrupted(trx)) { @@ -605,7 +606,7 @@ row_quiesce_table_complete( } if (srv_undo_sources) { - trx_purge_run(); + purge_sys.resume(); } dberr_t err = row_quiesce_set_state(table, QUIESCE_NONE, trx); @@ -631,13 +632,13 @@ row_quiesce_set_state( return(DB_UNSUPPORTED); - } else if (dict_table_is_temporary(table)) { + } else if (table->is_temporary()) { ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, ER_CANNOT_DISCARD_TEMPORARY_TABLE); return(DB_UNSUPPORTED); - } else if (table->space == TRX_SYS_SPACE) { + } else if (table->space_id == TRX_SYS_SPACE) { char table_name[MAX_FULL_NAME_LEN + 1]; diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc index 774794be225..6f34c9fcc9b 100644 --- a/storage/innobase/row/row0row.cc +++ b/storage/innobase/row/row0row.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, 2020, MariaDB Corporation. +Copyright (c) 2018, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -40,11 +40,141 @@ Created 4/20/1996 Heikki Tuuri #include "row0ext.h" #include "row0upd.h" #include "rem0cmp.h" -#include "read0read.h" #include "ut0mem.h" #include "gis0geo.h" #include "row0mysql.h" +/** Build a spatial index key. +@param[in] index spatial index +@param[in] ext externally stored column prefixes, or NULL +@param[in,out] dfield field of the tuple to be copied +@param[in] dfield2 field of the tuple to copy +@param[in] flag ROW_BUILD_NORMAL, ROW_BUILD_FOR_PURGE or + ROW_BUILD_FOR_UNDO +@param[in,out] heap memory heap from which the memory + of the field entry is allocated. +@retval false if undo log is logged before spatial index creation. */ +static bool row_build_spatial_index_key( + const dict_index_t* index, + const row_ext_t* ext, + dfield_t* dfield, + const dfield_t* dfield2, + ulint flag, + mem_heap_t* heap) +{ + double* mbr; + + dfield_copy(dfield, dfield2); + dfield->type.prtype |= DATA_GIS_MBR; + + /* Allocate memory for mbr field */ + mbr = static_cast<double*>(mem_heap_alloc(heap, DATA_MBR_LEN)); + + /* Set mbr field data. */ + dfield_set_data(dfield, mbr, DATA_MBR_LEN); + + const fil_space_t* space = index->table->space; + + if (UNIV_UNLIKELY(!dfield2->data || !space)) { + /* FIXME: dfield contains uninitialized data, + but row_build_index_entry_low() will not return NULL. + This bug is inherited from MySQL 5.7.5 + commit b66ad511b61fffe75c58d0a607cdb837c6e6c821. */ + return true; + } + + const byte* dptr = NULL; + ulint dlen = 0; + ulint flen = 0; + double tmp_mbr[SPDIMS * 2]; + mem_heap_t* temp_heap = NULL; + + if (!dfield_is_ext(dfield2)) { + dptr = static_cast<const byte*>(dfield_get_data(dfield2)); + dlen = dfield_get_len(dfield2); + goto write_mbr; + } + + if (flag == ROW_BUILD_FOR_PURGE) { + const byte* ptr = static_cast<const byte*>( + dfield_get_data(dfield2)); + + switch (dfield_get_spatial_status(dfield2)) { + case SPATIAL_ONLY: + ut_ad(dfield_get_len(dfield2) == DATA_MBR_LEN); + break; + + case SPATIAL_MIXED: + ptr += dfield_get_len(dfield2); + break; + + case SPATIAL_UNKNOWN: + ut_ad(0); + /* fall through */ + case SPATIAL_NONE: + /* Undo record is logged before + spatial index is created.*/ + return false; + } + + memcpy(mbr, ptr, DATA_MBR_LEN); + return true; + } + + if (flag == ROW_BUILD_FOR_UNDO + && dict_table_has_atomic_blobs(index->table)) { + /* For ROW_FORMAT=DYNAMIC or COMPRESSED, a prefix of + off-page records is stored in the undo log record (for + any column prefix indexes). For SPATIAL INDEX, we + must ignore this prefix. The full column value is + stored in the BLOB. For non-spatial index, we would + have already fetched a necessary prefix of the BLOB, + available in the "ext" parameter. + + Here, for SPATIAL INDEX, we are fetching the full + column, which is potentially wasting a lot of I/O, + memory, and possibly involving a concurrency problem, + similar to ones that existed before the introduction + of row_ext_t. + + MDEV-11657 FIXME: write the MBR directly to the undo + log record, and avoid recomputing it here! */ + flen = BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(dfield_get_len(dfield2) >= BTR_EXTERN_FIELD_REF_SIZE); + dptr = static_cast<const byte*>(dfield_get_data(dfield2)) + + dfield_get_len(dfield2) + - BTR_EXTERN_FIELD_REF_SIZE; + } else { + flen = dfield_get_len(dfield2); + dptr = static_cast<const byte*>(dfield_get_data(dfield2)); + } + + temp_heap = mem_heap_create(1000); + + dptr = btr_copy_externally_stored_field( + &dlen, dptr, ext ? ext->page_size : page_size_t(space->flags), + flen, temp_heap); + +write_mbr: + if (dlen <= GEO_DATA_HEADER_SIZE) { + for (uint i = 0; i < SPDIMS; i += 2) { + tmp_mbr[i] = DBL_MAX; + tmp_mbr[i + 1] = -DBL_MAX; + } + } else { + rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE, + uint(dlen - GEO_DATA_HEADER_SIZE), + SPDIMS, tmp_mbr); + } + + dfield_write_mbr(dfield, tmp_mbr); + if (temp_heap) { + mem_heap_free(temp_heap); + } + + return true; +} + /*****************************************************************//** When an insert or purge to a table is performed, this function builds the entry to be inserted into or purged from an index on the table. @@ -58,8 +188,8 @@ row_build_index_entry_low( inserted or purged */ const row_ext_t* ext, /*!< in: externally stored column prefixes, or NULL */ - dict_index_t* index, /*!< in: index on the table */ - mem_heap_t* heap, /*!< in: memory heap from which + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap, /*!< in,out: memory heap from which the memory for the index entry is allocated */ ulint flag) /*!< in: ROW_BUILD_NORMAL, @@ -112,11 +242,10 @@ row_build_index_entry_low( col_no = dict_col_get_no(col); dfield = dtuple_get_nth_field(entry, i); } -#if DATA_MISSING != 0 -# error "DATA_MISSING != 0" -#endif - if (dict_col_is_virtual(col)) { + compile_time_assert(DATA_MISSING == 0); + + if (col->is_virtual()) { const dict_v_col_t* v_col = reinterpret_cast<const dict_v_col_t*>(col); @@ -149,119 +278,11 @@ row_build_index_entry_low( /* Special handle spatial index, set the first field which is for store MBR. */ if (dict_index_is_spatial(index) && i == 0) { - double* mbr; - - dfield_copy(dfield, dfield2); - dfield->type.prtype |= DATA_GIS_MBR; - - /* Allocate memory for mbr field */ - ulint mbr_len = DATA_MBR_LEN; - mbr = static_cast<double*>(mem_heap_alloc(heap, mbr_len)); - - /* Set mbr field data. */ - dfield_set_data(dfield, mbr, mbr_len); - - if (dfield2->data) { - const uchar* dptr = NULL; - ulint dlen = 0; - ulint flen = 0; - double tmp_mbr[SPDIMS * 2]; - mem_heap_t* temp_heap = NULL; - - if (dfield_is_ext(dfield2)) { - if (flag == ROW_BUILD_FOR_PURGE) { - const byte* ptr = NULL; - - spatial_status_t spatial_status; - spatial_status = - dfield_get_spatial_status( - dfield2); - - switch (spatial_status) { - case SPATIAL_ONLY: - ptr = static_cast<const byte*>( - dfield_get_data( - dfield2)); - ut_ad(dfield_get_len(dfield2) - == DATA_MBR_LEN); - break; - - case SPATIAL_MIXED: - ptr = static_cast<const byte*>( - dfield_get_data( - dfield2)) - + dfield_get_len( - dfield2); - break; - - case SPATIAL_UNKNOWN: - ut_ad(0); - /* fall through */ - case SPATIAL_NONE: - /* Undo record is logged before - spatial index is created.*/ - return(NULL); - } - - memcpy(mbr, ptr, DATA_MBR_LEN); - continue; - } - - if (flag == ROW_BUILD_FOR_UNDO - && dict_table_get_format(index->table) - >= UNIV_FORMAT_B) { - /* For build entry for undo, and - the table is Barrcuda, we need - to skip the prefix data. */ - flen = BTR_EXTERN_FIELD_REF_SIZE; - ut_ad(dfield_get_len(dfield2) >= - BTR_EXTERN_FIELD_REF_SIZE); - dptr = static_cast<const byte*>( - dfield_get_data(dfield2)) - + dfield_get_len(dfield2) - - BTR_EXTERN_FIELD_REF_SIZE; - } else { - flen = dfield_get_len(dfield2); - dptr = static_cast<const byte*>( - dfield_get_data(dfield2)); - } - - temp_heap = mem_heap_create(1000); - - const page_size_t page_size - = (ext != NULL) - ? ext->page_size - : dict_table_page_size( - index->table); - - dptr = btr_copy_externally_stored_field( - &dlen, dptr, - page_size, - flen, - temp_heap); - } else { - dptr = static_cast<const uchar*>( - dfield_get_data(dfield2)); - dlen = dfield_get_len(dfield2); - - } - - if (dlen <= GEO_DATA_HEADER_SIZE) { - for (uint i = 0; i < SPDIMS; ++i) { - tmp_mbr[i * 2] = DBL_MAX; - tmp_mbr[i * 2 + 1] = -DBL_MAX; - } - } else { - rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE, - static_cast<uint>(dlen - - GEO_DATA_HEADER_SIZE), - SPDIMS, tmp_mbr); - } - dfield_write_mbr(dfield, tmp_mbr); - if (temp_heap) { - mem_heap_free(temp_heap); - } + if (!row_build_spatial_index_key( + index, ext, dfield, dfield2, flag, heap)) { + return NULL; } + continue; } @@ -273,6 +294,8 @@ row_build_index_entry_low( continue; } + ut_ad(!(index->type & DICT_FTS)); + if ((!ind_field || ind_field->prefix_len == 0) && (!dfield_is_ext(dfield) || dict_index_is_clust(index))) { @@ -286,11 +309,11 @@ row_build_index_entry_low( /* If the column is stored externally (off-page) in the clustered index, it must be an ordering field in - the secondary index. In the Antelope format, only - prefix-indexed columns may be stored off-page in the - clustered index record. In the Barracuda format, also - fully indexed long CHAR or VARCHAR columns may be - stored off-page. */ + the secondary index. If !atomic_blobs, the only way + we may have a secondary index pointing to a clustered + index record with an off-page column is when it is a + column prefix index. If atomic_blobs, also fully + indexed long columns may be stored off-page. */ ut_ad(col->ord_part); if (ext && !col->is_virtual()) { @@ -305,9 +328,8 @@ row_build_index_entry_low( } if (ind_field->prefix_len == 0) { - /* In the Barracuda format - (ROW_FORMAT=DYNAMIC or - ROW_FORMAT=COMPRESSED), we can have a + /* If ROW_FORMAT=DYNAMIC or + ROW_FORMAT=COMPRESSED, we can have a secondary index on an entire column that is stored off-page in the clustered index. As this is not a @@ -317,11 +339,12 @@ row_build_index_entry_low( continue; } } else if (dfield_is_ext(dfield)) { - /* This table is either in Antelope format + /* This table is either in (ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT) or a purge record where the ordered part of the field is not external. - In Antelope, the maximum column prefix + In ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, + the maximum column prefix index length is 767 bytes, and the clustered index record contains a 768-byte prefix of each off-page column. */ @@ -356,7 +379,7 @@ addition of new virtual columns. of an index, or NULL if index->table should be consulted instead -@param[in] add_cols default values of added columns, or NULL +@param[in] defaults default values of added/changed columns, or NULL @param[in] add_v new virtual columns added along with new indexes @param[in] col_map mapping of old column @@ -374,7 +397,7 @@ row_build_low( const rec_t* rec, const rec_offs* offsets, const dict_table_t* col_table, - const dtuple_t* add_cols, + const dtuple_t* defaults, const dict_add_v_col_t* add_v, const ulint* col_map, row_ext_t** ext, @@ -395,11 +418,12 @@ row_build_low( ut_ad(rec != NULL); ut_ad(heap != NULL); ut_ad(dict_index_is_clust(index)); - ut_ad(!trx_sys_mutex_own()); + ut_ad(!mutex_own(&trx_sys.mutex)); ut_ad(!col_map || col_table); if (!offsets) { - offsets = rec_get_offsets(rec, index, offsets_, true, + offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, ULINT_UNDEFINED, &tmp_heap); } else { ut_ad(rec_offs_validate(rec, index, offsets)); @@ -414,8 +438,9 @@ row_build_low( times, and the cursor restore can happen multiple times for single insert or update statement. */ ut_a(!rec_offs_any_null_extern(rec, offsets) - || trx_rw_is_active(row_get_rec_trx_id(rec, index, offsets), - NULL, false)); + || trx_sys.is_registered(current_trx(), + row_get_rec_trx_id(rec, index, + offsets))); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { @@ -435,17 +460,17 @@ row_build_low( } /* Avoid a debug assertion in rec_offs_validate(). */ - rec_offs_make_valid(copy, index, const_cast<rec_offs*>(offsets)); + rec_offs_make_valid(copy, index, true, const_cast<rec_offs*>(offsets)); if (!col_table) { ut_ad(!col_map); - ut_ad(!add_cols); + ut_ad(!defaults); col_table = index->table; } - if (add_cols) { + if (defaults) { ut_ad(col_map); - row = dtuple_copy(add_cols, heap); + row = dtuple_copy(defaults, heap); /* dict_table_copy_types() would set the fields to NULL */ for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) { dict_col_copy_type( @@ -505,10 +530,14 @@ row_build_low( } dfield_t* dfield = dtuple_get_nth_field(row, col_no); - - const byte* field = rec_get_nth_field( + const void* field = rec_get_nth_field( copy, offsets, i, &len); - + if (len == UNIV_SQL_DEFAULT) { + field = index->instant_field_value(i, &len); + if (field && type != ROW_COPY_POINTERS) { + field = mem_heap_dup(heap, field, len); + } + } dfield_set_data(dfield, field, len); if (rec_offs_nth_extern(offsets, i)) { @@ -525,7 +554,7 @@ row_build_low( } } - rec_offs_make_valid(rec, index, const_cast<rec_offs*>(offsets)); + rec_offs_make_valid(rec, index, true, const_cast<rec_offs*>(offsets)); ut_ad(dtuple_check_typed(row)); @@ -588,9 +617,9 @@ row_build( of an index, or NULL if index->table should be consulted instead */ - const dtuple_t* add_cols, + const dtuple_t* defaults, /*!< in: default values of - added columns, or NULL */ + added and changed columns, or NULL */ const ulint* col_map,/*!< in: mapping of old column numbers to new ones, or NULL */ row_ext_t** ext, /*!< out, own: cache of @@ -600,7 +629,7 @@ row_build( the memory needed is allocated */ { return(row_build_low(type, index, rec, offsets, col_table, - add_cols, NULL, col_map, ext, heap)); + defaults, NULL, col_map, ext, heap)); } /** An inverse function to row_build_index_entry. Builds a row from a @@ -616,7 +645,7 @@ addition of new virtual columns. of an index, or NULL if index->table should be consulted instead -@param[in] add_cols default values of added columns, or NULL +@param[in] defaults default values of added, changed columns, or NULL @param[in] add_v new virtual columns added along with new indexes @param[in] col_map mapping of old column @@ -633,28 +662,33 @@ row_build_w_add_vcol( const rec_t* rec, const rec_offs* offsets, const dict_table_t* col_table, - const dtuple_t* add_cols, + const dtuple_t* defaults, const dict_add_v_col_t* add_v, const ulint* col_map, row_ext_t** ext, mem_heap_t* heap) { return(row_build_low(type, index, rec, offsets, col_table, - add_cols, add_v, col_map, ext, heap)); + defaults, add_v, col_map, ext, heap)); } -/*******************************************************************//** -Converts an index record to a typed data tuple. +/** Convert an index record to a data tuple. +@tparam def whether the index->instant_field_value() needs to be accessed +@param[in] rec index record +@param[in] index index +@param[in] offsets rec_get_offsets(rec, index) +@param[out] n_ext number of externally stored columns +@param[in,out] heap memory heap for allocations @return index entry built; does not set info_bits, and the data fields in the entry will point directly to rec */ +template<bool def> +static inline dtuple_t* -row_rec_to_index_entry_low( -/*=======================*/ - const rec_t* rec, /*!< in: record in the index */ - const dict_index_t* index, /*!< in: index */ - const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ - mem_heap_t* heap) /*!< in: memory heap from which - the memory needed is allocated */ +row_rec_to_index_entry_impl( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + mem_heap_t* heap) { dtuple_t* entry; dfield_t* dfield; @@ -666,6 +700,7 @@ row_rec_to_index_entry_low( ut_ad(rec != NULL); ut_ad(heap != NULL); ut_ad(index != NULL); + ut_ad(def || !rec_offs_any_default(offsets)); /* Because this function may be invoked by row0merge.cc on a record whose header is in different format, the check @@ -688,7 +723,9 @@ row_rec_to_index_entry_low( for (i = 0; i < rec_len; i++) { dfield = dtuple_get_nth_field(entry, i); - field = rec_get_nth_field(rec, offsets, i, &len); + field = def + ? rec_get_nth_cfield(rec, index, offsets, i, &len) + : rec_get_nth_field(rec, offsets, i, &len); dfield_set_data(dfield, field, len); @@ -698,10 +735,24 @@ row_rec_to_index_entry_low( } ut_ad(dtuple_check_typed(entry)); - return(entry); } +/** Convert an index record to a data tuple. +@param[in] rec index record +@param[in] index index +@param[in] offsets rec_get_offsets(rec, index) +@param[in,out] heap memory heap for allocations */ +dtuple_t* +row_rec_to_index_entry_low( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + mem_heap_t* heap) +{ + return row_rec_to_index_entry_impl<false>(rec, index, offsets, heap); +} + /*******************************************************************//** Converts an index record to a typed data tuple. NOTE that externally stored (often big) fields are NOT copied to heap. @@ -730,10 +781,12 @@ row_rec_to_index_entry( copy_rec = rec_copy(buf, rec, offsets); - rec_offs_make_valid(copy_rec, index, const_cast<rec_offs*>(offsets)); - entry = row_rec_to_index_entry_low( + rec_offs_make_valid(copy_rec, index, true, + const_cast<rec_offs*>(offsets)); + entry = row_rec_to_index_entry_impl<true>( copy_rec, index, offsets, heap); - rec_offs_make_valid(rec, index, const_cast<rec_offs*>(offsets)); + rec_offs_make_valid(rec, index, true, + const_cast<rec_offs*>(offsets)); dtuple_set_info_bits(entry, rec_get_info_bits(rec, rec_offs_comp(offsets))); @@ -784,7 +837,7 @@ row_build_row_ref( ut_ad(heap != NULL); ut_ad(!dict_index_is_clust(index)); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, ULINT_UNDEFINED, &tmp_heap); /* Secondary indexes must not contain externally stored columns. */ ut_ad(!rec_offs_any_extern(offsets)); @@ -796,8 +849,7 @@ row_build_row_ref( mem_heap_alloc(heap, rec_offs_size(offsets))); rec = rec_copy(buf, rec, offsets); - /* Avoid a debug assertion in rec_offs_validate(). */ - rec_offs_make_valid(rec, index, offsets); + rec_offs_make_valid(rec, index, true, offsets); } table = index->table; @@ -817,6 +869,7 @@ row_build_row_ref( ut_a(pos != ULINT_UNDEFINED); + ut_ad(!rec_offs_nth_default(offsets, pos)); field = rec_get_nth_field(rec, offsets, pos, &len); dfield_set_data(dfield, field, len); @@ -871,9 +924,8 @@ row_build_row_ref_in_tuple( held as long as the row reference is used! */ const dict_index_t* index, /*!< in: secondary index */ - rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) + rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) or NULL */ - trx_t* trx) /*!< in: transaction */ { const dict_index_t* clust_index; dfield_t* dfield; @@ -894,7 +946,8 @@ row_build_row_ref_in_tuple( ut_ad(clust_index); if (!offsets) { - offsets = rec_get_offsets(rec, index, offsets_, true, + offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, ULINT_UNDEFINED, &heap); } else { ut_ad(rec_offs_validate(rec, index, offsets)); @@ -915,6 +968,7 @@ row_build_row_ref_in_tuple( ut_a(pos != ULINT_UNDEFINED); + ut_ad(!rec_offs_nth_default(offsets, pos)); field = rec_get_nth_field(rec, offsets, pos, &len); dfield_set_data(dfield, field, len); @@ -971,11 +1025,29 @@ row_search_on_row_ref( index = dict_table_get_first_index(table); - ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index)); - - if (btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr) - != DB_SUCCESS) { - return FALSE; + if (UNIV_UNLIKELY(ref->info_bits != 0)) { + ut_ad(ref->info_bits == REC_INFO_METADATA); + ut_ad(ref->n_fields <= index->n_uniq); + if (btr_pcur_open_at_index_side( + true, index, mode, pcur, true, 0, mtr) + != DB_SUCCESS + || !btr_pcur_move_to_next_user_rec(pcur, mtr)) { + return FALSE; + } + /* We do not necessarily have index->is_instant() here, + because we could be executing a rollback of an + instant ADD COLUMN operation. The function + rec_is_metadata() asserts index->is_instant(); + we do not want to call it here. */ + return rec_get_info_bits(btr_pcur_get_rec(pcur), + dict_table_is_comp(index->table)) + & REC_INFO_MIN_REC_FLAG; + } else { + ut_a(ref->n_fields == index->n_uniq); + if (btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr) + != DB_SUCCESS) { + return FALSE; + } } low_match = btr_pcur_get_low_match(pcur); @@ -1131,7 +1203,7 @@ row_raw_format_int( value = mach_read_int_type( (const byte*) data, data_len, unsigned_type); - ret = snprintf( + ret = (ulint) snprintf( buf, buf_size, unsigned_type ? "%llu" : "%lld", (longlong) value)+1; } else { @@ -1221,6 +1293,8 @@ row_raw_format( ulint ret; ibool format_in_hex; + ut_ad(data_len != UNIV_SQL_DEFAULT); + if (buf_size == 0) { return(0); diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index 864f7bd54ab..248a6592913 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -51,7 +51,6 @@ Created 12/19/1997 Heikki Tuuri #include "pars0sym.h" #include "pars0pars.h" #include "row0mysql.h" -#include "read0read.h" #include "buf0lru.h" #include "srv0srv.h" #include "srv0mon.h" @@ -106,10 +105,10 @@ row_sel_sec_rec_is_for_blob( ulint len; byte buf[REC_VERSION_56_MAX_INDEX_COL_LEN]; - /* This function should never be invoked on an Antelope format - table, because they should always contain enough prefix in the - clustered index record. */ - ut_ad(dict_table_get_format(table) >= UNIV_FORMAT_B); + /* This function should never be invoked on tables in + ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT, because they + should always contain enough prefix in the clustered index record. */ + ut_ad(dict_table_has_atomic_blobs(table)); ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE); ut_ad(prefix_len >= sec_len); ut_ad(prefix_len > 0); @@ -125,7 +124,7 @@ row_sel_sec_rec_is_for_blob( } len = btr_copy_externally_stored_field_prefix( - buf, prefix_len, dict_tf_get_page_size(table->flags), + buf, prefix_len, page_size_t(table->space->flags), clust_field, clust_len); if (len == 0) { @@ -201,9 +200,11 @@ row_sel_sec_rec_is_for_clust_rec( ib_vcol_row vc(heap); clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs, - true, ULINT_UNDEFINED, &heap); + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs, - true, ULINT_UNDEFINED, &heap); + sec_index->n_fields, + ULINT_UNDEFINED, &heap); n = dict_index_get_n_ordering_defined_by_user(sec_index); @@ -218,7 +219,7 @@ row_sel_sec_rec_is_for_clust_rec( ifield = dict_index_get_nth_field(sec_index, i); col = dict_field_get_col(ifield); - is_virtual = dict_col_is_virtual(col); + is_virtual = col->is_virtual(); /* For virtual column, its value will need to be reconstructed from base column in cluster index */ @@ -254,9 +255,9 @@ row_sel_sec_rec_is_for_clust_rec( clust_field = static_cast<byte*>(vfield->data); } else { clust_pos = dict_col_get_clust_pos(col, clust_index); - - clust_field = rec_get_nth_field( - clust_rec, clust_offs, clust_pos, &clust_len); + clust_field = rec_get_nth_cfield( + clust_rec, clust_index, clust_offs, + clust_pos, &clust_len); } sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len); @@ -305,8 +306,8 @@ row_sel_sec_rec_is_for_clust_rec( if (rec_offs_nth_extern(clust_offs, clust_pos)) { dptr = btr_copy_externally_stored_field( &clust_len, dptr, - dict_tf_get_page_size( - sec_index->table->flags), + page_size_t(clust_index->table->space + ->flags), len, heap); } @@ -510,8 +511,8 @@ row_sel_fetch_columns( if (field_no != ULINT_UNDEFINED) { - if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, - field_no))) { + if (UNIV_UNLIKELY(rec_offs_nth_extern( + offsets, field_no) != 0)) { /* Copy an externally stored field to the temporary heap, if possible. */ @@ -538,9 +539,8 @@ row_sel_fetch_columns( needs_copy = TRUE; } else { - data = rec_get_nth_field(rec, offsets, - field_no, &len); - + data = rec_get_nth_cfield(rec, index, offsets, + field_no, &len); needs_copy = column->copy_val; } @@ -805,7 +805,7 @@ row_sel_build_committed_vers_for_mysql( rec_offs_size(*offsets)); } - row_vers_build_for_semi_consistent_read( + row_vers_build_for_semi_consistent_read(prebuilt->trx, rec, mtr, clust_index, offsets, offset_heap, prebuilt->old_vers_heap, old_vers, vrow); } @@ -908,7 +908,9 @@ row_sel_get_clust_rec( offsets = rec_get_offsets(rec, btr_pcur_get_btr_cur(&plan->pcur)->index, - offsets, true, ULINT_UNDEFINED, &heap); + offsets, + btr_pcur_get_btr_cur(&plan->pcur)->index + ->n_core_fields, ULINT_UNDEFINED, &heap); row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets); @@ -943,7 +945,8 @@ row_sel_get_clust_rec( goto err_exit; } - offsets = rec_get_offsets(clust_rec, index, offsets, true, + offsets = rec_get_offsets(clust_rec, index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &heap); if (!node->read_view) { @@ -974,11 +977,9 @@ row_sel_get_clust_rec( switch (err) { case DB_SUCCESS: case DB_SUCCESS_LOCKED_REC: -#ifdef HAVE_valgrind_or_MSAN /* Declare the variable uninitialized. It should be set to DB_SUCCESS at func_exit. */ MEM_UNDEFINED(&err, sizeof err); -#endif /* HAVE_valgrind_or_MSAN */ break; default: goto err_exit; @@ -1119,13 +1120,14 @@ re_scan: } mutex_exit(&match->rtr_match_mutex); + /* MDEV-14059 FIXME: why re-latch the block? + pcur is already positioned on it! */ ulint page_no = page_get_page_no( - btr_pcur_get_page(pcur)); - page_id_t page_id(dict_index_get_space(index), - page_no); + btr_pcur_get_page(pcur)); cur_block = buf_page_get_gen( - page_id, dict_table_page_size(index->table), + page_id_t(index->table->space_id, page_no), + page_size_t(index->table->space->flags), RW_X_LATCH, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err); } else { @@ -1164,7 +1166,8 @@ re_scan: rec = btr_pcur_get_rec(pcur); my_offsets = offsets_; - my_offsets = rec_get_offsets(rec, index, my_offsets, true, + my_offsets = rec_get_offsets(rec, index, my_offsets, + index->n_fields, ULINT_UNDEFINED, &heap); /* No match record */ @@ -1187,7 +1190,7 @@ re_scan: rtr_rec_t* rtr_rec = &(*it); my_offsets = rec_get_offsets( - rtr_rec->r_rec, index, my_offsets, true, + rtr_rec->r_rec, index, my_offsets, index->n_fields, ULINT_UNDEFINED, &heap); err = lock_sec_rec_read_check_and_lock( @@ -1280,11 +1283,8 @@ static void row_sel_open_pcur( /*==============*/ - plan_t* plan, /*!< in: table plan */ -#ifdef BTR_CUR_HASH_ADAPT - ulint has_search_latch, -#endif - mtr_t* mtr) /*!< in: mtr */ + plan_t* plan, /*!< in: table plan */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { dict_index_t* index; func_node_t* cond; @@ -1327,7 +1327,7 @@ row_sel_open_pcur( btr_pcur_open_with_no_init(index, plan->tuple, plan->mode, BTR_SEARCH_LEAF, &plan->pcur, - has_search_latch, mtr); + NULL, mtr); } else { /* Open the cursor to the start or the end of the index (FALSE: no init) */ @@ -1343,11 +1343,6 @@ row_sel_open_pcur( plan->pcur_is_open = TRUE; } -#ifndef BTR_CUR_HASH_ADAPT -# define row_sel_open_pcur(plan, has_search_latch, mtr) \ - row_sel_open_pcur(plan, mtr) -#endif /* !BTR_CUR_HASH_ADAPT */ - /*********************************************************************//** Restores a stored pcur position to a table index. @return TRUE if the cursor should be moved to the next record after we @@ -1469,33 +1464,20 @@ row_sel_try_search_shortcut( sel_node_t* node, /*!< in: select node for a consistent read */ plan_t* plan, /*!< in: plan for a unique search in clustered index */ - ibool search_latch_locked, - /*!< in: whether the search holds latch on - search system. */ mtr_t* mtr) /*!< in: mtr */ { - dict_index_t* index; - rec_t* rec; - mem_heap_t* heap = NULL; - rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs* offsets = offsets_; - ulint ret; - rec_offs_init(offsets_); - - index = plan->index; + dict_index_t* index = plan->index; ut_ad(node->read_view); ut_ad(plan->unique_search); ut_ad(!plan->must_get_clust); - ut_ad(!search_latch_locked - || rw_lock_own(btr_get_search_latch(index), RW_LOCK_S)); - row_sel_open_pcur(plan, search_latch_locked, mtr); - - rec = btr_pcur_get_rec(&(plan->pcur)); + row_sel_open_pcur(plan, mtr); - if (!page_rec_is_user_rec(rec)) { + const rec_t* rec = btr_pcur_get_rec(&(plan->pcur)); + if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, index)) { +retry: return(SEL_RETRY); } @@ -1506,36 +1488,33 @@ row_sel_try_search_shortcut( fields in the user record matched to the search tuple */ if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) { - +exhausted: return(SEL_EXHAUSTED); } /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ - offsets = rec_get_offsets(rec, index, offsets, true, + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, ULINT_UNDEFINED, &heap); if (dict_index_is_clust(index)) { if (!lock_clust_rec_cons_read_sees(rec, index, offsets, node->read_view)) { - ret = SEL_RETRY; - goto func_exit; + goto retry; } } else if (!srv_read_only_mode && !lock_sec_rec_cons_read_sees( rec, index, node->read_view)) { - - ret = SEL_RETRY; - goto func_exit; + goto retry; } - /* Test the deleted flag. */ - if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) { - - ret = SEL_EXHAUSTED; - goto func_exit; + goto exhausted; } /* Fetch the columns needed in test conditions. The index @@ -1549,20 +1528,17 @@ row_sel_try_search_shortcut( /* Test the rest of search conditions */ if (!row_sel_test_other_conds(plan)) { - - ret = SEL_EXHAUSTED; - goto func_exit; + goto exhausted; } ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF); plan->n_rows_fetched++; - ret = SEL_FOUND; -func_exit: + if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } - return(ret); + return(SEL_FOUND); } #endif /* BTR_CUR_HASH_ADAPT */ @@ -1633,10 +1609,6 @@ table_loop: plan = sel_node_get_nth_plan(node, node->fetch_table); index = plan->index; -#ifdef BTR_CUR_HASH_ADAPT - ulint has_search_latch = 0; - rw_lock_t* const latch = btr_get_search_latch(index); -#endif /* BTR_CUR_HASH_ADAPT */ if (plan->n_rows_prefetched > 0) { sel_dequeue_prefetched_row(plan); @@ -1661,29 +1633,14 @@ table_loop: #ifdef BTR_CUR_HASH_ADAPT if (consistent_read && plan->unique_search && !plan->pcur_is_open && !plan->must_get_clust) { - if (!has_search_latch) { - has_search_latch = RW_S_LATCH; - rw_lock_s_lock(latch); - } else if (rw_lock_get_writer(latch) == RW_LOCK_X_WAIT) { - /* There is an x-latch request waiting: release the - s-latch for a moment; as an s-latch here is often - kept for some 10 searches before being released, - a waiting x-latch request would block other threads - from acquiring an s-latch for a long time, lowering - performance significantly in multiprocessors. */ - rw_lock_s_unlock(latch); - rw_lock_s_lock(latch); - } - - switch (row_sel_try_search_shortcut(node, plan, - has_search_latch, - &mtr)) { + switch (row_sel_try_search_shortcut(node, plan, &mtr)) { case SEL_FOUND: goto next_table; case SEL_EXHAUSTED: goto table_exhausted; default: ut_ad(0); + /* fall through */ case SEL_RETRY: break; } @@ -1693,18 +1650,12 @@ table_loop: mtr.commit(); mtr.start(); } - - if (has_search_latch) { - has_search_latch = 0; - rw_lock_s_unlock(latch); - } #endif /* BTR_CUR_HASH_ADAPT */ if (!plan->pcur_is_open) { /* Evaluate the expressions to build the search tuple and open the cursor */ - - row_sel_open_pcur(plan, has_search_latch, &mtr); + row_sel_open_pcur(plan, &mtr); cursor_just_opened = TRUE; @@ -1764,7 +1715,7 @@ rec_loop: trx = thr_get_trx(thr); offsets = rec_get_offsets(next_rec, index, offsets, - true, + index->n_core_fields, ULINT_UNDEFINED, &heap); /* If innodb_locks_unsafe_for_binlog option is used @@ -1818,12 +1769,19 @@ skip_lock: goto next_rec; } + if (rec_is_metadata(rec, index)) { + /* Skip the metadata pseudo-record. */ + cost_counter++; + goto next_rec; + } + if (!consistent_read) { /* Try to place a lock on the index record */ ulint lock_type; trx_t* trx; - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &heap); trx = thr_get_trx(thr); @@ -1910,7 +1868,7 @@ skip_lock: /* PHASE 3: Get previous version in a consistent read */ cons_read_requires_clust_rec = FALSE; - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, ULINT_UNDEFINED, &heap); if (consistent_read) { @@ -1941,7 +1899,8 @@ skip_lock: exhausted. */ offsets = rec_get_offsets( - rec, index, offsets, true, + rec, index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &heap); /* Fetch the columns needed in @@ -2101,10 +2060,6 @@ skip_lock: } next_rec: -#ifdef BTR_CUR_HASH_ADAPT - ut_ad(!has_search_latch); -#endif /* BTR_CUR_HASH_ADAPT */ - if (mtr_has_extra_clust_latch) { /* We must commit &mtr if we are moving to the next @@ -2142,9 +2097,6 @@ next_table: plan->cursor_at_end = TRUE; } else { -#ifdef BTR_CUR_HASH_ADAPT - ut_ad(!has_search_latch); -#endif /* BTR_CUR_HASH_ADAPT */ plan->stored_cursor_rec_processed = TRUE; btr_pcur_store_position(&(plan->pcur), &mtr); @@ -2235,9 +2187,6 @@ stop_for_a_while: inserted new records which should have appeared in the result set, which would result in the phantom problem. */ -#ifdef BTR_CUR_HASH_ADAPT - ut_ad(!has_search_latch); -#endif /* BTR_CUR_HASH_ADAPT */ plan->stored_cursor_rec_processed = FALSE; btr_pcur_store_position(&(plan->pcur), &mtr); @@ -2254,9 +2203,6 @@ commit_mtr_for_a_while: plan->stored_cursor_rec_processed = TRUE; -#ifdef BTR_CUR_HASH_ADAPT - ut_ad(!has_search_latch); -#endif /* BTR_CUR_HASH_ADAPT */ btr_pcur_store_position(&(plan->pcur), &mtr); mtr.commit(); @@ -2270,9 +2216,6 @@ lock_wait_or_error: /* See the note at stop_for_a_while: the same holds for this case */ ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc); -#ifdef BTR_CUR_HASH_ADAPT - ut_ad(!has_search_latch); -#endif /* BTR_CUR_HASH_ADAPT */ plan->stored_cursor_rec_processed = FALSE; btr_pcur_store_position(&(plan->pcur), &mtr); @@ -2280,11 +2223,6 @@ lock_wait_or_error: mtr.commit(); func_exit: -#ifdef BTR_CUR_HASH_ADAPT - if (has_search_latch) { - rw_lock_s_unlock(latch); - } -#endif /* BTR_CUR_HASH_ADAPT */ ut_ad(!sync_check_iterate(dict_sync_check())); if (heap != NULL) { @@ -2329,15 +2267,11 @@ row_sel_step( plan_reset_cursor(sel_node_get_nth_plan(node, 0)); if (node->consistent_read) { + trx_t *trx = thr_get_trx(thr); /* Assign a read view for the query */ - trx_assign_read_view(thr_get_trx(thr)); - - if (thr_get_trx(thr)->read_view != NULL) { - node->read_view = thr_get_trx(thr)->read_view; - } else { - node->read_view = NULL; - } - + trx->read_view.open(trx); + node->read_view = trx->read_view.is_open() ? + &trx->read_view : NULL; } else { sym_node_t* table_node; lock_mode i_lock_mode; @@ -2552,8 +2486,7 @@ row_sel_convert_mysql_key_to_innobase( ulint buf_len, /*!< in: buffer length */ dict_index_t* index, /*!< in: index of the key value */ const byte* key_ptr, /*!< in: MySQL key value */ - ulint key_len, /*!< in: MySQL key value length */ - trx_t* trx) /*!< in: transaction */ + ulint key_len) /*!< in: MySQL key value length */ { byte* original_buf = buf; const byte* original_key_ptr = key_ptr; @@ -2640,8 +2573,8 @@ row_sel_convert_mysql_key_to_innobase( even though the actual value only takes data len bytes from the start. */ - data_len = key_ptr[data_offset] - + 256 * key_ptr[data_offset + 1]; + data_len = ulint(key_ptr[data_offset]) + | ulint(key_ptr[data_offset + 1]) << 8; data_field_len = data_offset + 2 + field->prefix_len; @@ -2805,9 +2738,7 @@ row_sel_field_store_in_mysql_format_func( ut_ad(len != UNIV_SQL_NULL); MEM_CHECK_DEFINED(data, len); MEM_CHECK_ADDRESSABLE(dest, templ->mysql_col_len); -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(dest, templ->mysql_col_len); -#endif /* HAVE_valgrind_or_MSAN */ switch (templ->type) { const byte* field_end; @@ -2882,7 +2813,8 @@ row_sel_field_store_in_mysql_format_func( } } - row_mysql_pad_col(templ->mbminlen, pad, field_end - pad); + row_mysql_pad_col(templ->mbminlen, pad, + ulint(field_end - pad)); break; case DATA_BLOB: @@ -2960,15 +2892,6 @@ row_sel_field_store_in_mysql_format_func( } } -#ifdef UNIV_DEBUG -/** Convert a field from Innobase format to MySQL format. */ -# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \ - row_sel_store_mysql_field_func(m,p,r,i,o,f,t) -#else /* UNIV_DEBUG */ -/** Convert a field from Innobase format to MySQL format. */ -# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \ - row_sel_store_mysql_field_func(m,p,r,o,f,t) -#endif /* UNIV_DEBUG */ /** Convert a field in the Innobase format to a field in the MySQL format. @param[out] mysql_rec record in the MySQL format @param[in,out] prebuilt prebuilt struct @@ -2983,13 +2906,11 @@ row_sel_field_store_in_mysql_format_func( */ static MY_ATTRIBUTE((warn_unused_result)) ibool -row_sel_store_mysql_field_func( +row_sel_store_mysql_field( byte* mysql_rec, row_prebuilt_t* prebuilt, const rec_t* rec, -#ifdef UNIV_DEBUG const dict_index_t* index, -#endif const rec_offs* offsets, ulint field_no, const mysql_row_templ_t*templ) @@ -3008,7 +2929,7 @@ row_sel_store_mysql_field_func( || field_no == templ->icp_rec_field_no); ut_ad(rec_offs_validate(rec, index, offsets)); - if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) { + if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no) != 0)) { mem_heap_t* heap; /* Copy an externally stored field to a temporary heap */ @@ -3018,12 +2939,12 @@ row_sel_store_mysql_field_func( if (DATA_LARGE_MTYPE(templ->type)) { if (prebuilt->blob_heap == NULL) { prebuilt->blob_heap = mem_heap_create( - UNIV_PAGE_SIZE); + srv_page_size); } heap = prebuilt->blob_heap; } else { - heap = mem_heap_create(UNIV_PAGE_SIZE); + heap = mem_heap_create(srv_page_size); } /* NOTE: if we are retrieving a big BLOB, we may @@ -3060,9 +2981,9 @@ row_sel_store_mysql_field_func( mem_heap_free(heap); } } else { - /* Field is stored in the row. */ - - data = rec_get_nth_field(rec, offsets, field_no, &len); + /* The field is stored in the index record, or + in the metadata for instant ADD COLUMN. */ + data = rec_get_nth_cfield(rec, index, offsets, field_no, &len); if (len == UNIV_SQL_NULL) { /* MySQL assumes that the field for an SQL @@ -3096,7 +3017,7 @@ row_sel_store_mysql_field_func( if (prebuilt->blob_heap == NULL) { prebuilt->blob_heap = mem_heap_create( - UNIV_PAGE_SIZE); + srv_page_size); DBUG_PRINT("anna", ("blob_heap allocated: %p", prebuilt->blob_heap)); } @@ -3304,7 +3225,8 @@ class Row_sel_get_clust_rec_for_mysql ut_ad(rec_offs_validate(cached_clust_rec, index, offsets)); ut_ad(index->first_user_field() <= rec_offs_n_fields(offsets)); - ut_ad(vers_offs == rec_get_offsets(cached_old_vers, index, vers_offs, true, + ut_ad(vers_offs == rec_get_offsets(cached_old_vers, index, vers_offs, + index->n_core_fields, index->db_trx_id(), &heap)); ut_ad(!heap); for (unsigned n= index->db_trx_id(); n--; ) @@ -3372,7 +3294,7 @@ Row_sel_get_clust_rec_for_mysql::operator()( thd_get_thread_id(trx->mysql_thd)); row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, - sec_index, *offsets, trx); + sec_index, *offsets); clust_index = dict_table_get_first_index(sec_index->table); @@ -3491,7 +3413,8 @@ Row_sel_get_clust_rec_for_mysql::operator()( goto func_exit; } - *offsets = rec_get_offsets(clust_rec, clust_index, *offsets, true, + *offsets = rec_get_offsets(clust_rec, clust_index, *offsets, + clust_index->n_core_fields, ULINT_UNDEFINED, offset_heap); if (prebuilt->select_lock_type != LOCK_NONE) { @@ -3525,7 +3448,7 @@ Row_sel_get_clust_rec_for_mysql::operator()( if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED && !lock_clust_rec_cons_read_sees( clust_rec, clust_index, *offsets, - trx_get_read_view(trx))) { + &trx->read_view)) { const buf_page_t& bpage = btr_pcur_get_block( prebuilt->clust_pcur)->page; @@ -3541,7 +3464,7 @@ Row_sel_get_clust_rec_for_mysql::operator()( /* The following call returns 'offsets' associated with 'old_vers' */ err = row_sel_build_prev_vers_for_mysql( - trx->read_view, clust_index, prebuilt, + &trx->read_view, clust_index, prebuilt, clust_rec, offsets, offset_heap, &old_vers, vrow, mtr); @@ -3565,7 +3488,8 @@ Row_sel_get_clust_rec_for_mysql::operator()( ut_d(check_eq(clust_index, *offsets)); *offsets = rec_get_offsets( old_vers, clust_index, *offsets, - true, ULINT_UNDEFINED, offset_heap); + clust_index->n_core_fields, + ULINT_UNDEFINED, offset_heap); } } @@ -3631,10 +3555,10 @@ err_exit: Restores cursor position after it has been stored. We have to take into account that the record cursor was positioned on may have been deleted. Then we may have to move the cursor one step up or down. -@return TRUE if we may need to process the record the cursor is now +@return true if we may need to process the record the cursor is now positioned on (i.e. we should not go to the next record yet) */ static -ibool +bool sel_restore_position_for_mysql( /*===========================*/ ibool* same_user_rec, /*!< out: TRUE if we were able to restore @@ -3674,21 +3598,28 @@ sel_restore_position_for_mysql( case BTR_PCUR_ON: if (!success && moves_up) { next: - btr_pcur_move_to_next(pcur, mtr); - return(TRUE); + if (btr_pcur_move_to_next(pcur, mtr) + && rec_is_metadata(btr_pcur_get_rec(pcur), + pcur->btr_cur.index)) { + btr_pcur_move_to_next(pcur, mtr); + } + + return true; } return(!success); case BTR_PCUR_AFTER_LAST_IN_TREE: case BTR_PCUR_BEFORE_FIRST_IN_TREE: - return(TRUE); + return true; case BTR_PCUR_AFTER: /* positioned to record after pcur->old_rec. */ pcur->pos_state = BTR_PCUR_IS_POSITIONED; prev: - if (btr_pcur_is_on_user_rec(pcur) && !moves_up) { + if (btr_pcur_is_on_user_rec(pcur) && !moves_up + && !rec_is_metadata(btr_pcur_get_rec(pcur), + pcur->btr_cur.index)) { btr_pcur_move_to_prev(pcur, mtr); } - return(TRUE); + return true; case BTR_PCUR_BEFORE: /* For non optimistic restoration: The position is now set to the record before pcur->old_rec. @@ -3710,19 +3641,19 @@ prev: HANDLER READ idx PREV; */ goto prev; } - return(TRUE); + return true; case BTR_PCUR_IS_POSITIONED: if (moves_up && btr_pcur_is_on_user_rec(pcur)) { goto next; } - return(TRUE); + return true; case BTR_PCUR_WAS_POSITIONED: case BTR_PCUR_NOT_POSITIONED: break; } } ut_ad(0); - return(TRUE); + return true; } /********************************************************************//** @@ -3750,9 +3681,7 @@ row_sel_copy_cached_field_for_mysql( row_mysql_read_true_varchar( &len, cache, templ->mysql_length_bytes); len += templ->mysql_length_bytes; -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(buf, templ->mysql_col_len); -#endif /* HAVE_valgrind_or_MSAN */ } else { len = templ->mysql_col_len; } @@ -3821,9 +3750,7 @@ row_sel_dequeue_cached_row_for_mysql( /* The record is long. Copy it field by field, in case there are some long VARCHAR column of which only a small length is being used. */ -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(buf, prebuilt->mysql_prefix_len); -#endif /* HAVE_valgrind_or_MSAN */ /* First copy the NULL bits. */ ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len); @@ -3907,10 +3834,8 @@ row_sel_fetch_last_buf( } ut_ad(prebuilt->fetch_cache_first == 0); -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(prebuilt->fetch_cache[prebuilt->n_fetch_cached], prebuilt->mysql_row_len); -#endif /* HAVE_valgrind_or_MSAN */ return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]); } @@ -3962,12 +3887,15 @@ row_sel_try_search_shortcut_for_mysql( ut_ad(dict_index_is_clust(index)); ut_ad(!prebuilt->templ_contains_blob); + rw_lock_t* ahi_latch = btr_get_search_latch(index); + rw_lock_s_lock(ahi_latch); btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE, - BTR_SEARCH_LEAF, pcur, RW_S_LATCH, mtr); + BTR_SEARCH_LEAF, pcur, ahi_latch, mtr); rec = btr_pcur_get_rec(pcur); - if (!page_rec_is_user_rec(rec)) { - + if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, index)) { +retry: + rw_lock_s_unlock(ahi_latch); return(SEL_RETRY); } @@ -3976,32 +3904,32 @@ row_sel_try_search_shortcut_for_mysql( fields in the user record matched to the search tuple */ if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) { - +exhausted: + rw_lock_s_unlock(ahi_latch); return(SEL_EXHAUSTED); } /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ - *offsets = rec_get_offsets(rec, index, *offsets, true, + *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields, ULINT_UNDEFINED, heap); - if (!lock_clust_rec_cons_read_sees( - rec, index, *offsets, trx_get_read_view(trx))) { - - return(SEL_RETRY); + if (!lock_clust_rec_cons_read_sees(rec, index, *offsets, + &trx->read_view)) { + goto retry; } if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) { /* In delete-marked records, DB_TRX_ID must always refer to an existing undo log record. */ ut_ad(row_get_rec_trx_id(rec, index, *offsets)); - - return(SEL_EXHAUSTED); + goto exhausted; } *out_rec = rec; + rw_lock_s_unlock(ahi_latch); return(SEL_FOUND); } #endif /* BTR_CUR_HASH_ADAPT */ @@ -4113,9 +4041,12 @@ row_sel_fill_vrow( rec_offs_init(offsets_); ut_ad(!(*vrow)); + ut_ad(heap); + ut_ad(!dict_index_is_clust(index)); + ut_ad(!index->is_instant()); ut_ad(page_rec_is_leaf(rec)); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, ULINT_UNDEFINED, &heap); *vrow = dtuple_create_with_vcol( @@ -4126,18 +4057,18 @@ row_sel_fill_vrow( for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { const dict_field_t* field; - const dict_col_t* col; + const dict_col_t* col; field = dict_index_get_nth_field(index, i); col = dict_field_get_col(field); - if (dict_col_is_virtual(col)) { + if (col->is_virtual()) { const byte* data; - ulint len; + ulint len; data = rec_get_nth_field(rec, offsets, i, &len); - const dict_v_col_t* vcol = reinterpret_cast< + const dict_v_col_t* vcol = reinterpret_cast< const dict_v_col_t*>(col); dfield_t* dfield = dtuple_get_nth_v_field( @@ -4295,15 +4226,16 @@ row_search_mvcc( ulint direction) { DBUG_ENTER("row_search_mvcc"); + DBUG_ASSERT(prebuilt->index->table == prebuilt->table); dict_index_t* index = prebuilt->index; - ibool comp = dict_table_is_comp(index->table); + ibool comp = dict_table_is_comp(prebuilt->table); const dtuple_t* search_tuple = prebuilt->search_tuple; btr_pcur_t* pcur = prebuilt->pcur; trx_t* trx = prebuilt->trx; dict_index_t* clust_index; que_thr_t* thr; - const rec_t* rec; + const rec_t* UNINIT_VAR(rec); dtuple_t* vrow = NULL; const rec_t* result_rec = NULL; const rec_t* clust_rec; @@ -4345,10 +4277,10 @@ row_search_mvcc( ut_ad(!sync_check_iterate(sync_check())); - if (dict_table_is_discarded(prebuilt->table)) { + if (!prebuilt->table->space) { DBUG_RETURN(DB_TABLESPACE_DELETED); } else if (!prebuilt->table->is_readable()) { - DBUG_RETURN(fil_space_get(prebuilt->table->space) + DBUG_RETURN(prebuilt->table->space ? DB_DECRYPTION_FAILED : DB_TABLESPACE_NOT_FOUND); } else if (!prebuilt->index_usable) { @@ -4364,7 +4296,7 @@ row_search_mvcc( && prebuilt->read_just_key; /* Reset the new record lock info if srv_locks_unsafe_for_binlog - is set or session is using a READ COMMITED isolation level. Then + is set or session is using a READ COMMITTED isolation level. Then we are able to remove the record locks set here on an individual row. */ prebuilt->new_rec_locks = 0; @@ -4496,28 +4428,18 @@ row_search_mvcc( && dict_index_is_clust(index) && !prebuilt->templ_contains_blob && !prebuilt->used_in_HANDLER - && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) { + && (prebuilt->mysql_row_len < srv_page_size / 8)) { + mode = PAGE_CUR_GE; - if (trx->mysql_n_tables_locked == 0 - && prebuilt->select_lock_type == LOCK_NONE + if (prebuilt->select_lock_type == LOCK_NONE && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED - && MVCC::is_view_active(trx->read_view)) { + && trx->read_view.is_open()) { /* This is a SELECT query done as a consistent read, and the read view has already been allocated: let us try a search shortcut through the hash - index. - NOTE that we must also test that - mysql_n_tables_locked == 0, because this might - also be INSERT INTO ... SELECT ... or - CREATE TABLE ... SELECT ... . Our algorithm is - NOT prepared to inserts interleaved with the SELECT, - and if we try that, we can deadlock on the adaptive - hash index semaphore! */ - - rw_lock_t* const latch = btr_get_search_latch(index); - rw_lock_s_lock(latch); + index. */ switch (row_sel_try_search_shortcut_for_mysql( &rec, prebuilt, &offsets, &heap, @@ -4537,7 +4459,7 @@ row_search_mvcc( case ICP_ABORTED_BY_USER: mtr_commit(&mtr); err = DB_INTERRUPTED; - goto unlock_and_exit; + goto func_exit; case ICP_ERROR: case ICP_NO_MATCH: case ICP_OUT_OF_RANGE: @@ -4568,19 +4490,18 @@ row_search_mvcc( shortcut_match: mtr.commit(); - err = DB_SUCCESS; - unlock_and_exit: /* NOTE that we do NOT store the cursor position */ - rw_lock_s_unlock(latch); + err = DB_SUCCESS; goto func_exit; case SEL_EXHAUSTED: shortcut_mismatch: mtr.commit(); - + /* NOTE that we do NOT store the cursor + position */ err = DB_RECORD_NOT_FOUND; - goto unlock_and_exit; + goto func_exit; case SEL_RETRY: break; @@ -4591,8 +4512,6 @@ row_search_mvcc( mtr.commit(); mtr.start(); - - rw_lock_s_unlock(latch); } } #endif /* BTR_CUR_HASH_ADAPT */ @@ -4607,17 +4526,19 @@ row_search_mvcc( thread that is currently serving the transaction. Because we are that thread, we can read trx->state without holding any mutex. */ - ut_ad(prebuilt->sql_stat_start || trx->state == TRX_STATE_ACTIVE); + ut_ad(prebuilt->sql_stat_start + || trx->state == TRX_STATE_ACTIVE + || (prebuilt->table->no_rollback() + && trx->state == TRX_STATE_NOT_STARTED)); ut_ad(!trx_is_started(trx) || trx->state == TRX_STATE_ACTIVE); ut_ad(prebuilt->sql_stat_start || prebuilt->select_lock_type != LOCK_NONE - || MVCC::is_view_active(trx->read_view) + || trx->read_view.is_open() + || prebuilt->table->no_rollback() || srv_read_only_mode); - trx_start_if_not_started(trx, false); - if (trx->isolation_level <= TRX_ISO_READ_COMMITTED && prebuilt->select_lock_type != LOCK_NONE && trx->mysql_thd != NULL @@ -4645,45 +4566,36 @@ row_search_mvcc( que_thr_move_to_run_state_for_mysql(thr, trx); - clust_index = dict_table_get_first_index(index->table); + clust_index = dict_table_get_first_index(prebuilt->table); /* Do some start-of-statement preparations */ - if (!prebuilt->sql_stat_start) { - /* No need to set an intention lock or assign a read view */ - - if (!MVCC::is_view_active(trx->read_view) - && !srv_read_only_mode - && prebuilt->select_lock_type == LOCK_NONE) { - - ib::error() << "MySQL is trying to perform a" - " consistent read but the read view is not" - " assigned!"; - trx_print(stderr, trx, 600); - fputc('\n', stderr); - ut_error; - } - } else if (prebuilt->select_lock_type == LOCK_NONE) { - /* This is a consistent read */ - /* Assign a read view for the query */ - - if (!srv_read_only_mode) { - trx_assign_read_view(trx); - } - + if (prebuilt->table->no_rollback()) { + /* NO_ROLLBACK tables do not support MVCC or locking. */ + prebuilt->select_lock_type = LOCK_NONE; prebuilt->sql_stat_start = FALSE; + } else if (!prebuilt->sql_stat_start) { + /* No need to set an intention lock or assign a read view */ + ut_a(prebuilt->select_lock_type != LOCK_NONE + || srv_read_only_mode || trx->read_view.is_open()); } else { + prebuilt->sql_stat_start = FALSE; + trx_start_if_not_started(trx, false); + + if (prebuilt->select_lock_type == LOCK_NONE) { + trx->read_view.open(trx); + } else { wait_table_again: - err = lock_table(0, index->table, - prebuilt->select_lock_type == LOCK_S - ? LOCK_IS : LOCK_IX, thr); + err = lock_table(0, prebuilt->table, + prebuilt->select_lock_type == LOCK_S + ? LOCK_IS : LOCK_IX, thr); - if (err != DB_SUCCESS) { + if (err != DB_SUCCESS) { - table_lock_waited = TRUE; - goto lock_table_wait; + table_lock_waited = TRUE; + goto lock_table_wait; + } } - prebuilt->sql_stat_start = FALSE; } /* Open or restore index cursor position */ @@ -4695,7 +4607,7 @@ wait_table_again: goto next_rec; } - ibool need_to_process = sel_restore_position_for_mysql( + bool need_to_process = sel_restore_position_for_mysql( &same_user_rec, BTR_SEARCH_LEAF, pcur, moves_up, &mtr); @@ -4777,7 +4689,7 @@ wait_table_again: const rec_t* next_rec = page_rec_get_next_const(rec); offsets = rec_get_offsets(next_rec, index, offsets, - true, + index->n_core_fields, ULINT_UNDEFINED, &heap); err = sel_set_rec_lock(pcur, next_rec, index, offsets, @@ -4861,7 +4773,8 @@ rec_loop: level we do not lock gaps. Supremum record is really a gap and therefore we do not set locks there. */ - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &heap); err = sel_set_rec_lock(pcur, rec, index, offsets, @@ -4890,12 +4803,24 @@ rec_loop: corruption */ if (comp) { + if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) { + /* Skip the metadata pseudo-record. */ + ut_ad(index->is_instant()); + goto next_rec; + } + next_offs = rec_get_next_offs(rec, TRUE); if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) { goto wrong_offs; } } else { + if (rec_get_info_bits(rec, false) & REC_INFO_MIN_REC_FLAG) { + /* Skip the metadata pseudo-record. */ + ut_ad(index->is_instant()); + goto next_rec; + } + next_offs = rec_get_next_offs(rec, FALSE); if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) { @@ -4903,7 +4828,7 @@ rec_loop: } } - if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) { + if (UNIV_UNLIKELY(next_offs >= srv_page_size - PAGE_DIR)) { wrong_offs: if (srv_force_recovery == 0 || moves_up == FALSE) { @@ -4952,7 +4877,7 @@ wrong_offs: ut_ad(fil_page_index_page_check(btr_pcur_get_page(pcur))); ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, ULINT_UNDEFINED, &heap); if (UNIV_UNLIKELY(srv_force_recovery > 0)) { @@ -5081,7 +5006,7 @@ wrong_offs: existence with LOCK_REC_NOT_GAP. */ /* If innodb_locks_unsafe_for_binlog option is used - or this session is using a READ COMMITED isolation + or this session is using a READ COMMITTED isolation level we lock only the record, i.e., next-key locking is not used. */ @@ -5098,7 +5023,7 @@ wrong_offs: /* At most one transaction can be active for temporary table. */ - if (dict_table_is_temporary(clust_index->table)) { + if (clust_index->table->is_temporary()) { goto no_gap_lock; } @@ -5108,17 +5033,17 @@ wrong_offs: /* In delete-marked records, DB_TRX_ID must always refer to an existing undo log record. */ ut_ad(trx_id); - if (!trx_rw_is_active(trx_id, NULL, false)) { + if (!trx_sys.is_registered(trx, trx_id)) { /* The clustered index record was delete-marked in a committed transaction. Ignore the record. */ goto locks_ok_del_marked; } - } else if (trx_t* trx = row_vers_impl_x_locked( - rec, index, offsets)) { + } else if (trx_t* t = row_vers_impl_x_locked( + trx, rec, index, offsets)) { /* The record belongs to an active transaction. We must acquire a lock. */ - trx->release_reference(); + t->release_reference(); } else { /* The secondary index record does not point to a delete-marked clustered index @@ -5215,7 +5140,8 @@ no_gap_lock: Do a normal locking read. */ offsets = rec_get_offsets( - rec, index, offsets, true, + rec, index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &heap); goto locks_ok; case DB_DEADLOCK: @@ -5252,7 +5178,8 @@ no_gap_lock: /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ - if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) { + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED + || prebuilt->table->no_rollback()) { /* Do nothing: we let a non-locking SELECT read the latest version of the record */ @@ -5264,16 +5191,15 @@ no_gap_lock: high force recovery level set, we try to avoid crashes by skipping this lookup */ - if (srv_force_recovery < 5 - && !lock_clust_rec_cons_read_sees( - rec, index, offsets, - trx_get_read_view(trx))) { - + if (!lock_clust_rec_cons_read_sees( + rec, index, offsets, &trx->read_view)) { + ut_ad(srv_force_recovery + < SRV_FORCE_NO_UNDO_LOG_SCAN); rec_t* old_vers; /* The following call returns 'offsets' associated with 'old_vers' */ err = row_sel_build_prev_vers_for_mysql( - trx->read_view, clust_index, + &trx->read_view, clust_index, prebuilt, rec, &offsets, &heap, &old_vers, need_vrow ? &vrow : NULL, &mtr); @@ -5303,7 +5229,7 @@ no_gap_lock: if (!srv_read_only_mode && !lock_sec_rec_cons_read_sees( - rec, index, trx->read_view)) { + rec, index, &trx->read_view)) { /* We should look at the clustered index. However, as this is a non-locking read, we can skip the clustered index lookup if @@ -5589,7 +5515,7 @@ use_covering_index: /* We used 'offsets' for the clust rec, recalculate them for 'rec' */ offsets = rec_get_offsets(rec, index, offsets, - true, + index->n_core_fields, ULINT_UNDEFINED, &heap); result_rec = rec; @@ -5681,25 +5607,25 @@ next_rec: For R-tree spatial search, we also commit the mini-transaction each time */ - if (mtr_has_extra_clust_latch || spatial_search) { + if (spatial_search) { + /* No need to do store restore for R-tree */ + mtr.commit(); + mtr.start(); + mtr_has_extra_clust_latch = FALSE; + } else if (mtr_has_extra_clust_latch) { /* If we have extra cluster latch, we must commit mtr if we are moving to the next non-clustered index record, because we could break the latching order if we would access a different clustered index page right away without releasing the previous. */ - /* No need to do store restore for R-tree */ - if (!spatial_search) { - btr_pcur_store_position(pcur, &mtr); - } - + btr_pcur_store_position(pcur, &mtr); mtr.commit(); mtr_has_extra_clust_latch = FALSE; mtr.start(); - if (!spatial_search - && sel_restore_position_for_mysql(&same_user_rec, + if (sel_restore_position_for_mysql(&same_user_rec, BTR_SEARCH_LEAF, pcur, moves_up, &mtr)) { goto rec_loop; @@ -5720,8 +5646,7 @@ next_rec: ut_ad(pcur->latch_mode != BTR_NO_LATCHES); pcur->old_stored = false; if (btr_pcur_is_after_last_on_page(pcur)) { - if (btr_pcur_is_after_last_in_tree(pcur, - &mtr)) { + if (btr_pcur_is_after_last_in_tree(pcur)) { goto not_moved; } btr_pcur_move_to_next_page(pcur, &mtr); @@ -5830,7 +5755,14 @@ lock_table_wait: normal_return: /*-------------------------------------------------------------*/ - que_thr_stop_for_mysql_no_error(thr, trx); + { + /* handler_index_cond_check() may pull TR_table search + which initates another row_search_mvcc(). */ + ulint n_active_thrs= trx->lock.n_active_thrs; + trx->lock.n_active_thrs= 1; + que_thr_stop_for_mysql_no_error(thr, trx); + trx->lock.n_active_thrs= n_active_thrs - 1; + } mtr.commit(); @@ -5967,7 +5899,8 @@ row_count_rtree_recs( prebuilt->search_tuple = entry; - ulint bufsize = ut_max(UNIV_PAGE_SIZE, prebuilt->mysql_row_len); + ulint bufsize = std::max<ulint>(srv_page_size, + prebuilt->mysql_row_len); buf = static_cast<byte*>(ut_malloc_nokey(bufsize)); ulint cnt = 1000; @@ -6035,7 +5968,7 @@ row_search_autoinc_read_column( rec_offs_init(offsets_); ut_ad(page_rec_is_leaf(rec)); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, col_no + 1, &heap); if (rec_offs_nth_sql_null(offsets, col_no)) { @@ -6089,6 +6022,9 @@ row_search_get_max_rec( btr_pcur_close(&pcur); + ut_ad(!rec + || !(rec_get_info_bits(rec, dict_table_is_comp(index->table)) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG))); return(rec); } diff --git a/storage/innobase/row/row0trunc.cc b/storage/innobase/row/row0trunc.cc index 618e161bee4..ee024e73b53 100644 --- a/storage/innobase/row/row0trunc.cc +++ b/storage/innobase/row/row0trunc.cc @@ -24,22 +24,19 @@ TRUNCATE implementation Created 2013-04-12 Sunny Bains *******************************************************/ -#include "row0mysql.h" +#include "row0trunc.h" +#include "btr0sea.h" #include "pars0pars.h" #include "btr0pcur.h" #include "dict0crea.h" -#include "dict0boot.h" -#include "dict0load.h" #include "dict0stats.h" #include "dict0stats_bg.h" #include "lock0lock.h" #include "fts0fts.h" -#include "srv0start.h" -#include "row0trunc.h" +#include "ibuf0ibuf.h" #include "os0file.h" #include "que0que.h" #include "trx0undo.h" -#include "btr0sea.h" /* FIXME: For temporary tables, use a simple approach of btr_free() and btr_create() of each index tree. */ @@ -52,8 +49,6 @@ bool truncate_t::s_fix_up_active = false; truncate_t::tables_t truncate_t::s_tables; truncate_t::truncated_tables_t truncate_t::s_truncated_tables; -static const byte magic[] = { 0x01, 0xf3, 0xa1, 0x20 }; - /** Iterator over the the raw records in an index, doesn't support MVCC. */ class IndexIterator { @@ -101,7 +96,7 @@ public: for (;;) { if (!btr_pcur_is_on_user_rec(&m_pcur) - || !callback.match(&m_mtr, &m_pcur)) { + || !callback.match(&m_pcur)) { /* The end of of the index has been reached. */ err = DB_END_OF_INDEX; @@ -200,10 +195,9 @@ public: } /** - @param mtr mini-transaction covering the iteration @param pcur persistent cursor used for iteration @return true if the table id column matches. */ - bool match(mtr_t* mtr, btr_pcur_t* pcur) const + bool match(btr_pcur_t* pcur) const { ulint len; const byte* field; @@ -245,301 +239,6 @@ protected: }; /** -Creates a TRUNCATE log record with space id, table name, data directory path, -tablespace flags, table format, index ids, index types, number of index fields -and index field information of the table. */ -class TruncateLogger : public Callback { - -public: - /** - Constructor - - @param table Table to truncate - @param flags tablespace falgs */ - TruncateLogger( - dict_table_t* table, - ulint flags, - table_id_t new_table_id) - : - Callback(table->id, false), - m_table(table), - m_flags(flags), - m_truncate(table->id, new_table_id, table->data_dir_path), - m_log_file_name() - { - /* Do nothing */ - } - - /** - Initialize Truncate Logger by constructing Truncate Log File Name. - - @return DB_SUCCESS or error code. */ - dberr_t init() - { - /* Construct log file name. */ - ulint log_file_name_buf_sz = - strlen(srv_log_group_home_dir) - + (22 + 22 + sizeof "ib_trunc.log"); - - m_log_file_name = UT_NEW_ARRAY_NOKEY(char, log_file_name_buf_sz); - if (m_log_file_name == NULL) { - return(DB_OUT_OF_MEMORY); - } - memset(m_log_file_name, 0, log_file_name_buf_sz); - - strcpy(m_log_file_name, srv_log_group_home_dir); - ulint log_file_name_len = strlen(m_log_file_name); - if (m_log_file_name[log_file_name_len - 1] - != OS_PATH_SEPARATOR) { - - m_log_file_name[log_file_name_len] - = OS_PATH_SEPARATOR; - log_file_name_len = strlen(m_log_file_name); - } - - snprintf(m_log_file_name + log_file_name_len, - log_file_name_buf_sz - log_file_name_len, - "ib_%u_" IB_ID_FMT "_trunc.log", - m_table->space, m_table->id); - - return(DB_SUCCESS); - - } - - /** - Destructor */ - ~TruncateLogger() - { - if (m_log_file_name != NULL) { - bool exist; - os_file_delete_if_exists( - innodb_log_file_key, m_log_file_name, &exist); - UT_DELETE_ARRAY(m_log_file_name); - m_log_file_name = NULL; - } - } - - /** - @param mtr mini-transaction covering the read - @param pcur persistent cursor used for reading - @return DB_SUCCESS or error code */ - dberr_t operator()(mtr_t* mtr, btr_pcur_t* pcur); - - /** Called after iteratoring over the records. - @return true if invariant satisfied. */ - bool debug() const - { - /* We must find all the index entries on disk. */ - return(UT_LIST_GET_LEN(m_table->indexes) - == m_truncate.indexes()); - } - - /** - Write the TRUNCATE log - @return DB_SUCCESS or error code */ - dberr_t log() const - { - dberr_t err = DB_SUCCESS; - - if (m_log_file_name == 0) { - return(DB_ERROR); - } - - bool ret; - os_file_t handle = os_file_create( - innodb_log_file_key, m_log_file_name, - OS_FILE_CREATE, OS_FILE_NORMAL, - OS_LOG_FILE, srv_read_only_mode, &ret); - if (!ret) { - return(DB_IO_ERROR); - } - - - ulint sz = UNIV_PAGE_SIZE; - void* buf = ut_zalloc_nokey(sz + UNIV_PAGE_SIZE); - if (buf == 0) { - os_file_close(handle); - return(DB_OUT_OF_MEMORY); - } - - /* Align the memory for file i/o if we might have O_DIRECT set*/ - byte* log_buf = static_cast<byte*>( - ut_align(buf, UNIV_PAGE_SIZE)); - - lsn_t lsn = log_get_lsn(); - - /* Generally loop should exit in single go but - just for those 1% of rare cases we need to assume - corner case. */ - do { - /* First 4 bytes are reserved for magic number - which is currently 0. */ - err = m_truncate.write( - log_buf + 4, log_buf + sz - 4, - m_table->space, m_table->name.m_name, - m_flags, m_table->flags, lsn); - - DBUG_EXECUTE_IF("ib_err_trunc_oom_logging", - err = DB_FAIL;); - - if (err != DB_SUCCESS) { - ut_ad(err == DB_FAIL); - ut_free(buf); - sz *= 2; - buf = ut_zalloc_nokey(sz + UNIV_PAGE_SIZE); - DBUG_EXECUTE_IF("ib_err_trunc_oom_logging", - ut_free(buf); - buf = 0;); - if (buf == 0) { - os_file_close(handle); - return(DB_OUT_OF_MEMORY); - } - log_buf = static_cast<byte*>( - ut_align(buf, UNIV_PAGE_SIZE)); - } - - } while (err != DB_SUCCESS); - - dberr_t io_err; - - IORequest request(IORequest::WRITE); - - io_err = os_file_write( - request, m_log_file_name, handle, log_buf, 0, sz); - - if (io_err != DB_SUCCESS) { - - ib::error() - << "IO: Failed to write the file size to '" - << m_log_file_name << "'"; - - /* Preserve the original error code */ - if (err == DB_SUCCESS) { - err = io_err; - } - } - - os_file_flush(handle); - os_file_close(handle); - - ut_free(buf); - - /* Why we need MLOG_TRUNCATE when we have truncate_log for - recovery? - - truncate log can protect us if crash happens while truncate - is active. Once truncate is done truncate log is removed. - - If crash happens post truncate and system is yet to - checkpoint, on recovery we would see REDO records from action - before truncate (unless we explicitly checkpoint before - returning from truncate API. Costly alternative so rejected). - - These REDO records may reference a page that doesn't exist - post truncate so we need a mechanism to skip all such REDO - records. MLOG_TRUNCATE records space_id and lsn that exactly - serve the purpose. - - If checkpoint happens post truncate and crash happens post - this point then neither MLOG_TRUNCATE nor REDO record - from action before truncate are accessible. */ - if (!is_system_tablespace(m_table->space)) { - mtr_t mtr; - byte* log_ptr; - - mtr_start(&mtr); - - log_ptr = mlog_open(&mtr, 11 + 8); - log_ptr = mlog_write_initial_log_record_low( - MLOG_TRUNCATE, m_table->space, 0, - log_ptr, &mtr); - - mach_write_to_8(log_ptr, lsn); - log_ptr += 8; - - mlog_close(&mtr, log_ptr); - mtr_commit(&mtr); - } - - return(err); - } - - /** - Indicate completion of truncate log by writing magic-number. - File will be removed from the system but to protect against - unlink (File-System) anomalies we ensure we write magic-number. */ - void done() - { - if (m_log_file_name == 0) { - return; - } - - bool ret; - os_file_t handle = os_file_create_simple_no_error_handling( - innodb_log_file_key, m_log_file_name, - OS_FILE_OPEN, OS_FILE_READ_WRITE, - srv_read_only_mode, &ret); - DBUG_EXECUTE_IF("ib_err_trunc_writing_magic_number", - os_file_close(handle); - ret = false;); - if (!ret) { - ib::error() << "Failed to open truncate log file " - << m_log_file_name << "." - " If server crashes before truncate log is" - " removed make sure it is manually removed" - " before restarting server"; - os_file_delete(innodb_log_file_key, m_log_file_name); - return; - } - - if (os_file_write(IORequest(IORequest::WRITE), - m_log_file_name, handle, magic, 0, - sizeof magic) != DB_SUCCESS) { - ib::error() - << "IO: Failed to write the magic number to '" - << m_log_file_name << "'"; - } - - DBUG_EXECUTE_IF("ib_trunc_crash_after_updating_magic_no", - DBUG_SUICIDE();); - os_file_flush(handle); - os_file_close(handle); - DBUG_EXECUTE_IF("ib_trunc_crash_after_logging_complete", - log_buffer_flush_to_disk(); - os_thread_sleep(1000000); - DBUG_SUICIDE();); - os_file_delete(innodb_log_file_key, m_log_file_name); - } - -private: - /** Lookup the index using the index id. - @return index instance if found else NULL */ - const dict_index_t* find(index_id_t id) const - { - for (const dict_index_t* index = UT_LIST_GET_FIRST( - m_table->indexes); - index != NULL; - index = UT_LIST_GET_NEXT(indexes, index)) { - - if (index->id == id) { - return(index); - } - } - - return(NULL); - } - -private: - /** Table to be truncated */ - dict_table_t* m_table; - - /** Tablespace flags */ - ulint m_flags; - - /** Collect table to truncate information */ - truncate_t m_truncate; - - /** Truncate log file name. */ - char* m_log_file_name; -}; - -/** Scan to find out truncate log file from the given directory path. @param dir_path look for log directory in following path. @@ -629,8 +328,8 @@ TruncateLogParser::parse( return(DB_IO_ERROR); } - ulint sz = UNIV_PAGE_SIZE; - void* buf = ut_zalloc_nokey(sz + UNIV_PAGE_SIZE); + ulint sz = srv_page_size; + void* buf = ut_zalloc_nokey(sz + srv_page_size); if (buf == 0) { os_file_close(handle); return(DB_OUT_OF_MEMORY); @@ -639,7 +338,7 @@ TruncateLogParser::parse( IORequest request(IORequest::READ); /* Align the memory for file i/o if we might have O_DIRECT set*/ - byte* log_buf = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); + byte* log_buf = static_cast<byte*>(ut_align(buf, srv_page_size)); do { err = os_file_read(request, handle, log_buf, 0, sz); @@ -649,7 +348,7 @@ TruncateLogParser::parse( break; } - if (!memcmp(log_buf, magic, sizeof magic)) { + if (mach_read_from_4(log_buf) == 32743712) { /* Truncate action completed. Avoid parsing the file. */ os_file_close(handle); @@ -678,7 +377,7 @@ TruncateLogParser::parse( sz *= 2; - buf = ut_zalloc_nokey(sz + UNIV_PAGE_SIZE); + buf = ut_zalloc_nokey(sz + srv_page_size); if (buf == 0) { os_file_close(handle); @@ -689,7 +388,7 @@ TruncateLogParser::parse( } log_buf = static_cast<byte*>( - ut_align(buf, UNIV_PAGE_SIZE)); + ut_align(buf, srv_page_size)); } } while (err != DB_SUCCESS); @@ -741,68 +440,6 @@ TruncateLogParser::scan_and_parse( return(err); } -/** Callback to drop indexes during TRUNCATE */ -class DropIndex : public Callback { - -public: - /** - Constructor - - @param[in,out] table Table to truncate - @param[in,out] trx dictionary transaction - @param[in] noredo whether to disable redo logging */ - DropIndex(dict_table_t* table, trx_t* trx, bool noredo) - : Callback(table->id, noredo), m_trx(trx), m_table(table) {} - - /** - @param mtr mini-transaction covering the read - @param pcur persistent cursor used for reading - @return DB_SUCCESS or error code */ - dberr_t operator()(mtr_t* mtr, btr_pcur_t* pcur) const; - -private: - /** dictionary transaction */ - trx_t* const m_trx; - /** Table to be truncated */ - dict_table_t* const m_table; -}; - -/** Callback to create the indexes during TRUNCATE */ -class CreateIndex : public Callback { - -public: - /** - Constructor - - @param[in,out] table Table to truncate - @param[in] noredo whether to disable redo logging */ - CreateIndex(dict_table_t* table, bool noredo) - : - Callback(table->id, noredo), - m_table(table) - { - /* No op */ - } - - /** - Create the new index and update the root page number in the - SysIndex table. - - @param mtr mini-transaction covering the read - @param pcur persistent cursor used for reading - @return DB_SUCCESS or error code */ - dberr_t operator()(mtr_t* mtr, btr_pcur_t* pcur) const; - -private: - // Disably copying - CreateIndex(const CreateIndex&); - CreateIndex& operator=(const CreateIndex&); - -private: - /** Table to be truncated */ - dict_table_t* m_table; -}; - /** Check for presence of table-id in SYS_XXXX tables. */ class TableLocator : public Callback { @@ -828,15 +465,13 @@ public: /** Look for table-id in SYS_XXXX tables without loading the table. - @param mtr mini-transaction covering the read @param pcur persistent cursor used for reading - @return DB_SUCCESS or error code */ - dberr_t operator()(mtr_t* mtr, btr_pcur_t* pcur); - -private: - // Disably copying - TableLocator(const TableLocator&); - TableLocator& operator=(const TableLocator&); + @return DB_SUCCESS */ + dberr_t operator()(mtr_t*, btr_pcur_t*) + { + m_table_found = true; + return(DB_SUCCESS); + } private: /** Set to true if table is present */ @@ -844,488 +479,6 @@ private: }; /** -@param mtr mini-transaction covering the read -@param pcur persistent cursor used for reading -@return DB_SUCCESS or error code */ -dberr_t -TruncateLogger::operator()(mtr_t* mtr, btr_pcur_t* pcur) -{ - ulint len; - const byte* field; - rec_t* rec = btr_pcur_get_rec(pcur); - truncate_t::index_t index; - - field = rec_get_nth_field_old( - rec, DICT_FLD__SYS_INDEXES__TYPE, &len); - ut_ad(len == 4); - index.m_type = mach_read_from_4(field); - - field = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__ID, &len); - ut_ad(len == 8); - index.m_id = mach_read_from_8(field); - - field = rec_get_nth_field_old( - rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); - ut_ad(len == 4); - index.m_root_page_no = mach_read_from_4(field); - - /* For compressed tables we need to store extra meta-data - required during btr_create(). */ - if (FSP_FLAGS_GET_ZIP_SSIZE(m_flags)) { - - const dict_index_t* dict_index = find(index.m_id); - - if (dict_index != NULL) { - - dberr_t err = index.set(dict_index); - - if (err != DB_SUCCESS) { - m_truncate.clear(); - return(err); - } - - } else { - ib::warn() << "Index id " << index.m_id - << " not found"; - } - } - - m_truncate.add(index); - - return(DB_SUCCESS); -} - -/** -Drop an index in the table. - -@param mtr mini-transaction covering the read -@param pcur persistent cursor used for reading -@return DB_SUCCESS or error code */ -dberr_t -DropIndex::operator()(mtr_t* mtr, btr_pcur_t* pcur) const -{ - rec_t* rec = btr_pcur_get_rec(pcur); - - bool freed = dict_drop_index_tree(rec, pcur, m_trx, mtr); - -#ifdef UNIV_DEBUG - { - ulint len; - const byte* field; - ulint index_type; - - field = rec_get_nth_field_old( - btr_pcur_get_rec(pcur), DICT_FLD__SYS_INDEXES__TYPE, - &len); - ut_ad(len == 4); - - index_type = mach_read_from_4(field); - - if (index_type & DICT_CLUSTERED) { - /* Clustered index */ - DBUG_EXECUTE_IF("ib_trunc_crash_on_drop_of_clust_index", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - DBUG_SUICIDE();); - } else if (index_type & DICT_UNIQUE) { - /* Unique index */ - DBUG_EXECUTE_IF("ib_trunc_crash_on_drop_of_uniq_index", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - DBUG_SUICIDE();); - } else if (index_type == 0) { - /* Secondary index */ - DBUG_EXECUTE_IF("ib_trunc_crash_on_drop_of_sec_index", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - DBUG_SUICIDE();); - } - } -#endif /* UNIV_DEBUG */ - - DBUG_EXECUTE_IF("ib_err_trunc_drop_index", - freed = false;); - - if (freed) { - - /* We will need to commit and restart the - mini-transaction in order to avoid deadlocks. - The dict_drop_index_tree() call has freed - a page in this mini-transaction, and the rest - of this loop could latch another index page.*/ - const mtr_log_t log_mode = mtr->get_log_mode(); - mtr_commit(mtr); - - mtr_start(mtr); - mtr->set_log_mode(log_mode); - - btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); - } else { - /* Check if the .ibd file is missing. */ - bool found; - - fil_space_get_page_size(m_table->space, &found); - - DBUG_EXECUTE_IF("ib_err_trunc_drop_index", - found = false;); - - if (!found) { - return(DB_ERROR); - } - } - - return(DB_SUCCESS); -} - -/** -Create the new index and update the root page number in the -SysIndex table. - -@param mtr mini-transaction covering the read -@param pcur persistent cursor used for reading -@return DB_SUCCESS or error code */ -dberr_t -CreateIndex::operator()(mtr_t* mtr, btr_pcur_t* pcur) const -{ - ulint root_page_no; - - root_page_no = dict_recreate_index_tree(m_table, pcur, mtr); - -#ifdef UNIV_DEBUG - { - ulint len; - const byte* field; - ulint index_type; - - field = rec_get_nth_field_old( - btr_pcur_get_rec(pcur), DICT_FLD__SYS_INDEXES__TYPE, - &len); - ut_ad(len == 4); - - index_type = mach_read_from_4(field); - - if (index_type & DICT_CLUSTERED) { - /* Clustered index */ - DBUG_EXECUTE_IF( - "ib_trunc_crash_on_create_of_clust_index", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - DBUG_SUICIDE();); - } else if (index_type & DICT_UNIQUE) { - /* Unique index */ - DBUG_EXECUTE_IF( - "ib_trunc_crash_on_create_of_uniq_index", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - DBUG_SUICIDE();); - } else if (index_type == 0) { - /* Secondary index */ - DBUG_EXECUTE_IF( - "ib_trunc_crash_on_create_of_sec_index", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - DBUG_SUICIDE();); - } - } -#endif /* UNIV_DEBUG */ - - DBUG_EXECUTE_IF("ib_err_trunc_create_index", - root_page_no = FIL_NULL;); - - if (root_page_no != FIL_NULL) { - ulint len; - byte* data = rec_get_nth_field_old( - btr_pcur_get_rec(pcur), - DICT_FLD__SYS_INDEXES__PAGE_NO, &len); - ut_ad(len == 4); - mlog_write_ulint(data, root_page_no, MLOG_4BYTES, mtr); - - /* We will need to commit and restart the - mini-transaction in order to avoid deadlocks. - The dict_create_index_tree() call has allocated - a page in this mini-transaction, and the rest of - this loop could latch another index page. */ - mtr_commit(mtr); - - mtr_start(mtr); - - btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); - - } else { - bool found; - fil_space_get_page_size(m_table->space, &found); - - DBUG_EXECUTE_IF("ib_err_trunc_create_index", - found = false;); - - if (!found) { - return(DB_ERROR); - } - } - - return(DB_SUCCESS); -} - -/** -Look for table-id in SYS_XXXX tables without loading the table. - -@param mtr mini-transaction covering the read -@param pcur persistent cursor used for reading -@return DB_SUCCESS */ -dberr_t -TableLocator::operator()(mtr_t* mtr, btr_pcur_t* pcur) -{ - m_table_found = true; - - return(DB_SUCCESS); -} - -/** -Rollback the transaction and release the index locks. -Drop indexes if table is corrupted so that drop/create -sequence works as expected. - -@param table table to truncate -@param trx transaction covering the TRUNCATE -@param new_id new table id that was suppose to get assigned - to the table if truncate executed successfully. -@param has_internal_doc_id indicate existence of fts index -@param no_redo if true, turn-off redo logging -@param corrupted table corrupted status -@param unlock_index if true then unlock indexes before action */ -static -void -row_truncate_rollback( - dict_table_t* table, - trx_t* trx, - table_id_t new_id, - bool has_internal_doc_id, - bool no_redo, - bool corrupted, - bool unlock_index) -{ - if (unlock_index) { - dict_table_x_unlock_indexes(table); - } - - trx->error_state = DB_SUCCESS; - - trx_rollback_to_savepoint(trx, NULL); - - trx->error_state = DB_SUCCESS; - - if (corrupted && !dict_table_is_temporary(table)) { - - /* Cleanup action to ensure we don't left over stale entries - if we are marking table as corrupted. This will ensure - it can be recovered using drop/create sequence. */ - dict_table_x_lock_indexes(table); - - DropIndex dropIndex(table, trx, no_redo); - - SysIndexIterator().for_each(dropIndex); - - dict_table_x_unlock_indexes(table); - - for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); - index != NULL; - index = UT_LIST_GET_NEXT(indexes, index)) { - - dict_set_corrupted(index, trx, "TRUNCATE TABLE"); - } - - if (has_internal_doc_id) { - - ut_ad(!trx_is_started(trx)); - - table_id_t id = table->id; - - table->id = new_id; - - fts_drop_tables(trx, table); - - table->id = id; - - ut_ad(trx_is_started(trx)); - - trx_commit_for_mysql(trx); - } - - } else if (corrupted && dict_table_is_temporary(table)) { - - dict_table_x_lock_indexes(table); - - for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); - index != NULL; - index = UT_LIST_GET_NEXT(indexes, index)) { - - dict_drop_index_tree_in_mem(index, index->page); - - index->page = FIL_NULL; - } - - dict_table_x_unlock_indexes(table); - } - - table->corrupted = corrupted; -} - -/** -Finish the TRUNCATE operations for both commit and rollback. - -@param table table being truncated -@param trx transaction covering the truncate -@param fsp_flags tablespace flags -@param logger table to truncate information logger -@param err status of truncate operation - -@return DB_SUCCESS or error code */ -static MY_ATTRIBUTE((warn_unused_result)) -dberr_t -row_truncate_complete( - dict_table_t* table, - trx_t* trx, - ulint fsp_flags, - TruncateLogger* &logger, - dberr_t err) -{ - bool is_file_per_table = dict_table_is_file_per_table(table); - - /* Add the table back to FTS optimize background thread. */ - if (table->fts) { - fts_optimize_add_table(table); - } - - row_mysql_unlock_data_dictionary(trx); - - DEBUG_SYNC_C("ib_trunc_table_trunc_completing"); - - if (!dict_table_is_temporary(table)) { - - DBUG_EXECUTE_IF("ib_trunc_crash_before_log_removal", - log_buffer_flush_to_disk(); - os_thread_sleep(500000); - DBUG_SUICIDE();); - - /* Note: We don't log-checkpoint instead we have written - a special REDO log record MLOG_TRUNCATE that is used to - avoid applying REDO records before truncate for crash - that happens post successful truncate completion. */ - - if (logger != NULL) { - logger->done(); - UT_DELETE(logger); - logger = NULL; - } - } - - /* If non-temp file-per-table tablespace... */ - if (is_file_per_table - && !dict_table_is_temporary(table) - && fsp_flags != ULINT_UNDEFINED) { - - /* This function will reset back the stop_new_ops - and is_being_truncated so that fil-ops can re-start. */ - dberr_t err2 = truncate_t::truncate( - table->space, - table->data_dir_path, - table->name.m_name, fsp_flags, false); - - if (err2 != DB_SUCCESS) { - return(err2); - } - } - - if (err == DB_SUCCESS) { - dict_stats_update(table, DICT_STATS_EMPTY_TABLE); - } - - trx->op_info = ""; - - /* For temporary tables or if there was an error, we need to reset - the dict operation flags. */ - trx->ddl = false; - trx->dict_operation = TRX_DICT_OP_NONE; - - ut_ad(!trx_is_started(trx)); - - srv_wake_master_thread(); - - DBUG_EXECUTE_IF("ib_trunc_crash_after_truncate_done", - DBUG_SUICIDE();); - - return(err); -} - -/** -Handle FTS truncate issues. -@param table table being truncated -@param new_id new id for the table -@param trx transaction covering the truncate -@return DB_SUCCESS or error code. */ -static MY_ATTRIBUTE((warn_unused_result)) -dberr_t -row_truncate_fts( - dict_table_t* table, - table_id_t new_id, - trx_t* trx) -{ - dict_table_t fts_table; - - fts_table.id = new_id; - fts_table.name = table->name; - fts_table.flags2 = table->flags2; - fts_table.flags = table->flags; - fts_table.space = table->space; - - /* table->data_dir_path is used for FTS AUX table - creation. */ - if (DICT_TF_HAS_DATA_DIR(table->flags) - && table->data_dir_path == NULL) { - dict_get_and_save_data_dir_path(table, true); - ut_ad(table->data_dir_path != NULL); - } - - fts_table.data_dir_path = table->data_dir_path; - - dberr_t err; - - err = fts_create_common_tables( - trx, &fts_table, table->name.m_name, TRUE); - - for (ulint i = 0; - i < ib_vector_size(table->fts->indexes) && err == DB_SUCCESS; - i++) { - - dict_index_t* fts_index; - - fts_index = static_cast<dict_index_t*>( - ib_vector_getp(table->fts->indexes, i)); - - err = fts_create_index_tables_low( - trx, fts_index, table->name.m_name, new_id); - } - - DBUG_EXECUTE_IF("ib_err_trunc_during_fts_trunc", - err = DB_ERROR;); - - if (err != DB_SUCCESS) { - - trx->error_state = DB_SUCCESS; - trx_rollback_to_savepoint(trx, NULL); - trx->error_state = DB_SUCCESS; - - ib::error() << "Unable to truncate FTS index for table " - << table->name; - } else { - - ut_ad(trx_is_started(trx)); - } - - return(err); -} - -/** Update system table to reflect new table id. @param old_table_id old table id @param new_table_id new table id @@ -1403,7 +556,7 @@ row_truncate_update_sys_tables_during_fix_up( ibool reserve_dict_mutex, bool mark_index_corrupted) { - trx_t* trx = trx_allocate_for_background(); + trx_t* trx = trx_create(); trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); @@ -1460,637 +613,228 @@ row_truncate_update_sys_tables_during_fix_up( } trx_commit_for_mysql(trx); - trx_free_for_background(trx); + trx->free(); return(err); } -/** -Truncate also results in assignment of new table id, update the system -SYSTEM TABLES with the new id. -@param table, table being truncated -@param new_id, new table id -@param has_internal_doc_id, has doc col (fts) -@param no_redo if true, turn-off redo logging -@param trx transaction handle -@return error code or DB_SUCCESS */ -static MY_ATTRIBUTE((warn_unused_result)) +/********************************************************//** +Recreates table indexes by applying +TRUNCATE log record during recovery. +@return DB_SUCCESS or error code */ +static dberr_t -row_truncate_update_system_tables( - dict_table_t* table, - table_id_t new_id, - bool has_internal_doc_id, - bool no_redo, - trx_t* trx) +fil_recreate_table( +/*===============*/ + ulint format_flags, /*!< in: page format */ + const char* name, /*!< in: table name */ + truncate_t& truncate) /*!< in: The information of + TRUNCATE log record */ { - dberr_t err = DB_SUCCESS; - - ut_a(!dict_table_is_temporary(table)); + ut_ad(!truncate_t::s_fix_up_active); + truncate_t::s_fix_up_active = true; - err = row_truncate_update_table_id(table->id, new_id, FALSE, trx); - - DBUG_EXECUTE_IF("ib_err_trunc_during_sys_table_update", - err = DB_ERROR;); + /* Step-1: Scan for active indexes from REDO logs and drop + all the indexes using low level function that take root_page_no + and space-id. */ + truncate.drop_indexes(fil_system.sys_space); + /* Step-2: Scan for active indexes and re-create them. */ + dberr_t err = truncate.create_indexes( + name, fil_system.sys_space, format_flags); if (err != DB_SUCCESS) { - - row_truncate_rollback( - table, trx, new_id, has_internal_doc_id, - no_redo, true, false); - - ib::error() << "Unable to assign a new identifier to table " - << table->name << " after truncating it. Marked the" - " table as corrupted. In-memory representation is now" - " different from the on-disk representation."; - err = DB_ERROR; - } else { - /* Drop the old FTS index */ - if (has_internal_doc_id) { - - ut_ad(trx_is_started(trx)); - - fts_drop_tables(trx, table); - - DBUG_EXECUTE_IF("ib_truncate_crash_while_fts_cleanup", - DBUG_SUICIDE();); - - ut_ad(trx_is_started(trx)); - } - - DBUG_EXECUTE_IF("ib_trunc_crash_after_fts_drop", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - DBUG_SUICIDE();); - - dict_table_change_id_in_cache(table, new_id); - - /* Reset the Doc ID in cache to 0 */ - if (has_internal_doc_id && table->fts->cache != NULL) { - DBUG_EXECUTE_IF("ib_trunc_sleep_before_fts_cache_clear", - os_thread_sleep(10000000);); - - table->fts->dict_locked = true; - fts_update_next_doc_id(trx, table, 0); - fts_cache_clear(table->fts->cache); - fts_cache_init(table->fts->cache); - table->fts->dict_locked = false; - } + ib::info() << "Recovery failed for TRUNCATE TABLE '" + << name << "' within the system tablespace"; } - return(err); -} - -/** -Prepare for the truncate process. On success all of the table's indexes will -be locked in X mode. -@param table table to truncate -@param flags tablespace flags -@return error code or DB_SUCCESS */ -static MY_ATTRIBUTE((warn_unused_result)) -dberr_t -row_truncate_prepare(dict_table_t* table, ulint* flags) -{ - ut_ad(!dict_table_is_temporary(table)); - ut_ad(dict_table_is_file_per_table(table)); - - *flags = fil_space_get_flags(table->space); - - ut_ad(!dict_table_is_temporary(table)); - - dict_get_and_save_data_dir_path(table, true); - - if (*flags != ULINT_UNDEFINED) { - - dberr_t err = fil_prepare_for_truncate(table->space); - - if (err != DB_SUCCESS) { - return(err); - } - } + truncate_t::s_fix_up_active = false; - return(DB_SUCCESS); + return(err); } -/** -Do foreign key checks before starting TRUNCATE. -@param table table being truncated -@param trx transaction covering the truncate +/********************************************************//** +Recreates the tablespace and table indexes by applying +TRUNCATE log record during recovery. @return DB_SUCCESS or error code */ -static MY_ATTRIBUTE((warn_unused_result)) +static dberr_t -row_truncate_foreign_key_checks( - const dict_table_t* table, - const trx_t* trx) +fil_recreate_tablespace( +/*====================*/ + ulint space_id, /*!< in: space id */ + ulint format_flags, /*!< in: page format */ + ulint flags, /*!< in: tablespace flags */ + const char* name, /*!< in: table name */ + truncate_t& truncate, /*!< in: The information of + TRUNCATE log record */ + lsn_t recv_lsn) /*!< in: the end LSN of + the log record */ { - /* Check if the table is referenced by foreign key constraints from - some other table (not the table itself) */ - - dict_foreign_set::iterator it - = std::find_if(table->referenced_set.begin(), - table->referenced_set.end(), - dict_foreign_different_tables()); - - if (!srv_read_only_mode - && it != table->referenced_set.end() - && trx->check_foreigns) { - - dict_foreign_t* foreign = *it; - - FILE* ef = dict_foreign_err_file; - - /* We only allow truncating a referenced table if - FOREIGN_KEY_CHECKS is set to 0 */ + dberr_t err = DB_SUCCESS; + mtr_t mtr; - mutex_enter(&dict_foreign_err_mutex); + ut_ad(!truncate_t::s_fix_up_active); + truncate_t::s_fix_up_active = true; - rewind(ef); + /* Step-1: Invalidate buffer pool pages belonging to the tablespace + to re-create. */ + buf_LRU_flush_or_remove_pages(space_id, NULL); - ut_print_timestamp(ef); + /* Remove all insert buffer entries for the tablespace */ + ibuf_delete_for_discarded_space(space_id); - fputs(" Cannot truncate table ", ef); - ut_print_name(ef, trx, table->name.m_name); - fputs(" by DROP+CREATE\n" - "InnoDB: because it is referenced by ", ef); - ut_print_name(ef, trx, foreign->foreign_table_name); - putc('\n', ef); + /* Step-2: truncate tablespace (reset the size back to original or + default size) of tablespace. */ + err = truncate.truncate( + space_id, truncate.get_dir_path(), name, flags, true); - mutex_exit(&dict_foreign_err_mutex); + if (err != DB_SUCCESS) { + ib::info() << "Cannot access .ibd file for table '" + << name << "' with tablespace " << space_id + << " while truncating"; return(DB_ERROR); } - /* TODO: could we replace the counter n_foreign_key_checks_running - with lock checks on the table? Acquire here an exclusive lock on the - table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that - they can cope with the table having been truncated here? Foreign key - checks take an IS or IX lock on the table. */ - - if (table->n_foreign_key_checks_running > 0) { - ib::warn() << "Cannot truncate table " << table->name - << " because there is a foreign key check running on" - " it."; - + fil_space_t* space = fil_space_acquire(space_id); + if (!space) { + ib::info() << "Missing .ibd file for table '" << name + << "' with tablespace " << space_id; return(DB_ERROR); } - return(DB_SUCCESS); -} + const page_size_t page_size(space->flags); -/** -Do some sanity checks before starting the actual TRUNCATE. -@param table table being truncated -@return DB_SUCCESS or error code */ -static MY_ATTRIBUTE((warn_unused_result)) -dberr_t -row_truncate_sanity_checks( - const dict_table_t* table) -{ - if (dict_table_is_discarded(table)) { + /* Step-3: Initialize Header. */ + if (page_size.is_compressed()) { + byte* buf; + page_t* page; - return(DB_TABLESPACE_DELETED); + buf = static_cast<byte*>( + ut_zalloc_nokey(3U << srv_page_size_shift)); - } else if (!table->is_readable()) { - if (fil_space_get(table->space) == NULL) { - return(DB_TABLESPACE_NOT_FOUND); + /* Align the memory for file i/o */ + page = static_cast<byte*>(ut_align(buf, srv_page_size)); - } else { - return(DB_DECRYPTION_FAILED); - } - } else if (dict_table_is_corrupted(table)) { + flags |= FSP_FLAGS_PAGE_SSIZE(); - return(DB_TABLE_CORRUPT); - } + fsp_header_init_fields(page, space_id, flags); - return(DB_SUCCESS); -} + mach_write_to_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); + + page_zip_des_t page_zip; + page_zip_set_size(&page_zip, page_size.physical()); + page_zip.data = page + srv_page_size; -/** -Truncates a table for MySQL. -@param table table being truncated -@param trx transaction covering the truncate -@return error code or DB_SUCCESS */ -dberr_t row_truncate_table_for_mysql(dict_table_t* table, trx_t* trx) -{ - bool is_file_per_table = dict_table_is_file_per_table(table); - dberr_t err; #ifdef UNIV_DEBUG - ulint old_space = table->space; + page_zip.m_start = #endif /* UNIV_DEBUG */ - TruncateLogger* logger = NULL; - - /* Understanding the truncate flow. - - Step-1: Perform intiial sanity check to ensure table can be truncated. - This would include check for tablespace discard status, ibd file - missing, etc .... - - Step-2: Start transaction (only for non-temp table as temp-table don't - modify any data on disk doesn't need transaction object). - - Step-3: Validate ownership of needed locks (Exclusive lock). - Ownership will also ensure there is no active SQL queries, INSERT, - SELECT, ..... - - Step-4: Stop all the background process associated with table. - - Step-5: There are few foreign key related constraint under which - we can't truncate table (due to referential integrity unless it is - turned off). Ensure this condition is satisfied. - - Step-6: Truncate operation can be rolled back in case of error - till some point. Associate rollback segment to record undo log. - - Step-7: Generate new table-id. - Why we need new table-id ? - Purge and rollback case: we assign a new table id for the table. - Since purge and rollback look for the table based on the table id, - they see the table as 'dropped' and discard their operations. - - Step-8: Log information about tablespace which includes - table and index information. If there is a crash in the next step - then during recovery we will attempt to fixup the operation. - - Step-9: Drop all indexes (this include freeing of the pages - associated with them). - - Step-10: Re-create new indexes. - - Step-11: Update new table-id to in-memory cache (dictionary), - on-disk (INNODB_SYS_TABLES). INNODB_SYS_INDEXES also needs to - be updated to reflect updated root-page-no of new index created - and updated table-id. - - Step-12: Cleanup Stage. Reset auto-inc value to 1. - Release all the locks. - Commit the transaction. Update trx operation state. - - Notes: - - On error, log checkpoint is done followed writing of magic number to - truncate log file. If servers crashes after truncate, fix-up action - will not be applied. - - - log checkpoint is done before starting truncate table to ensure - that previous REDO log entries are not applied if current truncate - crashes. Consider following use-case: - - create table .... insert/load table .... truncate table (crash) - - on restart table is restored .... truncate table (crash) - - on restart (assuming default log checkpoint is not done) will have - 2 REDO log entries for same table. (Note 2 REDO log entries - for different table is not an issue). - For system-tablespace we can't truncate the tablespace so we need - to initiate a local cleanup that involves dropping of indexes and - re-creating them. If we apply stale entry we might end-up issuing - drop on wrong indexes. - - - Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE, - so we do not have to remove insert buffer records, as the - insert buffer works at a low level. If a freed page is later - reallocated, the allocator will remove the ibuf entries for - it. When we prepare to truncate *.ibd files, we remove all entries - for the table in the insert buffer tree. This is not strictly - necessary, but we can free up some space in the system tablespace. - - - Linear readahead and random readahead: we use the same - method as in 3) to discard ongoing operations. (This is only - relevant for TRUNCATE TABLE by TRUNCATE TABLESPACE.) - Ensure that the table will be dropped by trx_rollback_active() in - case of a crash. - */ - - /*-----------------------------------------------------------------*/ - /* Step-1: Perform intiial sanity check to ensure table can be - truncated. This would include check for tablespace discard status, - ibd file missing, etc .... */ - err = row_truncate_sanity_checks(table); - if (err != DB_SUCCESS) { - return(err); - - } - - /* Step-2: Start transaction (only for non-temp table as temp-table - don't modify any data on disk doesn't need transaction object). */ - if (!dict_table_is_temporary(table)) { - if (table->fts) { - fts_optimize_remove_table(table); - } - - /* Avoid transaction overhead for temporary table DDL. */ - trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); - } - - DEBUG_SYNC_C("row_trunc_before_dict_lock"); - - /* Step-3: Validate ownership of needed locks (Exclusive lock). - Ownership will also ensure there is no active SQL queries, INSERT, - SELECT, .....*/ - trx->op_info = "truncating table"; - ut_a(trx->dict_operation_lock_mode == 0); - row_mysql_lock_data_dictionary(trx); - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - - /* Step-4: Stop all the background process associated with table. */ - dict_stats_wait_bg_to_stop_using_table(table, trx); - - /* Step-5: There are few foreign key related constraint under which - we can't truncate table (due to referential integrity unless it is - turned off). Ensure this condition is satisfied. */ - ulint fsp_flags = ULINT_UNDEFINED; - err = row_truncate_foreign_key_checks(table, trx); - if (err != DB_SUCCESS) { - trx_rollback_to_savepoint(trx, NULL); - return(row_truncate_complete( - table, trx, fsp_flags, logger, err)); - } - - /* Remove all locks except the table-level X lock. */ - lock_remove_all_on_table(table, FALSE); - trx->table_id = table->id; - trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); - - /* Step-6: Truncate operation can be rolled back in case of error - till some point. Associate rollback segment to record undo log. */ - if (!dict_table_is_temporary(table)) { - mutex_enter(&trx->undo_mutex); - - trx_undo_t** pundo = &trx->rsegs.m_redo.update_undo; - err = trx_undo_assign_undo( - trx, trx->rsegs.m_redo.rseg, pundo, TRX_UNDO_UPDATE); - - mutex_exit(&trx->undo_mutex); - - DBUG_EXECUTE_IF("ib_err_trunc_assigning_undo_log", - err = DB_ERROR;); - if (err != DB_SUCCESS) { - trx_rollback_to_savepoint(trx, NULL); - return(row_truncate_complete( - table, trx, fsp_flags, logger, err)); - } - } - - /* Step-7: Generate new table-id. - Why we need new table-id ? - Purge and rollback: we assign a new table id for the - table. Since purge and rollback look for the table based on - the table id, they see the table as 'dropped' and discard - their operations. */ - table_id_t new_id; - dict_hdr_get_new_id(&new_id, NULL, NULL, table, false); - - /* Check if table involves FTS index. */ - bool has_internal_doc_id = - dict_table_has_fts_index(table) - || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID); - - bool no_redo = is_file_per_table && !has_internal_doc_id; - - /* Step-8: Log information about tablespace which includes - table and index information. If there is a crash in the next step - then during recovery we will attempt to fixup the operation. */ - - /* Lock all index trees for this table, as we will truncate - the table/index and possibly change their metadata. All - DML/DDL are blocked by table level X lock, with a few exceptions - such as queries into information schema about the table, - MySQL could try to access index stats for this kind of query, - we need to use index locks to sync up */ - dict_table_x_lock_indexes(table); - - if (!dict_table_is_temporary(table)) { - - if (is_file_per_table) { - - err = row_truncate_prepare(table, &fsp_flags); - - DBUG_EXECUTE_IF("ib_err_trunc_preparing_for_truncate", - err = DB_ERROR;); - - if (err != DB_SUCCESS) { - row_truncate_rollback( - table, trx, new_id, - has_internal_doc_id, - no_redo, false, true); - return(row_truncate_complete( - table, trx, fsp_flags, logger, err)); - } - } else { - fsp_flags = fil_space_get_flags(table->space); - - DBUG_EXECUTE_IF("ib_err_trunc_preparing_for_truncate", - fsp_flags = ULINT_UNDEFINED;); - - if (fsp_flags == ULINT_UNDEFINED) { - row_truncate_rollback( - table, trx, new_id, - has_internal_doc_id, - no_redo, false, true); - return(row_truncate_complete( - table, trx, fsp_flags, - logger, DB_ERROR)); - } - } - - logger = UT_NEW_NOKEY(TruncateLogger( - table, fsp_flags, new_id)); - - err = logger->init(); - if (err != DB_SUCCESS) { - row_truncate_rollback( - table, trx, new_id, has_internal_doc_id, - no_redo, false, true); - return(row_truncate_complete( - table, trx, fsp_flags, logger, DB_ERROR)); - - } - - err = SysIndexIterator().for_each(*logger); - if (err != DB_SUCCESS) { - row_truncate_rollback( - table, trx, new_id, has_internal_doc_id, - no_redo, false, true); - return(row_truncate_complete( - table, trx, fsp_flags, logger, DB_ERROR)); + page_zip.m_end = page_zip.m_nonempty = page_zip.n_blobs = 0; + buf_flush_init_for_writing(NULL, page, &page_zip, 0); - } - - ut_ad(logger->debug()); + err = fil_io(IORequestWrite, true, page_id_t(space_id, 0), + page_size, 0, page_size.physical(), page_zip.data, + NULL); - err = logger->log(); + ut_free(buf); if (err != DB_SUCCESS) { - row_truncate_rollback( - table, trx, new_id, has_internal_doc_id, - no_redo, false, true); - return(row_truncate_complete( - table, trx, fsp_flags, logger, DB_ERROR)); + ib::info() << "Failed to clean header of the" + " table '" << name << "' with tablespace " + << space_id; + goto func_exit; } } - DBUG_EXECUTE_IF("ib_trunc_crash_after_redo_log_write_complete", - log_buffer_flush_to_disk(); - os_thread_sleep(3000000); - DBUG_SUICIDE();); - - /* Step-9: Drop all indexes (free index pages associated with these - indexes) */ - if (!dict_table_is_temporary(table)) { - - DropIndex dropIndex(table, trx, no_redo); - - err = SysIndexIterator().for_each(dropIndex); - - if (err != DB_SUCCESS) { - - row_truncate_rollback( - table, trx, new_id, has_internal_doc_id, - no_redo, true, true); - - return(row_truncate_complete( - table, trx, fsp_flags, logger, err)); - } - } else { - /* For temporary tables we don't have entries in SYSTEM TABLES*/ - ut_ad(fsp_is_system_temporary(table->space)); - for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); - index != NULL; - index = UT_LIST_GET_NEXT(indexes, index)) { - err = dict_truncate_index_tree_in_mem(index); - - if (err != DB_SUCCESS) { - row_truncate_rollback( - table, trx, new_id, has_internal_doc_id, - no_redo, true, true); - return(row_truncate_complete( - table, trx, fsp_flags, logger, err)); - } - - DBUG_EXECUTE_IF( - "ib_trunc_crash_during_drop_index_temp_table", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - DBUG_SUICIDE();); - } - } - - if (is_file_per_table && fsp_flags != ULINT_UNDEFINED) { - /* A single-table tablespace has initially - FIL_IBD_FILE_INITIAL_SIZE number of pages allocated and an - extra page is allocated for each of the indexes present. But in - the case of clust index 2 pages are allocated and as one is - covered in the calculation as part of table->indexes.count we - take care of the other page by adding 1. */ - ulint space_size = table->indexes.count + - FIL_IBD_FILE_INITIAL_SIZE + 1; - - if (has_internal_doc_id) { - /* Since aux tables are created for fts indexes and - they use seperate tablespaces. */ - space_size -= ib_vector_size(table->fts->indexes); - } + mtr_start(&mtr); + /* Don't log the operation while fixing up table truncate operation + as crash at this level can still be sustained with recovery restarting + from last checkpoint. */ + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + /* Initialize the first extent descriptor page and + the second bitmap page for the new tablespace. */ + fsp_header_init(space, FIL_IBD_FILE_INITIAL_SIZE, &mtr); + mtr_commit(&mtr); - fil_reinit_space_header_for_table(table, space_size, trx); + /* Step-4: Re-Create Indexes to newly re-created tablespace. + This operation will restore tablespace back to what it was + when it was created during CREATE TABLE. */ + err = truncate.create_indexes(name, space, format_flags); + if (err != DB_SUCCESS) { + goto func_exit; } - DBUG_EXECUTE_IF("ib_trunc_crash_with_intermediate_log_checkpoint", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - log_checkpoint(TRUE); - os_thread_sleep(1000000); - DBUG_SUICIDE();); + /* Step-5: Write new created pages into ibd file handle and + flush it to disk for the tablespace, in case i/o-handler thread + deletes the bitmap page from buffer. */ + mtr_start(&mtr); - DBUG_EXECUTE_IF("ib_trunc_crash_drop_reinit_done_create_to_start", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - DBUG_SUICIDE();); + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); -#ifdef BTR_CUR_HASH_ADAPT - dict_table_x_unlock_indexes(table); - for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); index; - index = UT_LIST_GET_NEXT(indexes, index)) { - index = index->clone_if_needed(); - } - dict_table_x_lock_indexes(table); -#endif /* BTR_CUR_HASH_ADAPT */ + for (ulint page_no = 0; + page_no < UT_LIST_GET_FIRST(space->chain)->size; ++page_no) { - /* Step-10: Re-create new indexes. */ - if (!dict_table_is_temporary(table)) { + const page_id_t cur_page_id(space_id, page_no); - CreateIndex createIndex(table, no_redo); + buf_block_t* block = buf_page_get(cur_page_id, page_size, + RW_X_LATCH, &mtr); - err = SysIndexIterator().for_each(createIndex); + byte* page = buf_block_get_frame(block); - if (err != DB_SUCCESS) { + if (!FSP_FLAGS_GET_ZIP_SSIZE(flags)) { + ut_ad(!page_size.is_compressed()); - row_truncate_rollback( - table, trx, new_id, has_internal_doc_id, - no_redo, true, true); + buf_flush_init_for_writing( + block, page, NULL, recv_lsn); - return(row_truncate_complete( - table, trx, fsp_flags, logger, err)); - } - } + err = fil_io(IORequestWrite, true, cur_page_id, + page_size, 0, srv_page_size, page, NULL); + } else { + ut_ad(page_size.is_compressed()); - /* Done with index truncation, release index tree locks, - subsequent work relates to table level metadata change */ - dict_table_x_unlock_indexes(table); + /* We don't want to rewrite empty pages. */ - if (has_internal_doc_id) { + if (fil_page_get_type(page) != 0) { + page_zip_des_t* page_zip = + buf_block_get_page_zip(block); - err = row_truncate_fts(table, new_id, trx); + buf_flush_init_for_writing( + block, page, page_zip, recv_lsn); - if (err != DB_SUCCESS) { + err = fil_io(IORequestWrite, true, + cur_page_id, + page_size, 0, + page_size.physical(), + page_zip->data, NULL); + } else { +#ifdef UNIV_DEBUG + const byte* data = block->page.zip.data; - row_truncate_rollback( - table, trx, new_id, has_internal_doc_id, - no_redo, true, false); + /* Make sure that the page is really empty */ + for (ulint i = 0; + i < page_size.physical(); + ++i) { - return(row_truncate_complete( - table, trx, fsp_flags, logger, err)); + ut_a(data[i] == 0); + } +#endif /* UNIV_DEBUG */ + } } - } - - /* Step-11: Update new table-id to in-memory cache (dictionary), - on-disk (INNODB_SYS_TABLES). INNODB_SYS_INDEXES also needs to - be updated to reflect updated root-page-no of new index created - and updated table-id. */ - if (dict_table_is_temporary(table)) { - - dict_table_change_id_in_cache(table, new_id); - err = DB_SUCCESS; - - } else { - - /* If this fails then we are in an inconsistent state and - the results are undefined. */ - ut_ad(old_space == table->space); - - err = row_truncate_update_system_tables( - table, new_id, has_internal_doc_id, no_redo, trx); if (err != DB_SUCCESS) { - return(row_truncate_complete( - table, trx, fsp_flags, logger, err)); + ib::info() << "Cannot write page " << page_no + << " into a .ibd file for table '" + << name << "' with tablespace " << space_id; } } - DBUG_EXECUTE_IF("ib_trunc_crash_on_updating_dict_sys_info", - log_buffer_flush_to_disk(); - os_thread_sleep(2000000); - DBUG_SUICIDE();); - - /* Step-12: Cleanup Stage. Reset auto-inc value to 1. - Release all the locks. - Commit the transaction. Update trx operation state. */ - dict_table_autoinc_lock(table); - dict_table_autoinc_initialize(table, 1); - dict_table_autoinc_unlock(table); - - if (trx_is_started(trx)) { - - trx_commit_for_mysql(trx); - } + mtr_commit(&mtr); - return(row_truncate_complete(table, trx, fsp_flags, logger, err)); + truncate_t::s_fix_up_active = false; +func_exit: + space->release(); + return(err); } /** @@ -2115,9 +859,7 @@ truncate_t::fixup_tables_in_system_tablespace() "residing in the system tablespace."; err = fil_recreate_table( - (*it)->m_space_id, (*it)->m_format_flags, - (*it)->m_tablespace_flags, (*it)->m_tablename, **it); @@ -2176,23 +918,22 @@ truncate_t::fixup_tables_in_non_system_tablespace() "residing in file-per-table tablespace with " "id (" << (*it)->m_space_id << ")"; - if (!fil_space_get((*it)->m_space_id)) { + fil_space_t* space = fil_space_get((*it)->m_space_id); + if (!space) { /* Create the database directory for name, if it does not exist yet */ fil_create_directory_for_tablename( (*it)->m_tablename); - err = fil_ibd_create( - (*it)->m_space_id, - (*it)->m_tablename, - (*it)->m_dir_path, - (*it)->m_tablespace_flags, - FIL_IBD_FILE_INITIAL_SIZE, - (*it)->m_encryption, - (*it)->m_key_id); - - if (err != DB_SUCCESS) { + space = fil_ibd_create((*it)->m_space_id, + (*it)->m_tablename, + (*it)->m_dir_path, + (*it)->m_tablespace_flags, + FIL_IBD_FILE_INITIAL_SIZE, + (*it)->m_encryption, + (*it)->m_key_id, &err); + if (!space) { /* If checkpoint is not yet done and table is dropped and then we might still have REDO entries for this table @@ -2206,8 +947,6 @@ truncate_t::fixup_tables_in_non_system_tablespace() } } - ut_ad(fil_space_get((*it)->m_space_id)); - err = fil_recreate_tablespace( (*it)->m_space_id, (*it)->m_format_flags, @@ -2387,7 +1126,7 @@ truncate_t::update_root_page_no( pars_info_add_ull_literal( info, "index_id", - (mark_index_corrupted ? -1 : it->m_id)); + (mark_index_corrupted ? IB_ID_MAX : it->m_id)); err = que_eval_sql( info, @@ -2683,8 +1422,7 @@ truncate_t::index_t::set( /** Create an index for a table. @param[in] table_name table name, for which to create the index -@param[in] space_id space id where we have to -create the index +@param[in] space tablespace @param[in] page_size page size of the .ibd file @param[in] index_type type of index to truncate @param[in] index_id id of index to truncate @@ -2692,18 +1430,17 @@ create the index @param[in,out] mtr mini-transaction covering the create index @return root page no or FIL_NULL on failure */ -ulint +inline ulint truncate_t::create_index( const char* table_name, - ulint space_id, - const page_size_t& page_size, + fil_space_t* space, ulint index_type, index_id_t index_id, const btr_create_t& btr_redo_create_info, mtr_t* mtr) const { ulint root_page_no = btr_create( - index_type, space_id, page_size, index_id, + index_type, space, index_id, NULL, &btr_redo_create_info, mtr); if (root_page_no == FIL_NULL) { @@ -2712,7 +1449,7 @@ truncate_t::create_index( << srv_force_recovery << ". Continuing crash recovery" " even though we failed to create index " << index_id << " for compressed table '" << table_name << "' with" - " tablespace " << space_id << " during recovery"; + " file " << space->chain.start->name; } return(root_page_no); @@ -2720,30 +1457,27 @@ truncate_t::create_index( /** Check if index has been modified since TRUNCATE log snapshot was recorded. -@param space_id space_id where table/indexes resides. -@param root_page_no root page of index that needs to be verified. +@param[in] space tablespace +@param[in] root_page_no index root page number @return true if modified else false */ - +inline bool truncate_t::is_index_modified_since_logged( - ulint space_id, - ulint root_page_no) const + const fil_space_t* space, + ulint root_page_no) const { - mtr_t mtr; - bool found; - const page_size_t& page_size = fil_space_get_page_size(space_id, - &found); - dberr_t err = DB_SUCCESS; - - ut_ad(found); + dberr_t err; + mtr_t mtr; mtr_start(&mtr); /* Root page could be in free state if truncate crashed after drop_index and page was not allocated for any other object. */ buf_block_t* block= buf_page_get_gen( - page_id_t(space_id, root_page_no), page_size, RW_X_LATCH, NULL, + page_id_t(space->id, root_page_no), page_size_t(space->flags), + RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, __FILE__, __LINE__, &mtr, &err); + if (!block) return true; page_t* root = buf_block_get_frame(block); @@ -2767,31 +1501,21 @@ truncate_t::is_index_modified_since_logged( } /** Drop indexes for a table. -@param space_id space_id where table/indexes resides. */ - -void -truncate_t::drop_indexes( - ulint space_id) const +@param[in,out] space tablespace */ +void truncate_t::drop_indexes(fil_space_t* space) const { mtr_t mtr; - ulint root_page_no = FIL_NULL; indexes_t::const_iterator end = m_indexes.end(); + const page_size_t page_size(space->flags); for (indexes_t::const_iterator it = m_indexes.begin(); it != end; ++it) { - root_page_no = it->m_root_page_no; - - bool found; - const page_size_t& page_size - = fil_space_get_page_size(space_id, &found); + ulint root_page_no = it->m_root_page_no; - ut_ad(found); - - if (is_index_modified_since_logged( - space_id, root_page_no)) { + if (is_index_modified_since_logged(space, root_page_no)) { /* Page has been modified since TRUNCATE log snapshot was recorded so not safe to drop the index. */ continue; @@ -2799,14 +1523,14 @@ truncate_t::drop_indexes( mtr_start(&mtr); - if (space_id != TRX_SYS_SPACE) { + if (space->id != TRX_SYS_SPACE) { /* Do not log changes for single-table tablespaces, we are in recovery mode. */ mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); } if (root_page_no != FIL_NULL) { - const page_id_t root_page_id(space_id, root_page_no); + const page_id_t root_page_id(space->id, root_page_no); btr_free_if_exists( root_page_id, page_size, it->m_id, &mtr); @@ -2822,24 +1546,20 @@ truncate_t::drop_indexes( /** Create the indexes for a table @param[in] table_name table name, for which to create the indexes -@param[in] space_id space id where we have to create the indexes -@param[in] page_size page size of the .ibd file -@param[in] flags tablespace flags +@param[in,out] space tablespace @param[in] format_flags page format flags @return DB_SUCCESS or error code. */ -dberr_t +inline dberr_t truncate_t::create_indexes( const char* table_name, - ulint space_id, - const page_size_t& page_size, - ulint flags, + fil_space_t* space, ulint format_flags) { mtr_t mtr; mtr_start(&mtr); - if (space_id != TRX_SYS_SPACE) { + if (space->id != TRX_SYS_SPACE) { /* Do not log changes for single-table tablespaces, we are in recovery mode. */ mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); @@ -2856,12 +1576,12 @@ truncate_t::create_indexes( ++it) { btr_create_t btr_redo_create_info( - FSP_FLAGS_GET_ZIP_SSIZE(flags) + FSP_FLAGS_GET_ZIP_SSIZE(space->flags) ? &it->m_fields[0] : NULL); btr_redo_create_info.format_flags = format_flags; - if (FSP_FLAGS_GET_ZIP_SSIZE(flags)) { + if (FSP_FLAGS_GET_ZIP_SSIZE(space->flags)) { btr_redo_create_info.n_fields = it->m_n_fields; /* Skip the NUL appended field */ @@ -2871,7 +1591,7 @@ truncate_t::create_indexes( } root_page_no = create_index( - table_name, space_id, page_size, it->m_type, it->m_id, + table_name, space, it->m_type, it->m_id, btr_redo_create_info, &mtr); if (root_page_no == FIL_NULL) { diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc index 8c944189de6..304ec71a63a 100644 --- a/storage/innobase/row/row0uins.cc +++ b/storage/innobase/row/row0uins.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -78,9 +78,11 @@ row_undo_ins_remove_clust_rec( mtr.start(); if (index->table->is_temporary()) { + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - mtr.set_named_space(index->space); + index->set_modified(mtr); + ut_ad(lock_table_has_locks(index->table)); } /* This is similar to row_undo_mod_clust(). The DDL thread may @@ -94,7 +96,7 @@ row_undo_ins_remove_clust_rec( ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH); ut_ad(node->table->id != DICT_INDEXES_ID); - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } success = btr_pcur_restore_position( @@ -115,15 +117,17 @@ row_undo_ins_remove_clust_rec( const rec_t* rec = btr_cur_get_rec(btr_cur); mem_heap_t* heap = NULL; const rec_offs* offsets = rec_get_offsets( - rec, index, NULL, true, ULINT_UNDEFINED, &heap); + rec, index, NULL, index->n_core_fields, + ULINT_UNDEFINED, &heap); row_log_table_delete(rec, index, offsets, NULL); mem_heap_free(heap); } - if (node->table->id == DICT_INDEXES_ID) { - + switch (node->table->id) { + case DICT_INDEXES_ID: ut_ad(!online); ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); dict_drop_index_tree( btr_pcur_get_rec(&node->pcur), &node->pcur, node->trx, @@ -136,6 +140,54 @@ row_undo_ins_remove_clust_rec( success = btr_pcur_restore_position( BTR_MODIFY_LEAF, &node->pcur, &mtr); ut_a(success); + break; + case DICT_COLUMNS_ID: + /* This is rolling back an INSERT into SYS_COLUMNS. + If it was part of an instant ADD COLUMN operation, we + must modify the table definition. At this point, any + corresponding operation to the metadata record will have + been rolled back. */ + ut_ad(!online); + ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); + const rec_t* rec = btr_pcur_get_rec(&node->pcur); + if (rec_get_n_fields_old(rec) + != DICT_NUM_FIELDS__SYS_COLUMNS) { + break; + } + ulint len; + const byte* data = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len); + if (len != 8) { + break; + } + const table_id_t table_id = mach_read_from_8(data); + data = rec_get_nth_field_old(rec, DICT_FLD__SYS_COLUMNS__POS, + &len); + if (len != 4) { + break; + } + const unsigned pos = mach_read_from_4(data); + if (pos == 0 || pos >= (1U << 16)) { + break; + } + dict_table_t* table = dict_table_open_on_id( + table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); + if (!table) { + break; + } + + dict_index_t* index = dict_table_get_first_index(table); + + if (index && index->is_instant() + && DATA_N_SYS_COLS + 1 + pos == table->n_cols) { + /* This is the rollback of an instant ADD COLUMN. + Remove the column from the dictionary cache, + but keep the system columns. */ + table->rollback_instant(pos); + } + + dict_table_close(table, true, false); } if (btr_cur_optimistic_delete(btr_cur, 0, &mtr)) { @@ -150,7 +202,7 @@ retry: if (index->table->is_temporary()) { mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - mtr.set_named_space(index->space); + index->set_modified(mtr); } success = btr_pcur_restore_position( @@ -178,6 +230,27 @@ retry: func_exit: btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) { + /* When rolling back the very first instant ADD COLUMN + operation, reset the root page to the basic state. */ + ut_ad(!index->table->is_temporary()); + mtr.start(); + if (page_t* root = btr_root_get(index, &mtr)) { + byte* page_type = root + FIL_PAGE_TYPE; + ut_ad(mach_read_from_2(page_type) + == FIL_PAGE_TYPE_INSTANT + || mach_read_from_2(page_type) + == FIL_PAGE_INDEX); + index->set_modified(mtr); + mlog_write_ulint(page_type, FIL_PAGE_INDEX, + MLOG_2BYTES, &mtr); + byte* instant = PAGE_INSTANT + PAGE_HEADER + root; + mlog_write_ulint(instant, + page_ptr_get_direction(instant + 1), + MLOG_2BYTES, &mtr); + } + mtr.commit(); + } return(err); } @@ -205,10 +278,10 @@ row_undo_ins_remove_sec_low( if (modify_leaf) { mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } else { ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE)); - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } if (row_log_online_op_try(index, entry, 0)) { @@ -344,15 +417,16 @@ row_undo_ins_parse_undo_rec( default: ut_ad(!"wrong undo record type"); goto close_table; + case TRX_UNDO_INSERT_METADATA: case TRX_UNDO_INSERT_REC: break; case TRX_UNDO_RENAME_TABLE: dict_table_t* table = node->table; ut_ad(!table->is_temporary()); ut_ad(dict_table_is_file_per_table(table) - == (table->space != TRX_SYS_SPACE)); + == !is_system_tablespace(table->space_id)); size_t len = mach_read_from_2(node->undo_rec) - + node->undo_rec - ptr - 2; + + size_t(node->undo_rec - ptr) - 2; ptr[len] = 0; const char* name = reinterpret_cast<char*>(ptr); if (strcmp(table->name.m_name, name)) { @@ -380,8 +454,20 @@ close_table: clust_index = dict_table_get_first_index(node->table); if (clust_index != NULL) { - ptr = trx_undo_rec_get_row_ref( - ptr, clust_index, &node->ref, node->heap); + if (node->rec_type == TRX_UNDO_INSERT_REC) { + ptr = trx_undo_rec_get_row_ref( + ptr, clust_index, &node->ref, + node->heap); + } else { + node->ref = &trx_undo_metadata; + if (!row_undo_search_clust_to_pcur(node)) { + /* An error probably occurred during + an insert into the clustered index, + after we wrote the undo log record. */ + goto close_table; + } + return; + } if (!row_undo_search_clust_to_pcur(node)) { /* An error probably occurred during @@ -495,18 +581,29 @@ row_undo_ins( node->index = dict_table_get_first_index(node->table); ut_ad(dict_index_is_clust(node->index)); - /* Skip the clustered index (the first index) */ - node->index = dict_table_get_next_index(node->index); - dict_table_skip_corrupt_index(node->index); + switch (node->rec_type) { + default: + ut_ad(!"wrong undo record type"); + /* fall through */ + case TRX_UNDO_INSERT_REC: + /* Skip the clustered index (the first index) */ + node->index = dict_table_get_next_index(node->index); + + dict_table_skip_corrupt_index(node->index); - err = row_undo_ins_remove_sec_rec(node, thr); + err = row_undo_ins_remove_sec_rec(node, thr); - if (err == DB_SUCCESS) { + if (err != DB_SUCCESS) { + break; + } + /* fall through */ + case TRX_UNDO_INSERT_METADATA: log_free_check(); if (node->table->id == DICT_INDEXES_ID) { + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); if (!dict_locked) { mutex_enter(&dict_sys->mutex); diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc index d207aa5b9bc..88e87e2f9bc 100644 --- a/storage/innobase/row/row0umod.cc +++ b/storage/innobase/row/row0umod.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,6 +30,7 @@ Created 2/27/1997 Heikki Tuuri #include "dict0boot.h" #include "trx0undo.h" #include "trx0roll.h" +#include "trx0purge.h" #include "btr0btr.h" #include "mach0data.h" #include "ibuf0ibuf.h" @@ -109,7 +110,8 @@ row_undo_mod_clust_low( ut_ad(success); ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), btr_cur_get_index(btr_cur)) - == thr_get_trx(thr)->id || node->index->table->is_temporary()); + == thr_get_trx(thr)->id + || btr_cur_get_index(btr_cur)->table->is_temporary()); if (mode != BTR_MODIFY_LEAF && dict_index_is_online_ddl(btr_cur_get_index(btr_cur))) { @@ -121,7 +123,8 @@ row_undo_mod_clust_low( } if (mode != BTR_MODIFY_TREE) { - ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF); + ut_ad((mode & ulint(~BTR_ALREADY_S_LATCHED)) + == BTR_MODIFY_LEAF); err = btr_cur_optimistic_update( BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG @@ -146,104 +149,58 @@ row_undo_mod_clust_low( return(err); } -/***********************************************************//** -Purges a clustered index record after undo if possible. -This is attempted when the record was inserted by updating a -delete-marked record and there no longer exist transactions -that would see the delete-marked record. -@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */ -static MY_ATTRIBUTE((nonnull, warn_unused_result)) -dberr_t -row_undo_mod_remove_clust_low( -/*==========================*/ - undo_node_t* node, /*!< in: row undo node */ - mtr_t* mtr, /*!< in/out: mini-transaction */ - ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +/** Get the byte offset of the DB_TRX_ID column +@param[in] rec clustered index record +@param[in] index clustered index +@return the byte offset of DB_TRX_ID, from the start of rec */ +static ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index) { - btr_cur_t* btr_cur; - dberr_t err; - ulint trx_id_offset; - - ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); - - /* Find out if the record has been purged already - or if we can remove it. */ - - if (!btr_pcur_restore_position(mode, &node->pcur, mtr)) { - return DB_SUCCESS; - } - - DEBUG_SYNC_C("rollback_purge_clust"); - - if (row_vers_must_preserve_del_marked(node->new_trx_id, - node->table->name, mtr)) { - return DB_SUCCESS; - } - - btr_cur = btr_pcur_get_btr_cur(&node->pcur); - - trx_id_offset = btr_cur_get_index(btr_cur)->trx_id_offset; - + ut_ad(index->n_uniq <= MAX_REF_PARTS); + ulint trx_id_offset = index->trx_id_offset; if (!trx_id_offset) { - mem_heap_t* heap = NULL; - ulint trx_id_col; - const rec_offs* offsets; - ulint len; - - trx_id_col = dict_index_get_sys_col_pos( - btr_cur_get_index(btr_cur), DATA_TRX_ID); - ut_ad(trx_id_col > 0); - ut_ad(trx_id_col != ULINT_UNDEFINED); - - offsets = rec_get_offsets( - btr_cur_get_rec(btr_cur), btr_cur_get_index(btr_cur), - NULL, true, trx_id_col + 1, &heap); - + /* Reserve enough offsets for the PRIMARY KEY and 2 columns + so that we can access DB_TRX_ID, DB_ROLL_PTR. */ + rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2]; + rec_offs_init(offsets_); + mem_heap_t* heap = NULL; + const ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1; + rec_offs* offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + trx_id_pos + 1, &heap); + ut_ad(!heap); + ulint len; trx_id_offset = rec_get_nth_field_offs( - offsets, trx_id_col, &len); + offsets, trx_id_pos, &len); ut_ad(len == DATA_TRX_ID_LEN); - mem_heap_free(heap); } - if (trx_read_trx_id(btr_cur_get_rec(btr_cur) + trx_id_offset) - != node->new_trx_id) { - /* The record must have been purged and then replaced - with a different one. */ - return(DB_SUCCESS); - } + return trx_id_offset; +} - /* We are about to remove an old, delete-marked version of the - record that may have been delete-marked by a different transaction - than the rolling-back one. */ - ut_ad(rec_get_deleted_flag(btr_cur_get_rec(btr_cur), - dict_table_is_comp(node->table))); - /* In delete-marked records, DB_TRX_ID must - always refer to an existing update_undo log record. */ - ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), btr_cur->index)); - - if (mode == BTR_MODIFY_LEAF) { - err = btr_cur_optimistic_delete(btr_cur, 0, mtr) - ? DB_SUCCESS - : DB_FAIL; - } else { - ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE)); +/** Determine if rollback must execute a purge-like operation. +@param[in,out] node row undo +@param[in,out] mtr mini-transaction +@return whether the record should be purged */ +static bool row_undo_mod_must_purge(undo_node_t* node, mtr_t* mtr) +{ + ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); + ut_ad(!node->table->is_temporary()); - /* This operation is analogous to purge, we can free also - inherited externally stored fields. - We can also assume that the record was complete - (including BLOBs), because it had been delete-marked - after it had been completely inserted. Therefore, we - are passing rollback=false, just like purge does. */ + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&node->pcur); + ut_ad(btr_cur->index->is_primary()); + DEBUG_SYNC_C("rollback_purge_clust"); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, - false, mtr); + mtr->s_lock(&purge_sys.latch, __FILE__, __LINE__); - /* The delete operation may fail if we have little - file space left: TODO: easiest to crash the database - and restart with more file space */ + if (!purge_sys.view.changes_visible(node->new_trx_id, + node->table->name)) { + return false; } - return(err); + const rec_t* rec = btr_cur_get_rec(btr_cur); + + return trx_read_trx_id(rec + row_trx_id_offset(rec, btr_cur->index)) + == node->new_trx_id; } /***********************************************************//** @@ -272,18 +229,20 @@ row_undo_mod_clust( log_free_check(); pcur = &node->pcur; index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur)); + ut_ad(index->is_primary()); mtr.start(); if (index->table->is_temporary()) { mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - mtr.set_named_space(index->space); + index->set_modified(mtr); + ut_ad(lock_table_has_locks(index->table)); } online = dict_index_is_online_ddl(index); if (online) { ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH); - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } mem_heap_t* heap = mem_heap_create(1024); @@ -311,7 +270,7 @@ row_undo_mod_clust( if (index->table->is_temporary()) { mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - mtr.set_named_space(index->space); + index->set_modified(mtr); } err = row_undo_mod_clust_low( @@ -366,44 +325,123 @@ row_undo_mod_clust( btr_pcur_commit_specify_mtr(pcur, &mtr); DEBUG_SYNC_C("rollback_undo_pk"); - if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) { + if (err != DB_SUCCESS) { + goto func_exit; + } + + /* FIXME: Perform the below operations in the above + mini-transaction when possible. */ + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing update_undo log record. */ + ut_ad(node->new_trx_id); mtr.start(); + if (!btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, &mtr)) { + goto mtr_commit_exit; + } + if (index->table->is_temporary()) { mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - mtr.set_named_space(index->space); + if (!row_undo_mod_must_purge(node, &mtr)) { + goto mtr_commit_exit; + } + index->set_modified(mtr); } - /* It is not necessary to call row_log_table, - because the record is delete-marked and would thus - be omitted from the rebuilt copy of the table. */ - err = row_undo_mod_remove_clust_low( - node, &mtr, BTR_MODIFY_LEAF); - if (err != DB_SUCCESS) { - btr_pcur_commit_specify_mtr(pcur, &mtr); + ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur), + dict_table_is_comp(node->table))); + if (btr_cur_optimistic_delete(&pcur->btr_cur, 0, &mtr)) { + goto mtr_commit_exit; + } - /* We may have to modify tree structure: do a - pessimistic descent down the index tree */ + btr_pcur_commit_specify_mtr(pcur, &mtr); - mtr.start(); - if (index->table->is_temporary()) { - mtr.set_log_mode(MTR_LOG_NO_REDO); - } else { - mtr.set_named_space(index->space); + mtr.start(); + if (!btr_pcur_restore_position( + BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE, + pcur, &mtr)) { + goto mtr_commit_exit; + } + + if (index->table->is_temporary()) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + if (!row_undo_mod_must_purge(node, &mtr)) { + goto mtr_commit_exit; } + index->set_modified(mtr); + } - err = row_undo_mod_remove_clust_low( - node, &mtr, - BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE); + ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur), + dict_table_is_comp(node->table))); + + /* This operation is analogous to purge, we can free + also inherited externally stored fields. We can also + assume that the record was complete (including BLOBs), + because it had been delete-marked after it had been + completely inserted. Therefore, we are passing + rollback=false, just like purge does. */ + btr_cur_pessimistic_delete(&err, FALSE, &pcur->btr_cur, 0, + false, &mtr); + ut_ad(err == DB_SUCCESS + || err == DB_OUT_OF_FILE_SPACE); + } else if (!index->table->is_temporary() && node->new_trx_id) { + /* We rolled back a record so that it still exists. + We must reset the DB_TRX_ID if the history is no + longer accessible by any active read view. */ - ut_ad(err == DB_SUCCESS - || err == DB_OUT_OF_FILE_SPACE); + mtr.start(); + if (!btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, &mtr)) { + goto mtr_commit_exit; + } + rec_t* rec = btr_pcur_get_rec(pcur); + mtr.s_lock(&purge_sys.latch, __FILE__, __LINE__); + if (!purge_sys.view.changes_visible(node->new_trx_id, + node->table->name)) { + goto mtr_commit_exit; } - btr_pcur_commit_specify_mtr(pcur, &mtr); + ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1; + ut_ad(index->n_uniq <= MAX_REF_PARTS); + /* Reserve enough offsets for the PRIMARY KEY and 2 columns + so that we can access DB_TRX_ID, DB_ROLL_PTR. */ + rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2]; + rec_offs_init(offsets_); + offsets = rec_get_offsets( + rec, index, offsets_, index->n_core_fields, + trx_id_pos + 2, &heap); + ulint len; + ulint trx_id_offset = rec_get_nth_field_offs( + offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + + if (trx_read_trx_id(rec + trx_id_offset) == node->new_trx_id) { + ut_ad(!rec_get_deleted_flag( + rec, dict_table_is_comp(node->table))); + index->set_modified(mtr); + if (page_zip_des_t* page_zip = buf_block_get_page_zip( + btr_pcur_get_block(&node->pcur))) { + page_zip_write_trx_id_and_roll_ptr( + page_zip, rec, offsets, trx_id_pos, + 0, 1ULL << ROLL_PTR_INSERT_FLAG_POS, + &mtr); + } else { + mlog_write_string(rec + trx_id_offset, + reset_trx_id, + sizeof reset_trx_id, &mtr); + } + } + } else { + goto func_exit; } +mtr_commit_exit: + btr_pcur_commit_specify_mtr(pcur, &mtr); + +func_exit: node->state = UNDO_NODE_FETCH_NEXT; if (offsets_heap) { @@ -444,10 +482,10 @@ row_undo_mod_del_mark_or_remove_sec_low( is protected by index->lock. */ if (modify_leaf) { mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } else { ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE)); - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } if (row_log_online_op_try(index, entry, 0)) { @@ -506,12 +544,11 @@ row_undo_mod_del_mark_or_remove_sec_low( ut_a(success); /* For temporary table, we can skip to check older version of - clustered index entry. Because the purge won't process - any no-redo rollback segment undo logs. */ - if (dict_table_is_temporary(node->table) + clustered index entry, because there is no MVCC or purge. */ + if (node->table->is_temporary() || row_vers_old_has_index_entry( - false, btr_pcur_get_rec(&(node->pcur)), - &mtr_vers, index, entry, 0, 0)) { + false, btr_pcur_get_rec(&node->pcur), + &mtr_vers, index, entry, 0, 0)) { err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, btr_cur, TRUE, thr, &mtr); ut_ad(err == DB_SUCCESS); @@ -530,18 +567,14 @@ row_undo_mod_del_mark_or_remove_sec_low( } if (modify_leaf) { - success = btr_cur_optimistic_delete(btr_cur, 0, &mtr); - if (success) { - err = DB_SUCCESS; - } else { - err = DB_FAIL; - } + err = btr_cur_optimistic_delete(btr_cur, 0, &mtr) + ? DB_SUCCESS : DB_FAIL; } else { /* Passing rollback=false, because we are deleting a secondary index record: the distinction only matters when deleting a record that contains externally stored columns. */ - ut_ad(!dict_index_is_clust(index)); + ut_ad(!index->is_primary()); btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, false, &mtr); @@ -645,10 +678,10 @@ try_again: is protected by index->lock. */ if (mode == BTR_MODIFY_LEAF) { mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } else { ut_ad(mode == BTR_MODIFY_TREE); - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } if (row_log_online_op_try(index, entry, trx->id)) { @@ -764,7 +797,8 @@ try_again: offsets_heap = NULL; offsets = rec_get_offsets( btr_cur_get_rec(btr_cur), - index, NULL, true, ULINT_UNDEFINED, &offsets_heap); + index, NULL, index->n_core_fields, ULINT_UNDEFINED, + &offsets_heap); update = row_upd_build_sec_rec_difference_binary( btr_cur_get_rec(btr_cur), index, offsets, entry, heap); if (upd_get_n_fields(update) == 0) { @@ -865,8 +899,8 @@ row_undo_mod_upd_del_sec( } /* During online index creation, - HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should - guarantee that any active transaction has not modified + HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCk + should guarantee that any active transaction has not modified indexed columns such that col->ord_part was 0 at the time when the undo log record was written. When we get to roll back an undo log entry TRX_UNDO_DEL_MARK_REC, @@ -931,8 +965,8 @@ row_undo_mod_del_mark_sec( } /* During online index creation, - HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should - guarantee that any active transaction has not modified + HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCK + should guarantee that any active transaction has not modified indexed columns such that col->ord_part was 0 at the time when the undo log record was written. When we get to roll back an undo log entry TRX_UNDO_DEL_MARK_REC, @@ -1034,8 +1068,7 @@ row_undo_mod_upd_exist_sec( format. REDUNDANT and COMPACT formats store a local 768-byte prefix of each externally stored column. */ - ut_a(dict_table_get_format(index->table) - >= UNIV_FORMAT_B); + ut_a(dict_table_has_atomic_blobs(index->table)); /* This is only legitimate when rolling back an incomplete transaction @@ -1179,6 +1212,20 @@ close_table: node->heap, &(node->update)); node->new_trx_id = trx_id; node->cmpl_info = cmpl_info; + ut_ad(!node->ref->info_bits); + + if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) { + /* This must be an undo log record for a subsequent + instant ALTER TABLE, extending the metadata record. */ + ut_ad(clust_index->is_instant()); + if (node->update->info_bits != REC_INFO_MIN_REC_FLAG) { + ut_ad(!"wrong info_bits in undo log record"); + goto close_table; + } + node->update->info_bits = REC_INFO_METADATA; + const_cast<dtuple_t*>(node->ref)->info_bits + = REC_INFO_METADATA; + } if (!row_undo_search_clust_to_pcur(node)) { /* As long as this rolling-back transaction exists, @@ -1252,6 +1299,12 @@ row_undo_mod( node->index = dict_table_get_first_index(node->table); ut_ad(dict_index_is_clust(node->index)); + + if (node->ref->info_bits) { + ut_ad(node->ref->info_bits == REC_INFO_METADATA); + goto rollback_clust; + } + /* Skip the clustered index (the first index) */ node->index = dict_table_get_next_index(node->index); @@ -1274,6 +1327,7 @@ row_undo_mod( } if (err == DB_SUCCESS) { +rollback_clust: err = row_undo_mod_clust(node, thr); bool update_statistics diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc index e1978cfb297..b8be086e875 100644 --- a/storage/innobase/row/row0undo.cc +++ b/storage/innobase/row/row0undo.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -40,6 +40,7 @@ Created 1/8/1997 Heikki Tuuri #include "row0upd.h" #include "row0mysql.h" #include "srv0srv.h" +#include "srv0start.h" /* How to undo row operations? (1) For an insert, we have stored a prefix of the clustered index record @@ -186,7 +187,8 @@ row_undo_search_clust_to_pcur( rec = btr_pcur_get_rec(&node->pcur); - offsets = rec_get_offsets(rec, clust_index, offsets, true, + offsets = rec_get_offsets(rec, clust_index, offsets, + clust_index->n_core_fields, ULINT_UNDEFINED, &heap); found = row_get_rec_roll_ptr(rec, clust_index, offsets) @@ -196,11 +198,10 @@ row_undo_search_clust_to_pcur( ut_ad(row_get_rec_trx_id(rec, clust_index, offsets) == node->trx->id || node->table->is_temporary()); - if (dict_table_get_format(node->table) >= UNIV_FORMAT_B) { - /* In DYNAMIC or COMPRESSED format, there is - no prefix of externally stored columns in the - clustered index record. Build a cache of - column prefixes. */ + if (dict_table_has_atomic_blobs(node->table)) { + /* There is no prefix of externally stored + columns in the clustered index record. Build a + cache of column prefixes. */ ext = &node->ext; } else { /* REDUNDANT and COMPACT formats store a local @@ -228,10 +229,14 @@ row_undo_search_clust_to_pcur( } if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + ut_ad(node->row->info_bits == REC_INFO_MIN_REC_FLAG + || node->row->info_bits == 0); node->undo_row = dtuple_copy(node->row, node->heap); row_upd_replace(node->undo_row, &node->undo_ext, clust_index, node->update, node->heap); } else { + ut_ad((node->row->info_bits == REC_INFO_MIN_REC_FLAG) + == (node->rec_type == TRX_UNDO_INSERT_METADATA)); node->undo_row = NULL; node->undo_ext = NULL; } @@ -341,11 +346,17 @@ row_undo_step( ut_ad(que_node_get_type(node) == QUE_NODE_UNDO); - if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx) - && trx_roll_must_shutdown()) { + if (UNIV_UNLIKELY(trx_get_dict_operation(trx) == TRX_DICT_OP_NONE + && !srv_undo_sources + && srv_shutdown_state != SRV_SHUTDOWN_NONE) + && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) { /* Shutdown has been initiated. */ trx->error_state = DB_INTERRUPTED; - return(NULL); + return NULL; + } + + if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) { + trx_roll_report_progress(); } err = row_undo(node, thr); diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index 429282906df..600034c8ba3 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -281,52 +281,20 @@ row_upd_check_references_constraints( FALSE, FALSE, DICT_ERR_IGNORE_NONE); } - /* dict_operation_lock is held both here - (UPDATE or DELETE with FOREIGN KEY) and by TRUNCATE - TABLE operations. - If a TRUNCATE TABLE operation is in progress, - there can be 2 possible conditions: - 1) row_truncate_table_for_mysql() is not yet called. - 2) Truncate releases dict_operation_lock - during eviction of pages from buffer pool - for a file-per-table tablespace. - - In case of (1), truncate will wait for FK operation - to complete. - In case of (2), truncate will be rolled forward even - if it is interrupted. So if the foreign table is - undergoing a truncate, ignore the FK check. */ - if (foreign_table) { - mutex_enter(&fil_system->mutex); - const fil_space_t* space = fil_space_get_by_id( - foreign_table->space); - const bool being_truncated = space - && space->is_being_truncated; - mutex_exit(&fil_system->mutex); - if (being_truncated) { - continue; - } + foreign_table->inc_fk_checks(); } /* NOTE that if the thread ends up waiting for a lock we will release dict_operation_lock temporarily! - But the counter on the table protects 'foreign' from + But the inc_fk_checks() protects foreign_table from being dropped while the check is running. */ - if (foreign_table) { - my_atomic_addlint( - &foreign_table->n_foreign_key_checks_running, - 1); - } - err = row_ins_check_foreign_constraint( FALSE, foreign, table, entry, thr); if (foreign_table) { - my_atomic_addlint( - &foreign_table->n_foreign_key_checks_running, - -1); + foreign_table->dec_fk_checks(); } if (ref_table != NULL) { dict_table_close(ref_table, FALSE, FALSE); @@ -516,9 +484,7 @@ row_upd_rec_sys_fields_in_recovery( field = rec_get_nth_field(rec, offsets, pos, &len); ut_ad(len == DATA_TRX_ID_LEN); -#if DATA_TRX_ID + 1 != DATA_ROLL_PTR -# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" -#endif + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); trx_write_trx_id(field, trx_id); trx_write_roll_ptr(field + DATA_TRX_ID_LEN, roll_ptr); } @@ -591,7 +557,11 @@ row_upd_changes_field_size_or_external( } new_val = &(upd_field->new_val); + if (dfield_is_ext(new_val)) { + return(TRUE); + } new_len = dfield_get_len(new_val); + ut_ad(new_len != UNIV_SQL_DEFAULT); if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) { /* A bug fixed on Dec 31st, 2004: we looked at the @@ -605,11 +575,14 @@ row_upd_changes_field_size_or_external( 0); } - old_len = rec_offs_nth_size(offsets, upd_field->field_no); + if (rec_offs_nth_default(offsets, upd_field->field_no)) { + /* This is an instantly added column that is + at the initial default value. */ + return(TRUE); + } if (rec_offs_comp(offsets) - && rec_offs_nth_sql_null(offsets, - upd_field->field_no)) { + && rec_offs_nth_sql_null(offsets, upd_field->field_no)) { /* Note that in the compact table format, for a variable length field, an SQL NULL will use zero bytes in the offset array at the start of the physical @@ -618,9 +591,12 @@ row_upd_changes_field_size_or_external( if we update an SQL NULL varchar to an empty string! */ old_len = UNIV_SQL_NULL; + } else { + old_len = rec_offs_nth_size(offsets, + upd_field->field_no); } - if (dfield_is_ext(new_val) || old_len != new_len + if (old_len != new_len || rec_offs_nth_extern(offsets, upd_field->field_no)) { return(TRUE); @@ -695,6 +671,30 @@ row_upd_rec_in_place( ut_ad(!index->table->skip_alter_undo); if (rec_offs_comp(offsets)) { +#ifdef UNIV_DEBUG + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_COLUMNS_ADDED: + ut_ad(index->is_instant()); + break; + case REC_STATUS_NODE_PTR: + if (index->is_dummy + && fil_page_get_type(page_align(rec)) + == FIL_PAGE_RTREE) { + /* The function rtr_update_mbr_field_in_place() + is generating MLOG_COMP_REC_UPDATE_IN_PLACE + and MLOG_REC_UPDATE_IN_PLACE records for + node pointer pages. */ + break; + } + /* fall through */ + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad(!"wrong record status in update"); + } +#endif /* UNIV_DEBUG */ + rec_set_info_bits_new(rec, update->info_bits); } else { rec_set_info_bits_old(rec, update->info_bits); @@ -814,10 +814,7 @@ row_upd_index_write_log( log_ptr += mach_write_compressed(log_ptr, n_fields); for (i = 0; i < n_fields; i++) { - -#if MLOG_BUF_MARGIN <= 30 -# error "MLOG_BUF_MARGIN <= 30" -#endif + compile_time_assert(MLOG_BUF_MARGIN > 30); if (log_ptr + 30 > buf_end) { mlog_close(mtr, log_ptr); @@ -835,8 +832,8 @@ row_upd_index_write_log( /* If this is a virtual column, mark it using special field_no */ ulint field_no = upd_fld_is_virtual_col(upd_field) - ? REC_MAX_N_FIELDS + upd_field->field_no - : upd_field->field_no; + ? REC_MAX_N_FIELDS + unsigned(upd_field->field_no) + : unsigned(upd_field->field_no); log_ptr += mach_write_compressed(log_ptr, field_no); log_ptr += mach_write_compressed(log_ptr, len); @@ -976,6 +973,7 @@ row_upd_build_sec_rec_difference_binary( ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry)); ut_ad(!rec_offs_any_extern(offsets)); + ut_ad(!rec_offs_any_default(offsets)); ut_ad(!index->table->skip_alter_undo); update = upd_create(dtuple_get_n_fields(entry), heap); @@ -1046,22 +1044,21 @@ row_upd_build_difference_binary( TABLE* mysql_table, dberr_t* error) { - upd_field_t* upd_field; ulint len; upd_t* update; ulint n_diff; ulint trx_id_pos; - ulint i; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - ulint n_fld = dtuple_get_n_fields(entry); - ulint n_v_fld = dtuple_get_n_v_fields(entry); + const ulint n_v_fld = dtuple_get_n_v_fields(entry); rec_offs_init(offsets_); /* This function is used only for a clustered index */ ut_a(dict_index_is_clust(index)); ut_ad(!index->table->skip_alter_undo); + ut_ad(entry->n_fields <= index->n_fields); + ut_ad(entry->n_fields >= index->n_core_fields); - update = upd_create(n_fld + n_v_fld, heap); + update = upd_create(index->n_fields + n_v_fld, heap); n_diff = 0; @@ -1070,14 +1067,16 @@ row_upd_build_difference_binary( == trx_id_pos + 1); if (!offsets) { - offsets = rec_get_offsets(rec, index, offsets_, true, + offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, ULINT_UNDEFINED, &heap); } else { ut_ad(rec_offs_validate(rec, index, offsets)); } - for (i = 0; i < n_fld; i++) { - const byte* data = rec_get_nth_field(rec, offsets, i, &len); + for (ulint i = 0; i < entry->n_fields; i++) { + const byte* data = rec_get_nth_cfield(rec, index, offsets, i, + &len); const dfield_t* dfield = dtuple_get_nth_field(entry, i); /* NOTE: we compare the fields as binary strings! @@ -1097,17 +1096,22 @@ row_upd_build_difference_binary( if (!dfield_is_ext(dfield) != !rec_offs_nth_extern(offsets, i) || !dfield_data_is_binary_equal(dfield, len, data)) { - - upd_field = upd_get_nth_field(update, n_diff); - - dfield_copy(&(upd_field->new_val), dfield); - - upd_field_set_field_no(upd_field, i, index); - - n_diff++; + upd_field_t* uf = upd_get_nth_field(update, n_diff++); + dfield_copy(&uf->new_val, dfield); + upd_field_set_field_no(uf, i, index); } } + for (ulint i = entry->n_fields; i < index->n_fields; i++) { + upd_field_t* uf = upd_get_nth_field(update, n_diff++); + const dict_col_t* col = dict_index_get_nth_col(index, i); + /* upd_create() zero-initialized uf */ + uf->new_val.data = const_cast<byte*>(col->instant_value(&len)); + uf->new_val.len = static_cast<unsigned>(len); + dict_col_copy_type(col, &uf->new_val.type); + upd_field_set_field_no(uf, i, index); + } + /* Check the virtual columns updates. Even if there is no non-virtual column (base columns) change, we will still need to build the indexed virtual column value so that undo log would log them ( @@ -1127,7 +1131,7 @@ row_upd_build_difference_binary( ib_vcol_row vc(NULL); uchar *record = vc.record(thd, index, &mysql_table); - for (i = 0; i < n_v_fld; i++) { + for (ulint i = 0; i < n_v_fld; i++) { const dict_v_col_t* col = dict_table_get_nth_v_col(index->table, i); @@ -1154,24 +1158,16 @@ row_upd_build_difference_binary( entry, i); if (!dfield_data_is_binary_equal( - dfield, vfield->len, - static_cast<byte*>(vfield->data))) { - upd_field = upd_get_nth_field(update, n_diff); - - upd_field->old_v_val = static_cast<dfield_t*>( - mem_heap_alloc( - heap, - sizeof *upd_field->old_v_val)); - - dfield_copy(upd_field->old_v_val, vfield); - - dfield_copy(&(upd_field->new_val), dfield); - - upd_field_set_v_field_no( - upd_field, i, index); - - n_diff++; - + dfield, vfield->len, + static_cast<byte*>(vfield->data))) { + upd_field_t* uf = upd_get_nth_field(update, + n_diff++); + uf->old_v_val = static_cast<dfield_t*>( + mem_heap_alloc(heap, + sizeof *uf->old_v_val)); + dfield_copy(uf->old_v_val, vfield); + dfield_copy(&uf->new_val, dfield); + upd_field_set_v_field_no(uf, i, index); } } } @@ -1301,7 +1297,7 @@ row_upd_index_replace_new_col_val( /* Copy the locally stored prefix. */ memcpy(buf, data, - uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE); + unsigned(uf->orig_len) - BTR_EXTERN_FIELD_REF_SIZE); /* Copy the BLOB pointer. */ memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE, @@ -1316,50 +1312,34 @@ row_upd_index_replace_new_col_val( return true; } -/***********************************************************//** -Replaces the new column values stored in the update vector to the index entry -given. */ +/** Apply an update vector to an index entry. +@param[in,out] entry index entry to be updated; the clustered index record + must be covered by a lock or a page latch to prevent + deletion (rollback or purge) +@param[in] index index of the entry +@param[in] update update vector built for the entry +@param[in,out] heap memory heap for copying off-page columns */ void row_upd_index_replace_new_col_vals_index_pos( -/*=========================================*/ - dtuple_t* entry, /*!< in/out: index entry where replaced; - the clustered index record must be - covered by a lock or a page latch to - prevent deletion (rollback or purge) */ - dict_index_t* index, /*!< in: index; NOTE that this may also be a - non-clustered index */ - const upd_t* update, /*!< in: an update vector built for the index so - that the field number in an upd_field is the - index position */ - ibool order_only, - /*!< in: if TRUE, limit the replacement to - ordering fields of index; note that this - does not work for non-clustered indexes. */ - mem_heap_t* heap) /*!< in: memory heap for allocating and - copying the new values */ + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + mem_heap_t* heap) { ut_ad(!index->table->skip_alter_undo); - ulint i; - ulint n_fields; const page_size_t& page_size = dict_table_page_size(index->table); dtuple_set_info_bits(entry, update->info_bits); - if (order_only) { - n_fields = dict_index_get_n_unique(index); - } else { - n_fields = dict_index_get_n_fields(index); - } - - for (i = 0; i < n_fields; i++) { + for (unsigned i = index->n_fields; i--; ) { const dict_field_t* field; const dict_col_t* col; const upd_field_t* uf; field = dict_index_get_nth_field(index, i); col = dict_field_get_col(field); - if (dict_col_is_virtual(col)) { + if (col->is_virtual()) { const dict_v_col_t* vcol = reinterpret_cast< const dict_v_col_t*>( col); @@ -1707,7 +1687,7 @@ row_upd_changes_ord_field_binary_func( ind_field = dict_index_get_nth_field(index, i); col = dict_field_get_col(ind_field); col_no = dict_col_get_no(col); - is_virtual = dict_col_is_virtual(col); + is_virtual = col->is_virtual(); if (is_virtual) { vcol = reinterpret_cast<const dict_v_col_t*>(col); @@ -1790,10 +1770,33 @@ row_upd_changes_ord_field_binary_func( /* Get the new mbr. */ if (dfield_is_ext(new_field)) { if (flag == ROW_BUILD_FOR_UNDO - && dict_table_get_format(index->table) - >= UNIV_FORMAT_B) { - /* For undo, and the table is Barrcuda, - we need to skip the prefix data. */ + && dict_table_has_atomic_blobs( + index->table)) { + /* For ROW_FORMAT=DYNAMIC + or COMPRESSED, a prefix of + off-page records is stored + in the undo log record + (for any column prefix indexes). + For SPATIAL INDEX, we must + ignore this prefix. The + full column value is stored in + the BLOB. + For non-spatial index, we + would have already fetched a + necessary prefix of the BLOB, + available in the "ext" parameter. + + Here, for SPATIAL INDEX, we are + fetching the full column, which is + potentially wasting a lot of I/O, + memory, and possibly involving a + concurrency problem, similar to ones + that existed before the introduction + of row_ext_t. + + MDEV-11657 FIXME: write the MBR + directly to the undo log record, + and avoid recomputing it here! */ flen = BTR_EXTERN_FIELD_REF_SIZE; ut_ad(dfield_get_len(new_field) >= BTR_EXTERN_FIELD_REF_SIZE); @@ -1848,9 +1851,7 @@ row_upd_changes_ord_field_binary_func( /* Silence a compiler warning without silencing a Valgrind error. */ dfield_len = 0; -#ifdef HAVE_valgrind_or_MSAN MEM_UNDEFINED(&dfield_len, sizeof dfield_len); -#endif /* HAVE_valgrind_or_MSAN */ /* See if the column is stored externally. */ buf = row_ext_lookup(ext, col_no, &dfield_len); @@ -2052,16 +2053,19 @@ row_upd_copy_columns( /*=================*/ rec_t* rec, /*!< in: record in a clustered index */ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + const dict_index_t* index, /*!< in: index of rec */ sym_node_t* column) /*!< in: first column in a column list, or NULL */ { - byte* data; + ut_ad(dict_index_is_clust(index)); + + const byte* data; ulint len; while (column) { - data = rec_get_nth_field(rec, offsets, - column->field_nos[SYM_CLUST_FIELD_NO], - &len); + data = rec_get_nth_cfield( + rec, index, offsets, + column->field_nos[SYM_CLUST_FIELD_NO], &len); eval_node_copy_and_alloc_val(column, data, len); column = UT_LIST_GET_NEXT(col_var_list, column); @@ -2215,13 +2219,14 @@ row_upd_store_row( rec = btr_pcur_get_rec(node->pcur); - offsets = rec_get_offsets(rec, clust_index, offsets_, true, + offsets = rec_get_offsets(rec, clust_index, offsets_, + clust_index->n_core_fields, ULINT_UNDEFINED, &heap); - if (dict_table_get_format(node->table) >= UNIV_FORMAT_B) { - /* In DYNAMIC or COMPRESSED format, there is no prefix - of externally stored columns in the clustered index - record. Build a cache of column prefixes. */ + if (dict_table_has_atomic_blobs(node->table)) { + /* There is no prefix of externally stored columns in + the clustered index record. Build a cache of column + prefixes. */ ext = &node->ext; } else { /* REDUNDANT and COMPACT formats store a local @@ -2243,7 +2248,7 @@ row_upd_store_row( } } - if (node->is_delete) { + if (node->is_delete == PLAIN_DELETE) { node->upd_row = NULL; node->upd_ext = NULL; } else { @@ -2304,16 +2309,16 @@ row_upd_sec_index_entry( mtr.start(); - switch (index->space) { + switch (index->table->space_id) { case SRV_TMP_SPACE_ID: mtr.set_log_mode(MTR_LOG_NO_REDO); flags = BTR_NO_LOCKING_FLAG; break; default: - mtr.set_named_space(index->space); + index->set_modified(mtr); /* fall through */ case IBUF_SPACE_ID: - flags = 0; + flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : 0; break; } @@ -2324,7 +2329,7 @@ row_upd_sec_index_entry( or was being created online, but not committed yet. It is protected by index->lock. */ - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); switch (dict_index_get_online_status(index)) { case ONLINE_INDEX_COMPLETE: @@ -2354,7 +2359,7 @@ row_upd_sec_index_entry( are no foreign key constraints referring to the index. Change buffering is disabled for temporary tables and spatial index. */ - mode = (referenced || dict_table_is_temporary(index->table) + mode = (referenced || index->table->is_temporary() || dict_index_is_spatial(index)) ? BTR_MODIFY_LEAF_ALREADY_S_LATCHED : BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED; @@ -2368,7 +2373,7 @@ row_upd_sec_index_entry( are no foreign key constraints referring to the index. Change buffering is disabled for temporary tables and spatial index. */ - mode = (referenced || dict_table_is_temporary(index->table) + mode = (referenced || index->table->is_temporary() || dict_index_is_spatial(index)) ? BTR_MODIFY_LEAF : BTR_DELETE_MARK_LEAF; @@ -2448,7 +2453,7 @@ row_upd_sec_index_entry( && !wsrep_thd_is_BF(trx->mysql_thd, FALSE)) { rec_offs* offsets = rec_get_offsets( - rec, index, NULL, true, + rec, index, NULL, index->n_core_fields, ULINT_UNDEFINED, &heap); err = wsrep_row_upd_check_foreign_constraints( @@ -2491,12 +2496,9 @@ row_upd_sec_index_entry( ut_ad(err == DB_SUCCESS); if (referenced) { - - rec_offs* offsets; - - offsets = rec_get_offsets( - rec, index, NULL, true, ULINT_UNDEFINED, - &heap); + rec_offs* offsets = rec_get_offsets( + rec, index, NULL, index->n_core_fields, + ULINT_UNDEFINED, &heap); /* NOTE that the following call loses the position of pcur ! */ @@ -2509,7 +2511,7 @@ row_upd_sec_index_entry( btr_pcur_close(&pcur); mtr_commit(&mtr); - if (node->is_delete || err != DB_SUCCESS) { + if (node->is_delete == PLAIN_DELETE || err != DB_SUCCESS) { goto func_exit; } @@ -2525,7 +2527,7 @@ row_upd_sec_index_entry( /* The index->online_status may change if the index is being rollbacked. It is protected by index->lock. */ - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); switch (dict_index_get_online_status(index)) { case ONLINE_INDEX_COMPLETE: @@ -2549,7 +2551,7 @@ row_upd_sec_index_entry( } /* Insert new index entry */ - err = row_ins_sec_index_entry(index, entry, thr); + err = row_ins_sec_index_entry(index, entry, thr, !node->is_delete); func_exit: mem_heap_free(heap); @@ -2629,6 +2631,7 @@ row_upd_clust_rec_by_insert_inherit_func( #ifdef UNIV_DEBUG if (UNIV_LIKELY(rec != NULL)) { + ut_ad(!rec_offs_nth_default(offsets, i)); const byte* rec_data = rec_get_nth_field(rec, offsets, i, &len); ut_ad(len == dfield_get_len(dfield)); @@ -2663,8 +2666,7 @@ row_upd_clust_rec_by_insert_inherit_func( data[BTR_EXTERN_LEN] &= ~BTR_EXTERN_OWNER_FLAG; data[BTR_EXTERN_LEN] |= BTR_EXTERN_INHERITED_FLAG; /* The BTR_EXTERN_INHERITED_FLAG only matters in - rollback of a fresh insert (insert_undo log). - Purge (operating on update_undo log) will always free + rollback of a fresh insert. Purge will always free the extern fields of a delete-marked row. */ inherit = true; @@ -2719,6 +2721,7 @@ row_upd_clust_rec_by_insert( entry = row_build_index_entry_low(node->upd_row, node->upd_ext, index, heap, ROW_BUILD_FOR_INSERT); + if (index->is_instant()) entry->trim(*index); ut_ad(dtuple_get_info_bits(entry) == 0); row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id); @@ -2737,7 +2740,8 @@ row_upd_clust_rec_by_insert( we update the primary key. Delete-mark the old record in the clustered index and prepare to insert a new entry. */ rec = btr_cur_get_rec(btr_cur); - offsets = rec_get_offsets(rec, index, offsets, true, + offsets = rec_get_offsets(rec, index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &heap); ut_ad(page_rec_is_user_rec(rec)); @@ -2821,8 +2825,8 @@ check_fk: mtr->start(); node->state = UPD_NODE_INSERT_CLUSTERED; - err = row_ins_clust_index_entry( - index, entry, thr, dtuple_get_n_ext(entry)); + err = row_ins_clust_index_entry(index, entry, thr, + dtuple_get_n_ext(entry)); err_exit: mem_heap_free(heap); return(err); @@ -2908,7 +2912,7 @@ row_upd_clust_rec( flags |= BTR_NO_LOCKING_FLAG; mtr->set_log_mode(MTR_LOG_NO_REDO); } else { - mtr->set_named_space(index->space); + index->set_modified(*mtr); } /* NOTE: this transaction has an s-lock or x-lock on the record and @@ -2988,7 +2992,7 @@ row_upd_del_mark_clust_rec( trx_t* trx = thr_get_trx(thr); ut_ad(dict_index_is_clust(index)); - ut_ad(node->is_delete); + ut_ad(node->is_delete == PLAIN_DELETE); pcur = node->pcur; btr_cur = btr_pcur_get_btr_cur(pcur); @@ -3066,6 +3070,7 @@ row_upd_clust_step( mem_heap_t* heap = NULL; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs* offsets; + ulint flags; trx_t* trx = thr_get_trx(thr); rec_offs_init(offsets_); @@ -3083,12 +3088,17 @@ row_upd_clust_step( mtr.start(); - const ulint flags = index->table->is_temporary() - ? BTR_NO_LOCKING_FLAG : 0; - if (flags) { + if (node->table->is_temporary()) { + /* Disable locking, because temporary tables are + private to the connection (no concurrent access). */ + flags = node->table->no_rollback() + ? BTR_NO_ROLLBACK + : BTR_NO_LOCKING_FLAG; + /* Redo logging only matters for persistent tables. */ mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - mtr.set_named_space(index->space); + flags = node->table->no_rollback() ? BTR_NO_ROLLBACK : 0; + index->set_modified(mtr); } /* If the restoration does not succeed, then the same @@ -3108,7 +3118,7 @@ row_upd_clust_step( if (dict_index_is_online_ddl(index)) { ut_ad(node->table->id != DICT_INDEXES_ID); mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } else { mode = BTR_MODIFY_LEAF; } @@ -3122,7 +3132,8 @@ row_upd_clust_step( then we have to free the file segments of the index tree associated with the index */ - if (node->is_delete && node->table->id == DICT_INDEXES_ID) { + if (node->is_delete == PLAIN_DELETE + && node->table->id == DICT_INDEXES_ID) { ut_ad(!dict_index_is_online_ddl(index)); @@ -3132,7 +3143,7 @@ row_upd_clust_step( mtr.commit(); mtr.start(); - mtr.set_named_space(index->space); + index->set_modified(mtr); if (!btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, &mtr)) { err = DB_ERROR; @@ -3141,7 +3152,7 @@ row_upd_clust_step( } rec = btr_pcur_get_rec(pcur); - offsets = rec_get_offsets(rec, index, offsets_, true, + offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields, ULINT_UNDEFINED, &heap); if (!flags && !node->has_clust_rec_x_lock) { @@ -3153,11 +3164,13 @@ row_upd_clust_step( } } - ut_ad(lock_trx_has_rec_x_lock(trx, index->table, - btr_pcur_get_block(pcur), - page_rec_get_heap_no(rec))); + ut_ad(index->table->no_rollback() || index->table->is_temporary() + || row_get_rec_trx_id(rec, index, offsets) == trx->id + || lock_trx_has_expl_x_lock(trx, index->table, + btr_pcur_get_block(pcur), + page_rec_get_heap_no(rec))); - if (node->is_delete) { + if (node->is_delete == PLAIN_DELETE) { err = row_upd_del_mark_clust_rec( node, index, offsets, thr, referenced, #ifdef WITH_WSREP @@ -3173,12 +3186,12 @@ row_upd_clust_step( if (UNIV_UNLIKELY(!node->in_mysql_interface)) { /* Copy the necessary columns from clust_rec and calculate the new values to set */ - row_upd_copy_columns(rec, offsets, + row_upd_copy_columns(rec, offsets, index, UT_LIST_GET_FIRST(node->columns)); row_upd_eval_new_vals(node->update); } - if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + if (!node->is_delete && node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { err = row_upd_clust_rec( flags, node, index, offsets, &heap, thr, &mtr); goto exit_func; @@ -3221,8 +3234,10 @@ success: flags, node, index, offsets, &heap, thr, &mtr); if (err == DB_SUCCESS) { - ut_ad(!node->is_delete); - node->state = UPD_NODE_UPDATE_SOME_SEC; + ut_ad(node->is_delete != PLAIN_DELETE); + node->state = node->is_delete + ? UPD_NODE_UPDATE_ALL_SEC + : UPD_NODE_UPDATE_SOME_SEC; goto success; } } @@ -3264,7 +3279,7 @@ row_upd( /* We do not get the cmpl_info value from the MySQL interpreter: we must calculate it on the fly: */ - if (node->is_delete + if (node->is_delete == PLAIN_DELETE || row_upd_changes_some_index_ord_field_binary( node->table, node->update)) { node->cmpl_info = 0; @@ -3355,8 +3370,6 @@ row_upd_step( trx = thr_get_trx(thr); - trx_start_if_not_started_xa(trx, true); - node = static_cast<upd_node_t*>(thr->run_node); sel_node = node->select; @@ -3449,3 +3462,77 @@ error_handling: DBUG_RETURN(thr); } + +/** Write query start time as SQL field data to a buffer. Needed by InnoDB. +@param thd Thread object +@param buf Buffer to hold start time data */ +void thd_get_query_start_data(THD *thd, char *buf); + +/** Appends row_start or row_end field to update vector and sets a +CURRENT_TIMESTAMP/trx->id value to it. +Supposed to be called only by make_versioned_update() and +make_versioned_delete(). +@param[in] trx transaction +@param[in] vers_sys_idx table->row_start or table->row_end */ +void upd_node_t::vers_update_fields(const trx_t *trx, ulint idx) +{ + ut_ad(in_mysql_interface); // otherwise needs to recalculate + // node->cmpl_info + ut_ad(idx == table->vers_start || idx == table->vers_end); + + dict_index_t* clust_index = dict_table_get_first_index(table); + const dict_col_t *col= dict_table_get_nth_col(table, idx); + ulint field_no= dict_col_get_clust_pos(col, clust_index); + upd_field_t *ufield; + + for (ulint i= 0; i < update->n_fields; ++i) + { + if (update->fields[i].field_no == field_no) + { + ufield= &update->fields[i]; + goto skip_append; + } + } + + /* row_create_update_node_for_mysql() pre-allocated this much. + At least one PK column always remains unchanged. */ + ut_ad(update->n_fields < ulint(table->n_cols + table->n_v_cols)); + + update->n_fields++; + ufield= upd_get_nth_field(update, update->n_fields - 1); + upd_field_set_field_no(ufield, field_no, clust_index); + +skip_append: + char *where= reinterpret_cast<char *>(update->vers_sys_value); + if (col->vers_native()) + { + mach_write_to_8(where, trx->id); + } + else + { + thd_get_query_start_data(trx->mysql_thd, where); + } + + dfield_set_data(&ufield->new_val, update->vers_sys_value, col->len); + + for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++) + { + + const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no); + if (!v_col->m_col.ord_part) + continue; + for (ulint i= 0; i < unsigned(v_col->num_base); i++) + { + dict_col_t *base_col= v_col->base_col[i]; + if (base_col->ind == col->ind) + { + /* Virtual column depends on system field value + which we updated above. Remove it from update + vector, so it is recalculated in + row_upd_store_v_row() (see !update branch). */ + update->remove(v_col->v_pos); + break; + } + } + } +} diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc index 2d8704764d1..cde4e9e7b89 100644 --- a/storage/innobase/row/row0vers.cc +++ b/storage/innobase/row/row0vers.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,7 +39,6 @@ Created 2/6/1997 Heikki Tuuri #include "row0row.h" #include "row0upd.h" #include "rem0cmp.h" -#include "read0read.h" #include "lock0lock.h" #include "row0mysql.h" @@ -59,7 +58,7 @@ row_vers_non_virtual_fields_equal( for (const dict_field_t* ifield = index->fields; ifield != end; ifield++) { - if (!dict_col_is_virtual(ifield->col) + if (!ifield->col->is_virtual() && cmp_dfield_dfield(a++, b++)) { return false; } @@ -70,6 +69,7 @@ row_vers_non_virtual_fields_equal( /** Determine if an active transaction has inserted or modified a secondary index record. +@param[in,out] caller_trx trx of current thread @param[in] clust_rec clustered index record @param[in] clust_index clustered index @param[in] rec secondary index record @@ -82,6 +82,7 @@ trx_mutex_enter(), and trx->release_reference() must be invoked UNIV_INLINE trx_t* row_vers_impl_x_locked_low( + trx_t* caller_trx, const rec_t* clust_rec, dict_index_t* clust_index, const rec_t* rec, @@ -89,6 +90,7 @@ row_vers_impl_x_locked_low( const rec_offs* offsets, mtr_t* mtr) { + trx_id_t trx_id; rec_t* prev_version = NULL; rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs* clust_offsets; @@ -103,29 +105,45 @@ row_vers_impl_x_locked_low( ut_ad(rec_offs_validate(rec, index, offsets)); + if (ulint trx_id_offset = clust_index->trx_id_offset) { + trx_id = mach_read_from_6(clust_rec + trx_id_offset); + if (trx_id == 0) { + /* The transaction history was already purged. */ + DBUG_RETURN(0); + } + } + heap = mem_heap_create(1024); clust_offsets = rec_get_offsets(clust_rec, clust_index, clust_offsets_, - true, ULINT_UNDEFINED, &heap); + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); - const trx_id_t trx_id = row_get_rec_trx_id( - clust_rec, clust_index, clust_offsets); + trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); + if (trx_id == 0) { + /* The transaction history was already purged. */ + mem_heap_free(heap); + DBUG_RETURN(0); + } ut_ad(!clust_index->table->is_temporary()); - bool corrupt = false; - trx_t* trx = trx_rw_is_active(trx_id, &corrupt, true); + trx_t* trx; - if (trx == 0) { - /* The transaction that modified or inserted clust_rec is no - longer active, or it is corrupt: no implicit lock on rec */ - if (UNIV_UNLIKELY(corrupt)) { - lock_report_trx_id_insanity( - trx_id, clust_rec, clust_index, clust_offsets, - trx_sys_get_max_trx_id()); + if (trx_id == caller_trx->id) { + trx = caller_trx; + trx->reference(); + } else { + trx = trx_sys.find(caller_trx, trx_id); + if (trx == 0) { + /* The transaction that modified or inserted + clust_rec is no longer active, or it is + corrupt: no implicit lock on rec */ + lock_check_trx_id_sanity(trx_id, clust_rec, + clust_index, clust_offsets); + mem_heap_free(heap); + DBUG_RETURN(0); } - mem_heap_free(heap); - DBUG_RETURN(0); } const ulint comp = page_rec_is_comp(rec); @@ -222,7 +240,8 @@ not_locked: } clust_offsets = rec_get_offsets( - prev_version, clust_index, clust_offsets_, true, + prev_version, clust_index, clust_offsets_, + clust_index->n_core_fields, ULINT_UNDEFINED, &heap); vers_del = rec_get_deleted_flag(prev_version, comp); @@ -359,6 +378,7 @@ result_check: /** Determine if an active transaction has inserted or modified a secondary index record. +@param[in,out] caller_trx trx of current thread @param[in] rec secondary index record @param[in] index secondary index @param[in] offsets rec_get_offsets(rec, index) @@ -367,6 +387,7 @@ trx_mutex_enter(), and trx->release_reference() must be invoked @retval NULL if the record was committed */ trx_t* row_vers_impl_x_locked( + trx_t* caller_trx, const rec_t* rec, dict_index_t* index, const rec_offs* offsets) @@ -377,7 +398,7 @@ row_vers_impl_x_locked( dict_index_t* clust_index; ut_ad(!lock_mutex_own()); - ut_ad(!trx_sys_mutex_own()); + ut_ad(!mutex_own(&trx_sys.mutex)); mtr_start(&mtr); @@ -407,7 +428,8 @@ row_vers_impl_x_locked( trx = 0; } else { trx = row_vers_impl_x_locked_low( - clust_rec, clust_index, rec, index, offsets, &mtr); + caller_trx, clust_rec, clust_index, rec, index, + offsets, &mtr); ut_ad(trx == 0 || trx->is_referenced()); } @@ -417,29 +439,6 @@ row_vers_impl_x_locked( return(trx); } -/*****************************************************************//** -Finds out if we must preserve a delete marked earlier version of a clustered -index record, because it is >= the purge view. -@param[in] trx_id transaction id in the version -@param[in] name table name -@param[in,out] mtr mini transaction holding the latch on the - clustered index record; it will also hold - the latch on purge_view -@return TRUE if earlier version should be preserved */ -ibool -row_vers_must_preserve_del_marked( -/*==============================*/ - trx_id_t trx_id, - const table_name_t& name, - mtr_t* mtr) -{ - ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_S)); - - mtr_s_lock(&purge_sys->latch, mtr); - - return(!purge_sys->view.changes_visible(trx_id, name)); -} - /** build virtual column value from current cluster index record data @param[in,out] row the cluster index row in dtuple form @param[in] clust_index clustered index @@ -481,7 +480,7 @@ row_vers_build_clust_v_col( const dict_field_t* ind_field = dict_index_get_nth_field( index, i); - if (dict_col_is_virtual(ind_field->col)) { + if (ind_field->col->is_virtual()) { const dict_v_col_t* col; col = reinterpret_cast<const dict_v_col_t*>( @@ -572,7 +571,8 @@ row_vers_build_cur_vrow_low( clust_offsets = rec_get_offsets(prev_version, clust_index, NULL, - true, ULINT_UNDEFINED, &heap); + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); ulint entry_len = dict_index_get_n_fields(index); @@ -583,7 +583,7 @@ row_vers_build_cur_vrow_low( = dict_index_get_nth_field(index, i); const dict_col_t* col = ind_field->col; - if (!dict_col_is_virtual(col)) { + if (!col->is_virtual()) { continue; } @@ -665,7 +665,7 @@ row_vers_vc_matches_cluster( for (const dict_field_t *ifield = index->fields, *const end = &index->fields[index->n_fields]; ifield != end; ifield++, a++, b++) { - if (!dict_col_is_virtual(ifield->col)) { + if (!ifield->col->is_virtual()) { if (cmp_dfield_dfield(a, b)) { return false; } @@ -714,7 +714,8 @@ row_vers_vc_matches_cluster( clust_offsets = rec_get_offsets(prev_version, clust_index, NULL, - true, ULINT_UNDEFINED, &heap); + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); ulint entry_len = dict_index_get_n_fields(index); @@ -724,7 +725,7 @@ row_vers_vc_matches_cluster( const dict_col_t* col = ind_field->col; field1 = dtuple_get_nth_field(ientry, i); - if (!dict_col_is_virtual(col)) { + if (!col->is_virtual()) { continue; } @@ -789,7 +790,6 @@ func_exit: @param[in] clust_index cluster index @param[in] clust_offsets cluster rec offset @param[in] index secondary index -@param[in] ientry secondary index rec @param[in] roll_ptr roll_ptr for the purge record @param[in] trx_id transaction ID on the purging record @param[in,out] heap heap memory @@ -805,7 +805,6 @@ row_vers_build_cur_vrow( dict_index_t* clust_index, rec_offs** clust_offsets, dict_index_t* index, - const dtuple_t* ientry, roll_ptr_t roll_ptr, trx_id_t trx_id, mem_heap_t* heap, @@ -854,7 +853,8 @@ row_vers_build_cur_vrow( index, roll_ptr, trx_id, v_heap, &cur_vrow, mtr); } - *clust_offsets = rec_get_offsets(rec, clust_index, NULL, true, + *clust_offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index->n_core_fields, ULINT_UNDEFINED, &heap); return(cur_vrow); } @@ -903,7 +903,7 @@ row_vers_old_has_index_entry( ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_S_FIX)); - ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_S)); + ut_ad(!rw_lock_own(&purge_sys.latch, RW_LOCK_S)); ut_ad(also_curr || !vcol_info); clust_index = dict_table_get_first_index(index->table); @@ -911,7 +911,8 @@ row_vers_old_has_index_entry( comp = page_rec_is_comp(rec); ut_ad(!dict_table_is_comp(index->table) == !comp); heap = mem_heap_create(1024); - clust_offsets = rec_get_offsets(rec, clust_index, NULL, true, + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index->n_core_fields, ULINT_UNDEFINED, &heap); if (dict_index_has_virtual(index)) { @@ -927,7 +928,7 @@ row_vers_old_has_index_entry( /* The top of the stack of versions is locked by the mtr holding a latch on the page containing the clustered index record. The bottom of the stack is - locked by the fact that the purge_sys->view must + locked by the fact that the purge_sys.view must 'overtake' any read view of an active transaction. Thus, it is safe to fetch the prefixes for externally stored columns. */ @@ -1000,7 +1001,8 @@ row_vers_old_has_index_entry( } } clust_offsets = rec_get_offsets(rec, clust_index, NULL, - true, + clust_index + ->n_core_fields, ULINT_UNDEFINED, &heap); } else { @@ -1046,8 +1048,7 @@ unsafe_to_purge: cur_vrow = row_vers_build_cur_vrow( also_curr, rec, clust_index, &clust_offsets, - index, ientry, roll_ptr, trx_id, heap, v_heap, mtr, - vcol_info); + index, roll_ptr, trx_id, heap, v_heap, mtr, vcol_info); if (vcol_info && vcol_info->is_first_fetch()) { goto unsafe_to_purge; @@ -1080,7 +1081,8 @@ unsafe_to_purge: } clust_offsets = rec_get_offsets(prev_version, clust_index, - NULL, true, + NULL, + clust_index->n_core_fields, ULINT_UNDEFINED, &heap); if (dict_index_has_virtual(index)) { @@ -1178,7 +1180,7 @@ row_vers_build_for_consistent_read( ut_ad(dict_index_is_clust(index)); ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_S_FIX)); - ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_S)); + ut_ad(!rw_lock_own(&(purge_sys.latch), RW_LOCK_S)); ut_ad(rec_offs_validate(rec, index, *offsets)); @@ -1221,7 +1223,7 @@ row_vers_build_for_consistent_read( *offsets = rec_get_offsets( prev_version, index, *offsets, - true, ULINT_UNDEFINED, offset_heap); + index->n_core_fields, ULINT_UNDEFINED, offset_heap); #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(prev_version, *offsets)); @@ -1239,7 +1241,7 @@ row_vers_build_for_consistent_read( in_heap, rec_offs_size(*offsets))); *old_vers = rec_copy(buf, prev_version, *offsets); - rec_offs_make_valid(*old_vers, index, *offsets); + rec_offs_make_valid(*old_vers, index, true, *offsets); if (vrow && *vrow) { *vrow = dtuple_copy(*vrow, in_heap); @@ -1262,6 +1264,7 @@ which should be seen by a semi-consistent read. */ void row_vers_build_for_semi_consistent_read( /*====================================*/ + trx_t* caller_trx,/*!<in/out: trx of current thread */ const rec_t* rec, /*!< in: record in a clustered index; the caller must have a latch on the page; this latch locks the top of the stack of versions @@ -1290,7 +1293,7 @@ row_vers_build_for_semi_consistent_read( ut_ad(dict_index_is_clust(index)); ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_S_FIX)); - ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_S)); + ut_ad(!rw_lock_own(&(purge_sys.latch), RW_LOCK_S)); ut_ad(rec_offs_validate(rec, index, *offsets)); @@ -1298,7 +1301,6 @@ row_vers_build_for_semi_consistent_read( ut_ad(!vrow || !(*vrow)); for (;;) { - const trx_t* version_trx; mem_heap_t* heap2; rec_t* prev_version; trx_id_t version_trx_id; @@ -1308,20 +1310,7 @@ row_vers_build_for_semi_consistent_read( rec_trx_id = version_trx_id; } - trx_sys_mutex_enter(); - version_trx = trx_get_rw_trx_by_id(version_trx_id); - /* Because version_trx is a read-write transaction, - its state cannot change from or to NOT_STARTED while - we are holding the trx_sys->mutex. It may change from - ACTIVE to PREPARED or COMMITTED. */ - if (version_trx - && trx_state_eq(version_trx, - TRX_STATE_COMMITTED_IN_MEMORY)) { - version_trx = NULL; - } - trx_sys_mutex_exit(); - - if (!version_trx) { + if (!trx_sys.is_registered(caller_trx, version_trx_id)) { committed_version_trx: /* We found a version that belongs to a committed transaction: return it. */ @@ -1350,11 +1339,10 @@ committed_version_trx: semi-consistent read. */ version = rec; - *offsets = rec_get_offsets(version, - index, *offsets, - true, - ULINT_UNDEFINED, - offset_heap); + *offsets = rec_get_offsets( + version, index, *offsets, + index->n_core_fields, ULINT_UNDEFINED, + offset_heap); } buf = static_cast<byte*>( @@ -1362,7 +1350,7 @@ committed_version_trx: in_heap, rec_offs_size(*offsets))); *old_vers = rec_copy(buf, version, *offsets); - rec_offs_make_valid(*old_vers, index, *offsets); + rec_offs_make_valid(*old_vers, index, true, *offsets); if (vrow && *vrow) { *vrow = dtuple_copy(*vrow, in_heap); dtuple_dup_v_fld(*vrow, in_heap); @@ -1397,7 +1385,8 @@ committed_version_trx: } version = prev_version; - *offsets = rec_get_offsets(version, index, *offsets, true, + *offsets = rec_get_offsets(version, index, *offsets, + index->n_core_fields, ULINT_UNDEFINED, offset_heap); #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); diff --git a/storage/innobase/srv/srv0conc.cc b/storage/innobase/srv/srv0conc.cc index b420ca6d8ee..10caa0193a7 100644 --- a/storage/innobase/srv/srv0conc.cc +++ b/storage/innobase/srv/srv0conc.cc @@ -56,10 +56,8 @@ ulong srv_thread_sleep_delay = 10000; /** We are prepared for a situation that we have this many threads waiting for -a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the -value. */ - -ulint srv_max_n_threads = 0; +a semaphore inside InnoDB. srv_start() sets the value. */ +ulint srv_max_n_threads; /** The following controls how many threads we let inside InnoDB concurrently: threads waiting for locks are not counted into the number because otherwise @@ -133,12 +131,9 @@ srv_conc_enter_innodb_with_atomics( #endif /* WITH_WSREP */ if (srv_thread_concurrency == 0) { - if (notified_mysql) { - - (void) my_atomic_addlint( - &srv_conc.n_waiting, -1); - + my_atomic_addlint(&srv_conc.n_waiting, + ulint(-1)); thd_wait_end(trx->mysql_thd); } @@ -157,10 +152,8 @@ srv_conc_enter_innodb_with_atomics( srv_enter_innodb_with_tickets(trx); if (notified_mysql) { - - (void) my_atomic_addlint( - &srv_conc.n_waiting, -1); - + my_atomic_addlint(&srv_conc.n_waiting, + ulint(-1)); thd_wait_end(trx->mysql_thd); } @@ -182,13 +175,11 @@ srv_conc_enter_innodb_with_atomics( /* Since there were no free seats, we relinquish the overbooked ticket. */ - (void) my_atomic_addlint( - &srv_conc.n_active, -1); + my_atomic_addlint(&srv_conc.n_active, ulint(-1)); } if (!notified_mysql) { - (void) my_atomic_addlint( - &srv_conc.n_waiting, 1); + my_atomic_addlint(&srv_conc.n_waiting, 1); thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK); @@ -232,7 +223,7 @@ srv_conc_exit_innodb_with_atomics( trx->n_tickets_to_enter_innodb = 0; trx->declared_to_be_inside_innodb = FALSE; - (void) my_atomic_addlint(&srv_conc.n_active, -1); + my_atomic_addlint(&srv_conc.n_active, ulint(-1)); } /*********************************************************************//** diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index d204479c1c0..0404335574a 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -2,7 +2,7 @@ Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -423,7 +423,7 @@ static monitor_info_t innodb_counter_info[] = MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_TIME}, {"buffer_flush_adaptive_avg_pass", "buffer", - "Numner of adaptive flushes passed during the recent Avg period.", + "Number of adaptive flushes passed during the recent Avg period.", MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_PASS}, @@ -1488,8 +1488,8 @@ srv_mon_set_module_control( mon_option_t set_option) /*!< in: Turn on/off reset the counter */ { - ulint ix; - ulint start_id; + lint ix; + lint start_id; ibool set_current_module = FALSE; ut_a(module_id <= NUM_MONITOR); @@ -1599,7 +1599,7 @@ srv_mon_get_rseg_size(void) total rollback segment size and to avoid mutex contention we don't acquire the rseg->mutex" */ for (i = 0; i < TRX_SYS_N_RSEGS; ++i) { - const trx_rseg_t* rseg = trx_sys->rseg_array[i]; + const trx_rseg_t* rseg = trx_sys.rseg_array[i]; if (rseg != NULL) { value += rseg->curr_size; @@ -1841,7 +1841,7 @@ srv_mon_process_existing_counter( /* innodb_page_size */ case MONITOR_OVLD_SRV_PAGE_SIZE: - value = UNIV_PAGE_SIZE; + value = srv_page_size; break; case MONITOR_OVLD_RWLOCK_S_SPIN_WAITS: @@ -1936,7 +1936,7 @@ srv_mon_process_existing_counter( /* innodb_row_lock_time_max */ case MONITOR_OVLD_LOCK_MAX_WAIT_TIME: - value = lock_sys->n_lock_max_wait_time / 1000; + value = lock_sys.n_lock_max_wait_time / 1000; break; /* innodb_row_lock_time_avg */ @@ -1955,7 +1955,7 @@ srv_mon_process_existing_counter( break; case MONITOR_RSEG_HISTORY_LEN: - value = trx_sys->rseg_history_len; + value = trx_sys.history_size(); break; case MONITOR_RSEG_CUR_SIZE: @@ -1963,7 +1963,7 @@ srv_mon_process_existing_counter( break; case MONITOR_OVLD_N_FILE_OPENED: - value = fil_system->n_open; + value = fil_system.n_open; break; case MONITOR_OVLD_IBUF_MERGE_INSERT: @@ -2003,30 +2003,30 @@ srv_mon_process_existing_counter( break; case MONITOR_OVLD_LSN_FLUSHDISK: - value = (mon_type_t) log_sys->flushed_to_disk_lsn; + value = (mon_type_t) log_sys.flushed_to_disk_lsn; break; case MONITOR_OVLD_LSN_CURRENT: - value = (mon_type_t) log_sys->lsn; + value = (mon_type_t) log_sys.lsn; break; case MONITOR_PENDING_LOG_FLUSH: - mutex_enter(&log_sys->mutex); - value = static_cast<mon_type_t>(log_sys->n_pending_flushes); - mutex_exit(&log_sys->mutex); + mutex_enter(&log_sys.mutex); + value = static_cast<mon_type_t>(log_sys.n_pending_flushes); + mutex_exit(&log_sys.mutex); break; case MONITOR_PENDING_CHECKPOINT_WRITE: - mutex_enter(&log_sys->mutex); + mutex_enter(&log_sys.mutex); value = static_cast<mon_type_t>( - log_sys->n_pending_checkpoint_writes); - mutex_exit(&log_sys->mutex); + log_sys.n_pending_checkpoint_writes); + mutex_exit(&log_sys.mutex); break; case MONITOR_LOG_IO: - mutex_enter(&log_sys->mutex); - value = static_cast<mon_type_t>(log_sys->n_log_ios); - mutex_exit(&log_sys->mutex); + mutex_enter(&log_sys.mutex); + value = static_cast<mon_type_t>(log_sys.n_log_ios); + mutex_exit(&log_sys.mutex); break; case MONITOR_OVLD_BUF_OLDEST_LSN: @@ -2034,15 +2034,15 @@ srv_mon_process_existing_counter( break; case MONITOR_OVLD_LSN_CHECKPOINT: - value = (mon_type_t) log_sys->last_checkpoint_lsn; + value = (mon_type_t) log_sys.last_checkpoint_lsn; break; case MONITOR_OVLD_MAX_AGE_ASYNC: - value = log_sys->max_modified_age_async; + value = log_sys.max_modified_age_async; break; case MONITOR_OVLD_MAX_AGE_SYNC: - value = log_sys->max_modified_age_sync; + value = log_sys.max_modified_age_sync; break; #ifdef BTR_CUR_HASH_ADAPT @@ -2052,7 +2052,7 @@ srv_mon_process_existing_counter( #endif /* BTR_CUR_HASH_ADAPT */ case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE: - value = btr_cur_n_non_sea; + value = my_atomic_loadlint(&btr_cur_n_non_sea); break; case MONITOR_OVLD_PAGE_COMPRESS_SAVED: diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 44e0946f067..3da92d48feb 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -149,23 +149,10 @@ my_bool srv_read_only_mode; /** store to its own file each table created by an user; data dictionary tables are in the system tablespace 0 */ my_bool srv_file_per_table; -/** whether to use backup-safe TRUNCATE and crash-safe RENAME -instead of the MySQL 5.7 WL#6501 TRUNCATE TABLE implementation */ -my_bool srv_safe_truncate; -/** The file format to use on new *.ibd files. */ -ulint srv_file_format; -/** Whether to check file format during startup. A value of -UNIV_FORMAT_MAX + 1 means no checking ie. FALSE. The default is to -set it to the highest format we support. */ -ulint srv_max_file_format_at_startup = UNIV_FORMAT_MAX; /** Set if InnoDB operates in read-only mode or innodb-force-recovery is greater than SRV_FORCE_NO_TRX_UNDO. */ my_bool high_level_read_only; -#if UNIV_FORMAT_A -# error "UNIV_FORMAT_A must be 0!" -#endif - /** Place locks to records only i.e. do not use next-key locking except on duplicate key checking and foreign key checking */ ibool srv_locks_unsafe_for_binlog; @@ -180,17 +167,10 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ my_bool srv_use_native_aio; my_bool srv_numa_interleave; -/** innodb_use_trim; whether to use fallocate(PUNCH_HOLE) with -page_compression */ -my_bool srv_use_trim; -/** copy of innodb_use_atomic_writes; @see innobase_init() */ +/** copy of innodb_use_atomic_writes; @see innodb_init_params() */ my_bool srv_use_atomic_writes; /** innodb_compression_algorithm; used with page compression */ ulong innodb_compression_algorithm; -/** innodb_mtflush_threads; number of threads used for multi-threaded flush */ -long srv_mtflush_threads; -/** innodb_use_mtflush; whether to use multi threaded flush. */ -my_bool srv_use_mtflush; #ifdef UNIV_DEBUG /** Used by SET GLOBAL innodb_master_thread_disabled_debug = X. */ @@ -206,15 +186,15 @@ ulong srv_n_log_files; /** The InnoDB redo log file size, or 0 when changing the redo log format at startup (while disallowing writes to the redo log). */ ulonglong srv_log_file_size; -/** copy of innodb_log_buffer_size, but in database pages */ -ulint srv_log_buffer_size; +/** innodb_log_buffer_size, in bytes */ +ulong srv_log_buffer_size; /** innodb_flush_log_at_trx_commit */ ulong srv_flush_log_at_trx_commit; /** innodb_flush_log_at_timeout */ uint srv_flush_log_at_timeout; /** innodb_page_size */ ulong srv_page_size; -/** log2 of innodb_page_size; @see innobase_init() */ +/** log2 of innodb_page_size; @see innodb_init_params() */ ulong srv_page_size_shift; /** innodb_log_write_ahead_size */ ulong srv_log_write_ahead_size; @@ -269,13 +249,17 @@ ulint srv_buf_pool_base_size; ulint srv_buf_pool_curr_size; /** Dump this % of each buffer pool during BP dump */ ulong srv_buf_pool_dump_pct; +/** Abort load after this amount of pages */ +#ifdef UNIV_DEBUG +ulong srv_buf_pool_load_pages_abort = LONG_MAX; +#endif /** Lock table size in bytes */ ulint srv_lock_table_size = ULINT_MAX; -/** copy of innodb_read_io_threads */ -ulint srv_n_read_io_threads; -/** copy of innodb_write_io_threads */ -ulint srv_n_write_io_threads; +/** innodb_read_io_threads */ +ulong srv_n_read_io_threads; +/** innodb_write_io_threads */ +ulong srv_n_write_io_threads; /** innodb_random_read_ahead */ my_bool srv_random_read_ahead; @@ -288,13 +272,10 @@ ulong srv_read_ahead_threshold; buffer in terms of percentage of the buffer pool. */ uint srv_change_buffer_max_size; -char* srv_file_flush_method_str; - +ulong srv_file_flush_method; -enum srv_flush_t srv_file_flush_method = IF_WIN(SRV_ALL_O_DIRECT_FSYNC,SRV_FSYNC); - -/** copy of innodb_open_files, initialized by innobase_init() */ +/** copy of innodb_open_files; @see innodb_init_params() */ ulint srv_max_n_open_files; /** innodb_io_capacity */ @@ -391,8 +372,7 @@ unsigned long long srv_stats_modified_counter; based on number of configured pages */ my_bool srv_stats_sample_traditional; -/** copy of innodb_doublewrite */ -ibool srv_use_doublewrite_buf; +my_bool srv_use_doublewrite_buf; /** innodb_doublewrite_batch_size (a debug parameter) specifies the number of pages to use in LRU and flush_list batch flushing. @@ -526,11 +506,6 @@ UNIV_INTERN ulong srv_buf_dump_status_frequency; mutex_exit(&srv_sys.mutex); \ } while (0) -#define fetch_lock_wait_timeout(trx) \ - ((trx)->lock.allowed_to_wait \ - ? thd_lock_wait_timeout((trx)->mysql_thd) \ - : 0) - /* IMPLEMENTATION OF THE SERVER MAIN PROGRAM ========================================= @@ -635,6 +610,12 @@ struct srv_sys_t{ static srv_sys_t srv_sys; +/** @return whether the purge coordinator thread is active */ +bool purge_sys_t::running() +{ + return my_atomic_loadlint(&srv_sys.n_threads_active[SRV_PURGE]); +} + /** Event to signal srv_monitor_thread. Not protected by a mutex. Set after setting srv_print_innodb_monitor. */ os_event_t srv_monitor_event; @@ -882,7 +863,8 @@ srv_suspend_thread_low( ut_a(!slot->suspended); slot->suspended = TRUE; - if (my_atomic_addlint(&srv_sys.n_threads_active[type], -1) < 0) { + if (lint(my_atomic_addlint(&srv_sys.n_threads_active[type], ulint(-1))) + < 0) { ut_error; } @@ -1093,8 +1075,6 @@ srv_init() trx_i_s_cache_init(trx_i_s_cache); ut_crc32_init(); - - dict_mem_init(); } /*********************************************************************//** @@ -1134,40 +1114,15 @@ srv_free(void) } /*********************************************************************//** -Normalizes init parameter values to use units we use inside InnoDB. */ -static -void -srv_normalize_init_values(void) -/*===========================*/ -{ - srv_sys_space.normalize(); - - srv_tmp_space.normalize(); - - srv_log_buffer_size /= UNIV_PAGE_SIZE; - - srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE); -} - -/*********************************************************************//** Boots the InnoDB server. */ void srv_boot(void) /*==========*/ { - /* Transform the init parameter values given by MySQL to - use units we use inside InnoDB: */ - - srv_normalize_init_values(); - sync_check_init(); - /* Reset the system variables in the recovery module. */ recv_sys_var_init(); trx_pool_init(); row_mysql_init(); - - /* Initialize this module */ - srv_init(); } @@ -1196,7 +1151,7 @@ srv_refresh_innodb_monitor_stats(void) #ifdef BTR_CUR_HASH_ADAPT btr_cur_n_sea_old = btr_cur_n_sea; #endif /* BTR_CUR_HASH_ADAPT */ - btr_cur_n_non_sea_old = btr_cur_n_non_sea; + btr_cur_n_non_sea_old = my_atomic_loadlint(&btr_cur_n_non_sea); log_refresh_stats(); @@ -1232,7 +1187,6 @@ srv_printf_innodb_monitor( { double time_elapsed; time_t current_time; - ulint n_reserved; ibool ret; mutex_enter(&srv_innodb_monitor_mutex); @@ -1358,16 +1312,16 @@ srv_printf_innodb_monitor( "%.2f hash searches/s, %.2f non-hash searches/s\n", (btr_cur_n_sea - btr_cur_n_sea_old) / time_elapsed, - (btr_cur_n_non_sea - btr_cur_n_non_sea_old) + (my_atomic_loadlint(&btr_cur_n_non_sea) - btr_cur_n_non_sea_old) / time_elapsed); btr_cur_n_sea_old = btr_cur_n_sea; #else /* BTR_CUR_HASH_ADAPT */ fprintf(file, "%.2f non-hash searches/s\n", - (btr_cur_n_non_sea - btr_cur_n_non_sea_old) + (my_atomic_loadlint(&btr_cur_n_non_sea) - btr_cur_n_non_sea_old) / time_elapsed); #endif /* BTR_CUR_HASH_ADAPT */ - btr_cur_n_non_sea_old = btr_cur_n_non_sea; + btr_cur_n_non_sea_old = my_atomic_loadlint(&btr_cur_n_non_sea); fputs("---\n" "LOG\n" @@ -1394,12 +1348,10 @@ srv_printf_innodb_monitor( srv_conc_get_active_threads(), srv_conc_get_waiting_threads()); - /* This is a dirty read, without holding trx_sys->mutex. */ fprintf(file, ULINTPF " read views open inside InnoDB\n", - trx_sys->mvcc->size()); + trx_sys.view_count()); - n_reserved = fil_space_get_n_reserved_extents(0); - if (n_reserved > 0) { + if (ulint n_reserved = fil_system.sys_space->n_reserved_extents) { fprintf(file, ULINTPF " tablespace extents now reserved for" " B-tree split operations\n", @@ -1564,7 +1516,7 @@ srv_export_innodb_status(void) export_vars.innodb_have_atomic_builtins = 0; #endif - export_vars.innodb_page_size = UNIV_PAGE_SIZE; + export_vars.innodb_page_size = srv_page_size; export_vars.innodb_log_waits = srv_stats.log_waits; @@ -1611,7 +1563,7 @@ srv_export_innodb_status(void) } export_vars.innodb_row_lock_time_max = - lock_sys->n_lock_max_wait_time / 1000; + lock_sys.n_lock_max_wait_time / 1000; export_vars.innodb_rows_read = srv_stats.n_rows_read; @@ -1632,7 +1584,7 @@ srv_export_innodb_status(void) export_vars.innodb_system_rows_deleted = srv_stats.n_system_rows_deleted; - export_vars.innodb_num_open_files = fil_system->n_open; + export_vars.innodb_num_open_files = fil_system.n_open; export_vars.innodb_truncated_status_writes = srv_truncated_status_writes; @@ -1754,7 +1706,7 @@ loop: if (srv_print_innodb_monitor) { /* Reset mutex_skipped counter everytime srv_print_innodb_monitor changes. This is to - ensure we will not be blocked by lock_sys->mutex + ensure we will not be blocked by lock_sys.mutex for short duration information printing, such as requested by sync_array_print_long_waits() */ if (!last_srv_print_monitor) { @@ -1958,19 +1910,8 @@ srv_get_active_thread_type(void) srv_sys_mutex_exit(); - if (ret == SRV_NONE && srv_shutdown_state > SRV_SHUTDOWN_INITIATED - && purge_sys != NULL) { - /* Check only on shutdown. */ - switch (trx_purge_state()) { - case PURGE_STATE_RUN: - case PURGE_STATE_STOP: - ret = SRV_PURGE; - break; - case PURGE_STATE_INIT: - case PURGE_STATE_DISABLED: - case PURGE_STATE_EXIT: - break; - } + if (ret == SRV_NONE && purge_sys.enabled()) { + ret = SRV_PURGE; } return(ret); @@ -2009,9 +1950,9 @@ srv_wake_purge_thread_if_not_active() { ut_ad(!srv_sys_mutex_own()); - if (purge_sys->state == PURGE_STATE_RUN + if (purge_sys.enabled() && !purge_sys.paused() && !my_atomic_loadlint(&srv_sys.n_threads_active[SRV_PURGE]) - && my_atomic_loadlint(&trx_sys->rseg_history_len)) { + && trx_sys.history_size()) { srv_release_threads(SRV_PURGE, 1); } @@ -2153,16 +2094,10 @@ srv_master_do_disabled_loop(void) /** Disables master thread. It's used by: SET GLOBAL innodb_master_thread_disabled_debug = 1 (0). -@param[in] thd thread handle -@param[in] var pointer to system variable -@param[out] var_ptr where the formal string goes @param[in] save immediate result from check function */ void -srv_master_thread_disabled_debug_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save) +srv_master_thread_disabled_debug_update(THD*, st_mysql_sys_var*, void*, + const void* save) { /* This method is protected by mutex, as every SET GLOBAL .. */ ut_ad(srv_master_thread_disabled_event != NULL); @@ -2499,80 +2434,62 @@ loop: goto loop; } -/** Check if purge should stop. -@param[in] n_purged pages purged in the last batch -@return whether purge should exit */ -static -bool -srv_purge_should_exit(ulint n_purged) +/** @return whether purge should exit due to shutdown */ +static bool srv_purge_should_exit() { - ut_ad(srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP); - - if (srv_undo_sources) { - return(false); - } - if (srv_fast_shutdown) { - return(true); - } - /* Slow shutdown was requested. */ - if (ulint history_size = n_purged ? trx_sys->rseg_history_len : 0) { - static time_t progress_time; - time_t now = time(NULL); - if (now - progress_time >= 15) { - progress_time = now; + ut_ad(srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP); + + if (srv_undo_sources) + return false; + + if (srv_fast_shutdown) + return true; + + /* Slow shutdown was requested. */ + if (const ulint history_size= trx_sys.history_size()) + { + static time_t progress_time; + time_t now= time(NULL); + if (now - progress_time >= 15) + { + progress_time= now; #if defined HAVE_SYSTEMD && !defined EMBEDDED_LIBRARY - service_manager_extend_timeout( - INNODB_EXTEND_TIMEOUT_INTERVAL, - "InnoDB: to purge " ULINTPF " transactions", - history_size); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "InnoDB: to purge %zu transactions", + history_size); + ib::info() << "to purge " << history_size << " transactions"; #endif - ib::info() << "to purge " << history_size - << " transactions"; - } - /* The previous round still did some work. */ - return(false); - } - /* Exit if there are no active transactions to roll back. */ - return(trx_sys_any_active_transactions() == 0); + } + return false; + } + + return !trx_sys.any_active_transactions(); } /*********************************************************************//** Fetch and execute a task from the work queue. @param [in,out] slot purge worker thread slot @return true if a task was executed */ -static -bool -srv_task_execute(ut_d(srv_slot_t *slot)) -/*==================*/ +static bool srv_task_execute(ut_d(srv_slot_t *slot)) { - que_thr_t* thr = NULL; - ut_ad(!srv_read_only_mode); - ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND); + ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND); mutex_enter(&srv_sys.tasks_mutex); - if (UT_LIST_GET_LEN(srv_sys.tasks) > 0) { - - thr = UT_LIST_GET_FIRST(srv_sys.tasks); - + if (que_thr_t* thr = UT_LIST_GET_FIRST(srv_sys.tasks)) { ut_a(que_node_get_type(thr->child) == QUE_NODE_PURGE); - UT_LIST_REMOVE(srv_sys.tasks, thr); - } - - mutex_exit(&srv_sys.tasks_mutex); - - if (thr != NULL) { + mutex_exit(&srv_sys.tasks_mutex); ut_d(thr->thread_slot = slot); - que_run_threads(thr); - - my_atomic_addlint( - &purge_sys->n_completed, 1); + my_atomic_addlint(&purge_sys.n_completed, 1); + return true; } - return(thr != NULL); + ut_ad(UT_LIST_GET_LEN(srv_sys.tasks) == 0); + mutex_exit(&srv_sys.tasks_mutex); + return false; } /*********************************************************************//** @@ -2620,19 +2537,11 @@ DECLARE_THREAD(srv_worker_thread)( srv_wake_purge_thread_if_not_active(); } - - /* Note: we are checking the state without holding the - purge_sys->latch here. */ - } while (purge_sys->state != PURGE_STATE_EXIT); + } while (purge_sys.enabled()); srv_free_slot(slot); - rw_lock_x_lock(&purge_sys->latch); - - ut_a(!purge_sys->running); - ut_a(purge_sys->state == PURGE_STATE_EXIT); - - rw_lock_x_unlock(&purge_sys->latch); + ut_ad(!purge_sys.enabled()); #ifdef UNIV_DEBUG_THREAD_CREATION ib::info() << "Purge worker thread exiting, id " @@ -2650,15 +2559,12 @@ DECLARE_THREAD(srv_worker_thread)( /** Do the actual purge operation. @param[in,out] n_total_purged total number of purged pages -@param[in,out] slot purge coordinator thread slot @return length of history list before the last purge batch. */ -static -ulint -srv_do_purge(ulint* n_total_purged +static ulint srv_do_purge(ulint* n_total_purged #ifdef UNIV_DEBUG - , srv_slot_t *slot + , srv_slot_t* slot /*!< purge coordinator */ #endif -) + ) { ulint n_pages_purged; @@ -2682,7 +2588,7 @@ srv_do_purge(ulint* n_total_purged } do { - if (trx_sys->rseg_history_len > rseg_history_len + if (trx_sys.history_size() > rseg_history_len || (srv_max_purge_lag > 0 && rseg_history_len > srv_max_purge_lag)) { @@ -2711,32 +2617,34 @@ srv_do_purge(ulint* n_total_purged ut_a(n_use_threads <= n_threads); /* Take a snapshot of the history list before purge. */ - if ((rseg_history_len = trx_sys->rseg_history_len) == 0) { + if (!(rseg_history_len = trx_sys.history_size())) { break; } ulint undo_trunc_freq = - purge_sys->undo_trunc.get_rseg_truncate_frequency(); + purge_sys.undo_trunc.get_rseg_truncate_frequency(); ulint rseg_truncate_frequency = ut_min( static_cast<ulint>(srv_purge_rseg_truncate_frequency), undo_trunc_freq); n_pages_purged = trx_purge( - n_use_threads, srv_purge_batch_size, + n_use_threads, (++count % rseg_truncate_frequency) == 0 #ifdef UNIV_DEBUG , slot #endif - ); + ); *n_total_purged += n_pages_purged; - } while (!srv_purge_should_exit(n_pages_purged) - && n_pages_purged > 0 - && purge_sys->state == PURGE_STATE_RUN); + } while (n_pages_purged > 0 && !purge_sys.paused() + && !srv_purge_should_exit()); return(rseg_history_len); } +#ifndef UNIV_DEBUG +# define srv_do_purge(n_total_purged, slot) srv_do_purge(n_total_purged) +#endif /*********************************************************************//** Suspend the purge coordinator thread. */ @@ -2760,34 +2668,25 @@ srv_purge_coordinator_suspend( int64_t sig_count = srv_suspend_thread(slot); do { - rw_lock_x_lock(&purge_sys->latch); - - purge_sys->running = false; - - rw_lock_x_unlock(&purge_sys->latch); - /* We don't wait right away on the the non-timed wait because we want to signal the thread that wants to suspend purge. */ const bool wait = stop - || rseg_history_len <= trx_sys->rseg_history_len; + || rseg_history_len <= trx_sys.history_size(); const bool timeout = srv_resume_thread( slot, sig_count, wait, stop ? 0 : SRV_PURGE_MAX_TIMEOUT); sig_count = srv_suspend_thread(slot); - rw_lock_x_lock(&purge_sys->latch); + rw_lock_x_lock(&purge_sys.latch); - stop = (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED - && purge_sys->state == PURGE_STATE_STOP); + stop = srv_shutdown_state <= SRV_SHUTDOWN_INITIATED + && purge_sys.paused_latched(); if (!stop) { - ut_a(purge_sys->n_stop == 0); - purge_sys->running = true; - if (timeout - && rseg_history_len == trx_sys->rseg_history_len - && trx_sys->rseg_history_len < 5000) { + && rseg_history_len < 5000 + && rseg_history_len == trx_sys.history_size()) { /* No new records were added since the wait started. Simply wait for new records. The magic number 5000 is an @@ -2798,13 +2697,11 @@ srv_purge_coordinator_suspend( stop = true; } } else { - ut_a(purge_sys->n_stop > 0); - /* Signal that we are suspended. */ - os_event_set(purge_sys->event); + os_event_set(purge_sys.event); } - rw_lock_x_unlock(&purge_sys->latch); + rw_lock_x_unlock(&purge_sys.latch); } while (stop && srv_undo_sources); srv_resume_thread(slot, 0, false); @@ -2827,15 +2724,9 @@ DECLARE_THREAD(srv_purge_coordinator_thread)( ut_ad(!srv_read_only_mode); ut_a(srv_n_purge_threads >= 1); - ut_a(trx_purge_state() == PURGE_STATE_INIT); ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND); - rw_lock_x_lock(&purge_sys->latch); - - purge_sys->running = true; - purge_sys->state = PURGE_STATE_RUN; - - rw_lock_x_unlock(&purge_sys->latch); + purge_sys.coordinator_startup(); #ifdef UNIV_PFS_THREAD pfs_register_thread(srv_purge_thread_key); @@ -2848,7 +2739,7 @@ DECLARE_THREAD(srv_purge_coordinator_thread)( slot = srv_reserve_slot(SRV_PURGE); - ulint rseg_history_len = trx_sys->rseg_history_len; + ulint rseg_history_len = trx_sys.history_size(); do { /* If there are no records to purge or the last @@ -2856,26 +2747,21 @@ DECLARE_THREAD(srv_purge_coordinator_thread)( if (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED && srv_undo_sources - && (purge_sys->state == PURGE_STATE_STOP - || n_total_purged == 0)) { + && (n_total_purged == 0 || purge_sys.paused())) { srv_purge_coordinator_suspend(slot, rseg_history_len); } ut_ad(!slot->suspended); - if (srv_purge_should_exit(n_total_purged)) { + if (srv_purge_should_exit()) { break; } n_total_purged = 0; - rseg_history_len = srv_do_purge(&n_total_purged -#ifdef UNIV_DEBUG - , slot -#endif - ); - } while (!srv_purge_should_exit(n_total_purged)); + rseg_history_len = srv_do_purge(&n_total_purged, slot); + } while (!srv_purge_should_exit()); /* The task queue should always be empty, independent of fast shutdown state. */ @@ -2884,20 +2770,17 @@ DECLARE_THREAD(srv_purge_coordinator_thread)( srv_free_slot(slot); /* Note that we are shutting down. */ - rw_lock_x_lock(&purge_sys->latch); - - purge_sys->state = PURGE_STATE_EXIT; + rw_lock_x_lock(&purge_sys.latch); + purge_sys.coordinator_shutdown(); /* If there are any pending undo-tablespace truncate then clear it off as we plan to shutdown the purge thread. */ - purge_sys->undo_trunc.clear(); - - purge_sys->running = false; + purge_sys.undo_trunc.clear(); - /* Ensure that the wait in trx_purge_stop() will terminate. */ - os_event_set(purge_sys->event); + /* Ensure that the wait in purge_sys_t::stop() will terminate. */ + os_event_set(purge_sys.event); - rw_lock_x_unlock(&purge_sys->latch); + rw_lock_x_unlock(&purge_sys.latch); #ifdef UNIV_DEBUG_THREAD_CREATION ib::info() << "Purge coordinator exiting, id " diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 9548730b359..7314fd60cd6 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -3,7 +3,7 @@ Copyright (c) 1996, 2017, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2020, MariaDB Corporation. +Copyright (c) 2013, 2021, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -79,7 +79,6 @@ Created 2/16/1996 Heikki Tuuri #include "os0proc.h" #include "buf0flu.h" #include "buf0rea.h" -#include "buf0mtflu.h" #include "dict0boot.h" #include "dict0load.h" #include "dict0stats_bg.h" @@ -129,7 +128,7 @@ bool srv_sys_tablespaces_open; bool srv_was_started; /** The original value of srv_log_file_size (innodb_log_file_size) */ static ulonglong srv_log_file_size_requested; -/** TRUE if innobase_start_or_create_for_mysql() has been called */ +/** whether srv_start() has been called */ static bool srv_start_has_been_called; /** Whether any undo log records can be generated */ @@ -180,9 +179,7 @@ static ulint n[SRV_MAX_N_IO_THREADS + 6]; /** io_handler_thread identifiers, 32 is the maximum number of purge threads */ /** 6 is the ? */ #define START_OLD_THREAD_CNT (SRV_MAX_N_IO_THREADS + 6 + 32) -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32 + MTFLUSH_MAX_WORKER]; -/* Thread contex data for multi-threaded flush */ -void *mtflush_ctx=NULL; +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32]; /** Thead handles */ static os_thread_t thread_handles[SRV_MAX_N_IO_THREADS + 6 + 32]; @@ -193,9 +190,6 @@ static bool thread_started[SRV_MAX_N_IO_THREADS + 6 + 32] = {false}; /** Name of srv_monitor_file */ static char* srv_monitor_file_name; -/** Minimum expected tablespace size. (10M) */ -static const ulint MIN_EXPECTED_TABLESPACE_SIZE = 5 * 1024 * 1024; - /** */ #define SRV_MAX_N_PENDING_SYNC_IOS 100 @@ -303,7 +297,7 @@ DECLARE_THREAD(io_handler_thread)( #endif /* For read only mode, we don't need ibuf and log I/O thread. - Please see innobase_start_or_create_for_mysql() */ + Please see srv_start() */ ulint start = (srv_read_only_mode) ? 0 : 2; if (segment < start) { @@ -481,7 +475,7 @@ create_log_files( false, false); } - log_init(srv_n_log_files); + log_sys.log.create(srv_n_log_files); if (!log_set_capacity(srv_log_file_size_requested)) { return(DB_ERROR); } @@ -490,30 +484,31 @@ create_log_files( /* Create a log checkpoint. */ log_mutex_enter(); - if (log_sys->is_encrypted() && !log_crypt_init()) { + if (log_sys.is_encrypted() && !log_crypt_init()) { return DB_ERROR; } ut_d(recv_no_log_write = false); - log_sys->lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE); + log_sys.lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE); - log_sys->log.lsn = log_sys->lsn; - log_sys->log.lsn_offset = LOG_FILE_HDR_SIZE; + log_sys.log.set_lsn(log_sys.lsn); + log_sys.log.set_lsn_offset(LOG_FILE_HDR_SIZE); - log_sys->buf_next_to_write = 0; - log_sys->write_lsn = log_sys->lsn; + log_sys.buf_next_to_write = 0; + log_sys.write_lsn = log_sys.lsn; - log_sys->next_checkpoint_no = 0; - log_sys->last_checkpoint_lsn = 0; + log_sys.next_checkpoint_no = 0; + log_sys.last_checkpoint_lsn = 0; - memset(log_sys->buf, 0, log_sys->buf_size); - log_block_init(log_sys->buf, log_sys->lsn); - log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE); + memset(log_sys.buf, 0, srv_log_buffer_size); + log_block_init(log_sys.buf, log_sys.lsn); + log_block_set_first_rec_group(log_sys.buf, LOG_BLOCK_HDR_SIZE); + memset(log_sys.flush_buf, 0, srv_log_buffer_size); - log_sys->buf_free = LOG_BLOCK_HDR_SIZE; - log_sys->lsn += LOG_BLOCK_HDR_SIZE; + log_sys.buf_free = LOG_BLOCK_HDR_SIZE; + log_sys.lsn += LOG_BLOCK_HDR_SIZE; MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, - (log_sys->lsn - log_sys->last_checkpoint_lsn)); + (log_sys.lsn - log_sys.last_checkpoint_lsn)); log_mutex_exit(); log_make_checkpoint(); @@ -627,13 +622,13 @@ srv_undo_tablespace_create( " be created"; ib::info() << "Setting file " << name << " size to " - << (size >> (20 - UNIV_PAGE_SIZE_SHIFT)) << " MB"; + << (size >> (20 - srv_page_size_shift)) << " MB"; ib::info() << "Database physically writes the file full: " << "wait..."; ret = os_file_set_size( - name, fh, os_offset_t(size) << UNIV_PAGE_SIZE_SHIFT); + name, fh, os_offset_t(size) << srv_page_size_shift); if (!ret) { ib::info() << "Error in creating " << name @@ -691,7 +686,7 @@ static bool srv_undo_tablespace_open(const char* name, ulint space_id, fil_node_t* file = space->add(name, fh, 0, false, true); - mutex_enter(&fil_system->mutex); + mutex_enter(&fil_system.mutex); if (create_new_db) { space->size = file->size = ulint(size >> srv_page_size_shift); @@ -702,12 +697,12 @@ static bool srv_undo_tablespace_open(const char* name, ulint space_id, if (!success) { os_file_close(file->handle); file->handle = OS_FILE_CLOSED; - ut_a(fil_system->n_open > 0); - fil_system->n_open--; + ut_a(fil_system.n_open > 0); + fil_system.n_open--; } } - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); return success; } @@ -721,7 +716,7 @@ dberr_t srv_check_undo_redo_logs_exists() { bool ret; - os_file_t fh; + pfs_os_file_t fh; char name[OS_FILE_MAX_PATH]; /* Check if any undo tablespaces exist */ @@ -1007,24 +1002,19 @@ srv_undo_tablespaces_init(bool create_new_db) if (create_new_db) { mtr_t mtr; - mtr_start(&mtr); - - /* The undo log tablespace */ for (i = 0; i < n_undo_tablespaces; ++i) { - - fsp_header_init( - undo_tablespace_ids[i], - SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr); + mtr.start(); + fsp_header_init(fil_space_get(undo_tablespace_ids[i]), + SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, + &mtr); + mtr.commit(); } - - mtr_commit(&mtr); } if (!undo::Truncate::s_fix_up_spaces.empty()) { /* Step-1: Initialize the tablespace header and rsegs header. */ mtr_t mtr; - trx_sysf_t* sys_header; mtr_start(&mtr); /* Turn off REDO logging. We are in server start mode and fixing @@ -1033,7 +1023,11 @@ srv_undo_tablespaces_init(bool create_new_db) as part of the current recovery process. We surely don't need that as this is fix-up action parallel to REDO logging. */ mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - sys_header = trx_sysf_get(&mtr); + buf_block_t* sys_header = trx_sysf_get(&mtr); + if (!sys_header) { + mtr.commit(); + return DB_CORRUPTION; + } for (undo::undo_spaces_t::const_iterator it = undo::Truncate::s_fix_up_spaces.begin(); @@ -1042,19 +1036,17 @@ srv_undo_tablespaces_init(bool create_new_db) undo::Truncate::add_space_to_trunc_list(*it); - fsp_header_init( - *it, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr); + fil_space_t* space = fil_space_get(*it); - mtr_x_lock_space(*it, &mtr); + fsp_header_init(space, + SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, + &mtr); for (ulint i = 0; i < TRX_SYS_N_RSEGS; i++) { - - ulint space_id = trx_sysf_rseg_get_space( - sys_header, i, &mtr); - - if (space_id == *it) { + if (trx_sysf_rseg_get_space(sys_header, i) + == *it) { trx_rseg_header_create( - *it, ULINT_MAX, i, &mtr); + space, i, sys_header, &mtr); } } @@ -1067,9 +1059,9 @@ srv_undo_tablespaces_init(bool create_new_db) = undo::Truncate::s_fix_up_spaces.begin(); it != undo::Truncate::s_fix_up_spaces.end(); ++it) { - FlushObserver dummy(TRX_SYS_SPACE, NULL, NULL); + FlushObserver dummy(fil_system.sys_space, NULL, NULL); buf_LRU_flush_or_remove_pages(TRX_SYS_SPACE, &dummy); - FlushObserver dummy2(*it, NULL, NULL); + FlushObserver dummy2(fil_space_get(*it), NULL, NULL); buf_LRU_flush_or_remove_pages(*it, &dummy2); /* Remove the truncate redo log file. */ @@ -1080,41 +1072,6 @@ srv_undo_tablespaces_init(bool create_new_db) return(DB_SUCCESS); } -/******************************************************************** -Wait for the purge thread(s) to start up. */ -static -void -srv_start_wait_for_purge_to_start() -/*===============================*/ -{ - /* Wait for the purge coordinator and master thread to startup. */ - - purge_state_t state = trx_purge_state(); - - ut_a(state != PURGE_STATE_DISABLED); - - while (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED - && srv_force_recovery < SRV_FORCE_NO_BACKGROUND - && state == PURGE_STATE_INIT) { - - switch (state = trx_purge_state()) { - case PURGE_STATE_RUN: - case PURGE_STATE_STOP: - break; - - case PURGE_STATE_INIT: - ib::info() << "Waiting for purge to start"; - - os_thread_sleep(50000); - break; - - case PURGE_STATE_EXIT: - case PURGE_STATE_DISABLED: - ut_error; - } - } -} - /** Create the temporary file tablespace. @param[in] create_new_db whether we are creating a new database @return DB_SUCCESS or error code. */ @@ -1140,47 +1097,30 @@ srv_open_tmp_tablespace(bool create_new_db) &create_new_temp_space, 12 * 1024 * 1024); if (err == DB_FAIL) { - - ib::error() << "The " << srv_tmp_space.name() - << " data file must be writable!"; - + ib::error() << "The innodb_temporary" + " data file must be writable!"; err = DB_ERROR; - } else if (err != DB_SUCCESS) { - ib::error() << "Could not create the shared " - << srv_tmp_space.name() << "."; - + ib::error() << "Could not create the shared innodb_temporary."; } else if ((err = srv_tmp_space.open_or_create( true, create_new_db, &sum_of_new_sizes, NULL)) != DB_SUCCESS) { - - ib::error() << "Unable to create the shared " - << srv_tmp_space.name(); - + ib::error() << "Unable to create the shared innodb_temporary"; + } else if (fil_system.temp_space->open()) { + /* Initialize the header page */ + mtr_t mtr; + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + fsp_header_init(fil_system.temp_space, + srv_tmp_space.get_sum_of_sizes(), + &mtr); + mtr.commit(); } else { - - mtr_t mtr; - ulint size = srv_tmp_space.get_sum_of_sizes(); - - /* Open this shared temp tablespace in the fil_system so that - it stays open until shutdown. */ - if (fil_space_open(srv_tmp_space.name())) { - - /* Initialize the header page */ - mtr_start(&mtr); - mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - - fsp_header_init(SRV_TMP_SPACE_ID, size, &mtr); - - mtr_commit(&mtr); - } else { - /* This file was just opened in the code above! */ - ib::error() << "The " << srv_tmp_space.name() - << " data file cannot be re-opened" - " after check_file_spec() succeeded!"; - - err = DB_ERROR; - } + /* This file was just opened in the code above! */ + ib::error() << "The innodb_temporary" + " data file cannot be re-opened" + " after check_file_spec() succeeded!"; + err = DB_ERROR; } return(err); @@ -1195,7 +1135,7 @@ srv_start_state_set( srv_start_state_t state) /*!< in: indicate current state of thread startup */ { - srv_start_state |= state; + srv_start_state |= ulint(state); } /****************************************************************//** @@ -1207,7 +1147,7 @@ srv_start_state_is_set( /*===================*/ srv_start_state_t state) /*!< in: state to check for */ { - return(srv_start_state & state); + return(srv_start_state & ulint(state)); } /** @@ -1228,7 +1168,7 @@ srv_shutdown_all_bg_threads() if (srv_start_state_is_set(SRV_START_STATE_LOCK_SYS)) { /* a. Let the lock timeout thread exit */ - os_event_set(lock_sys->timeout_event); + os_event_set(lock_sys.timeout_event); } if (!srv_read_only_mode) { @@ -1267,10 +1207,6 @@ srv_shutdown_all_bg_threads() } os_event_set(buf_flush_event); - - if (srv_use_mtflush) { - buf_mtflu_io_thread_exit(); - } } if (!os_thread_count) { @@ -1341,6 +1277,7 @@ srv_init_abort_low( " with error " << err; } + srv_shutdown_bg_undo_sources(); srv_shutdown_all_bg_threads(); return(err); } @@ -1360,17 +1297,10 @@ srv_prepare_to_delete_redo_log_files( ulint pending_io = 0; ulint count = 0; - if (srv_safe_truncate) { - if ((log_sys->log.format & ~LOG_HEADER_FORMAT_ENCRYPTED) - != LOG_HEADER_FORMAT_10_3 - || log_sys->log.subformat != 1) { - srv_log_file_size = 0; - } - } else { - if ((log_sys->log.format & ~LOG_HEADER_FORMAT_ENCRYPTED) - != LOG_HEADER_FORMAT_10_2) { - srv_log_file_size = 0; - } + if ((log_sys.log.format & ~LOG_HEADER_FORMAT_ENCRYPTED) + != LOG_HEADER_FORMAT_CURRENT + || log_sys.log.subformat != 2) { + srv_log_file_size = 0; } do { @@ -1382,23 +1312,23 @@ srv_prepare_to_delete_redo_log_files( log_mutex_enter(); - fil_names_clear(log_sys->lsn, false); + fil_names_clear(log_sys.lsn, false); - flushed_lsn = log_sys->lsn; + flushed_lsn = log_sys.lsn; { ib::info info; if (srv_log_file_size == 0) { - info << ((log_sys->log.format + info << ((log_sys.log.format & ~LOG_HEADER_FORMAT_ENCRYPTED) - < LOG_HEADER_FORMAT_10_3 + != LOG_HEADER_FORMAT_10_4 ? "Upgrading redo log: " : "Downgrading redo log: "); } else if (n_files != srv_n_log_files || srv_log_file_size != srv_log_file_size_requested) { if (srv_encrypt_log - == log_sys->is_encrypted()) { + == (my_bool)log_sys.is_encrypted()) { info << (srv_encrypt_log ? "Resizing encrypted" : "Resizing"); @@ -1456,14 +1386,11 @@ srv_prepare_to_delete_redo_log_files( DBUG_RETURN(flushed_lsn); } -/******************************************************************** -Starts InnoDB and creates a new database if database files -are not found and the user wants. +/** Start InnoDB. +@param[in] create_new_db whether to create a new database @return DB_SUCCESS or error code */ -dberr_t -innobase_start_or_create_for_mysql() +dberr_t srv_start(bool create_new_db) { - bool create_new_db = false; lsn_t flushed_lsn; dberr_t err = DB_SUCCESS; ulint srv_n_log_files_found = srv_n_log_files; @@ -1476,6 +1403,7 @@ innobase_start_or_create_for_mysql() ut_ad(srv_operation == SRV_OPERATION_NORMAL || is_mariabackup_restore_or_export()); + if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) { srv_read_only_mode = true; } @@ -1487,15 +1415,6 @@ innobase_start_or_create_for_mysql() /* Reset the start state. */ srv_start_state = SRV_START_STATE_NONE; - if (srv_read_only_mode) { - ib::info() << "Started in read only mode"; - - /* There is no write to InnoDB tablespaces (not even - temporary ones, because also CREATE TEMPORARY TABLE is - refused in read-only mode). */ - srv_use_doublewrite_buf = FALSE; - } - compile_time_assert(sizeof(ulint) == sizeof(void*)); #ifdef UNIV_DEBUG @@ -1547,62 +1466,10 @@ innobase_start_or_create_for_mysql() srv_is_being_started = true; -#ifdef _WIN32 - srv_use_native_aio = TRUE; - -#elif defined(LINUX_NATIVE_AIO) - - if (srv_use_native_aio) { - ib::info() << "Using Linux native AIO"; - } -#else - /* Currently native AIO is supported only on windows and linux - and that also when the support is compiled in. In all other - cases, we ignore the setting of innodb_use_native_aio. */ - srv_use_native_aio = FALSE; -#endif /* _WIN32 */ - /* Register performance schema stages before any real work has been started which may need to be instrumented. */ mysql_stage_register("innodb", srv_stages, UT_ARR_SIZE(srv_stages)); - if (srv_file_flush_method_str == NULL) { - /* These are the default options */ - srv_file_flush_method = IF_WIN(SRV_ALL_O_DIRECT_FSYNC,SRV_FSYNC); - } else if (0 == ut_strcmp(srv_file_flush_method_str, "fsync")) { - srv_file_flush_method = SRV_FSYNC; - - } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) { - srv_file_flush_method = SRV_O_DSYNC; - - } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) { - srv_file_flush_method = SRV_O_DIRECT; - - } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT_NO_FSYNC")) { - srv_file_flush_method = SRV_O_DIRECT_NO_FSYNC; - - } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) { - srv_file_flush_method = SRV_LITTLESYNC; - - } else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) { - srv_file_flush_method = SRV_NOSYNC; -#ifdef _WIN32 - } else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) { - srv_file_flush_method = SRV_FSYNC; - } else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) { - } else if (0 == ut_strcmp(srv_file_flush_method_str, - "async_unbuffered")) { -#endif /* _WIN32 */ - } else { - ib::error() << "Unrecognized value " - << srv_file_flush_method_str - << " for innodb_flush_method"; - err = DB_ERROR; - } - - /* Note that the call srv_boot() also changes the values of - some variables to the units used by InnoDB internally */ - /* Set the maximum number of threads which can wait for a semaphore inside InnoDB: this is the 'sync wait array' size, as well as the maximum number of threads that can wait in the 'srv_conc array' for @@ -1619,7 +1486,7 @@ innobase_start_or_create_for_mysql() + 1 /* dict_stats_thread */ + 1 /* fts_optimize_thread */ + 1 /* recv_writer_thread */ - + 1 /* trx_rollback_or_clean_all_recovered */ + + 1 /* trx_rollback_all_recovered */ + 128 /* added as margin, for use of InnoDB Memcached etc. */ + max_connections @@ -1631,65 +1498,6 @@ innobase_start_or_create_for_mysql() + fts_sort_pll_degree * FTS_NUM_AUX_INDEX * max_connections; - if (srv_buf_pool_size >= BUF_POOL_SIZE_THRESHOLD) { - - if (srv_buf_pool_instances == srv_buf_pool_instances_default) { -#if defined(_WIN32) && !defined(_WIN64) - /* Do not allocate too large of a buffer pool on - Windows 32-bit systems, which can have trouble - allocating larger single contiguous memory blocks. */ - srv_buf_pool_size = static_cast<ulint>(ut_uint64_align_up(srv_buf_pool_size, srv_buf_pool_chunk_unit)); - srv_buf_pool_instances = ut_min( - static_cast<ulong>(MAX_BUFFER_POOLS), - static_cast<ulong>(srv_buf_pool_size / srv_buf_pool_chunk_unit)); -#else /* defined(_WIN32) && !defined(_WIN64) */ - /* Default to 8 instances when size > 1GB. */ - srv_buf_pool_instances = 8; -#endif /* defined(_WIN32) && !defined(_WIN64) */ - } - } else { - /* If buffer pool is less than 1 GiB, assume fewer - threads. Also use only one buffer pool instance. */ - if (srv_buf_pool_instances != srv_buf_pool_instances_default - && srv_buf_pool_instances != 1) { - /* We can't distinguish whether the user has explicitly - started mysqld with --innodb-buffer-pool-instances=0, - (srv_buf_pool_instances_default is 0) or has not - specified that option at all. Thus we have the - limitation that if the user started with =0, we - will not emit a warning here, but we should actually - do so. */ - ib::info() - << "Adjusting innodb_buffer_pool_instances" - " from " << srv_buf_pool_instances << " to 1" - " since innodb_buffer_pool_size is less than " - << BUF_POOL_SIZE_THRESHOLD / (1024 * 1024) - << " MiB"; - } - - srv_buf_pool_instances = 1; - } - - if (srv_buf_pool_chunk_unit * srv_buf_pool_instances - > srv_buf_pool_size) { - /* Size unit of buffer pool is larger than srv_buf_pool_size. - adjust srv_buf_pool_chunk_unit for srv_buf_pool_size. */ - srv_buf_pool_chunk_unit - = static_cast<ulong>(srv_buf_pool_size) - / srv_buf_pool_instances; - if (srv_buf_pool_size % srv_buf_pool_instances != 0) { - ++srv_buf_pool_chunk_unit; - } - } - - srv_buf_pool_size = buf_pool_size_align(srv_buf_pool_size); - - if (srv_n_page_cleaners > srv_buf_pool_instances) { - /* limit of page_cleaner parallelizability - is number of buffer pool instances. */ - srv_n_page_cleaners = srv_buf_pool_instances; - } - srv_boot(); ib::info() << ut_crc32_implementation; @@ -1724,7 +1532,7 @@ innobase_start_or_create_for_mysql() } else { srv_monitor_file_name = NULL; - srv_monitor_file = os_file_create_tmpfile(NULL); + srv_monitor_file = os_file_create_tmpfile(); if (!srv_monitor_file && err == DB_SUCCESS) { err = DB_ERROR; @@ -1734,7 +1542,7 @@ innobase_start_or_create_for_mysql() mutex_create(LATCH_ID_SRV_MISC_TMPFILE, &srv_misc_tmpfile_mutex); - srv_misc_tmpfile = os_file_create_tmpfile(NULL); + srv_misc_tmpfile = os_file_create_tmpfile(); if (!srv_misc_tmpfile && err == DB_SUCCESS) { err = DB_ERROR; @@ -1768,7 +1576,7 @@ innobase_start_or_create_for_mysql() return(srv_init_abort(DB_ERROR)); } - fil_init(srv_file_per_table ? 50000 : 5000, srv_max_n_open_files); + fil_system.create(srv_file_per_table ? 50000 : 5000); double size; char unit; @@ -1819,11 +1627,9 @@ innobase_start_or_create_for_mysql() } #endif /* UNIV_DEBUG */ - fsp_init(); - log_sys_init(); - + log_sys.create(); recv_sys_init(); - lock_sys_create(srv_lock_table_size); + lock_sys.create(srv_lock_table_size); /* Create i/o-handler threads: */ @@ -1842,9 +1648,10 @@ innobase_start_or_create_for_mysql() os_thread_create(buf_flush_page_cleaner_coordinator, NULL, NULL); - for (i = 1; i < srv_n_page_cleaners; ++i) { - os_thread_create(buf_flush_page_cleaner_worker, - NULL, NULL); + /* Create page cleaner workers if needed. For example + mariabackup could set srv_n_page_cleaners = 0. */ + if (srv_n_page_cleaners > 1) { + buf_flush_set_page_cleaner_thread_cnt(srv_n_page_cleaners); } #ifdef UNIV_LINUX @@ -1854,25 +1661,6 @@ innobase_start_or_create_for_mysql() srv_start_state_set(SRV_START_STATE_IO); } - if (srv_n_log_files * srv_log_file_size >= log_group_max_size) { - /* Log group size is limited by the size of page number. Remove this - limitation when fil_io() is not used for recovery log io. */ - ib::error() << "Combined size of log files must be < " - << log_group_max_size; - - return(srv_init_abort(DB_ERROR)); - } - - os_normalize_path(srv_data_home); - - /* Check if the data files exist or not. */ - err = srv_sys_space.check_file_spec( - &create_new_db, MIN_EXPECTED_TABLESPACE_SIZE); - - if (err != DB_SUCCESS) { - return(srv_init_abort(DB_ERROR)); - } - srv_startup_is_before_trx_rollback_phase = !create_new_db; /* Check if undo tablespaces and redo log files exist before creating @@ -2074,7 +1862,7 @@ innobase_start_or_create_for_mysql() false, false); } - log_init(srv_n_log_files_found); + log_sys.log.create(srv_n_log_files_found); if (!log_set_capacity(srv_log_file_size_requested)) { return(srv_init_abort(DB_ERROR)); @@ -2087,7 +1875,7 @@ files_checked: shutdown */ fil_open_log_and_system_tablespace_files(); - ut_d(fil_space_get(0)->recv_size = srv_sys_space_size_debug); + ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug); err = srv_undo_tablespaces_init(create_new_db); @@ -2106,24 +1894,20 @@ files_checked: dict_stats_thread_init(); } - trx_sys_file_format_init(); - - trx_sys_create(); + trx_sys.create(); if (create_new_db) { ut_a(!srv_read_only_mode); mtr_start(&mtr); - - fsp_header_init(0, sum_of_new_sizes, &mtr); - + ut_ad(fil_system.sys_space->id == 0); compile_time_assert(TRX_SYS_SPACE == 0); compile_time_assert(IBUF_SPACE_ID == 0); + fsp_header_init(fil_system.sys_space, sum_of_new_sizes, &mtr); ulint ibuf_root = btr_create( - DICT_CLUSTERED | DICT_IBUF, - 0, univ_page_size, DICT_IBUF_ID_MIN, - dict_ind_redundant, NULL, &mtr); + DICT_CLUSTERED | DICT_IBUF, fil_system.sys_space, + DICT_IBUF_ID_MIN, dict_ind_redundant, NULL, &mtr); mtr_commit(&mtr); @@ -2138,7 +1922,7 @@ files_checked: All the remaining rollback segments will be created later, after the double write buffer has been created. */ trx_sys_create_sys_pages(); - trx_sys_init_at_db_start(); + trx_lists_init_at_db_start(); err = dict_create(); @@ -2162,26 +1946,6 @@ files_checked: return(srv_init_abort(err)); } } else { - - /* Check if we support the max format that is stamped - on the system tablespace. - Note: We are NOT allowed to make any modifications to - the TRX_SYS_PAGE_NO page before recovery because this - page also contains the max_trx_id etc. important system - variables that are required for recovery. We need to - ensure that we return the system to a state where normal - recovery is guaranteed to work. We do this by - invalidating the buffer cache, this will force the - reread of the page and restoration to its last known - consistent state, this is REQUIRED for the recovery - process to work. */ - err = trx_sys_file_format_max_check( - srv_max_file_format_at_startup); - - if (err != DB_SUCCESS) { - return(srv_init_abort(err)); - } - /* Invalidate the buffer pool to ensure that we reread the page that we read above, during recovery. Note that this is not as heavy weight as it seems. At @@ -2218,19 +1982,15 @@ files_checked: if (err != DB_SUCCESS) { return(srv_init_abort(err)); } + /* fall through */ + case SRV_OPERATION_RESTORE: /* This must precede recv_apply_hashed_log_recs(true). */ - trx_sys_init_at_db_start(); + trx_lists_init_at_db_start(); break; case SRV_OPERATION_RESTORE_DELTA: case SRV_OPERATION_BACKUP: ut_ad(!"wrong mariabackup mode"); - /* fall through */ - case SRV_OPERATION_RESTORE: - /* mariabackup --prepare only deals with - the redo log and the data files, not with - transactions or the data dictionary. */ - break; } if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { @@ -2255,15 +2015,29 @@ files_checked: if (!srv_read_only_mode) { const ulint flags = FSP_FLAGS_PAGE_SSIZE(); for (ulint id = 0; id <= srv_undo_tablespaces; id++) { - if (fil_space_get(id)) { - fsp_flags_try_adjust(id, flags); + if (fil_space_t* space = fil_space_get(id)) { + fsp_flags_try_adjust(space, flags); } } if (sum_of_new_sizes > 0) { /* New data file(s) were added */ mtr.start(); - fsp_header_inc_size(0, sum_of_new_sizes, &mtr); + mtr.x_lock_space(fil_system.sys_space, + __FILE__, __LINE__); + buf_block_t* block = buf_page_get( + page_id_t(0, 0), univ_page_size, + RW_SX_LATCH, &mtr); + ulint size = mach_read_from_4( + FSP_HEADER_OFFSET + FSP_SIZE + + block->frame); + ut_ad(size == fil_system.sys_space + ->size_in_header); + size += sum_of_new_sizes; + mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SIZE + + block->frame, size, + MLOG_4BYTES, &mtr); + fil_system.sys_space->size_in_header = size; mtr.commit(); /* Immediately write the log record about increased tablespace size to disk, so that it @@ -2273,8 +2047,20 @@ files_checked: } } +#ifdef UNIV_DEBUG + { + mtr.start(); + buf_block_t* block = buf_page_get(page_id_t(0, 0), + univ_page_size, + RW_S_LATCH, &mtr); + ut_ad(mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET + + block->frame) + == fil_system.sys_space->size_in_header); + mtr.commit(); + } +#endif const ulint tablespace_size_in_header - = fsp_header_get_tablespace_size(); + = fil_system.sys_space->size_in_header; const ulint sum_of_data_file_sizes = srv_sys_space.get_sum_of_sizes(); /* Compare the system tablespace file size to what is @@ -2312,7 +2098,7 @@ files_checked: } /* recv_recovery_from_checkpoint_finish needs trx lists which - are initialized in trx_sys_init_at_db_start(). */ + are initialized in trx_lists_init_at_db_start(). */ recv_recovery_from_checkpoint_finish(); @@ -2329,12 +2115,11 @@ files_checked: err = fil_write_flushed_lsn(log_get_lsn()); ut_ad(!buf_pool_check_no_pending_io()); fil_close_log_files(true); - log_group_close_all(); if (err == DB_SUCCESS) { bool trunc = is_mariabackup_restore(); /* Delete subsequent log files. */ delete_log_files(logfilename, dirnamelen, - srv_n_log_files_found, trunc); + (uint)srv_n_log_files_found, trunc); if (trunc) { /* Truncate the first log file. */ strcpy(logfilename + dirnamelen, @@ -2356,17 +2141,12 @@ files_checked: /* Leave the redo log alone. */ } else if (srv_log_file_size_requested == srv_log_file_size && srv_n_log_files_found == srv_n_log_files - && log_sys->log.format - == (srv_safe_truncate - ? (srv_encrypt_log - ? LOG_HEADER_FORMAT_10_3 - | LOG_HEADER_FORMAT_ENCRYPTED - : LOG_HEADER_FORMAT_10_3) - : (srv_encrypt_log - ? LOG_HEADER_FORMAT_10_2 - | LOG_HEADER_FORMAT_ENCRYPTED - : LOG_HEADER_FORMAT_10_2)) - && log_sys->log.subformat == !!srv_safe_truncate) { + && log_sys.log.format + == (srv_encrypt_log + ? LOG_HEADER_FORMAT_CURRENT + | LOG_HEADER_FORMAT_ENCRYPTED + : LOG_HEADER_FORMAT_CURRENT) + && log_sys.log.subformat == 2) { /* No need to add or remove encryption, upgrade, downgrade, or resize. */ } else { @@ -2403,9 +2183,6 @@ files_checked: return(srv_init_abort(DB_ERROR));); DBUG_PRINT("ib_log", ("After innodb_log_abort_5")); - /* Free the old log file space. */ - log_group_close_all(); - ib::info() << "Starting to delete and rewrite log" " files."; @@ -2429,10 +2206,8 @@ files_checked: /* Validate a few system page types that were left uninitialized by older versions of MySQL. */ if (!high_level_read_only) { - mtr_t mtr; buf_block_t* block; mtr.start(); - mtr.set_sys_modified(); /* Bitmap page types will be reset in buf_dblwr_check_block() without redo logging. */ block = buf_page_get( @@ -2465,7 +2240,7 @@ files_checked: The data dictionary latch should guarantee that there is at most one data dictionary transaction active at a time. */ if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { - trx_rollback_or_clean_recovered(FALSE); + trx_rollback_recovered(false); } /* Fix-up truncate of tables in the system tablespace @@ -2512,13 +2287,6 @@ files_checked: recv_recovery_rollback_active(); srv_startup_is_before_trx_rollback_phase = FALSE; - - /* It is possible that file_format tag has never - been set. In this case we initialize it to minimum - value. Important to note that we can do it ONLY after - we have finished the recovery process so that the - image of TRX_SYS_PAGE_NO is not stale. */ - trx_sys_file_format_tag_init(); } ut_ad(err == DB_SUCCESS); @@ -2559,8 +2327,9 @@ files_checked: lock_wait_timeout_thread, NULL, thread_ids + 2 + SRV_MAX_N_IO_THREADS); thread_started[2 + SRV_MAX_N_IO_THREADS] = true; - lock_sys->timeout_thread_active = true; + lock_sys.timeout_thread_active = true; + DBUG_EXECUTE_IF("innodb_skip_monitors", goto skip_monitors;); /* Create the thread which warns of long semaphore waits */ srv_error_monitor_active = true; thread_handles[3 + SRV_MAX_N_IO_THREADS] = os_thread_create( @@ -2576,6 +2345,24 @@ files_checked: thread_started[4 + SRV_MAX_N_IO_THREADS] = true; srv_start_state |= SRV_START_STATE_LOCK_SYS | SRV_START_STATE_MONITOR; + +#ifndef DBUG_OFF +skip_monitors: +#endif + ut_ad(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN + || !purge_sys.enabled()); + + if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) { + srv_undo_sources = true; + /* Create the dict stats gathering thread */ + srv_dict_stats_thread_active = true; + dict_stats_thread_handle = os_thread_create( + dict_stats_thread, NULL, NULL); + + /* Create the thread that will optimize the + FULLTEXT search index subsystem. */ + fts_optimize_init(); + } } /* Create the SYS_FOREIGN and SYS_FOREIGN_COLS system tables */ @@ -2609,34 +2396,21 @@ files_checked: } trx_temp_rseg_create(); - } - - ut_a(trx_purge_state() == PURGE_STATE_INIT); - - /* Create the master thread which does purge and other utility - operations */ - if (!srv_read_only_mode - && srv_force_recovery < SRV_FORCE_NO_BACKGROUND) { - thread_handles[1 + SRV_MAX_N_IO_THREADS] = os_thread_create( - srv_master_thread, - NULL, thread_ids + (1 + SRV_MAX_N_IO_THREADS)); - thread_started[1 + SRV_MAX_N_IO_THREADS] = true; - srv_start_state_set(SRV_START_STATE_MASTER); + if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) { + thread_handles[1 + SRV_MAX_N_IO_THREADS] + = os_thread_create(srv_master_thread, NULL, + (1 + SRV_MAX_N_IO_THREADS) + + thread_ids); + thread_started[1 + SRV_MAX_N_IO_THREADS] = true; + srv_start_state_set(SRV_START_STATE_MASTER); + } } if (!srv_read_only_mode && (srv_operation == SRV_OPERATION_NORMAL || srv_operation == SRV_OPERATION_RESTORE_ROLLBACK_XA) && srv_force_recovery < SRV_FORCE_NO_BACKGROUND) { - srv_undo_sources = true; - /* Create the dict stats gathering thread */ - srv_dict_stats_thread_active = true; - dict_stats_thread_handle = os_thread_create( - dict_stats_thread, NULL, NULL); - - /* Create the thread that will optimize the FTS sub-system. */ - fts_optimize_init(); thread_handles[5 + SRV_MAX_N_IO_THREADS] = os_thread_create( srv_purge_coordinator_thread, @@ -2655,11 +2429,14 @@ files_checked: thread_started[5 + i + SRV_MAX_N_IO_THREADS] = true; } - srv_start_wait_for_purge_to_start(); + while (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED + && srv_force_recovery < SRV_FORCE_NO_BACKGROUND + && !purge_sys.enabled()) { + ib::info() << "Waiting for purge to start"; + os_thread_sleep(50000); + } srv_start_state_set(SRV_START_STATE_PURGE); - } else { - purge_sys->state = PURGE_STATE_DISABLED; } srv_is_being_started = false; @@ -2667,25 +2444,13 @@ files_checked: if (!srv_read_only_mode) { /* wake main loop of page cleaner up */ os_event_set(buf_flush_event); - - if (srv_use_mtflush) { - /* Start multi-threaded flush threads */ - mtflush_ctx = buf_mtflu_handler_init( - srv_mtflush_threads, - srv_buf_pool_instances); - - /* Set up the thread ids */ - buf_mtflu_set_thread_ids( - srv_mtflush_threads, - mtflush_ctx, - (thread_ids + 6 + 32)); - } } if (srv_print_verbose_log) { ib::info() << INNODB_VERSION_STR - << " started; log sequence number " - << srv_start_lsn; + << " started; log sequence number " + << srv_start_lsn + << "; transaction id " << trx_sys.get_max_trx_id(); } if (srv_force_recovery > 0) { @@ -2753,8 +2518,7 @@ files_checked: } /** Shut down background threads that can generate undo log. */ -void -srv_shutdown_bg_undo_sources() +void srv_shutdown_bg_undo_sources() { if (srv_undo_sources) { ut_ad(!srv_read_only_mode); @@ -2770,8 +2534,7 @@ srv_shutdown_bg_undo_sources() } /** Shut down InnoDB. */ -void -innodb_shutdown() +void innodb_shutdown() { ut_ad(!my_atomic_loadptr_explicit(reinterpret_cast<void**> (&srv_running), @@ -2780,8 +2543,8 @@ innodb_shutdown() switch (srv_operation) { case SRV_OPERATION_RESTORE_ROLLBACK_XA: - if (dberr_t err = fil_write_flushed_lsn(log_sys->lsn)) - ib::error() << "Writing flushed lsn " << log_sys->lsn + if (dberr_t err = fil_write_flushed_lsn(log_sys.lsn)) + ib::error() << "Writing flushed lsn " << log_sys.lsn << " failed; error=" << err; /* fall through */ case SRV_OPERATION_BACKUP: @@ -2820,15 +2583,15 @@ innodb_shutdown() ut_ad(dict_stats_event || !srv_was_started || srv_read_only_mode); ut_ad(dict_sys || !srv_was_started); - ut_ad(trx_sys || !srv_was_started); + ut_ad(trx_sys.is_initialised() || !srv_was_started); ut_ad(buf_dblwr || !srv_was_started || srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); - ut_ad(lock_sys || !srv_was_started); + ut_ad(lock_sys.is_initialised() || !srv_was_started); + ut_ad(log_sys.is_initialised() || !srv_was_started); #ifdef BTR_CUR_HASH_ADAPT ut_ad(btr_search_sys || !srv_was_started); #endif /* BTR_CUR_HASH_ADAPT */ ut_ad(ibuf || !srv_was_started); - ut_ad(log_sys || !srv_was_started); if (dict_stats_event) { dict_stats_thread_deinit(); @@ -2855,47 +2618,29 @@ innodb_shutdown() if (ibuf) { ibuf_close(); } - if (log_sys) { - log_shutdown(); - } - if (trx_sys) { - trx_sys_file_format_close(); - trx_sys_close(); - } - UT_DELETE(purge_sys); - purge_sys = NULL; + log_sys.close(); + purge_sys.close(); + trx_sys.close(); if (buf_dblwr) { buf_dblwr_free(); } - if (lock_sys) { - lock_sys_close(); - } - + lock_sys.close(); trx_pool_close(); - /* We don't create these mutexes in RO mode because we don't create - the temp files that the cover. */ if (!srv_read_only_mode) { mutex_free(&srv_monitor_file_mutex); mutex_free(&srv_misc_tmpfile_mutex); } - if (dict_sys) { - dict_close(); - } - -#ifdef BTR_CUR_HASH_ADAPT - if (btr_search_sys) { - btr_search_sys_free(); - } -#endif /* BTR_CUR_HASH_ADAPT */ + dict_close(); + btr_search_sys_free(); /* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside them */ os_aio_free(); row_mysql_close(); srv_free(); - fil_close(); + fil_system.close(); /* 4. Free all allocated memory */ @@ -2909,13 +2654,11 @@ innodb_shutdown() sync_check_close(); - if (dict_foreign_err_file) { - fclose(dict_foreign_err_file); - } - srv_sys_space.shutdown(); if (srv_tmp_space.get_sanity_check_status()) { - fil_space_close(srv_tmp_space.name()); + if (fil_system.temp_space) { + fil_system.temp_space->close(); + } srv_tmp_space.delete_files(); } srv_tmp_space.shutdown(); @@ -2926,7 +2669,8 @@ innodb_shutdown() if (srv_was_started && srv_print_verbose_log) { ib::info() << "Shutdown completed; log sequence number " - << srv_shutdown_lsn; + << srv_shutdown_lsn + << "; transaction id " << trx_sys.get_max_trx_id(); } srv_start_state = SRV_START_STATE_NONE; diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc index 27650cb1639..2f9f0a49986 100644 --- a/storage/innobase/sync/sync0arr.cc +++ b/storage/innobase/sync/sync0arr.cc @@ -584,9 +584,8 @@ sync_array_cell_print( fprintf(file, "number of readers " ULINTPF - ", waiters flag %u, " - "lock_word: " ULINTPFx "\n" - "Last time read locked in file %s line %u\n" + ", waiters flag %d, " + "lock_word: %x\n" "Last time write locked in file %s line %u" #if 0 /* JAN: TODO: FIX LATER */ "\nHolder thread " ULINTPF @@ -594,10 +593,8 @@ sync_array_cell_print( #endif "\n", rw_lock_get_reader_count(rwlock), - rwlock->waiters, - rwlock->lock_word, - innobase_basename(rwlock->last_s_file_name), - rwlock->last_s_line, + my_atomic_load32_explicit(&rwlock->waiters, MY_MEMORY_ORDER_RELAXED), + my_atomic_load32_explicit(&rwlock->lock_word, MY_MEMORY_ORDER_RELAXED), innobase_basename(rwlock->last_x_file_name), rwlock->last_x_line #if 0 /* JAN: TODO: FIX LATER */ @@ -979,7 +976,7 @@ sync_array_print_long_waits_low( return(false); } -#ifdef HAVE_valgrind +#if defined HAVE_valgrind && !__has_feature(memory_sanitizer) /* Increase the timeouts if running under valgrind because it executes extremely slowly. HAVE_valgrind does not necessary mean that we are running under valgrind but we have no better way to tell. @@ -1079,13 +1076,12 @@ sync_array_print_long_waits( } if (noticed && srv_monitor_event) { - ibool old_val; fprintf(stderr, "InnoDB: ###### Starts InnoDB Monitor" " for 30 secs to print diagnostic info:\n"); - old_val = srv_print_innodb_monitor; + my_bool old_val = srv_print_innodb_monitor; /* If some crucial semaphore is reserved, then also the InnoDB Monitor can hang, and we do not get diagnostics. Since in @@ -1158,23 +1154,18 @@ sync_array_print_info( sync_array_exit(arr); } -/**********************************************************************//** -Create the primary system wait array(s), they are protected by an OS mutex */ -void -sync_array_init( -/*============*/ - ulint n_threads) /*!< in: Number of slots to - create in all arrays */ +/** Create the primary system wait arrays */ +void sync_array_init() { ut_a(sync_wait_array == NULL); ut_a(srv_sync_array_size > 0); - ut_a(n_threads > 0); + ut_a(srv_max_n_threads > 0); sync_array_size = srv_sync_array_size; sync_wait_array = UT_NEW_ARRAY_NOKEY(sync_array_t*, sync_array_size); - ulint n_slots = 1 + (n_threads - 1) / sync_array_size; + ulint n_slots = 1 + (srv_max_n_threads - 1) / sync_array_size; for (ulint i = 0; i < sync_array_size; ++i) { @@ -1182,11 +1173,8 @@ sync_array_init( } } -/**********************************************************************//** -Close sync array wait sub-system. */ -void -sync_array_close(void) -/*==================*/ +/** Destroy the sync array wait sub-system. */ +void sync_array_close() { for (ulint i = 0; i < sync_array_size; ++i) { sync_array_free(sync_wait_array[i]); @@ -1306,7 +1294,7 @@ sync_arr_fill_sys_semphore_waits_table( ulint n_items; DBUG_ENTER("i_s_sys_semaphore_waits_fill_table"); - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* deny access to user without PROCESS_ACL privilege */ if (check_global_access(thd, PROCESS_ACL)) { @@ -1393,11 +1381,10 @@ sync_arr_fill_sys_semphore_waits_table( //OK(fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->store(rwlock->line, true)); //fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->set_notnull(); OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_READERS], rw_lock_get_reader_count(rwlock))); - OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG], (longlong)rwlock->waiters)); - OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD], (longlong)rwlock->lock_word)); - OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_LAST_READER_FILE], innobase_basename(rwlock->last_s_file_name))); - OK(fields[SYS_SEMAPHORE_WAITS_LAST_READER_LINE]->store(rwlock->last_s_line, true)); - fields[SYS_SEMAPHORE_WAITS_LAST_READER_LINE]->set_notnull(); + OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG], + my_atomic_load32_explicit(&rwlock->waiters, MY_MEMORY_ORDER_RELAXED))); + OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD], + my_atomic_load32_explicit(&rwlock->lock_word, MY_MEMORY_ORDER_RELAXED))); OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE], innobase_basename(rwlock->last_x_file_name))); OK(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->store(rwlock->last_x_line, true)); fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->set_notnull(); diff --git a/storage/innobase/sync/sync0debug.cc b/storage/innobase/sync/sync0debug.cc index 89db512da2a..f576ec6acca 100644 --- a/storage/innobase/sync/sync0debug.cc +++ b/storage/innobase/sync/sync0debug.cc @@ -33,6 +33,7 @@ Created 2012-08-21 Sunny Bains #include "sync0sync.h" #include "sync0debug.h" #include "srv0start.h" +#include "fil0fil.h" #include <vector> #include <string> @@ -188,10 +189,10 @@ struct LatchDebug { latch that the thread is trying to acquire @return true if passes, else crash with error message. */ - bool basic_check( + inline bool basic_check( const Latches* latches, latch_level_t requested_level, - ulint level) const + lint level) const UNIV_NOTHROW; /** Adds a latch and its level in the thread level array. Allocates @@ -474,6 +475,7 @@ LatchDebug::LatchDebug() LEVEL_MAP_INSERT(SYNC_TRX_SYS_HEADER); LEVEL_MAP_INSERT(SYNC_THREADS); LEVEL_MAP_INSERT(SYNC_TRX); + LEVEL_MAP_INSERT(SYNC_RW_TRX_HASH_ELEMENT); LEVEL_MAP_INSERT(SYNC_TRX_SYS); LEVEL_MAP_INSERT(SYNC_LOCK_SYS); LEVEL_MAP_INSERT(SYNC_LOCK_WAIT_SYS); @@ -492,7 +494,6 @@ LatchDebug::LatchDebug() LEVEL_MAP_INSERT(SYNC_RSEG_HEADER_NEW); LEVEL_MAP_INSERT(SYNC_NOREDO_RSEG); LEVEL_MAP_INSERT(SYNC_REDO_RSEG); - LEVEL_MAP_INSERT(SYNC_TRX_UNDO); LEVEL_MAP_INSERT(SYNC_PURGE_LATCH); LEVEL_MAP_INSERT(SYNC_TREE_NODE); LEVEL_MAP_INSERT(SYNC_TREE_NODE_FROM_HASH); @@ -506,7 +507,6 @@ LatchDebug::LatchDebug() LEVEL_MAP_INSERT(SYNC_DICT); LEVEL_MAP_INSERT(SYNC_FTS_CACHE); LEVEL_MAP_INSERT(SYNC_DICT_OPERATION); - LEVEL_MAP_INSERT(SYNC_FILE_FORMAT_TAG); LEVEL_MAP_INSERT(SYNC_TRX_I_S_LAST_READ); LEVEL_MAP_INSERT(SYNC_TRX_I_S_RWLOCK); LEVEL_MAP_INSERT(SYNC_RECV_WRITER); @@ -601,11 +601,11 @@ LatchDebug::less( The level of the latch that the thread is trying to acquire @return true if passes, else crash with error message. */ -bool +inline bool LatchDebug::basic_check( const Latches* latches, latch_level_t requested_level, - ulint in_level) const + lint in_level) const UNIV_NOTHROW { latch_level_t level = latch_level_t(in_level); @@ -733,7 +733,7 @@ LatchDebug::check_order( if (srv_is_being_started) { /* This is violated during trx_sys_create_rsegs() when creating additional rollback segments when - upgrading in innobase_start_or_create_for_mysql(). */ + upgrading in srv_start(). */ break; } @@ -750,17 +750,16 @@ LatchDebug::check_order( case SYNC_LOG: case SYNC_LOG_WRITE: case SYNC_LOG_FLUSH_ORDER: - case SYNC_FILE_FORMAT_TAG: case SYNC_DOUBLEWRITE: case SYNC_SEARCH_SYS: case SYNC_THREADS: case SYNC_LOCK_SYS: case SYNC_LOCK_WAIT_SYS: + case SYNC_RW_TRX_HASH_ELEMENT: case SYNC_TRX_SYS: case SYNC_IBUF_BITMAP_MUTEX: case SYNC_REDO_RSEG: case SYNC_NOREDO_RSEG: - case SYNC_TRX_UNDO: case SYNC_PURGE_LATCH: case SYNC_PURGE_QUEUE: case SYNC_DICT_AUTOINC_MUTEX: @@ -804,7 +803,7 @@ LatchDebug::check_order( case SYNC_TRX: - /* Either the thread must own the lock_sys->mutex, or + /* Either the thread must own the lock_sys.mutex, or it is allowed to own only ONE trx_t::mutex. */ if (less(latches, level) != NULL) { @@ -878,8 +877,7 @@ LatchDebug::check_order( The purge thread can read the UNDO pages without any covering mutex. */ - ut_a(find(latches, SYNC_TRX_UNDO) != 0 - || find(latches, SYNC_REDO_RSEG) != 0 + ut_a(find(latches, SYNC_REDO_RSEG) != 0 || find(latches, SYNC_NOREDO_RSEG) != 0 || basic_check(latches, level, level - 1)); break; @@ -897,19 +895,10 @@ LatchDebug::check_order( case SYNC_TREE_NODE: - { - const latch_t* fsp_latch; - - fsp_latch = find(latches, SYNC_FSP); - - ut_a((fsp_latch != NULL - && fsp_latch->is_temp_fsp()) - || find(latches, SYNC_INDEX_TREE) != 0 - || find(latches, SYNC_DICT_OPERATION) - || basic_check(latches, - level, SYNC_TREE_NODE - 1)); - } - + ut_a(find(latches, SYNC_FSP) == &fil_system.temp_space->latch + || find(latches, SYNC_INDEX_TREE) + || find(latches, SYNC_DICT_OPERATION) + || basic_check(latches, level, SYNC_TREE_NODE - 1)); break; case SYNC_TREE_NODE_NEW: @@ -1296,9 +1285,6 @@ sync_latch_meta_init() LATCH_ADD_MUTEX(DICT_SYS, SYNC_DICT, dict_sys_mutex_key); - LATCH_ADD_MUTEX(FILE_FORMAT_MAX, SYNC_FILE_FORMAT_TAG, - file_format_max_mutex_key); - LATCH_ADD_MUTEX(FIL_SYSTEM, SYNC_ANY_LATCH, fil_system_mutex_key); LATCH_ADD_MUTEX(FLUSH_LIST, SYNC_BUF_FLUSH_LIST, flush_list_mutex_key); @@ -1379,8 +1365,6 @@ sync_latch_meta_init() LATCH_ADD_MUTEX(BUF_DBLWR, SYNC_DOUBLEWRITE, buf_dblwr_mutex_key); - LATCH_ADD_MUTEX(TRX_UNDO, SYNC_TRX_UNDO, trx_undo_mutex_key); - LATCH_ADD_MUTEX(TRX_POOL, SYNC_POOL, trx_pool_mutex_key); LATCH_ADD_MUTEX(TRX_POOL_MANAGER, SYNC_POOL_MANAGER, @@ -1486,16 +1470,14 @@ sync_latch_meta_init() PFS_NOT_INSTRUMENTED); LATCH_ADD_MUTEX(BTR_DEFRAGMENT_MUTEX, SYNC_NO_ORDER_CHECK, PFS_NOT_INSTRUMENTED); - LATCH_ADD_MUTEX(MTFLUSH_THREAD_MUTEX, SYNC_NO_ORDER_CHECK, - PFS_NOT_INSTRUMENTED); - LATCH_ADD_MUTEX(MTFLUSH_MUTEX, SYNC_NO_ORDER_CHECK, - PFS_NOT_INSTRUMENTED); LATCH_ADD_MUTEX(FIL_CRYPT_STAT_MUTEX, SYNC_NO_ORDER_CHECK, PFS_NOT_INSTRUMENTED); LATCH_ADD_MUTEX(FIL_CRYPT_DATA_MUTEX, SYNC_NO_ORDER_CHECK, PFS_NOT_INSTRUMENTED); LATCH_ADD_MUTEX(FIL_CRYPT_THREADS_MUTEX, SYNC_NO_ORDER_CHECK, PFS_NOT_INSTRUMENTED); + LATCH_ADD_MUTEX(RW_TRX_HASH_ELEMENT, SYNC_RW_TRX_HASH_ELEMENT, + rw_trx_hash_element_mutex_key); latch_id_t id = LATCH_ID_NONE; @@ -1720,7 +1702,7 @@ sync_check_init() ut_d(LatchDebug::init()); - sync_array_init(OS_THREAD_MAX_N); + sync_array_init(); } /** Free the InnoDB synchronization data structures. */ diff --git a/storage/innobase/sync/sync0rw.cc b/storage/innobase/sync/sync0rw.cc index ded4f44c2a9..8ebfd27e7aa 100644 --- a/storage/innobase/sync/sync0rw.cc +++ b/storage/innobase/sync/sync0rw.cc @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -230,9 +230,7 @@ rw_lock_create_func( ut_ad(cline <= 8192); lock->cline = cline; lock->count_os_wait = 0; - lock->last_s_file_name = "not yet reserved"; lock->last_x_file_name = "not yet reserved"; - lock->last_s_line = 0; lock->last_x_line = 0; lock->event = os_event_create(0); lock->wait_ex_event = os_event_create(0); @@ -256,7 +254,8 @@ rw_lock_free_func( rw_lock_t* lock) /*!< in/out: rw-lock */ { ut_ad(rw_lock_validate(lock)); - ut_a(lock->lock_word == X_LOCK_DECR); + ut_a(my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED) == X_LOCK_DECR); mutex_enter(&rw_lock_list_mutex); @@ -285,8 +284,8 @@ rw_lock_s_lock_spin( { ulint i = 0; /* spin round count */ sync_array_t* sync_arr; - ulint spin_count = 0; - uint64_t count_os_wait = 0; + lint spin_count = 0; + int64_t count_os_wait = 0; /* We reuse the thread id to index into the counter, cache it here for efficiency. */ @@ -300,7 +299,9 @@ lock_loop: /* Spin waiting for the writer field to become free */ HMT_low(); ulint j = i; - while (i < srv_n_spin_wait_rounds && lock->lock_word <= 0) { + while (i < srv_n_spin_wait_rounds && + my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED) <= 0) { ut_delay(srv_spin_wait_delay); i++; } @@ -340,7 +341,7 @@ lock_loop: /* Set waiters before checking lock_word to ensure wake-up signal is sent. This may lead to some unnecessary signals. */ - my_atomic_fas32((int32*) &lock->waiters, 1); + my_atomic_fas32_explicit(&lock->waiters, 1, MY_MEMORY_ORDER_ACQUIRE); if (rw_lock_s_lock_low(lock, pass, file_name, line)) { @@ -414,21 +415,20 @@ rw_lock_x_lock_wait_func( unsigned line) /*!< in: line where requested */ { ulint i = 0; - ulint n_spins = 0; + lint n_spins = 0; sync_array_t* sync_arr; - uint64_t count_os_wait = 0; + int64_t count_os_wait = 0; - ut_ad(lock->lock_word <= threshold); + ut_ad(my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED) <= threshold); HMT_low(); - while (lock->lock_word < threshold) { + while (my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED) < threshold) { ut_delay(srv_spin_wait_delay); if (i < srv_n_spin_wait_rounds) { i++; continue; } - HMT_medium(); /* If there is still a reader, then go to sleep.*/ n_spins += i; @@ -441,7 +441,7 @@ rw_lock_x_lock_wait_func( i = 0; /* Check lock_word to ensure wake-up isn't missed.*/ - if (lock->lock_word < threshold) { + if (my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED) < threshold) { ++count_os_wait; @@ -464,7 +464,6 @@ rw_lock_x_lock_wait_func( sync_array_free_cell(sync_arr, cell); break; } - HMT_low(); } HMT_medium(); rw_lock_stats.rw_x_spin_round_count.add(n_spins); @@ -532,14 +531,18 @@ rw_lock_x_lock_low( file_name, line); } else { + int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED); /* At least one X lock by this thread already exists. Add another. */ - if (lock->lock_word == 0 - || lock->lock_word == -X_LOCK_HALF_DECR) { - lock->lock_word -= X_LOCK_DECR; + if (lock_word == 0 + || lock_word == -X_LOCK_HALF_DECR) { + my_atomic_add32_explicit(&lock->lock_word, -X_LOCK_DECR, + MY_MEMORY_ORDER_RELAXED); } else { - ut_ad(lock->lock_word <= -X_LOCK_DECR); - --lock->lock_word; + ut_ad(lock_word <= -X_LOCK_DECR); + my_atomic_add32_explicit(&lock->lock_word, -1, + MY_MEMORY_ORDER_RELAXED); } } @@ -610,12 +613,17 @@ rw_lock_sx_lock_low( thread working on this lock and it is safe to read and write to the lock_word. */ - ut_ad((lock->lock_word == 0) - || ((lock->lock_word <= -X_LOCK_DECR) - && (lock->lock_word +#ifdef UNIV_DEBUG + int32_t lock_word = +#endif + my_atomic_add32_explicit(&lock->lock_word, -X_LOCK_HALF_DECR, + MY_MEMORY_ORDER_RELAXED); + + ut_ad((lock_word == 0) + || ((lock_word <= -X_LOCK_DECR) + && (lock_word > -(X_LOCK_DECR + X_LOCK_HALF_DECR)))); - lock->lock_word -= X_LOCK_HALF_DECR; } } else { /* Another thread locked before us */ @@ -651,8 +659,8 @@ rw_lock_x_lock_func( { ulint i = 0; sync_array_t* sync_arr; - ulint spin_count = 0; - uint64_t count_os_wait = 0; + lint spin_count = 0; + int64_t count_os_wait = 0; ut_ad(rw_lock_validate(lock)); ut_ad(!rw_lock_own(lock, RW_LOCK_S)); @@ -684,8 +692,7 @@ lock_loop: HMT_low(); ulint j = i; while (i < srv_n_spin_wait_rounds - && lock->lock_word <= X_LOCK_HALF_DECR) { - + && my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED) <= X_LOCK_HALF_DECR) { ut_delay(srv_spin_wait_delay); i++; } @@ -710,7 +717,7 @@ lock_loop: /* Waiters must be set before checking lock_word, to ensure signal is sent. This could lead to a few unnecessary wake-up signals. */ - my_atomic_fas32((int32*) &lock->waiters, 1); + my_atomic_fas32_explicit(&lock->waiters, 1, MY_MEMORY_ORDER_ACQUIRE); if (rw_lock_x_lock_low(lock, pass, file_name, line)) { sync_array_free_cell(sync_arr, cell); @@ -757,8 +764,8 @@ rw_lock_sx_lock_func( { ulint i = 0; sync_array_t* sync_arr; - ulint spin_count = 0; - uint64_t count_os_wait = 0; + lint spin_count = 0; + int64_t count_os_wait = 0; ut_ad(rw_lock_validate(lock)); ut_ad(!rw_lock_own(lock, RW_LOCK_S)); @@ -790,8 +797,7 @@ lock_loop: /* Spin waiting for the lock_word to become free */ ulint j = i; while (i < srv_n_spin_wait_rounds - && lock->lock_word <= X_LOCK_HALF_DECR) { - + && my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED) <= X_LOCK_HALF_DECR) { ut_delay(srv_spin_wait_delay); i++; } @@ -815,7 +821,7 @@ lock_loop: /* Waiters must be set before checking lock_word, to ensure signal is sent. This could lead to a few unnecessary wake-up signals. */ - my_atomic_fas32((int32*) &lock->waiters, 1); + my_atomic_fas32_explicit(&lock->waiters, 1, MY_MEMORY_ORDER_ACQUIRE); if (rw_lock_sx_lock_low(lock, pass, file_name, line)) { @@ -853,13 +859,15 @@ rw_lock_validate( /*=============*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - lint lock_word; + int32_t lock_word; ut_ad(lock); - lock_word = lock->lock_word; + lock_word = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), + MY_MEMORY_ORDER_RELAXED); - ut_ad(lock->waiters < 2); + ut_ad(my_atomic_load32_explicit(const_cast<int32_t*>(&lock->waiters), + MY_MEMORY_ORDER_RELAXED) < 2); ut_ad(lock_word > -(2 * X_LOCK_DECR)); ut_ad(lock_word <= X_LOCK_DECR); @@ -922,15 +930,17 @@ rw_lock_add_debug_info( rw_lock_debug_mutex_exit(); if (pass == 0 && lock_type != RW_LOCK_X_WAIT) { + int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED); /* Recursive x while holding SX (lock_type == RW_LOCK_X && lock_word == -X_LOCK_HALF_DECR) is treated as not-relock (new lock). */ if ((lock_type == RW_LOCK_X - && lock->lock_word < -X_LOCK_HALF_DECR) + && lock_word < -X_LOCK_HALF_DECR) || (lock_type == RW_LOCK_SX - && (lock->lock_word < 0 || lock->sx_recursive == 1))) { + && (lock_word < 0 || lock->sx_recursive == 1))) { sync_check_lock_validate(lock); sync_check_lock_granted(lock); @@ -987,7 +997,7 @@ rw_lock_remove_debug_info( Checks if the thread has locked the rw-lock in the specified mode, with the pass value == 0. @return TRUE if locked */ -ibool +bool rw_lock_own( /*========*/ const rw_lock_t*lock, /*!< in: rw-lock */ @@ -1019,12 +1029,12 @@ rw_lock_own( rw_lock_debug_mutex_exit(); /* Found! */ - return(TRUE); + return(true); } } rw_lock_debug_mutex_exit(); - return(FALSE); + return(false); } /** Checks if the thread has locked the rw-lock in the specified mode, with @@ -1107,12 +1117,12 @@ rw_lock_list_print_info( count++; - if (lock->lock_word != X_LOCK_DECR) { + if (my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), MY_MEMORY_ORDER_RELAXED) != X_LOCK_DECR) { fprintf(file, "RW-LOCK: %p ", (void*) lock); - if (lock->waiters) { - fputs(" Waiters for the lock exist\n", file); + if (int32_t waiters= my_atomic_load32_explicit(const_cast<int32_t*>(&lock->waiters), MY_MEMORY_ORDER_RELAXED)) { + fprintf(file, " (%d waiters)\n", waiters); } else { putc('\n', file); } diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc index 81bce39df33..97641d42826 100644 --- a/storage/innobase/sync/sync0sync.cc +++ b/storage/innobase/sync/sync0sync.cc @@ -43,7 +43,6 @@ mysql_pfs_key_t buf_pool_zip_mutex_key; mysql_pfs_key_t cache_last_read_mutex_key; mysql_pfs_key_t dict_foreign_err_mutex_key; mysql_pfs_key_t dict_sys_mutex_key; -mysql_pfs_key_t file_format_max_mutex_key; mysql_pfs_key_t fil_system_mutex_key; mysql_pfs_key_t flush_list_mutex_key; mysql_pfs_key_t fts_delete_mutex_key; @@ -78,7 +77,6 @@ mysql_pfs_key_t srv_innodb_monitor_mutex_key; mysql_pfs_key_t srv_misc_tmpfile_mutex_key; mysql_pfs_key_t srv_monitor_file_mutex_key; mysql_pfs_key_t buf_dblwr_mutex_key; -mysql_pfs_key_t trx_undo_mutex_key; mysql_pfs_key_t trx_mutex_key; mysql_pfs_key_t trx_pool_mutex_key; mysql_pfs_key_t trx_pool_manager_mutex_key; @@ -93,6 +91,7 @@ mysql_pfs_key_t sync_array_mutex_key; mysql_pfs_key_t thread_mutex_key; mysql_pfs_key_t zip_pad_mutex_key; mysql_pfs_key_t row_drop_list_mutex_key; +mysql_pfs_key_t rw_trx_hash_element_mutex_key; #endif /* UNIV_PFS_MUTEX */ #ifdef UNIV_PFS_RWLOCK mysql_pfs_key_t btr_search_latch_key; @@ -144,13 +143,13 @@ sync_print_wait_info(FILE* file) " %.2f RW-excl, %.2f RW-sx\n", (double) rw_lock_stats.rw_s_spin_round_count / (rw_lock_stats.rw_s_spin_wait_count - ? rw_lock_stats.rw_s_spin_wait_count : 1), + ? rw_lock_stats.rw_s_spin_wait_count : 1LL), (double) rw_lock_stats.rw_x_spin_round_count / (rw_lock_stats.rw_x_spin_wait_count - ? rw_lock_stats.rw_x_spin_wait_count : 1), + ? rw_lock_stats.rw_x_spin_wait_count : 1LL), (double) rw_lock_stats.rw_sx_spin_round_count / (rw_lock_stats.rw_sx_spin_wait_count - ? rw_lock_stats.rw_sx_spin_wait_count : 1)); + ? rw_lock_stats.rw_sx_spin_wait_count : 1LL)); } /** diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc index 2b9d6c96acd..59e0bed8006 100644 --- a/storage/innobase/trx/trx0i_s.cc +++ b/storage/innobase/trx/trx0i_s.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -44,8 +44,9 @@ Created July 17, 2007 Vasil Dimov #include "sync0rw.h" #include "sync0sync.h" #include "trx0sys.h" - -#include <sql_class.h> +#include "que0que.h" +#include "trx0purge.h" +#include "sql_class.h" /** Initial number of rows in the table cache */ #define TABLE_CACHE_INITIAL_ROWSNUM 1024 @@ -161,10 +162,10 @@ struct trx_i_s_cache_t { ha_storage_t* storage; /*!< storage for external volatile data that may become unavailable when we release - lock_sys->mutex or trx_sys->mutex */ + lock_sys.mutex or trx_sys.mutex */ ulint mem_allocd; /*!< the amount of memory allocated with mem_alloc*() */ - ibool is_truncated; /*!< this is TRUE if the memory + bool is_truncated; /*!< this is true if the memory limit was hit and thus the data in the cache is truncated */ }; @@ -523,9 +524,9 @@ thd_done: row->trx_tables_locked = lock_number_of_tables_locked(&trx->lock); - /* These are protected by both trx->mutex or lock_sys->mutex, - or just lock_sys->mutex. For reading, it suffices to hold - lock_sys->mutex. */ + /* These are protected by both trx->mutex or lock_sys.mutex, + or just lock_sys.mutex. For reading, it suffices to hold + lock_sys.mutex. */ row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks); @@ -712,7 +713,8 @@ fill_lock_data( ut_a(n_fields > 0); heap = NULL; - offsets = rec_get_offsets(rec, index, offsets, true, n_fields, &heap); + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + n_fields, &heap); /* format and store the data */ @@ -1219,102 +1221,67 @@ trx_i_s_cache_clear( ha_storage_empty(&cache->storage); } -/*******************************************************************//** -Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the -table cache buffer. Cache must be locked for write. */ -static -void -fetch_data_into_cache_low( -/*======================*/ - trx_i_s_cache_t* cache, /*!< in/out: cache */ - bool read_write, /*!< in: only read-write - transactions */ - trx_ut_list_t* trx_list) /*!< in: trx list */ -{ - const trx_t* trx; - bool rw_trx_list = trx_list == &trx_sys->rw_trx_list; - - ut_ad(rw_trx_list || trx_list == &trx_sys->mysql_trx_list); - - /* Iterate over the transaction list and add each one - to innodb_trx's cache. We also add all locks that are relevant - to each transaction into innodb_locks' and innodb_lock_waits' - caches. */ - - for (trx = UT_LIST_GET_FIRST(*trx_list); - trx != NULL; - trx = - (rw_trx_list - ? UT_LIST_GET_NEXT(trx_list, trx) - : UT_LIST_GET_NEXT(mysql_trx_list, trx))) { - - i_s_trx_row_t* trx_row; - i_s_locks_row_t* requested_lock_row; - /* Note: Read only transactions that modify temporary - tables an have a transaction ID */ - if (!trx_is_started(trx) - || (!rw_trx_list && trx->id != 0 && !trx->read_only)) { +/** + Add transactions to innodb_trx's cache. - continue; - } - - assert_trx_nonlocking_or_in_list(trx); - - ut_ad(trx->in_rw_trx_list == rw_trx_list); - - if (!add_trx_relevant_locks_to_cache(cache, trx, - &requested_lock_row)) { - - cache->is_truncated = TRUE; - return; - } - - trx_row = reinterpret_cast<i_s_trx_row_t*>( - table_cache_create_empty_row( - &cache->innodb_trx, cache)); - - /* memory could not be allocated */ - if (trx_row == NULL) { - - cache->is_truncated = TRUE; - return; - } + We also add all locks that are relevant to each transaction into + innodb_locks' and innodb_lock_waits' caches. +*/ - if (!fill_trx_row(trx_row, trx, requested_lock_row, cache)) { - - /* memory could not be allocated */ - --cache->innodb_trx.rows_used; - cache->is_truncated = TRUE; - return; - } - } -} - -/*******************************************************************//** -Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the -table cache buffer. Cache must be locked for write. */ -static -void -fetch_data_into_cache( -/*==================*/ - trx_i_s_cache_t* cache) /*!< in/out: cache */ +static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx) { - ut_ad(lock_mutex_own()); - ut_ad(trx_sys_mutex_own()); - - trx_i_s_cache_clear(cache); + i_s_locks_row_t *requested_lock_row; + + assert_trx_nonlocking_or_in_list(trx); + + if (add_trx_relevant_locks_to_cache(cache, trx, &requested_lock_row)) + { + if (i_s_trx_row_t *trx_row= reinterpret_cast<i_s_trx_row_t*>( + table_cache_create_empty_row(&cache->innodb_trx, cache))) + { + if (fill_trx_row(trx_row, trx, requested_lock_row, cache)) + return; + --cache->innodb_trx.rows_used; + } + } + + /* memory could not be allocated */ + cache->is_truncated= true; +} - /* Capture the state of the read-write transactions. This includes - internal transactions too. They are not on mysql_trx_list */ - fetch_data_into_cache_low(cache, true, &trx_sys->rw_trx_list); - /* Capture the state of the read-only active transactions */ - fetch_data_into_cache_low(cache, false, &trx_sys->mysql_trx_list); +/** + Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the + table cache buffer. Cache must be locked for write. +*/ - cache->is_truncated = FALSE; +static void fetch_data_into_cache(trx_i_s_cache_t *cache) +{ + ut_ad(lock_mutex_own()); + trx_i_s_cache_clear(cache); + + /* Capture the state of transactions */ + mutex_enter(&trx_sys.mutex); + for (trx_t *trx= UT_LIST_GET_FIRST(trx_sys.trx_list); + trx != NULL; + trx= UT_LIST_GET_NEXT(trx_list, trx)) + { + if (trx->state != TRX_STATE_NOT_STARTED && trx != purge_sys.query->trx) + { + mutex_enter(&trx->mutex); + if (trx->state != TRX_STATE_NOT_STARTED) + fetch_data_into_cache_low(cache, trx); + mutex_exit(&trx->mutex); + if (cache->is_truncated) + break; + } + } + mutex_exit(&trx_sys.mutex); + cache->is_truncated= false; } + /*******************************************************************//** Update the transactions cache if it has not been read for some time. Called from handler/i_s.cc. @@ -1332,13 +1299,7 @@ trx_i_s_possibly_fetch_data_into_cache( /* We need to read trx_sys and record/table lock queues */ lock_mutex_enter(); - - trx_sys_mutex_enter(); - fetch_data_into_cache(cache); - - trx_sys_mutex_exit(); - lock_mutex_exit(); /* update cache last read time */ @@ -1351,7 +1312,7 @@ trx_i_s_possibly_fetch_data_into_cache( Returns TRUE if the data in the cache is truncated due to the memory limit posed by TRX_I_S_MEM_LIMIT. @return TRUE if truncated */ -ibool +bool trx_i_s_cache_is_truncated( /*=======================*/ trx_i_s_cache_t* cache) /*!< in: cache */ @@ -1395,7 +1356,7 @@ trx_i_s_cache_init( cache->mem_allocd = 0; - cache->is_truncated = FALSE; + cache->is_truncated = false; } /*******************************************************************//** diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 2025ac70beb..02a524d6850 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -31,7 +31,6 @@ Created 3/26/1996 Heikki Tuuri #include "mtr0log.h" #include "os0thread.h" #include "que0que.h" -#include "read0read.h" #include "row0purge.h" #include "row0upd.h" #include "srv0mon.h" @@ -42,6 +41,7 @@ Created 3/26/1996 Heikki Tuuri #include "trx0roll.h" #include "trx0rseg.h" #include "trx0trx.h" +#include <mysql/service_wsrep.h> /** Maximum allowable purge history length. <=0 means 'infinite'. */ ulong srv_max_purge_lag = 0; @@ -50,7 +50,7 @@ ulong srv_max_purge_lag = 0; ulong srv_max_purge_lag_delay = 0; /** The global data structure coordinating a purge */ -purge_sys_t* purge_sys; +purge_sys_t purge_sys; /** A dummy undo record used as a return value when we have a whole undo log which needs no purge */ @@ -61,101 +61,67 @@ my_bool srv_purge_view_update_only_debug; #endif /* UNIV_DEBUG */ /** Sentinel value */ -const TrxUndoRsegs TrxUndoRsegsIterator::NullElement(UINT64_UNDEFINED); +static const TrxUndoRsegs NullElement; -/** Constructor */ +/** Default constructor */ TrxUndoRsegsIterator::TrxUndoRsegsIterator() - : - m_trx_undo_rsegs(NullElement), - m_iter(m_trx_undo_rsegs.end()) + : m_rsegs(NullElement), m_iter(m_rsegs.begin()) { } /** Sets the next rseg to purge in purge_sys. +Executed in the purge coordinator thread. @return whether anything is to be purged */ -inline -bool -TrxUndoRsegsIterator::set_next() +inline bool TrxUndoRsegsIterator::set_next() { - mutex_enter(&purge_sys->pq_mutex); + mutex_enter(&purge_sys.pq_mutex); /* Only purge consumes events from the priority queue, user threads only produce the events. */ /* Check if there are more rsegs to process in the current element. */ - if (m_iter != m_trx_undo_rsegs.end()) { - + if (m_iter != m_rsegs.end()) { /* We are still processing rollback segment from the same transaction and so expected transaction - number shouldn't increase. Undo increment of - expected trx_no done by caller assuming rollback + number shouldn't increase. Undo the increment of + expected commit done by caller assuming rollback segments from given transaction are done. */ - purge_sys->iter.trx_no = (*m_iter)->last_trx_no; - - } else if (!purge_sys->purge_queue.empty()) { - - /* Read the next element from the queue. - Combine elements if they have same transaction number. - This can happen if a transaction shares redo rollback segment - with another transaction that has already added it to purge - queue and former transaction also needs to schedule non-redo - rollback segment for purge. */ - m_trx_undo_rsegs = NullElement; - - purge_pq_t& purge_queue = purge_sys->purge_queue; - - while (!purge_queue.empty()) { - - if (m_trx_undo_rsegs.get_trx_no() == UINT64_UNDEFINED) { - m_trx_undo_rsegs = purge_queue.top(); - } else if (purge_queue.top().get_trx_no() == - m_trx_undo_rsegs.get_trx_no()) { - m_trx_undo_rsegs.append( - purge_queue.top()); - } else { - break; - } - - purge_queue.pop(); - } - - m_iter = m_trx_undo_rsegs.begin(); - + purge_sys.tail.commit = (*m_iter)->last_commit; + } else if (!purge_sys.purge_queue.empty()) { + m_rsegs = purge_sys.purge_queue.top(); + purge_sys.purge_queue.pop(); + ut_ad(purge_sys.purge_queue.empty() + || purge_sys.purge_queue.top() != m_rsegs); + m_iter = m_rsegs.begin(); } else { /* Queue is empty, reset iterator. */ - m_trx_undo_rsegs = NullElement; - m_iter = m_trx_undo_rsegs.end(); - - mutex_exit(&purge_sys->pq_mutex); - - purge_sys->rseg = NULL; + purge_sys.rseg = NULL; + mutex_exit(&purge_sys.pq_mutex); + m_rsegs = NullElement; + m_iter = m_rsegs.begin(); return false; } - purge_sys->rseg = *m_iter++; - - mutex_exit(&purge_sys->pq_mutex); - - ut_a(purge_sys->rseg != NULL); + purge_sys.rseg = *m_iter++; + mutex_exit(&purge_sys.pq_mutex); + mutex_enter(&purge_sys.rseg->mutex); - mutex_enter(&purge_sys->rseg->mutex); - - ut_a(purge_sys->rseg->last_page_no != FIL_NULL); - ut_ad(purge_sys->rseg->last_trx_no == m_trx_undo_rsegs.get_trx_no()); + ut_a(purge_sys.rseg->last_page_no != FIL_NULL); + ut_ad(purge_sys.rseg->last_trx_no() == m_rsegs.trx_no()); /* We assume in purge of externally stored fields that space id is in the range of UNDO tablespace space ids */ - ut_a(purge_sys->rseg->space == TRX_SYS_SPACE - || srv_is_undo_tablespace(purge_sys->rseg->space)); + ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE + || srv_is_undo_tablespace(purge_sys.rseg->space->id)); - ut_a(purge_sys->iter.trx_no <= purge_sys->rseg->last_trx_no); + ut_a(purge_sys.tail.commit <= purge_sys.rseg->last_commit); - purge_sys->iter.trx_no = purge_sys->rseg->last_trx_no; - purge_sys->hdr_offset = purge_sys->rseg->last_offset; - purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; + purge_sys.tail.commit = purge_sys.rseg->last_commit; + purge_sys.hdr_offset = purge_sys.rseg->last_offset; + purge_sys.hdr_page_no = purge_sys.rseg->last_page_no; - mutex_exit(&purge_sys->rseg->mutex); + mutex_exit(&purge_sys.rseg->mutex); return(true); } @@ -169,7 +135,7 @@ purge_graph_build() { ut_a(srv_n_purge_threads > 0); - trx_t* trx = trx_allocate_for_background(); + trx_t* trx = trx_create(); ut_ad(!trx->id); trx->start_time = time(NULL); trx->start_time_micro = microsecond_interval_timer(); @@ -190,92 +156,103 @@ purge_graph_build() return(fork); } -/** Construct the purge system. */ -purge_sys_t::purge_sys_t() - : latch(), event(os_event_create(0)), - n_stop(0), running(false), state(PURGE_STATE_INIT), - query(purge_graph_build()), - view(), n_submitted(0), n_completed(0), - iter(), limit(), -#ifdef UNIV_DEBUG - done(), -#endif /* UNIV_DEBUG */ - next_stored(false), rseg(NULL), - page_no(0), offset(0), hdr_page_no(0), hdr_offset(0), - rseg_iter(), purge_queue(), pq_mutex(), undo_trunc() +/** Initialise the purge system. */ +void purge_sys_t::create() { - ut_ad(!purge_sys); - rw_lock_create(trx_purge_latch_key, &latch, SYNC_PURGE_LATCH); - mutex_create(LATCH_ID_PURGE_SYS_PQ, &pq_mutex); + ut_ad(this == &purge_sys); + ut_ad(!enabled()); + ut_ad(!event); + event= os_event_create(0); + ut_ad(event); + m_paused= 0; + query= purge_graph_build(); + n_submitted= 0; + n_completed= 0; + next_stored= false; + rseg= NULL; + page_no= 0; + offset= 0; + hdr_page_no= 0; + hdr_offset= 0; + rw_lock_create(trx_purge_latch_key, &latch, SYNC_PURGE_LATCH); + mutex_create(LATCH_ID_PURGE_SYS_PQ, &pq_mutex); + undo_trunc.create(); } -/** Destruct the purge system. */ -purge_sys_t::~purge_sys_t() +/** Close the purge subsystem on shutdown. */ +void purge_sys_t::close() { - ut_ad(this == purge_sys); - - trx_t* trx = query->trx; - que_graph_free(query); - ut_ad(!trx->id); - ut_ad(trx->state == TRX_STATE_ACTIVE); - trx->state = TRX_STATE_NOT_STARTED; - trx_free_for_background(trx); - view.close(); - rw_lock_free(&latch); - mutex_free(&pq_mutex); - os_event_destroy(event); + ut_ad(this == &purge_sys); + if (!event) return; + + m_enabled= false; + trx_t* trx = query->trx; + que_graph_free(query); + ut_ad(!trx->id); + ut_ad(trx->state == TRX_STATE_ACTIVE); + trx->state= TRX_STATE_NOT_STARTED; + trx->free(); + rw_lock_free(&latch); + mutex_free(&pq_mutex); + os_event_destroy(event); } /*================ UNDO LOG HISTORY LIST =============================*/ -/********************************************************************//** -Adds the update undo log as the first log in the history list. Removes the -update undo log segment from the rseg slot if it is too big for reuse. */ +/** Prepend the history list with an undo log. +Remove the undo log segment from the rseg slot if it is too big for reuse. +@param[in] trx transaction +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction */ void -trx_purge_add_update_undo_to_history( -/*=================================*/ - trx_t* trx, /*!< in: transaction */ - page_t* undo_page, /*!< in: update undo log header page, - x-latched */ - mtr_t* mtr) /*!< in: mtr */ +trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr) { - trx_undo_t* undo = trx->rsegs.m_redo.update_undo; - trx_rseg_t* rseg = undo->rseg; + DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")", + trx->id, trx->no)); + ut_ad(undo == trx->rsegs.m_redo.undo + || undo == trx->rsegs.m_redo.old_insert); + trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; + ut_ad(undo->rseg == rseg); trx_rsegf_t* rseg_header = trx_rsegf_get( rseg->space, rseg->page_no, mtr); + page_t* undo_page = trx_undo_set_state_at_finish( + undo, mtr); trx_ulogf_t* undo_header = undo_page + undo->hdr_offset; - if (undo->state != TRX_UNDO_CACHED) { - ulint hist_size; -#ifdef UNIV_DEBUG - trx_usegf_t* seg_header = undo_page + TRX_UNDO_SEG_HDR; -#endif /* UNIV_DEBUG */ + ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1); - /* The undo log segment will not be reused */ - - if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) { - ib::fatal() << "undo->id is " << undo->id; - } + if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG_FORMAT + rseg_header))) { + /* This database must have been upgraded from + before MariaDB 10.3.5. */ + trx_rseg_format_upgrade(rseg_header, mtr); + } + if (undo->state != TRX_UNDO_CACHED) { + /* The undo log segment will not be reused */ + ut_a(undo->id < TRX_RSEG_N_SLOTS); trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr); MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED); - hist_size = mtr_read_ulint( - rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr); + uint32_t hist_size = mach_read_from_4(TRX_RSEG_HISTORY_SIZE + + rseg_header); - ut_ad(undo->size == flst_get_len( - seg_header + TRX_UNDO_PAGE_LIST)); + ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR + + TRX_UNDO_PAGE_LIST + + undo_page)); mlog_write_ulint( rseg_header + TRX_RSEG_HISTORY_SIZE, hist_size + undo->size, MLOG_4BYTES, mtr); + + mlog_write_ull(rseg_header + TRX_RSEG_MAX_TRX_ID, + trx_sys.get_max_trx_id(), mtr); } /* After the purge thread has been given permission to exit, we may roll back transactions (trx->undo_no==0) in THD::cleanup() invoked from unlink_thd() in fast shutdown, - or in trx_rollback_resurrected() in slow shutdown. + or in trx_rollback_recovered() in slow shutdown. Before any transaction-generating background threads or the purge have been started, recv_recovery_rollback_active() can @@ -291,36 +268,57 @@ trx_purge_add_update_undo_to_history( user transactions. */ ut_ad(srv_undo_sources || trx->undo_no == 0 - || ((srv_is_being_started - || trx_rollback_or_clean_is_active) - && purge_sys->state == PURGE_STATE_INIT) - || (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND - && purge_sys->state == PURGE_STATE_DISABLED) - || ((trx->in_mysql_trx_list || trx->internal) + || (!purge_sys.enabled() + && (srv_is_being_started + || trx_rollback_is_active + || srv_force_recovery >= SRV_FORCE_NO_BACKGROUND)) + || ((trx->mysql_thd || trx->internal) && srv_fast_shutdown)); +#ifdef WITH_WSREP + if (wsrep_is_wsrep_xid(trx->xid)) { + trx_rseg_update_wsrep_checkpoint(rseg_header, trx->xid, mtr); + } +#endif + + if (trx->mysql_log_file_name && *trx->mysql_log_file_name) { + /* Update the latest MySQL binlog name and offset info + in rollback segment header if MySQL binlogging is on + or the database server is a MySQL replication save. */ + trx_rseg_update_binlog_offset(rseg_header, trx, mtr); + } + /* Add the log as the first in the history list */ flst_add_first(rseg_header + TRX_RSEG_HISTORY, undo_header + TRX_UNDO_HISTORY_NODE, mtr); - my_atomic_addlint(&trx_sys->rseg_history_len, 1); - - /* Write the trx number to the undo log header */ mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr); - - /* Write information about delete markings to the undo log header */ - - if (!undo->del_marks) { - mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE, + /* This is needed for upgrading old undo log pages from + before MariaDB 10.3.1. */ + if (UNIV_UNLIKELY(!mach_read_from_2(undo_header + + TRX_UNDO_NEEDS_PURGE))) { + mlog_write_ulint(undo_header + TRX_UNDO_NEEDS_PURGE, 1, MLOG_2BYTES, mtr); } if (rseg->last_page_no == FIL_NULL) { rseg->last_page_no = undo->hdr_page_no; rseg->last_offset = undo->hdr_offset; - rseg->last_trx_no = trx->no; - rseg->last_del_marks = undo->del_marks; + rseg->set_last_trx_no(trx->no, undo == trx->rsegs.m_redo.undo); + rseg->needs_purge = true; + } + + trx_sys.history_insert(); + + if (undo->state == TRX_UNDO_CACHED) { + UT_LIST_ADD_FIRST(rseg->undo_cached, undo); + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); + } else { + ut_ad(undo->state == TRX_UNDO_TO_PURGE); + ut_free(undo); } + + undo = NULL; } /** Remove undo log header from the history list. @@ -336,7 +334,7 @@ trx_purge_remove_log_hdr( { flst_remove(rseg_hdr + TRX_RSEG_HISTORY, log_hdr + TRX_UNDO_HISTORY_NODE, mtr); - my_atomic_addlint(&trx_sys->rseg_history_len, -1); + trx_sys.history_remove(); } /** Free an undo log segment, and remove the header from the history list. @@ -348,63 +346,52 @@ trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr) { mtr_t mtr; trx_rsegf_t* rseg_hdr; - trx_ulogf_t* log_hdr; - trx_usegf_t* seg_hdr; - ulint seg_size; - ulint hist_size; - bool marked = false; + page_t* undo_page; - for (;;) { - page_t* undo_page; + mtr.start(); + mutex_enter(&rseg->mutex); - mtr_start(&mtr); + rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr); + undo_page = trx_undo_page_get( + page_id_t(rseg->space->id, hdr_addr.page), &mtr); + + /* Mark the last undo log totally purged, so that if the + system crashes, the tail of the undo log will not get accessed + again. The list of pages in the undo log tail gets + inconsistent during the freeing of the segment, and therefore + purge should not try to access them again. */ + mlog_write_ulint(undo_page + hdr_addr.boffset + TRX_UNDO_NEEDS_PURGE, + 0, MLOG_2BYTES, &mtr); + + while (!fseg_free_step_not_header( + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + + undo_page, &mtr)) { + mutex_exit(&rseg->mutex); + + mtr.commit(); + mtr.start(); mutex_enter(&rseg->mutex); rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr); undo_page = trx_undo_page_get( - page_id_t(rseg->space, hdr_addr.page), &mtr); - - seg_hdr = undo_page + TRX_UNDO_SEG_HDR; - log_hdr = undo_page + hdr_addr.boffset; - - /* Mark the last undo log totally purged, so that if the - system crashes, the tail of the undo log will not get accessed - again. The list of pages in the undo log tail gets inconsistent - during the freeing of the segment, and therefore purge should - not try to access them again. */ - - if (!marked) { - marked = true; - mlog_write_ulint( - log_hdr + TRX_UNDO_DEL_MARKS, FALSE, - MLOG_2BYTES, &mtr); - } - - if (fseg_free_step_not_header( - seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr)) { - - break; - } - - mutex_exit(&rseg->mutex); - - mtr_commit(&mtr); + page_id_t(rseg->space->id, hdr_addr.page), &mtr); } /* The page list may now be inconsistent, but the length field stored in the list base node tells us how big it was before we started the freeing. */ - seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST); + const ulint seg_size = flst_get_len( + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + undo_page); /* We may free the undo log segment header page; it must be freed within the same mtr as the undo log header is removed from the history list: otherwise, in case of a database crash, the segment could become inaccessible garbage in the file space. */ - trx_purge_remove_log_hdr(rseg_hdr, log_hdr, &mtr); + trx_purge_remove_log_hdr(rseg_hdr, undo_page + hdr_addr.boffset, &mtr); do { @@ -413,10 +400,11 @@ trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr) is not flooded with bufferfixed pages: see the note in fsp0fsp.cc. */ - } while (!fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr)); + } while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + + undo_page, &mtr)); - hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, - MLOG_4BYTES, &mtr); + const ulint hist_size = mach_read_from_4(rseg_hdr + + TRX_RSEG_HISTORY_SIZE); ut_ad(hist_size >= seg_size); mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, @@ -433,10 +421,12 @@ trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr) /** Remove unnecessary history data from a rollback segment. @param[in,out] rseg rollback segment -@param[in] limit truncate offset */ +@param[in] limit truncate anything before this */ static void -trx_purge_truncate_rseg_history(trx_rseg_t* rseg, const purge_iter_t* limit) +trx_purge_truncate_rseg_history( + trx_rseg_t& rseg, + const purge_sys_t::iterator& limit) { fil_addr_t hdr_addr; fil_addr_t prev_hdr_addr; @@ -447,48 +437,37 @@ trx_purge_truncate_rseg_history(trx_rseg_t* rseg, const purge_iter_t* limit) mtr_t mtr; trx_id_t undo_trx_no; - mtr_start(&mtr); - ut_ad(rseg->is_persistent()); - mutex_enter(&(rseg->mutex)); + mtr.start(); + ut_ad(rseg.is_persistent()); + mutex_enter(&rseg.mutex); - rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr); + rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr); hdr_addr = trx_purge_get_log_from_hist( flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr)); loop: if (hdr_addr.page == FIL_NULL) { - - mutex_exit(&(rseg->mutex)); - - mtr_commit(&mtr); - +func_exit: + mutex_exit(&rseg.mutex); + mtr.commit(); return; } - undo_page = trx_undo_page_get(page_id_t(rseg->space, hdr_addr.page), + undo_page = trx_undo_page_get(page_id_t(rseg.space->id, hdr_addr.page), &mtr); log_hdr = undo_page + hdr_addr.boffset; undo_trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); - if (undo_trx_no >= limit->trx_no) { - - /* limit space_id should match the rollback segment - space id to avoid freeing of the page belongs to - different rollback segment for the same trx_no. */ - if (undo_trx_no == limit->trx_no - && rseg->space == limit->undo_rseg_space) { - + if (undo_trx_no >= limit.trx_no()) { + if (undo_trx_no == limit.trx_no()) { trx_undo_truncate_start( - rseg, hdr_addr.page, - hdr_addr.boffset, limit->undo_no); + &rseg, hdr_addr.page, + hdr_addr.boffset, limit.undo_no); } - mutex_exit(&(rseg->mutex)); - mtr_commit(&mtr); - - return; + goto func_exit; } prev_hdr_addr = trx_purge_get_log_from_hist( @@ -501,24 +480,24 @@ loop: /* We can free the whole log segment */ - mutex_exit(&(rseg->mutex)); - mtr_commit(&mtr); + mutex_exit(&rseg.mutex); + mtr.commit(); /* calls the trx_purge_remove_log_hdr() inside trx_purge_free_segment(). */ - trx_purge_free_segment(rseg, hdr_addr); + trx_purge_free_segment(&rseg, hdr_addr); } else { /* Remove the log hdr from the rseg history. */ trx_purge_remove_log_hdr(rseg_hdr, log_hdr, &mtr); - mutex_exit(&(rseg->mutex)); - mtr_commit(&mtr); + mutex_exit(&rseg.mutex); + mtr.commit(); } - mtr_start(&mtr); - mutex_enter(&(rseg->mutex)); + mtr.start(); + mutex_enter(&rseg.mutex); - rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr); + rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr); hdr_addr = prev_hdr_addr; @@ -609,8 +588,8 @@ namespace undo { return; } - ulint sz = UNIV_PAGE_SIZE; - void* buf = ut_zalloc_nokey(sz + UNIV_PAGE_SIZE); + ulint sz = srv_page_size; + void* buf = ut_zalloc_nokey(sz + srv_page_size); if (buf == NULL) { os_file_close(handle); os_file_delete(innodb_log_file_key, log_file_name); @@ -619,7 +598,7 @@ namespace undo { } byte* log_buf = static_cast<byte*>( - ut_align(buf, UNIV_PAGE_SIZE)); + ut_align(buf, srv_page_size)); mach_write_to_4(log_buf, undo::s_magic); @@ -677,8 +656,8 @@ namespace undo { return(false); } - ulint sz = UNIV_PAGE_SIZE; - void* buf = ut_zalloc_nokey(sz + UNIV_PAGE_SIZE); + ulint sz = srv_page_size; + void* buf = ut_zalloc_nokey(sz + srv_page_size); if (buf == NULL) { os_file_close(handle); os_file_delete(innodb_log_file_key, @@ -688,7 +667,7 @@ namespace undo { } byte* log_buf = static_cast<byte*>( - ut_align(buf, UNIV_PAGE_SIZE)); + ut_align(buf, srv_page_size)); IORequest request(IORequest::READ); @@ -765,7 +744,7 @@ trx_purge_mark_undo_for_truncate( for (ulint i = 1; i <= srv_undo_tablespaces_active; i++) { if (fil_space_get_size(space_id) - > (srv_max_undo_log_size / srv_page_size)) { + > (srv_max_undo_log_size >> srv_page_size_shift)) { /* Tablespace qualifies for truncate. */ undo_trunc->mark(space_id); undo::Truncate::add_space_to_trunc_list(space_id); @@ -791,9 +770,10 @@ trx_purge_mark_undo_for_truncate( /* Step-3: Iterate over all the rsegs of selected UNDO tablespace and mark them temporarily unavailable for allocation.*/ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { - if (trx_rseg_t* rseg = trx_sys->rseg_array[i]) { + if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) { ut_ad(rseg->is_persistent()); - if (rseg->space == undo_trunc->get_marked_space_id()) { + if (rseg->space->id + == undo_trunc->get_marked_space_id()) { /* Once set this rseg will not be allocated to new booting transaction but we will wait @@ -815,17 +795,17 @@ void trx_purge_cleanse_purge_queue( undo::Truncate* undo_trunc) { - mutex_enter(&purge_sys->pq_mutex); + mutex_enter(&purge_sys.pq_mutex); typedef std::vector<TrxUndoRsegs> purge_elem_list_t; purge_elem_list_t purge_elem_list; /* Remove rseg instances that are in the purge queue before we start truncate of corresponding UNDO truncate. */ - while (!purge_sys->purge_queue.empty()) { - purge_elem_list.push_back(purge_sys->purge_queue.top()); - purge_sys->purge_queue.pop(); + while (!purge_sys.purge_queue.empty()) { + purge_elem_list.push_back(purge_sys.purge_queue.top()); + purge_sys.purge_queue.pop(); } - ut_ad(purge_sys->purge_queue.empty()); + ut_ad(purge_sys.purge_queue.empty()); for (purge_elem_list_t::iterator it = purge_elem_list.begin(); it != purge_elem_list.end(); @@ -835,21 +815,18 @@ trx_purge_cleanse_purge_queue( it2 != it->end(); ++it2) { - if ((*it2)->space + if ((*it2)->space->id == undo_trunc->get_marked_space_id()) { it->erase(it2); break; } } - if (it->size()) { - /* size != 0 suggest that there exist other rsegs that - needs processing so add this element to purge queue. - Note: Other rseg could be non-redo rsegs. */ - purge_sys->purge_queue.push(*it); + if (!it->empty()) { + purge_sys.purge_queue.push(*it); } } - mutex_exit(&purge_sys->pq_mutex); + mutex_exit(&purge_sys.pq_mutex); } /** Iterate over selected UNDO tablespace and check if all the rsegs @@ -859,7 +836,7 @@ that resides in the tablespace are free. static void trx_purge_initiate_truncate( - purge_iter_t* limit, + const purge_sys_t::iterator& limit, undo::Truncate* undo_trunc) { /* Step-1: Early check to findout if any of the the UNDO tablespace @@ -903,23 +880,11 @@ trx_purge_initiate_truncate( ulint cached_undo_size = 0; for (trx_undo_t* undo = - UT_LIST_GET_FIRST(rseg->update_undo_cached); + UT_LIST_GET_FIRST(rseg->undo_cached); undo != NULL && all_free; undo = UT_LIST_GET_NEXT(undo_list, undo)) { - if (limit->trx_no < undo->trx_id) { - all_free = false; - } else { - cached_undo_size += undo->size; - } - } - - for (trx_undo_t* undo = - UT_LIST_GET_FIRST(rseg->insert_undo_cached); - undo != NULL && all_free; - undo = UT_LIST_GET_NEXT(undo_list, undo)) { - - if (limit->trx_no < undo->trx_id) { + if (limit.trx_no() < undo->trx_id) { all_free = false; } else { cached_undo_size += undo->size; @@ -957,6 +922,14 @@ trx_purge_initiate_truncate( ut_a(srv_is_undo_tablespace(space_id)); + fil_space_t* space = fil_space_get(space_id); + + if (!space) { +not_found: + ib::error() << "Failed to find UNDO tablespace " << space_id; + return; + } + /* Flush all to-be-discarded pages of the tablespace. During truncation, we do not want any writes to the @@ -969,8 +942,8 @@ trx_purge_initiate_truncate( break crash recovery. So, we cannot avoid the write. */ { FlushObserver observer( - space_id, - UT_LIST_GET_FIRST(purge_sys->query->thrs)->graph->trx, + space, + UT_LIST_GET_FIRST(purge_sys.query->thrs)->graph->trx, NULL); buf_LRU_flush_or_remove_pages(space_id, &observer); } @@ -978,11 +951,10 @@ trx_purge_initiate_truncate( log_free_check(); /* Adjust the tablespace metadata. */ - fil_space_t* space = fil_truncate_prepare(space_id); + space = fil_truncate_prepare(space_id); if (!space) { - ib::error() << "Failed to find UNDO tablespace " << space_id; - return; + goto not_found; } /* Undo tablespace always are a single file. */ @@ -995,71 +967,56 @@ trx_purge_initiate_truncate( mtr_t mtr; const ulint size = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES; mtr.start(); - mtr.x_lock_space(space, __FILE__, __LINE__); + mtr_x_lock_space(space, &mtr); fil_truncate_log(space, size, &mtr); - fsp_header_init(space_id, size, &mtr); - mutex_enter(&fil_system->mutex); + fsp_header_init(space, size, &mtr); + mutex_enter(&fil_system.mutex); space->size = file->size = size; - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system.mutex); + + buf_block_t* sys_header = trx_sysf_get(&mtr); for (ulint i = 0; i < undo_trunc->rsegs_size(); ++i) { trx_rseg_t* rseg = undo_trunc->get_ith_rseg(i); - buf_block_t* rblock = trx_rseg_header_create( - space_id, ULINT_MAX, rseg->id, &mtr); + space, rseg->id, sys_header, &mtr); ut_ad(rblock); rseg->page_no = rblock ? rblock->page.id.page_no() : FIL_NULL; /* Before re-initialization ensure that we free the existing structure. There can't be any active transactions. */ - ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0); - ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0); + ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0); + ut_a(UT_LIST_GET_LEN(rseg->old_insert_list) == 0); trx_undo_t* next_undo; - for (trx_undo_t* undo = - UT_LIST_GET_FIRST(rseg->update_undo_cached); + for (trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached); undo != NULL; undo = next_undo) { next_undo = UT_LIST_GET_NEXT(undo_list, undo); - UT_LIST_REMOVE(rseg->update_undo_cached, undo); + UT_LIST_REMOVE(rseg->undo_cached, undo); MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); - trx_undo_mem_free(undo); + ut_free(undo); } - for (trx_undo_t* undo = - UT_LIST_GET_FIRST(rseg->insert_undo_cached); - undo != NULL; - undo = next_undo) { - - next_undo = UT_LIST_GET_NEXT(undo_list, undo); - UT_LIST_REMOVE(rseg->insert_undo_cached, undo); - MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); - trx_undo_mem_free(undo); - } - - UT_LIST_INIT(rseg->update_undo_list, &trx_undo_t::undo_list); - UT_LIST_INIT(rseg->update_undo_cached, &trx_undo_t::undo_list); - UT_LIST_INIT(rseg->insert_undo_list, &trx_undo_t::undo_list); - UT_LIST_INIT(rseg->insert_undo_cached, &trx_undo_t::undo_list); + UT_LIST_INIT(rseg->undo_list, &trx_undo_t::undo_list); + UT_LIST_INIT(rseg->undo_cached, &trx_undo_t::undo_list); + UT_LIST_INIT(rseg->old_insert_list, &trx_undo_t::undo_list); /* These were written by trx_rseg_header_create(). */ - ut_ad(mach_read_from_4(TRX_RSEG + TRX_RSEG_MAX_SIZE - + rblock->frame) - == uint32_t(rseg->max_size)); + ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + + rblock->frame)); ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rblock->frame)); - rseg->max_size = ULINT_MAX; - /* Initialize the undo log lists according to the rseg header */ rseg->curr_size = 1; rseg->trx_ref_count = 0; rseg->last_page_no = FIL_NULL; rseg->last_offset = 0; - rseg->last_trx_no = 0; - rseg->last_del_marks = FALSE; + rseg->last_commit = 0; + rseg->needs_purge = false; } mtr.commit(); @@ -1073,26 +1030,26 @@ trx_purge_initiate_truncate( /* This is only executed by the srv_purge_coordinator_thread. */ export_vars.innodb_undo_truncations++; - /* TODO: PUNCH_HOLE the garbage (with write-ahead logging) */ + /* In MDEV-8319 (10.5) we will PUNCH_HOLE the garbage + (with write-ahead logging). */ - mutex_enter(&fil_system->mutex); - ut_ad(space->stop_new_ops); + mutex_enter(&fil_system.mutex); ut_ad(space->is_being_truncated); - space->stop_new_ops = false; space->is_being_truncated = false; - mutex_exit(&fil_system->mutex); + space->set_stopping(false); + mutex_exit(&fil_system.mutex); - if (purge_sys->rseg != NULL - && purge_sys->rseg->last_page_no == FIL_NULL) { - /* If purge_sys->rseg is pointing to rseg that was recently + if (purge_sys.rseg != NULL + && purge_sys.rseg->last_page_no == FIL_NULL) { + /* If purge_sys.rseg is pointing to rseg that was recently truncated then move to next rseg element. - Note: Ideally purge_sys->rseg should be NULL because purge + Note: Ideally purge_sys.rseg should be NULL because purge should complete processing of all the records but there is purge_batch_size that can force the purge loop to exit before - all the records are purged and in this case purge_sys->rseg + all the records are purged and in this case purge_sys.rseg could point to a valid rseg waiting for next purge cycle. */ - purge_sys->next_stored = false; - purge_sys->rseg = NULL; + purge_sys.next_stored = false; + purge_sys.rseg = NULL; } DBUG_EXECUTE_IF("ib_undo_trunc", @@ -1112,35 +1069,26 @@ trx_purge_initiate_truncate( undo::Truncate::clear_trunc_list(); } -/********************************************************************//** +/** Removes unnecessary history data from rollback segments. NOTE that when this -function is called, the caller must not have any latches on undo log pages! */ -static -void -trx_purge_truncate_history( -/*========================*/ - purge_iter_t* limit, /*!< in: truncate limit */ - const ReadView* view) /*!< in: purge view */ +function is called, the caller must not have any latches on undo log pages! +*/ +static void trx_purge_truncate_history() { - ut_ad(trx_purge_check_limit()); - - /* We play safe and set the truncate limit at most to the purge view - low_limit number, though this is not necessary */ - - if (limit->trx_no >= view->low_limit_no()) { - limit->trx_no = view->low_limit_no(); - limit->undo_no = 0; - limit->undo_rseg_space = ULINT_UNDEFINED; + ut_ad(purge_sys.head <= purge_sys.tail); + purge_sys_t::iterator& head = purge_sys.head.commit + ? purge_sys.head : purge_sys.tail; + + if (head.trx_no() >= purge_sys.view.low_limit_no()) { + /* This is sometimes necessary. TODO: find out why. */ + head.reset_trx_no(purge_sys.view.low_limit_no()); + head.undo_no = 0; } - ut_ad(limit->trx_no <= purge_sys->view.low_limit_no()); - for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { - trx_rseg_t* rseg = trx_sys->rseg_array[i]; - - if (rseg != NULL) { - ut_a(rseg->id == i); - trx_purge_truncate_rseg_history(rseg, limit); + if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) { + ut_ad(rseg->id == i); + trx_purge_truncate_rseg_history(*rseg, head); } } @@ -1148,19 +1096,15 @@ trx_purge_truncate_history( can (greedy approach). This will ensure when the server is idle we try and truncate all the UNDO tablespaces. */ for (ulint i = srv_undo_tablespaces_active; i--; ) { - trx_purge_mark_undo_for_truncate(&purge_sys->undo_trunc); - trx_purge_initiate_truncate(limit, &purge_sys->undo_trunc); + trx_purge_mark_undo_for_truncate(&purge_sys.undo_trunc); + trx_purge_initiate_truncate(head, &purge_sys.undo_trunc); } } /***********************************************************************//** Updates the last not yet purged history log info in rseg when we have purged -a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */ -static -void -trx_purge_rseg_get_next_history_log( -/*================================*/ - trx_rseg_t* rseg, /*!< in: rollback segment */ +a whole undo log. Advances also purge_sys.purge_trx_no past the purged log. */ +static void trx_purge_rseg_get_next_history_log( ulint* n_pages_handled)/*!< in/out: number of UNDO pages handled */ { @@ -1168,24 +1112,23 @@ trx_purge_rseg_get_next_history_log( trx_ulogf_t* log_hdr; fil_addr_t prev_log_addr; trx_id_t trx_no; - ibool del_marks; mtr_t mtr; - mutex_enter(&(rseg->mutex)); + mutex_enter(&purge_sys.rseg->mutex); - ut_a(rseg->last_page_no != FIL_NULL); + ut_a(purge_sys.rseg->last_page_no != FIL_NULL); - purge_sys->iter.trx_no = rseg->last_trx_no + 1; - purge_sys->iter.undo_no = 0; - purge_sys->iter.undo_rseg_space = ULINT_UNDEFINED; - purge_sys->next_stored = false; + purge_sys.tail.commit = purge_sys.rseg->last_commit + 1; + purge_sys.tail.undo_no = 0; + purge_sys.next_stored = false; - mtr_start(&mtr); + mtr.start(); undo_page = trx_undo_page_get_s_latched( - page_id_t(rseg->space, rseg->last_page_no), &mtr); + page_id_t(purge_sys.rseg->space->id, + purge_sys.rseg->last_page_no), &mtr); - log_hdr = undo_page + rseg->last_offset; + log_hdr = undo_page + purge_sys.rseg->last_offset; /* Increase the purge page count by one for every handled log */ @@ -1194,56 +1137,53 @@ trx_purge_rseg_get_next_history_log( prev_log_addr = trx_purge_get_log_from_hist( flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); - if (prev_log_addr.page == FIL_NULL) { + const bool empty = prev_log_addr.page == FIL_NULL; + + if (empty) { /* No logs left in the history list */ + purge_sys.rseg->last_page_no = FIL_NULL; + } - rseg->last_page_no = FIL_NULL; + mutex_exit(&purge_sys.rseg->mutex); + mtr.commit(); - mutex_exit(&(rseg->mutex)); - mtr_commit(&mtr); + if (empty) { return; } - mutex_exit(&rseg->mutex); - - mtr_commit(&mtr); - - /* Read the trx number and del marks from the previous log header */ - mtr_start(&mtr); + /* Read the previous log header. */ + mtr.start(); - log_hdr = trx_undo_page_get_s_latched(page_id_t(rseg->space, - prev_log_addr.page), - &mtr) + log_hdr = trx_undo_page_get_s_latched( + page_id_t(purge_sys.rseg->space->id, prev_log_addr.page), + &mtr) + prev_log_addr.boffset; trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); - - del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS); + unsigned purge = mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE); + ut_ad(purge <= 1); mtr_commit(&mtr); - mutex_enter(&(rseg->mutex)); - - rseg->last_page_no = prev_log_addr.page; - rseg->last_offset = prev_log_addr.boffset; - rseg->last_trx_no = trx_no; - rseg->last_del_marks = del_marks; + mutex_enter(&purge_sys.rseg->mutex); - TrxUndoRsegs elem(rseg->last_trx_no); - elem.push_back(rseg); + purge_sys.rseg->last_page_no = prev_log_addr.page; + purge_sys.rseg->last_offset = prev_log_addr.boffset; + purge_sys.rseg->set_last_trx_no(trx_no, purge != 0); + purge_sys.rseg->needs_purge = purge != 0; /* Purge can also produce events, however these are already ordered in the rollback segment and any user generated event will be greater than the events that Purge produces. ie. Purge can never produce events from an empty rollback segment. */ - mutex_enter(&purge_sys->pq_mutex); + mutex_enter(&purge_sys.pq_mutex); - purge_sys->purge_queue.push(elem); + purge_sys.purge_queue.push(*purge_sys.rseg); - mutex_exit(&purge_sys->pq_mutex); + mutex_exit(&purge_sys.pq_mutex); - mutex_exit(&rseg->mutex); + mutex_exit(&purge_sys.rseg->mutex); } /** Position the purge sys "iterator" on the undo record to use for purging. */ @@ -1254,46 +1194,36 @@ trx_purge_read_undo_rec() ulint offset; ulint page_no; ib_uint64_t undo_no; - ulint undo_rseg_space; - purge_sys->hdr_offset = purge_sys->rseg->last_offset; - page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; + purge_sys.hdr_offset = purge_sys.rseg->last_offset; + page_no = purge_sys.hdr_page_no = purge_sys.rseg->last_page_no; - if (purge_sys->rseg->last_del_marks) { + if (purge_sys.rseg->needs_purge) { mtr_t mtr; - trx_undo_rec_t* undo_rec = NULL; + mtr.start(); + if (trx_undo_rec_t* undo_rec = trx_undo_get_first_rec( + purge_sys.rseg->space, purge_sys.hdr_page_no, + purge_sys.hdr_offset, RW_S_LATCH, &mtr)) { - mtr_start(&mtr); - - undo_rec = trx_undo_get_first_rec( - purge_sys->rseg->space, - purge_sys->hdr_page_no, - purge_sys->hdr_offset, RW_S_LATCH, &mtr); - - if (undo_rec != NULL) { offset = page_offset(undo_rec); undo_no = trx_undo_rec_get_undo_no(undo_rec); - undo_rseg_space = purge_sys->rseg->space; page_no = page_get_page_no(page_align(undo_rec)); } else { offset = 0; undo_no = 0; - undo_rseg_space = ULINT_UNDEFINED; } - mtr_commit(&mtr); + mtr.commit(); } else { offset = 0; undo_no = 0; - undo_rseg_space = ULINT_UNDEFINED; } - purge_sys->offset = offset; - purge_sys->page_no = page_no; - purge_sys->iter.undo_no = undo_no; - purge_sys->iter.undo_rseg_space = undo_rseg_space; + purge_sys.offset = offset; + purge_sys.page_no = page_no; + purge_sys.tail.undo_no = undo_no; - purge_sys->next_stored = true; + purge_sys.next_stored = true; } /***********************************************************************//** @@ -1306,9 +1236,9 @@ void trx_purge_choose_next_log(void) /*===========================*/ { - ut_ad(!purge_sys->next_stored); + ut_ad(!purge_sys.next_stored); - if (purge_sys->rseg_iter.set_next()) { + if (purge_sys.rseg_iter.set_next()) { trx_purge_read_undo_rec(); } else { /* There is nothing to do yet. */ @@ -1337,19 +1267,18 @@ trx_purge_get_next_rec( ulint space; mtr_t mtr; - ut_ad(purge_sys->next_stored); - ut_ad(purge_sys->iter.trx_no < purge_sys->view.low_limit_no()); + ut_ad(purge_sys.next_stored); + ut_ad(purge_sys.tail.trx_no() < purge_sys.view.low_limit_no()); - space = purge_sys->rseg->space; - page_no = purge_sys->page_no; - offset = purge_sys->offset; + space = purge_sys.rseg->space->id; + page_no = purge_sys.page_no; + offset = purge_sys.offset; if (offset == 0) { /* It is the dummy undo log record, which means that there is no need to purge this undo log */ - trx_purge_rseg_get_next_history_log( - purge_sys->rseg, n_pages_handled); + trx_purge_rseg_get_next_history_log(n_pages_handled); /* Look for the next undo log and record to purge */ @@ -1365,52 +1294,18 @@ trx_purge_get_next_rec( rec = undo_page + offset; - rec2 = rec; - - for (;;) { - ulint type; - trx_undo_rec_t* next_rec; - ulint cmpl_info; - - /* Try first to find the next record which requires a purge - operation from the same page of the same undo log */ - - next_rec = trx_undo_page_get_next_rec( - rec2, purge_sys->hdr_page_no, purge_sys->hdr_offset); - - if (next_rec == NULL) { - rec2 = trx_undo_get_next_rec( - rec2, purge_sys->hdr_page_no, - purge_sys->hdr_offset, &mtr); - break; - } - - rec2 = next_rec; - - type = trx_undo_rec_get_type(rec2); - - if (type == TRX_UNDO_DEL_MARK_REC) { - - break; - } - - cmpl_info = trx_undo_rec_get_cmpl_info(rec2); - - if (trx_undo_rec_get_extern_storage(rec2)) { - break; - } + rec2 = trx_undo_page_get_next_rec(rec, purge_sys.hdr_page_no, + purge_sys.hdr_offset); - if ((type == TRX_UNDO_UPD_EXIST_REC) - && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { - break; - } + if (rec2 == NULL) { + rec2 = trx_undo_get_next_rec(rec, purge_sys.hdr_page_no, + purge_sys.hdr_offset, &mtr); } if (rec2 == NULL) { mtr_commit(&mtr); - trx_purge_rseg_get_next_history_log( - purge_sys->rseg, n_pages_handled); + trx_purge_rseg_get_next_history_log(n_pages_handled); /* Look for the next undo log and record to purge */ @@ -1425,10 +1320,9 @@ trx_purge_get_next_rec( } else { page = page_align(rec2); - purge_sys->offset = rec2 - page; - purge_sys->page_no = page_get_page_no(page); - purge_sys->iter.undo_no = trx_undo_rec_get_undo_no(rec2); - purge_sys->iter.undo_rseg_space = space; + purge_sys.offset = ulint(rec2 - page); + purge_sys.page_no = page_get_page_no(page); + purge_sys.tail.undo_no = trx_undo_rec_get_undo_no(rec2); if (undo_page != page) { /* We advance to a new page of the undo log: */ @@ -1457,17 +1351,17 @@ trx_purge_fetch_next_rec( handled */ mem_heap_t* heap) /*!< in: memory heap where copied */ { - if (!purge_sys->next_stored) { + if (!purge_sys.next_stored) { trx_purge_choose_next_log(); - if (!purge_sys->next_stored) { + if (!purge_sys.next_stored) { DBUG_PRINT("ib_purge", ("no logs left in the history list")); return(NULL); } } - if (purge_sys->iter.trx_no >= purge_sys->view.low_limit_no()) { + if (purge_sys.tail.trx_no() >= purge_sys.view.low_limit_no()) { return(NULL); } @@ -1476,8 +1370,11 @@ trx_purge_fetch_next_rec( os_thread_get_curr_id(), iter->trx_no, iter->undo_no); */ *roll_ptr = trx_undo_build_roll_ptr( - FALSE, purge_sys->rseg->id, - purge_sys->page_no, purge_sys->offset); + /* row_purge_record_func() will later set + ROLL_PTR_INSERT_FLAG for TRX_UNDO_INSERT_REC */ + false, + purge_sys.rseg->id, + purge_sys.page_no, purge_sys.offset); /* The following call will advance the stored values of the purge iterator. */ @@ -1485,30 +1382,26 @@ trx_purge_fetch_next_rec( return(trx_purge_get_next_rec(n_pages_handled, heap)); } -/*******************************************************************//** -This function runs a purge batch. +/** Run a purge batch. +@param n_purge_threads number of purge threads @return number of undo log pages handled in the batch */ static ulint -trx_purge_attach_undo_recs( -/*=======================*/ - ulint n_purge_threads,/*!< in: number of purge threads */ - purge_sys_t* purge_sys, /*!< in/out: purge instance */ - ulint batch_size) /*!< in: no. of pages to purge */ +trx_purge_attach_undo_recs(ulint n_purge_threads) { que_thr_t* thr; ulint i; ulint n_pages_handled = 0; - ulint n_thrs = UT_LIST_GET_LEN(purge_sys->query->thrs); + ulint n_thrs = UT_LIST_GET_LEN(purge_sys.query->thrs); ut_a(n_purge_threads > 0); - purge_sys->limit = purge_sys->iter; + purge_sys.head = purge_sys.tail; #ifdef UNIV_DEBUG i = 0; /* Debug code to validate some pre-requisites and reset done flag. */ - for (thr = UT_LIST_GET_FIRST(purge_sys->query->thrs); + for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); thr != NULL && i < n_purge_threads; thr = UT_LIST_GET_NEXT(thrs, thr), ++i) { @@ -1530,13 +1423,15 @@ trx_purge_attach_undo_recs( /* Fetch and parse the UNDO records. The UNDO records are added to a per purge node vector. */ - thr = UT_LIST_GET_FIRST(purge_sys->query->thrs); + thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); ut_a(n_thrs > 0 && thr != NULL); - ut_ad(trx_purge_check_limit()); + ut_ad(purge_sys.head <= purge_sys.tail); i = 0; + const ulint batch_size = srv_purge_batch_size; + while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) { purge_node_t* node; trx_purge_rec_t* purge_rec; @@ -1553,11 +1448,11 @@ trx_purge_attach_undo_recs( /* Track the max {trx_id, undo_no} for truncating the UNDO logs once we have purged the records. */ - if (trx_purge_check_limit()) { - purge_sys->limit = purge_sys->iter; + if (purge_sys.head <= purge_sys.tail) { + purge_sys.head = purge_sys.tail; } - /* Fetch the next record, and advance the purge_sys->iter. */ + /* Fetch the next record, and advance the purge_sys.tail. */ purge_rec->undo_rec = trx_purge_fetch_next_rec( &purge_rec->roll_ptr, &n_pages_handled, node->heap); @@ -1585,13 +1480,13 @@ trx_purge_attach_undo_recs( thr = UT_LIST_GET_NEXT(thrs, thr); if (!(++i % n_purge_threads)) { - thr = UT_LIST_GET_FIRST(purge_sys->query->thrs); + thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); } ut_a(thr != NULL); } - ut_ad(trx_purge_check_limit()); + ut_ad(purge_sys.head <= purge_sys.tail); return(n_pages_handled); } @@ -1611,12 +1506,12 @@ trx_purge_dml_delay(void) /* If purge lag is set (ie. > 0) then calculate the new DML delay. Note: we do a dirty read of the trx_sys_t data structure here, - without holding trx_sys->mutex. */ + without holding trx_sys.mutex. */ if (srv_max_purge_lag > 0) { float ratio; - ratio = float(trx_sys->rseg_history_len) / srv_max_purge_lag; + ratio = float(trx_sys.history_size()) / srv_max_purge_lag; if (ratio > 1.0) { /* If the history list length exceeds the @@ -1636,18 +1531,14 @@ trx_purge_dml_delay(void) return(delay); } -/*******************************************************************//** -Wait for pending purge jobs to complete. */ +/** Wait for pending purge jobs to complete. */ static void -trx_purge_wait_for_workers_to_complete( -/*===================================*/ - purge_sys_t* purge_sys) /*!< in: purge instance */ +trx_purge_wait_for_workers_to_complete() { - ulint n_submitted = purge_sys->n_submitted; - /* Ensure that the work queue empties out. */ - while ((ulint) my_atomic_loadlint(&purge_sys->n_completed) != n_submitted) { + while (my_atomic_loadlint(&purge_sys.n_completed) + != purge_sys.n_submitted) { if (srv_get_task_queue_length() > 0) { srv_release_threads(SRV_WORKER, 1); @@ -1656,9 +1547,6 @@ trx_purge_wait_for_workers_to_complete( os_thread_yield(); } - /* None of the worker threads should be doing any work. */ - ut_a(purge_sys->n_submitted == purge_sys->n_completed); - /* There should be no outstanding tasks as long as the worker threads are active. */ ut_a(srv_get_task_queue_length() == 0); @@ -1672,8 +1560,6 @@ trx_purge( /*======*/ ulint n_purge_threads, /*!< in: number of purge tasks to submit to the work queue */ - ulint batch_size, /*!< in: the maximum number of records - to purge in one batch */ bool truncate /*!< in: truncate history if true */ #ifdef UNIV_DEBUG , srv_slot_t *slot /*!< in/out: purge coordinator @@ -1689,11 +1575,12 @@ trx_purge( srv_dml_needed_delay = trx_purge_dml_delay(); /* The number of tasks submitted should be completed. */ - ut_a(purge_sys->n_submitted == purge_sys->n_completed); + ut_a(purge_sys.n_submitted + == my_atomic_loadlint(&purge_sys.n_completed)); - rw_lock_x_lock(&purge_sys->latch); - trx_sys->mvcc->clone_oldest_view(&purge_sys->view); - rw_lock_x_unlock(&purge_sys->latch); + rw_lock_x_lock(&purge_sys.latch); + trx_sys.clone_oldest_view(); + rw_lock_x_unlock(&purge_sys.latch); #ifdef UNIV_DEBUG if (srv_purge_view_update_only_debug) { @@ -1702,67 +1589,32 @@ trx_purge( #endif /* UNIV_DEBUG */ /* Fetch the UNDO recs that need to be purged. */ - n_pages_handled = trx_purge_attach_undo_recs( - n_purge_threads, purge_sys, batch_size); - - /* Do we do an asynchronous purge or not ? */ - if (n_purge_threads > 1) { - ulint i = 0; - - /* Submit the tasks to the work queue. */ - for (i = 0; i < n_purge_threads - 1; ++i) { - thr = que_fork_scheduler_round_robin( - purge_sys->query, thr); - - ut_a(thr != NULL); - - srv_que_task_enqueue_low(thr); - } - - thr = que_fork_scheduler_round_robin(purge_sys->query, thr); - ut_a(thr != NULL); - - purge_sys->n_submitted += n_purge_threads - 1; - - goto run_synchronously; - - /* Do it synchronously. */ - } else { - thr = que_fork_scheduler_round_robin(purge_sys->query, NULL); - ut_ad(thr); + n_pages_handled = trx_purge_attach_undo_recs(n_purge_threads); + purge_sys.n_submitted += n_purge_threads; + + /* Submit tasks to workers queue if using multi-threaded purge. */ + for (ulint i = n_purge_threads; --i; ) { + thr = que_fork_scheduler_round_robin(purge_sys.query, thr); + ut_a(thr); + srv_que_task_enqueue_low(thr); + } -run_synchronously: - ++purge_sys->n_submitted; + thr = que_fork_scheduler_round_robin(purge_sys.query, thr); - ut_d(thr->thread_slot = slot); - que_run_threads(thr); + ut_d(thr->thread_slot = slot); + que_run_threads(thr); - my_atomic_addlint( - &purge_sys->n_completed, 1); + my_atomic_addlint(&purge_sys.n_completed, 1); - if (n_purge_threads > 1) { - trx_purge_wait_for_workers_to_complete(purge_sys); - } + if (n_purge_threads > 1) { + trx_purge_wait_for_workers_to_complete(); } - ut_a(purge_sys->n_submitted == purge_sys->n_completed); - -#ifdef UNIV_DEBUG - rw_lock_x_lock(&purge_sys->latch); - if (purge_sys->limit.trx_no == 0) { - purge_sys->done = purge_sys->iter; - } else { - purge_sys->done = purge_sys->limit; - } - rw_lock_x_unlock(&purge_sys->latch); -#endif /* UNIV_DEBUG */ + ut_a(purge_sys.n_submitted + == my_atomic_loadlint(&purge_sys.n_completed)); if (truncate) { - trx_purge_truncate_history( - purge_sys->limit.trx_no - ? &purge_sys->limit - : &purge_sys->iter, - &purge_sys->view); + trx_purge_truncate_history(); } MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1); @@ -1771,111 +1623,63 @@ run_synchronously: return(n_pages_handled); } -/*******************************************************************//** -Get the purge state. -@return purge state. */ -purge_state_t -trx_purge_state(void) -/*=================*/ +/** Stop purge during FLUSH TABLES FOR EXPORT */ +void purge_sys_t::stop() { - purge_state_t state; - - rw_lock_x_lock(&purge_sys->latch); - - state = purge_sys->state; - - rw_lock_x_unlock(&purge_sys->latch); - - return(state); + rw_lock_x_lock(&latch); + + if (!enabled_latched()) + { + /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */ + ut_ad(!srv_undo_sources); + rw_lock_x_unlock(&latch); + return; + } + + ut_ad(srv_n_purge_threads > 0); + + if (0 == my_atomic_add32_explicit(&m_paused, 1, MY_MEMORY_ORDER_RELAXED)) + { + /* We need to wakeup the purge thread in case it is suspended, so + that it can acknowledge the state change. */ + const int64_t sig_count = os_event_reset(event); + rw_lock_x_unlock(&latch); + ib::info() << "Stopping purge"; + srv_purge_wakeup(); + /* Wait for purge coordinator to signal that it is suspended. */ + os_event_wait_low(event, sig_count); + MONITOR_ATOMIC_INC(MONITOR_PURGE_STOP_COUNT); + return; + } + + rw_lock_x_unlock(&latch); + + if (running()) + { + ib::info() << "Waiting for purge to stop"; + while (running()) + os_thread_sleep(10000); + } } -/*******************************************************************//** -Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */ -void -trx_purge_stop(void) -/*================*/ +/** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */ +void purge_sys_t::resume() { - rw_lock_x_lock(&purge_sys->latch); - - switch (purge_sys->state) { - case PURGE_STATE_INIT: - case PURGE_STATE_DISABLED: - ut_error; - case PURGE_STATE_EXIT: - /* Shutdown must have been initiated during - FLUSH TABLES FOR EXPORT. */ - ut_ad(!srv_undo_sources); -unlock: - rw_lock_x_unlock(&purge_sys->latch); - break; - case PURGE_STATE_STOP: - ut_ad(srv_n_purge_threads > 0); - ++purge_sys->n_stop; - purge_sys->state = PURGE_STATE_STOP; - if (!purge_sys->running) { - goto unlock; - } - ib::info() << "Waiting for purge to stop"; - do { - rw_lock_x_unlock(&purge_sys->latch); - os_thread_sleep(10000); - rw_lock_x_lock(&purge_sys->latch); - } while (purge_sys->running); - goto unlock; - case PURGE_STATE_RUN: - ut_ad(srv_n_purge_threads > 0); - ++purge_sys->n_stop; - ib::info() << "Stopping purge"; - - /* We need to wakeup the purge thread in case it is suspended, - so that it can acknowledge the state change. */ - - const int64_t sig_count = os_event_reset(purge_sys->event); - purge_sys->state = PURGE_STATE_STOP; - rw_lock_x_unlock(&purge_sys->latch); - srv_purge_wakeup(); - /* Wait for purge coordinator to signal that it - is suspended. */ - os_event_wait_low(purge_sys->event, sig_count); - } - - MONITOR_INC_VALUE(MONITOR_PURGE_STOP_COUNT, 1); -} - -/*******************************************************************//** -Resume purge, move to PURGE_STATE_RUN. */ -void -trx_purge_run(void) -/*===============*/ -{ - rw_lock_x_lock(&purge_sys->latch); - - switch (purge_sys->state) { - case PURGE_STATE_EXIT: - /* Shutdown must have been initiated during - FLUSH TABLES FOR EXPORT. */ - ut_ad(!srv_undo_sources); - break; - case PURGE_STATE_INIT: - case PURGE_STATE_DISABLED: - ut_error; - - case PURGE_STATE_RUN: - ut_a(!purge_sys->n_stop); - break; - case PURGE_STATE_STOP: - ut_a(purge_sys->n_stop); - if (--purge_sys->n_stop == 0) { - - ib::info() << "Resuming purge"; - - purge_sys->state = PURGE_STATE_RUN; - } - - MONITOR_INC_VALUE(MONITOR_PURGE_RESUME_COUNT, 1); - } - - rw_lock_x_unlock(&purge_sys->latch); - - srv_purge_wakeup(); + if (!enabled()) + { + /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */ + ut_ad(!srv_undo_sources); + return; + } + + int32_t paused= my_atomic_add32_explicit(&m_paused, -1, + MY_MEMORY_ORDER_RELAXED); + ut_a(paused); + + if (paused == 1) + { + ib::info() << "Resuming purge"; + srv_purge_wakeup(); + MONITOR_ATOMIC_INC(MONITOR_PURGE_RESUME_COUNT); + } } diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc index e3e1c33b305..9c7106facaf 100644 --- a/storage/innobase/trx/trx0rec.cc +++ b/storage/innobase/trx/trx0rec.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -31,7 +31,6 @@ Created 3/26/1996 Heikki Tuuri #include "mtr0log.h" #include "dict0dict.h" #include "ut0mem.h" -#include "read0read.h" #include "row0ext.h" #include "row0upd.h" #include "que0que.h" @@ -40,61 +39,77 @@ Created 3/26/1996 Heikki Tuuri #include "row0row.h" #include "row0mysql.h" +/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */ +const dtuple_t trx_undo_metadata = { + REC_INFO_METADATA, 0, 0, + NULL, 0, NULL +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif /* UNIV_DEBUG */ +}; + /*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/ -/**********************************************************************//** -Writes the mtr log entry of the inserted undo log record on the undo log -page. */ -UNIV_INLINE -void -trx_undof_page_add_undo_rec_log( -/*============================*/ - page_t* undo_page, /*!< in: undo log page */ - ulint old_free, /*!< in: start offset of the inserted entry */ - ulint new_free, /*!< in: end offset of the entry */ - mtr_t* mtr) /*!< in: mtr */ +/** Write redo log of writing an undo log record. +@param[in] undo_block undo log page +@param[in] old_free start offset of the undo log record +@param[in] new_free end offset of the undo log record +@param[in,out] mtr mini-transaction */ +static void trx_undof_page_add_undo_rec_log(const buf_block_t* undo_block, + ulint old_free, ulint new_free, + mtr_t* mtr) { - byte* log_ptr; - const byte* log_end; - ulint len; - - log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN); - - if (log_ptr == NULL) { - + ut_ad(old_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + ut_ad(new_free >= old_free); + ut_ad(new_free < srv_page_size); + ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_block->frame) + == new_free); + mtr->set_modified(); + switch (mtr->get_log_mode()) { + case MTR_LOG_NONE: + case MTR_LOG_NO_REDO: return; + case MTR_LOG_SHORT_INSERTS: + ut_ad(0); + /* fall through */ + case MTR_LOG_ALL: + break; } - log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN]; - log_ptr = mlog_write_initial_log_record_fast( - undo_page, MLOG_UNDO_INSERT, log_ptr, mtr); - len = new_free - old_free - 4; - + const uint32_t + len = uint32_t(new_free - old_free - 4), + reserved = std::min<uint32_t>(11 + 13 + len, + mtr->get_log()->MAX_DATA_SIZE); + byte* log_ptr = mtr->get_log()->open(reserved); + const byte* log_end = log_ptr + reserved; + log_ptr = mlog_write_initial_log_record_low( + MLOG_UNDO_INSERT, + undo_block->page.id.space(), undo_block->page.id.page_no(), + log_ptr, mtr); mach_write_to_2(log_ptr, len); - log_ptr += 2; - - if (log_ptr + len <= log_end) { - memcpy(log_ptr, undo_page + old_free + 2, len); - mlog_close(mtr, log_ptr + len); + if (log_ptr + 2 + len <= log_end) { + memcpy(log_ptr + 2, undo_block->frame + old_free + 2, len); + mlog_close(mtr, log_ptr + 2 + len); } else { - mlog_close(mtr, log_ptr); - mlog_catenate_string(mtr, undo_page + old_free + 2, len); + mlog_close(mtr, log_ptr + 2); + mtr->get_log()->push(undo_block->frame + old_free + 2, len); } } -/***********************************************************//** -Parses a redo log record of adding an undo log record. -@return end of log record or NULL */ +/** Parse MLOG_UNDO_INSERT. +@param[in] ptr log record +@param[in] end_ptr end of log record buffer +@param[in,out] page page or NULL +@return end of log record +@retval NULL if the log record is incomplete */ byte* trx_undo_parse_add_undo_rec( -/*========================*/ - byte* ptr, /*!< in: buffer */ - byte* end_ptr,/*!< in: buffer end */ - page_t* page) /*!< in: page or NULL */ + const byte* ptr, + const byte* end_ptr, + page_t* page) { ulint len; - byte* rec; - ulint first_free; if (end_ptr < ptr + 2) { @@ -109,35 +124,33 @@ trx_undo_parse_add_undo_rec( return(NULL); } - if (page == NULL) { - - return(ptr + len); - } - - first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_FREE); - rec = page + first_free; + if (page) { + ulint first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + byte* rec = page + first_free; - mach_write_to_2(rec, first_free + 4 + len); - mach_write_to_2(rec + 2 + len, first_free); + mach_write_to_2(rec, first_free + 4 + len); + mach_write_to_2(rec + 2 + len, first_free); - mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, - first_free + 4 + len); - ut_memcpy(rec + 2, ptr, len); + mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, + first_free + 4 + len); + memcpy(rec + 2, ptr, len); + } - return(ptr + len); + return(const_cast<byte*>(ptr + len)); } /** Calculate the free space left for extending an undo log record. -@param page undo log page +@param block undo log page @param ptr current end of the undo page @return bytes left */ -static ulint trx_undo_left(const page_t *page, const byte *ptr) +static ulint trx_undo_left(const buf_block_t *undo_block, const byte *ptr) { - ut_ad(ptr >= &page[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]); + ut_ad(ptr >= &undo_block->frame[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]); /* The 10 is supposed to be an extra safety margin (and needed for compatibility with older versions) */ - lint left= srv_page_size - (ptr - page) - (10 + FIL_PAGE_DATA_END); + lint left= srv_page_size - (ptr - undo_block->frame) - + (10 + FIL_PAGE_DATA_END); ut_ad(left >= 0); return left < 0 ? 0 : static_cast<ulint>(left); } @@ -151,7 +164,7 @@ static ulint trx_undo_page_set_next_prev_and_add( /*================================*/ - page_t* undo_page, /*!< in/out: undo log page */ + buf_block_t* undo_block, /*!< in/out: undo log page */ byte* ptr, /*!< in: ptr up to where data has been written on this undo page. */ mtr_t* mtr) /*!< in: mtr */ @@ -163,15 +176,15 @@ trx_undo_page_set_next_prev_and_add( that points to the next free offset value within undo_page.*/ - ut_ad(ptr > undo_page); - ut_ad(ptr < undo_page + UNIV_PAGE_SIZE); - - if (UNIV_UNLIKELY(trx_undo_left(undo_page, ptr) < 2)) { + ut_ad(ptr > undo_block->frame); + ut_ad(ptr < undo_block->frame + srv_page_size); + if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2)) { return(0); } - ptr_to_first_free = undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE; + ptr_to_first_free = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_block->frame; first_free = mach_read_from_2(ptr_to_first_free); @@ -179,16 +192,16 @@ trx_undo_page_set_next_prev_and_add( mach_write_to_2(ptr, first_free); ptr += 2; - end_of_rec = ptr - undo_page; + end_of_rec = ulint(ptr - undo_block->frame); /* Write offset of the next undo log record */ - mach_write_to_2(undo_page + first_free, end_of_rec); + mach_write_to_2(undo_block->frame + first_free, end_of_rec); /* Update the offset to first free undo record */ mach_write_to_2(ptr_to_first_free, end_of_rec); /* Write this log entry to the UNDO log */ - trx_undof_page_add_undo_rec_log(undo_page, first_free, + trx_undof_page_add_undo_rec_log(undo_block, first_free, end_of_rec, mtr); return(first_free); @@ -200,7 +213,7 @@ static const ulint VIRTUAL_COL_UNDO_FORMAT_1 = 0xF1; /** Write virtual column index info (index id and column position in index) to the undo log -@param[in,out] undo_page undo log page +@param[in,out] undo_block undo log page @param[in] table the table @param[in] pos the virtual column position @param[in] ptr undo log record being written @@ -210,7 +223,7 @@ to the undo log static byte* trx_undo_log_v_idx( - page_t* undo_page, + buf_block_t* undo_block, const dict_table_t* table, ulint pos, byte* ptr, @@ -229,7 +242,7 @@ trx_undo_log_v_idx( 1 byte for undo log record format version marker */ ulint size = n_idx * (5 + 5) + 5 + 2 + (first_v_col ? 1 : 0); - if (trx_undo_left(undo_page, ptr) < size) { + if (trx_undo_left(undo_block, ptr) < size) { return(NULL); } @@ -258,7 +271,7 @@ trx_undo_log_v_idx( ptr += mach_write_compressed(ptr, v_index.nth_field); } - mach_write_to_2(old_ptr, ptr - old_ptr); + mach_write_to_2(old_ptr, ulint(ptr - old_ptr)); return(ptr); } @@ -302,7 +315,7 @@ trx_undo_read_v_idx_low( if (index->id == id) { const dict_col_t* col = dict_index_get_nth_col( index, pos); - ut_ad(dict_col_is_virtual(col)); + ut_ad(col->is_virtual()); const dict_v_col_t* vcol = reinterpret_cast< const dict_v_col_t*>(col); *col_pos = vcol->v_pos; @@ -359,7 +372,7 @@ trx_undo_read_v_idx( } /** Reports in the undo log of an insert of virtual columns. -@param[in] undo_page undo log page +@param[in] undo_block undo log page @param[in] table the table @param[in] row dtuple contains the virtual columns @param[in,out] ptr log ptr @@ -367,7 +380,7 @@ trx_undo_read_v_idx( static bool trx_undo_report_insert_virtual( - page_t* undo_page, + buf_block_t* undo_block, dict_table_t* table, const dtuple_t* row, byte** ptr) @@ -375,7 +388,7 @@ trx_undo_report_insert_virtual( byte* start = *ptr; bool first_v_col = true; - if (trx_undo_left(undo_page, *ptr) < 2) { + if (trx_undo_left(undo_block, *ptr) < 2) { return(false); } @@ -392,7 +405,7 @@ trx_undo_report_insert_virtual( if (col->m_col.ord_part) { /* make sure enought space to write the length */ - if (trx_undo_left(undo_page, *ptr) < 5) { + if (trx_undo_left(undo_block, *ptr) < 5) { return(false); } @@ -400,7 +413,7 @@ trx_undo_report_insert_virtual( pos += REC_MAX_N_FIELDS; *ptr += mach_write_compressed(*ptr, pos); - *ptr = trx_undo_log_v_idx(undo_page, table, + *ptr = trx_undo_log_v_idx(undo_block, table, col_no, *ptr, first_v_col); first_v_col = false; @@ -412,7 +425,7 @@ trx_undo_report_insert_virtual( row, col->v_pos); switch (ulint flen = vfield->len) { case 0: case UNIV_SQL_NULL: - if (trx_undo_left(undo_page, *ptr) < 5) { + if (trx_undo_left(undo_block, *ptr) < 5) { return(false); } @@ -427,8 +440,8 @@ trx_undo_report_insert_virtual( flen = max_len; } - if (trx_undo_left(undo_page, *ptr) < flen + 5) { - + if (trx_undo_left(undo_block, *ptr) + < flen + 5) { return(false); } *ptr += mach_write_compressed(*ptr, flen); @@ -440,7 +453,7 @@ trx_undo_report_insert_virtual( } /* Always mark the end of the log with 2 bytes length field */ - mach_write_to_2(start, *ptr - start); + mach_write_to_2(start, ulint(*ptr - start)); return(true); } @@ -452,7 +465,7 @@ static ulint trx_undo_page_report_insert( /*========================*/ - page_t* undo_page, /*!< in: undo log page */ + buf_block_t* undo_block, /*!< in: undo log page */ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: clustered index */ const dtuple_t* clust_entry, /*!< in: index entry which will be @@ -464,19 +477,21 @@ trx_undo_page_report_insert( ulint i; ut_ad(dict_index_is_clust(index)); - ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT); - - first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_FREE); - ptr = undo_page + first_free; + /* MariaDB 10.3.1+ in trx_undo_page_init() always initializes + TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote + TRX_UNDO_INSERT == 1 into insert_undo pages, + or TRX_UNDO_UPDATE == 2 into update_undo pages. */ + ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + + undo_block->frame) <= 2); - ut_ad(first_free <= UNIV_PAGE_SIZE); + first_free = mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_block->frame); + ptr = undo_block->frame + first_free; - if (trx_undo_left(undo_page, ptr) < 2 + 1 + 11 + 11) { + ut_ad(first_free <= srv_page_size); + if (trx_undo_left(undo_block, ptr) < 2 + 1 + 11 + 11) { /* Not enough space for writing the general parameters */ - return(0); } @@ -490,13 +505,21 @@ trx_undo_page_report_insert( /*----------------------------------------*/ /* Store then the fields required to uniquely determine the record to be inserted in the clustered index */ + if (UNIV_UNLIKELY(clust_entry->info_bits != 0)) { + ut_ad(clust_entry->info_bits == REC_INFO_METADATA); + ut_ad(index->is_instant()); + ut_ad(undo_block->frame[first_free + 2] + == TRX_UNDO_INSERT_REC); + undo_block->frame[first_free + 2] = TRX_UNDO_INSERT_METADATA; + goto done; + } for (i = 0; i < dict_index_get_n_unique(index); i++) { const dfield_t* field = dtuple_get_nth_field(clust_entry, i); ulint flen = dfield_get_len(field); - if (trx_undo_left(undo_page, ptr) < 5) { + if (trx_undo_left(undo_block, ptr) < 5) { return(0); } @@ -507,7 +530,7 @@ trx_undo_page_report_insert( case 0: case UNIV_SQL_NULL: break; default: - if (trx_undo_left(undo_page, ptr) < flen) { + if (trx_undo_left(undo_block, ptr) < flen) { return(0); } @@ -519,12 +542,13 @@ trx_undo_page_report_insert( if (index->table->n_v_cols) { if (!trx_undo_report_insert_virtual( - undo_page, index->table, clust_entry, &ptr)) { + undo_block, index->table, clust_entry, &ptr)) { return(0); } } - return(trx_undo_page_set_next_prev_and_add(undo_page, ptr, mtr)); +done: + return(trx_undo_page_set_next_prev_and_add(undo_block, ptr, mtr)); } /**********************************************************************//** @@ -596,7 +620,7 @@ trx_undo_rec_get_col_val( ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE); /* we do not have access to index->table here - ut_ad(dict_table_get_format(index->table) >= UNIV_FORMAT_B + ut_ad(dict_table_has_atomic_blobs(index->table) || *len >= col->max_prefix + BTR_EXTERN_FIELD_REF_SIZE); */ @@ -629,7 +653,7 @@ trx_undo_rec_get_row_ref( used, as we do NOT copy the data in the record! */ dict_index_t* index, /*!< in: clustered index */ - dtuple_t** ref, /*!< out, own: row reference */ + const dtuple_t**ref, /*!< out, own: row reference */ mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ { @@ -641,17 +665,17 @@ trx_undo_rec_get_row_ref( ref_len = dict_index_get_n_unique(index); - *ref = dtuple_create(heap, ref_len); + dtuple_t* tuple = dtuple_create(heap, ref_len); + *ref = tuple; - dict_index_copy_types(*ref, index, ref_len); + dict_index_copy_types(tuple, index, ref_len); for (i = 0; i < ref_len; i++) { - dfield_t* dfield; const byte* field; ulint len; ulint orig_len; - dfield = dtuple_get_nth_field(*ref, i); + dfield_t* dfield = dtuple_get_nth_field(tuple, i); ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); @@ -761,7 +785,7 @@ trx_undo_page_report_modify_ext( } /* Encode spatial status into length. */ - spatial_len |= spatial_status << SPATIAL_STATUS_SHIFT; + spatial_len |= ulint(spatial_status) << SPATIAL_STATUS_SHIFT; if (spatial_status == SPATIAL_ONLY) { /* If the column is only used by gis index, log its @@ -840,7 +864,7 @@ static ulint trx_undo_page_report_modify( /*========================*/ - page_t* undo_page, /*!< in: undo log page */ + buf_block_t* undo_block, /*!< in: undo log page */ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: clustered index where update or delete marking is done */ @@ -856,48 +880,46 @@ trx_undo_page_report_modify( virtual column info */ mtr_t* mtr) /*!< in: mtr */ { - dict_table_t* table = index->table; ulint first_free; byte* ptr; - const byte* field; - ulint flen; - ulint col_no; - ulint type_cmpl; - byte* type_cmpl_ptr; - ulint i; - trx_id_t trx_id; - ibool ignore_prefix = FALSE; - byte ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN - + BTR_EXTERN_FIELD_REF_SIZE]; - bool first_v_col = true; - ut_a(dict_index_is_clust(index)); + ut_ad(index->is_primary()); ut_ad(rec_offs_validate(rec, index, offsets)); + /* MariaDB 10.3.1+ in trx_undo_page_init() always initializes + TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote + TRX_UNDO_INSERT == 1 into insert_undo pages, + or TRX_UNDO_UPDATE == 2 into update_undo pages. */ ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE - + undo_page) == TRX_UNDO_UPDATE - || (dict_table_is_temporary(table) - && mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE - + undo_page) == TRX_UNDO_INSERT)); - trx_undo_t* update_undo = dict_table_is_temporary(table) - ? NULL : trx->rsegs.m_redo.update_undo; - - first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_FREE); - ptr = undo_page + first_free; + + undo_block->frame) <= 2); - ut_ad(first_free <= UNIV_PAGE_SIZE); + first_free = mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_block->frame); + ptr = undo_block->frame + first_free; - if (trx_undo_left(undo_page, ptr) < 50) { + ut_ad(first_free <= srv_page_size); + if (trx_undo_left(undo_block, ptr) < 50) { /* NOTE: the value 50 must be big enough so that the general fields written below fit on the undo log page */ - - return(0); + return 0; } /* Reserve 2 bytes for the pointer to the next undo log record */ ptr += 2; + dict_table_t* table = index->table; + const byte* field; + ulint flen; + ulint col_no; + ulint type_cmpl; + byte* type_cmpl_ptr; + ulint i; + trx_id_t trx_id; + ibool ignore_prefix = FALSE; + byte ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE]; + bool first_v_col = true; + /* Store first some general parameters to the undo log */ if (!update) { @@ -943,8 +965,8 @@ trx_undo_page_report_modify( allowed to ignore blob prefixes if the delete marking was done by some other trx as it must have committed by now for us to allow an over-write. */ - if (ignore_prefix) { - ignore_prefix = (trx_id != trx->id); + if (trx_id == trx->id) { + ignore_prefix = false; } ptr += mach_u64_write_compressed(ptr, trx_id); @@ -962,22 +984,22 @@ trx_undo_page_report_modify( for (i = 0; i < dict_index_get_n_unique(index); i++) { + /* The ordering columns must not be instant added columns. */ + ut_ad(!rec_offs_nth_default(offsets, i)); field = rec_get_nth_field(rec, offsets, i, &flen); /* The ordering columns must not be stored externally. */ ut_ad(!rec_offs_nth_extern(offsets, i)); ut_ad(dict_index_get_nth_col(index, i)->ord_part); - if (trx_undo_left(undo_page, ptr) < 5) { - + if (trx_undo_left(undo_block, ptr) < 5) { return(0); } ptr += mach_write_compressed(ptr, flen); if (flen != UNIV_SQL_NULL) { - if (trx_undo_left(undo_page, ptr) < flen) { - + if (trx_undo_left(undo_block, ptr) < flen) { return(0); } @@ -990,8 +1012,7 @@ trx_undo_page_report_modify( /* Save to the undo log the old values of the columns to be updated. */ if (update) { - if (trx_undo_left(undo_page, ptr) < 5) { - + if (trx_undo_left(undo_block, ptr) < 5) { return(0); } @@ -1029,8 +1050,7 @@ trx_undo_page_report_modify( ulint pos = fld->field_no; /* Write field number to undo log */ - if (trx_undo_left(undo_page, ptr) < 5) { - + if (trx_undo_left(undo_block, ptr) < 5) { return(0); } @@ -1054,7 +1074,7 @@ trx_undo_page_report_modify( if (is_virtual) { ut_ad(fld->field_no < table->n_v_def); - ptr = trx_undo_log_v_idx(undo_page, table, + ptr = trx_undo_log_v_idx(undo_block, table, fld->field_no, ptr, first_v_col); if (ptr == NULL) { @@ -1077,12 +1097,11 @@ trx_undo_page_report_modify( flen, max_v_log_len); } } else { - field = rec_get_nth_field(rec, offsets, - pos, &flen); + field = rec_get_nth_cfield( + rec, index, offsets, pos, &flen); } - if (trx_undo_left(undo_page, ptr) < 15) { - + if (trx_undo_left(undo_block, ptr) < 15) { return(0); } @@ -1105,21 +1124,13 @@ trx_undo_page_report_modify( dict_table_page_size(table), &field, &flen, SPATIAL_UNKNOWN); - /* Notify purge that it eventually has to - free the old externally stored field */ - - if (update_undo) { - update_undo->del_marks = TRUE; - } - *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN; } else { ptr += mach_write_compressed(ptr, flen); } if (flen != UNIV_SQL_NULL) { - if (trx_undo_left(undo_page, ptr) < flen) { - + if (trx_undo_left(undo_block, ptr) < flen) { return(0); } @@ -1136,16 +1147,15 @@ trx_undo_page_report_modify( flen, max_v_log_len); } - if (trx_undo_left(undo_page, ptr) < 15) { - + if (trx_undo_left(undo_block, ptr) < 15) { return(0); } ptr += mach_write_compressed(ptr, flen); if (flen != UNIV_SQL_NULL) { - if (trx_undo_left(undo_page, ptr) < flen) { - + if (trx_undo_left(undo_block, ptr) + < flen) { return(0); } @@ -1179,12 +1189,7 @@ trx_undo_page_report_modify( double mbr[SPDIMS * 2]; mem_heap_t* row_heap = NULL; - if (update_undo) { - update_undo->del_marks = TRUE; - } - - if (trx_undo_left(undo_page, ptr) < 5) { - + if (trx_undo_left(undo_block, ptr) < 5) { return(0); } @@ -1250,16 +1255,15 @@ trx_undo_page_report_modify( if (true) { /* Write field number to undo log */ - if (trx_undo_left(undo_page, ptr) < 5 + 15) { - + if (trx_undo_left(undo_block, ptr) < 5 + 15) { return(0); } ptr += mach_write_compressed(ptr, pos); /* Save the old value of field */ - field = rec_get_nth_field(rec, offsets, pos, - &flen); + field = rec_get_nth_cfield( + rec, index, offsets, pos, &flen); if (is_ext) { const dict_col_t* col = @@ -1299,9 +1303,8 @@ trx_undo_page_report_modify( if (flen != UNIV_SQL_NULL && spatial_status != SPATIAL_ONLY) { - if (trx_undo_left(undo_page, ptr) + if (trx_undo_left(undo_block, ptr) < flen) { - return(0); } @@ -1310,7 +1313,7 @@ trx_undo_page_report_modify( } if (spatial_status != SPATIAL_NONE) { - if (trx_undo_left(undo_page, ptr) + if (trx_undo_left(undo_block, ptr) < DATA_MBR_LEN) { return(0); } @@ -1341,8 +1344,7 @@ already_logged: /* Write field number to undo log. Make sure there is enought space in log */ - if (trx_undo_left(undo_page, ptr) < 5) { - + if (trx_undo_left(undo_block, ptr) < 5) { return(0); } @@ -1350,7 +1352,7 @@ already_logged: ptr += mach_write_compressed(ptr, pos); ut_ad(col_no < table->n_v_def); - ptr = trx_undo_log_v_idx(undo_page, table, + ptr = trx_undo_log_v_idx(undo_block, table, col_no, ptr, first_v_col); first_v_col = false; @@ -1395,9 +1397,8 @@ already_logged: case 0: case UNIV_SQL_NULL: break; default: - if (trx_undo_left(undo_page, ptr) + if (trx_undo_left(undo_block, ptr) < flen) { - return(0); } @@ -1407,7 +1408,7 @@ already_logged: } } - mach_write_to_2(old_ptr, ptr - old_ptr); + mach_write_to_2(old_ptr, ulint(ptr - old_ptr)); if (row_heap) { mem_heap_free(row_heap); @@ -1416,22 +1417,20 @@ already_logged: /*----------------------------------------*/ /* Write pointers to the previous and the next undo log records */ - if (trx_undo_left(undo_page, ptr) < 2) { - + if (trx_undo_left(undo_block, ptr) < 2) { return(0); } mach_write_to_2(ptr, first_free); ptr += 2; - mach_write_to_2(undo_page + first_free, ptr - undo_page); + const ulint new_free = ulint(ptr - undo_block->frame); + mach_write_to_2(undo_block->frame + first_free, new_free); - mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, - ptr - undo_page); + mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_block->frame, new_free); /* Write to the REDO log about this change in the UNDO log */ - - trx_undof_page_add_undo_rec_log(undo_page, first_free, - ptr - undo_page, mtr); + trx_undof_page_add_undo_rec_log(undo_block, first_free, new_free, mtr); return(first_free); } @@ -1514,7 +1513,7 @@ trx_undo_update_rec_get_update( buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN)); - trx_write_trx_id(buf, trx_id); + mach_write_to_6(buf, trx_id); upd_field_set_field_no(upd_field, dict_index_get_sys_col_pos(index, DATA_TRX_ID), @@ -1542,6 +1541,7 @@ trx_undo_update_rec_get_update( ulint orig_len; bool is_virtual; + upd_field = upd_get_nth_field(update, i); field_no = mach_read_next_compressed(&ptr); is_virtual = (field_no >= REC_MAX_N_FIELDS); @@ -1553,25 +1553,6 @@ trx_undo_update_rec_get_update( index->table, ptr, first_v_col, &is_undo_log, &field_no); first_v_col = false; - } else if (field_no >= dict_index_get_n_fields(index)) { - ib::error() << "Trying to access update undo rec" - " field " << field_no - << " in index " << index->name - << " of table " << index->table->name - << " but index has only " - << dict_index_get_n_fields(index) - << " fields " << BUG_REPORT_MSG - << ". Run also CHECK TABLE " - << index->table->name << "." - " n_fields = " << n_fields << ", i = " << i; - ut_ad(0); - *upd = NULL; - return(NULL); - } - - upd_field = upd_get_nth_field(update, i); - - if (is_virtual) { /* This column could be dropped or no longer indexed */ if (field_no == ULINT_UNDEFINED) { /* Mark this is no longer needed */ @@ -1585,10 +1566,31 @@ trx_undo_update_rec_get_update( continue; } - upd_field_set_v_field_no( - upd_field, field_no, index); - } else { + upd_field_set_v_field_no(upd_field, field_no, index); + } else if (field_no < index->n_fields) { upd_field_set_field_no(upd_field, field_no, index); + } else if (update->info_bits == REC_INFO_MIN_REC_FLAG + && index->is_instant()) { + /* This must be a rollback of a subsequent + instant ADD COLUMN operation. This will be + detected and handled by btr_cur_trim(). */ + upd_field->field_no = field_no; + upd_field->orig_len = 0; + } else { + ib::error() << "Trying to access update undo rec" + " field " << field_no + << " in index " << index->name + << " of table " << index->table->name + << " but index has only " + << dict_index_get_n_fields(index) + << " fields " << BUG_REPORT_MSG + << ". Run also CHECK TABLE " + << index->table->name << "." + " n_fields = " << n_fields << ", i = " << i; + + ut_ad(0); + *upd = NULL; + return(NULL); } ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); @@ -1681,7 +1683,7 @@ trx_undo_rec_get_partial_row( bool first_v_col = true; bool is_undo_log = true; - ut_ad(dict_index_is_clust(index)); + ut_ad(index->is_primary()); *row = dtuple_create_with_vcol( heap, dict_table_get_n_cols(index->table), @@ -1818,8 +1820,7 @@ trx_undo_rec_get_partial_row( && spatial_status != SPATIAL_ONLY) { ut_a(dfield_get_len(dfield) >= BTR_EXTERN_FIELD_REF_SIZE); - ut_a(dict_table_get_format(index->table) - >= UNIV_FORMAT_B + ut_a(dict_table_has_atomic_blobs(index->table) || dfield_get_len(dfield) >= REC_ANTELOPE_MAX_INDEX_COL_LEN + BTR_EXTERN_FIELD_REF_SIZE); @@ -1830,51 +1831,22 @@ trx_undo_rec_get_partial_row( return(const_cast<byte*>(ptr)); } -/***********************************************************************//** -Erases the unused undo log page end. -@return TRUE if the page contained something, FALSE if it was empty */ -static MY_ATTRIBUTE((nonnull)) -ibool -trx_undo_erase_page_end( -/*====================*/ - page_t* undo_page, /*!< in/out: undo page whose end to erase */ - mtr_t* mtr) /*!< in/out: mini-transaction */ +/** Erase the unused undo log page end. +@param[in,out] undo_page undo log page +@return whether the page contained something */ +bool +trx_undo_erase_page_end(page_t* undo_page) { ulint first_free; first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE); - memset(undo_page + first_free, 0xff, - (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free); + memset(undo_page + first_free, 0, + (srv_page_size - FIL_PAGE_DATA_END) - first_free); - mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr); return(first_free != TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); } -/***********************************************************//** -Parses a redo log record of erasing of an undo page end. -@return end of log record or NULL */ -byte* -trx_undo_parse_erase_page_end( -/*==========================*/ - byte* ptr, /*!< in: buffer */ - byte* end_ptr MY_ATTRIBUTE((unused)), /*!< in: buffer end */ - page_t* page, /*!< in: page or NULL */ - mtr_t* mtr) /*!< in: mtr or NULL */ -{ - ut_ad(ptr != NULL); - ut_ad(end_ptr != NULL); - - if (page == NULL) { - - return(ptr); - } - - trx_undo_erase_page_end(page, mtr); - - return(ptr); -} - /** Report a RENAME TABLE operation. @param[in,out] trx transaction @param[in] table table that is being renamed @@ -1891,7 +1863,7 @@ trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table, + block->frame; ulint first_free = mach_read_from_2(ptr_first_free); ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); - ut_ad(first_free <= UNIV_PAGE_SIZE); + ut_ad(first_free <= srv_page_size); byte* start = block->frame + first_free; size_t len = strlen(table->name.m_name); const size_t fixed = 2 + 1 + 11 + 11 + 2; @@ -1901,7 +1873,7 @@ trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE < UNIV_PAGE_SIZE_MIN - 10 - FIL_PAGE_DATA_END); - if (trx_undo_left(block->frame, start) < fixed + len) { + if (trx_undo_left(block, start) < fixed + len) { ut_ad(first_free > TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); return 0; @@ -1919,7 +1891,7 @@ trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table, mach_write_to_2(start, offset); mach_write_to_2(ptr_first_free, offset); - trx_undof_page_add_undo_rec_log(block->frame, first_free, offset, mtr); + trx_undof_page_add_undo_rec_log(block, first_free, offset, mtr); return first_free; } @@ -1932,57 +1904,41 @@ dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table) ut_ad(!trx->read_only); ut_ad(trx->id); ut_ad(!table->is_temporary()); - ut_ad(srv_safe_truncate); - - trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; - trx_undo_t** pundo = &trx->rsegs.m_redo.insert_undo; - mutex_enter(&trx->undo_mutex); - dberr_t err = *pundo - ? DB_SUCCESS - : trx_undo_assign_undo(trx, rseg, pundo, TRX_UNDO_INSERT); - ut_ad((err == DB_SUCCESS) == (*pundo != NULL)); - if (trx_undo_t* undo = *pundo) { - mtr_t mtr; - mtr.start(); - buf_block_t* block = buf_page_get_gen( - page_id_t(undo->space, undo->last_page_no), - univ_page_size, RW_X_LATCH, - undo->guess_block, - BUF_GET, __FILE__, __LINE__, &mtr, &err); - ut_ad((err == DB_SUCCESS) == !!block); - - for (ut_d(int loop_count = 0); block;) { + mtr_t mtr; + dberr_t err; + mtr.start(); + if (buf_block_t* block = trx_undo_assign(trx, &err, &mtr)) { + trx_undo_t* undo = trx->rsegs.m_redo.undo; + ut_ad(err == DB_SUCCESS); + ut_ad(undo); + for (ut_d(int loop_count = 0);;) { ut_ad(loop_count++ < 2); - buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); ut_ad(undo->last_page_no == block->page.id.page_no()); if (ulint offset = trx_undo_page_report_rename( trx, table, block, &mtr)) { - undo->empty = FALSE; undo->top_page_no = undo->last_page_no; undo->top_offset = offset; undo->top_undo_no = trx->undo_no++; undo->guess_block = block; + ut_ad(!undo->empty()); - trx->undo_rseg_space = rseg->space; err = DB_SUCCESS; break; } else { mtr.commit(); mtr.start(); - block = trx_undo_add_page(trx, undo, &mtr); + block = trx_undo_add_page(undo, &mtr); if (!block) { err = DB_OUT_OF_FILE_SPACE; break; } } } - - mtr.commit(); } - mutex_exit(&trx->undo_mutex); + mtr.commit(); return err; } @@ -2016,8 +1972,6 @@ trx_undo_report_row_operation( undo log record */ { trx_t* trx; - ulint page_no; - buf_block_t* undo_block; mtr_t mtr; #ifdef UNIV_DEBUG int loop_count = 0; @@ -2037,7 +1991,7 @@ trx_undo_report_row_operation( mtr.start(); trx_undo_t** pundo; trx_rseg_t* rseg; - const bool is_temp = dict_table_is_temporary(index->table); + const bool is_temp = index->table->is_temporary(); if (is_temp) { mtr.set_log_mode(MTR_LOG_NO_REDO); @@ -2047,62 +2001,32 @@ trx_undo_report_row_operation( } else { ut_ad(!trx->read_only); ut_ad(trx->id); - /* Keep INFORMATION_SCHEMA.TABLES.UPDATE_TIME - up-to-date for persistent tables. Temporary tables are - not listed there. */ - trx->mod_tables.insert(index->table); - - pundo = !rec - ? &trx->rsegs.m_redo.insert_undo - : &trx->rsegs.m_redo.update_undo; + pundo = &trx->rsegs.m_redo.undo; rseg = trx->rsegs.m_redo.rseg; } - mutex_enter(&trx->undo_mutex); - dberr_t err; - - if (*pundo) { - err = DB_SUCCESS; - } else if (!rec || is_temp) { - err = trx_undo_assign_undo(trx, rseg, pundo, TRX_UNDO_INSERT); - } else { - err = trx_undo_assign_undo(trx, rseg, pundo, TRX_UNDO_UPDATE); - } - - trx_undo_t* undo = *pundo; + dberr_t err; + buf_block_t* undo_block = trx_undo_assign_low(trx, rseg, pundo, + &err, &mtr); + trx_undo_t* undo = *pundo; - ut_ad((err == DB_SUCCESS) == (undo != NULL)); - if (undo == NULL) { + ut_ad((err == DB_SUCCESS) == (undo_block != NULL)); + if (UNIV_UNLIKELY(undo_block == NULL)) { goto err_exit; } - page_no = undo->last_page_no; - - undo_block = buf_page_get_gen( - page_id_t(undo->space, page_no), univ_page_size, RW_X_LATCH, - undo->guess_block, BUF_GET, __FILE__, __LINE__, - &mtr, &err); - - buf_block_dbg_add_level(undo_block, SYNC_TRX_UNDO_PAGE); + ut_ad(undo != NULL); do { - ut_ad(page_no == undo_block->page.id.page_no()); - page_t* undo_page = buf_block_get_frame(undo_block); ulint offset = !rec ? trx_undo_page_report_insert( - undo_page, trx, index, clust_entry, &mtr) + undo_block, trx, index, clust_entry, &mtr) : trx_undo_page_report_modify( - undo_page, trx, index, rec, offsets, update, + undo_block, trx, index, rec, offsets, update, cmpl_info, clust_entry, &mtr); if (UNIV_UNLIKELY(offset == 0)) { - /* The record did not fit on the page. We erase the - end segment of the undo log page and write a log - record of it: this is to ensure that in the debug - version the replicate page constructed using the log - records stays identical to the original page */ - - if (!trx_undo_erase_page_end(undo_page, &mtr)) { + if (!trx_undo_erase_page_end(undo_block->frame)) { /* The record did not fit on an empty undo page. Discard the freshly allocated page and return an error. */ @@ -2116,7 +2040,7 @@ trx_undo_report_row_operation( first, because it may be holding lower-level latches, such as SYNC_FSP and SYNC_FSP_PAGE. */ - mtr_commit(&mtr); + mtr.commit(); mtr.start(); if (is_temp) { mtr.set_log_mode(MTR_LOG_NO_REDO); @@ -2133,24 +2057,41 @@ trx_undo_report_row_operation( mtr_commit(&mtr); } else { /* Success */ - undo->guess_block = undo_block; mtr_commit(&mtr); - undo->empty = FALSE; - undo->top_page_no = page_no; + undo->top_page_no = undo_block->page.id.page_no(); undo->top_offset = offset; undo->top_undo_no = trx->undo_no++; - - trx->undo_rseg_space = rseg->space; - - mutex_exit(&trx->undo_mutex); + undo->guess_block = undo_block; + ut_ad(!undo->empty()); + + if (!is_temp) { + const undo_no_t limit = undo->top_undo_no; + /* Determine if this is the first time + when this transaction modifies a + system-versioned column in this table. */ + trx_mod_table_time_t& time + = trx->mod_tables.insert( + trx_mod_tables_t::value_type( + index->table, limit)) + .first->second; + ut_ad(time.valid(limit)); + + if (!time.is_versioned() + && index->table->versioned_by_id() + && (!rec /* INSERT */ + || (update + && update->affects_versioned()))) { + time.set_versioned(limit); + } + } *roll_ptr = trx_undo_build_roll_ptr( - !rec, rseg->id, page_no, offset); + !rec, rseg->id, undo->top_page_no, offset); return(DB_SUCCESS); } - ut_ad(page_no == undo->last_page_no); + ut_ad(undo_block->page.id.page_no() == undo->last_page_no); /* We have to extend the undo log by one page */ @@ -2161,12 +2102,11 @@ trx_undo_report_row_operation( mtr.set_log_mode(MTR_LOG_NO_REDO); } - undo_block = trx_undo_add_page(trx, undo, &mtr); - page_no = undo->last_page_no; + undo_block = trx_undo_add_page(undo, &mtr); DBUG_EXECUTE_IF("ib_err_ins_undo_page_add_failure", undo_block = NULL;); - } while (undo_block != NULL); + } while (UNIV_LIKELY(undo_block != NULL)); ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, DB_OUT_OF_FILE_SPACE, @@ -2175,14 +2115,13 @@ trx_undo_report_row_operation( " log pages. Please add new data file to the tablespace or" " check if filesystem is full or enable auto-extension for" " the tablespace", - undo->space == TRX_SYS_SPACE + undo->rseg->space == fil_system.sys_space ? "system" : is_temp ? "temporary" : "undo"); /* Did not succeed: out of space */ err = DB_OUT_OF_FILE_SPACE; err_exit: - mutex_exit(&trx->undo_mutex); mtr_commit(&mtr); return(err); } @@ -2211,12 +2150,13 @@ trx_undo_get_undo_rec_low( &offset); ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO); ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); - rseg = trx_sys->rseg_array[rseg_id]; + rseg = trx_sys.rseg_array[rseg_id]; + ut_ad(rseg->is_persistent()); mtr_start(&mtr); undo_page = trx_undo_page_get_s_latched( - page_id_t(rseg->space, page_no), &mtr); + page_id_t(rseg->space->id, page_no), &mtr); undo_rec = trx_undo_rec_copy(undo_page + offset, heap); @@ -2248,14 +2188,14 @@ trx_undo_get_undo_rec( { bool missing_history; - rw_lock_s_lock(&purge_sys->latch); + rw_lock_s_lock(&purge_sys.latch); - missing_history = purge_sys->view.changes_visible(trx_id, name); + missing_history = purge_sys.view.changes_visible(trx_id, name); if (!missing_history) { *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); } - rw_lock_s_unlock(&purge_sys->latch); + rw_lock_s_unlock(&purge_sys.latch); return(missing_history); } @@ -2317,12 +2257,13 @@ trx_undo_prev_version_build( bool dummy_extern; byte* buf; - ut_ad(!rw_lock_own(&purge_sys->latch, RW_LOCK_S)); + ut_ad(!index->table->is_temporary()); + ut_ad(!rw_lock_own(&purge_sys.latch, RW_LOCK_S)); ut_ad(mtr_memo_contains_page_flagged(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX | MTR_MEMO_PAGE_X_FIX)); ut_ad(rec_offs_validate(rec, index, offsets)); - ut_a(dict_index_is_clust(index)); + ut_a(index->is_primary()); roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); @@ -2333,8 +2274,6 @@ trx_undo_prev_version_build( return(true); } - ut_ad(!dict_table_is_temporary(index->table)); - rec_trx_id = row_get_rec_trx_id(rec, index, offsets); ut_ad(!index->table->skip_alter_undo); @@ -2344,8 +2283,7 @@ trx_undo_prev_version_build( &undo_rec)) { if (v_status & TRX_UNDO_PREV_IN_PURGE) { /* We are fetching the record being purged */ - undo_rec = trx_undo_get_undo_rec_low( - roll_ptr, heap); + undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); } else { /* The undo record may already have been purged, during purge or semi-consistent read. */ @@ -2367,12 +2305,12 @@ trx_undo_prev_version_build( &info_bits); /* (a) If a clustered index record version is such that the - trx id stamp in it is bigger than purge_sys->view, then the + trx id stamp in it is bigger than purge_sys.view, then the BLOBs in that version are known to exist (the purge has not progressed that far); (b) if the version is the first version such that trx id in it - is less than purge_sys->view, and it is not delete-marked, + is less than purge_sys.view, and it is not delete-marked, then the BLOBs in that version are known to exist (the purge cannot have purged the BLOBs referenced by that version yet). @@ -2409,19 +2347,19 @@ trx_undo_prev_version_build( the BLOB. */ /* the row_upd_changes_disowned_external(update) call could be - omitted, but the synchronization on purge_sys->latch is likely + omitted, but the synchronization on purge_sys.latch is likely more expensive. */ if ((update->info_bits & REC_INFO_DELETED_FLAG) && row_upd_changes_disowned_external(update)) { bool missing_extern; - rw_lock_s_lock(&purge_sys->latch); + rw_lock_s_lock(&purge_sys.latch); - missing_extern = purge_sys->view.changes_visible( + missing_extern = purge_sys.view.changes_visible( trx_id, index->table->name); - rw_lock_s_unlock(&purge_sys->latch); + rw_lock_s_unlock(&purge_sys.latch); if (missing_extern) { /* treat as a fresh insert, not to @@ -2459,7 +2397,7 @@ trx_undo_prev_version_build( heap, rec_offs_size(offsets))); *old_vers = rec_copy(buf, rec, offsets); - rec_offs_make_valid(*old_vers, index, offsets); + rec_offs_make_valid(*old_vers, index, true, offsets); row_upd_rec_in_place(*old_vers, index, offsets, update, NULL); } @@ -2474,7 +2412,8 @@ trx_undo_prev_version_build( rec_offs offsets_dbg[REC_OFFS_NORMAL_SIZE]; rec_offs_init(offsets_dbg); ut_a(!rec_offs_any_null_extern( - *old_vers, rec_get_offsets(*old_vers, index, offsets_dbg, true, + *old_vers, rec_get_offsets(*old_vers, index, offsets_dbg, + index->n_core_fields, ULINT_UNDEFINED, &heap))); #endif // defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc index c5f70452bf2..d519265dc8a 100644 --- a/storage/innobase/trx/trx0roll.cc +++ b/storage/innobase/trx/trx0roll.cc @@ -34,7 +34,6 @@ Created 3/26/1996 Heikki Tuuri #include "mach0data.h" #include "pars0pars.h" #include "que0que.h" -#include "read0read.h" #include "row0mysql.h" #include "row0undo.h" #include "srv0mon.h" @@ -49,19 +48,53 @@ Created 3/26/1996 Heikki Tuuri rollback */ static const ulint TRX_ROLL_TRUNC_THRESHOLD = 1; -/** true if trx_rollback_or_clean_all_recovered() thread is active */ -bool trx_rollback_or_clean_is_active; +/** true if trx_rollback_all_recovered() thread is active */ +bool trx_rollback_is_active; /** In crash recovery, the current trx to be rolled back; NULL otherwise */ const trx_t* trx_roll_crash_recv_trx; -/****************************************************************//** -Finishes a transaction rollback. */ -static -void -trx_rollback_finish( -/*================*/ - trx_t* trx); /*!< in: transaction */ +/** Finish transaction rollback. +@param[in,out] trx transaction +@return whether the rollback was completed normally +@retval false if the rollback was aborted by shutdown */ +static bool trx_rollback_finish(trx_t* trx) +{ + trx->mod_tables.clear(); + bool finished = trx->error_state == DB_SUCCESS; + if (UNIV_LIKELY(finished)) { + trx_commit(trx); + } else { + ut_a(trx->error_state == DB_INTERRUPTED); + ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE); + ut_a(!srv_undo_sources); + ut_ad(srv_fast_shutdown); + ut_d(trx->in_rollback = false); + if (trx_undo_t*& undo = trx->rsegs.m_redo.old_insert) { + UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->old_insert_list, + undo); + ut_free(undo); + undo = NULL; + } + if (trx_undo_t*& undo = trx->rsegs.m_redo.undo) { + UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->undo_list, + undo); + ut_free(undo); + undo = NULL; + } + if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) { + UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->undo_list, + undo); + ut_free(undo); + undo = NULL; + } + trx_commit_low(trx, NULL); + } + + trx->lock.que_state = TRX_QUE_RUNNING; + + return finished; +} /*******************************************************************//** Rollback a transaction used in MySQL. */ @@ -91,7 +124,7 @@ trx_rollback_to_savepoint_low( trx->error_state = DB_SUCCESS; - if (trx->has_logged()) { + if (trx->has_logged_or_recovered()) { ut_ad(trx->rsegs.m_redo.rseg != 0 || trx->rsegs.m_noredo.rseg != 0); @@ -115,13 +148,20 @@ trx_rollback_to_savepoint_low( trx_rollback_finish(trx); MONITOR_INC(MONITOR_TRX_ROLLBACK); } else { + ut_a(trx->error_state == DB_SUCCESS); + const undo_no_t limit = savept->least_undo_no; + for (trx_mod_tables_t::iterator i = trx->mod_tables.begin(); + i != trx->mod_tables.end(); ) { + trx_mod_tables_t::iterator j = i++; + ut_ad(j->second.valid()); + if (j->second.rollback(limit)) { + trx->mod_tables.erase(j); + } + } trx->lock.que_state = TRX_QUE_RUNNING; MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT); } - ut_a(trx->error_state == DB_SUCCESS); - ut_a(trx->lock.que_state == TRX_QUE_RUNNING); - mem_heap_free(heap); /* There might be work for utility threads.*/ @@ -170,8 +210,6 @@ trx_rollback_for_mysql_low( trx->op_info = ""; - ut_a(trx->error_state == DB_SUCCESS); - return(trx->error_state); } @@ -180,7 +218,7 @@ trx_rollback_for_mysql_low( @return error code or DB_SUCCESS */ dberr_t trx_rollback_for_mysql(trx_t* trx) { - /* We are reading trx->state without holding trx_sys->mutex + /* We are reading trx->state without holding trx_sys.mutex here, because the rollback should be invoked for a running active MySQL transaction (or recovered prepared transaction) that is associated with the current thread. */ @@ -188,21 +226,21 @@ dberr_t trx_rollback_for_mysql(trx_t* trx) switch (trx->state) { case TRX_STATE_NOT_STARTED: trx->will_lock = 0; - ut_ad(trx->in_mysql_trx_list); + ut_ad(trx->mysql_thd); #ifdef WITH_WSREP trx->wsrep = false; #endif return(DB_SUCCESS); case TRX_STATE_ACTIVE: - ut_ad(trx->in_mysql_trx_list); + ut_ad(trx->mysql_thd); assert_trx_nonlocking_or_in_list(trx); return(trx_rollback_for_mysql_low(trx)); case TRX_STATE_PREPARED: case TRX_STATE_PREPARED_RECOVERED: ut_ad(!trx_is_autocommit_non_locking(trx)); - if (trx->has_logged_persistent()) { + if (trx->rsegs.m_redo.undo || trx->rsegs.m_redo.old_insert) { /* The XA ROLLBACK of a XA PREPARE transaction will consist of multiple mini-transactions. @@ -218,19 +256,22 @@ dberr_t trx_rollback_for_mysql(trx_t* trx) killed, and finally, the transaction would be recovered in XA PREPARE state, with some of the actions already having been rolled back. */ - trx_undo_ptr_t* undo_ptr = &trx->rsegs.m_redo; + ut_ad(!trx->rsegs.m_redo.undo + || trx->rsegs.m_redo.undo->rseg + == trx->rsegs.m_redo.rseg); + ut_ad(!trx->rsegs.m_redo.old_insert + || trx->rsegs.m_redo.old_insert->rseg + == trx->rsegs.m_redo.rseg); mtr_t mtr; mtr.start(); mutex_enter(&trx->rsegs.m_redo.rseg->mutex); - if (undo_ptr->insert_undo != NULL) { - trx_undo_set_state_at_prepare( - trx, undo_ptr->insert_undo, - true, &mtr); + if (trx_undo_t* undo = trx->rsegs.m_redo.undo) { + trx_undo_set_state_at_prepare(trx, undo, true, + &mtr); } - if (undo_ptr->update_undo != NULL) { - trx_undo_set_state_at_prepare( - trx, undo_ptr->update_undo, - true, &mtr); + if (trx_undo_t* undo = trx->rsegs.m_redo.old_insert) { + trx_undo_set_state_at_prepare(trx, undo, true, + &mtr); } mutex_exit(&trx->rsegs.m_redo.rseg->mutex); /* Write the redo log for the XA ROLLBACK @@ -263,11 +304,11 @@ trx_rollback_last_sql_stat_for_mysql( { dberr_t err; - /* We are reading trx->state without holding trx_sys->mutex + /* We are reading trx->state without holding trx_sys.mutex here, because the statement rollback should be invoked for a running active MySQL transaction that is associated with the current thread. */ - ut_ad(trx->in_mysql_trx_list); + ut_ad(trx->mysql_thd); switch (trx->state) { case TRX_STATE_NOT_STARTED: @@ -390,7 +431,7 @@ trx_rollback_to_savepoint_for_mysql_low( dberr_t err; ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); - ut_ad(trx->in_mysql_trx_list); + ut_ad(trx->mysql_thd); /* Free all savepoints strictly later than savep. */ @@ -442,11 +483,11 @@ trx_rollback_to_savepoint_for_mysql( { trx_named_savept_t* savep; - /* We are reading trx->state without holding trx_sys->mutex + /* We are reading trx->state without holding trx_sys.mutex here, because the savepoint rollback should be invoked for a running active MySQL transaction that is associated with the current thread. */ - ut_ad(trx->in_mysql_trx_list); + ut_ad(trx->mysql_thd); savep = trx_savepoint_find(trx, savepoint_name); @@ -540,7 +581,7 @@ trx_release_savepoint_for_mysql( ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE, true) || trx_state_eq(trx, TRX_STATE_PREPARED, true)); - ut_ad(trx->in_mysql_trx_list); + ut_ad(trx->mysql_thd); savep = trx_savepoint_find(trx, savepoint_name); @@ -578,8 +619,6 @@ trx_rollback_active( que_fork_t* fork; que_thr_t* thr; roll_node_t* roll_node; - dict_table_t* table; - ibool dictionary_locked = FALSE; const trx_id_t trx_id = trx->id; ut_ad(trx_id); @@ -602,9 +641,11 @@ trx_rollback_active( trx_roll_crash_recv_trx = trx; - if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { + const bool dictionary_locked = trx_get_dict_operation(trx) + != TRX_DICT_OP_NONE; + + if (dictionary_locked) { row_mysql_lock_data_dictionary(trx); - dictionary_locked = TRUE; } que_run_threads(thr); @@ -612,46 +653,26 @@ trx_rollback_active( que_run_threads(roll_node->undo_thr); - if (trx->error_state != DB_SUCCESS) { - ut_ad(trx->error_state == DB_INTERRUPTED); - ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE); - ut_ad(!srv_undo_sources); - ut_ad(srv_fast_shutdown); + que_graph_free( + static_cast<que_t*>(roll_node->undo_thr->common.parent)); + + if (UNIV_UNLIKELY(!trx_rollback_finish(trx))) { ut_ad(!dictionary_locked); - que_graph_free(static_cast<que_t*>( - roll_node->undo_thr->common.parent)); goto func_exit; } - trx_rollback_finish(thr_get_trx(roll_node->undo_thr)); - - /* Free the memory reserved by the undo graph */ - que_graph_free(static_cast<que_t*>( - roll_node->undo_thr->common.parent)); - ut_a(trx->lock.que_state == TRX_QUE_RUNNING); - if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE - && trx->table_id != 0) { - - ut_ad(dictionary_locked); + if (!dictionary_locked || !trx->table_id) { + } else if (dict_table_t* table = dict_table_open_on_id( + trx->table_id, TRUE, DICT_TABLE_OP_NORMAL)) { + ib::info() << "Dropping table " << table->name + << ", with id " << trx->table_id + << " in recovery"; - /* If the transaction was for a dictionary operation, - we drop the relevant table only if it is not flagged - as DISCARDED. If it still exists. */ + dict_table_close_and_drop(trx, table); - table = dict_table_open_on_id( - trx->table_id, TRUE, DICT_TABLE_OP_NORMAL); - - if (table && !dict_table_is_discarded(table)) { - ib::warn() << "Dropping table '" << table->name - << "', with id " << trx->table_id - << " in recovery"; - - dict_table_close_and_drop(trx, table); - - trx_commit_for_mysql(trx); - } + trx_commit_for_mysql(trx); } ib::info() << "Rolled back recovered transaction " << trx_id; @@ -666,193 +687,160 @@ func_exit: trx_roll_crash_recv_trx = NULL; } -/*******************************************************************//** -Rollback or clean up any resurrected incomplete transactions. It assumes -that the caller holds the trx_sys_t::mutex and it will release the -lock if it does a clean up or rollback. -@return TRUE if the transaction was cleaned up or rolled back -and trx_sys->mutex was released. */ -static -ibool -trx_rollback_resurrected( -/*=====================*/ - trx_t* trx, /*!< in: transaction to rollback or clean */ - ibool* all) /*!< in/out: FALSE=roll back dictionary transactions; - TRUE=roll back all non-PREPARED transactions */ -{ - ut_ad(trx_sys_mutex_own()); - - /* The trx->is_recovered flag and trx->state are set - atomically under the protection of the trx->mutex in - trx_t::commit_state(). We do not want to accidentally clean up - a non-recovered transaction here. */ - - trx_mutex_enter(trx); - if (!trx->is_recovered) { -func_exit: - trx_mutex_exit(trx); - return(FALSE); - } - - switch (trx->state) { - case TRX_STATE_COMMITTED_IN_MEMORY: - trx_mutex_exit(trx); - trx_sys_mutex_exit(); - ib::info() << "Cleaning up trx with id " << ib::hex(trx->id); - trx_cleanup_at_db_startup(trx); - trx_free_resurrected(trx); - return(TRUE); - case TRX_STATE_ACTIVE: - if (srv_shutdown_state != SRV_SHUTDOWN_NONE - && !srv_undo_sources && srv_fast_shutdown) { -fake_prepared: - trx->state = TRX_STATE_PREPARED; - *all = FALSE; - goto func_exit; - } - trx_mutex_exit(trx); +struct trx_roll_count_callback_arg +{ + uint32_t n_trx; + uint64_t n_rows; + trx_roll_count_callback_arg(): n_trx(0), n_rows(0) {} +}; - if (*all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { - trx_sys_mutex_exit(); - trx_rollback_active(trx); - if (trx->error_state != DB_SUCCESS) { - ut_ad(trx->error_state == DB_INTERRUPTED); - trx->error_state = DB_SUCCESS; - ut_ad(!srv_undo_sources); - ut_ad(srv_fast_shutdown); - mutex_enter(&trx_sys->mutex); - trx_mutex_enter(trx); - goto fake_prepared; - } - trx_free_for_background(trx); - return(TRUE); - } - return(FALSE); - case TRX_STATE_PREPARED: - case TRX_STATE_PREPARED_RECOVERED: - goto func_exit; - case TRX_STATE_NOT_STARTED: - break; - } - ut_error; - goto func_exit; +static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element, + trx_roll_count_callback_arg *arg) +{ + mutex_enter(&element->mutex); + if (trx_t *trx= element->trx) + { + if (trx->is_recovered && trx_state_eq(trx, TRX_STATE_ACTIVE)) + { + arg->n_trx++; + arg->n_rows+= trx->undo_no; + } + } + mutex_exit(&element->mutex); + return 0; } -/** Report progress when rolling back a row of a recovered transaction. -@return whether the rollback should be aborted due to pending shutdown */ -bool -trx_roll_must_shutdown() +/** Report progress when rolling back a row of a recovered transaction. */ +void trx_roll_report_progress() { - const trx_t* trx = trx_roll_crash_recv_trx; - ut_ad(trx); - ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); - - if (trx_get_dict_operation(trx) == TRX_DICT_OP_NONE - && srv_shutdown_state != SRV_SHUTDOWN_NONE - && !srv_undo_sources && srv_fast_shutdown) { - return true; - } - time_t now = time(NULL); - mutex_enter(&trx_sys->mutex); mutex_enter(&recv_sys->mutex); - - if (recv_sys->report(now)) { - ulint n_trx = 0; - ulonglong n_rows = 0; - for (const trx_t* t = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); - t != NULL; - t = UT_LIST_GET_NEXT(trx_list, t)) { - - assert_trx_in_rw_list(t); - if (t->is_recovered - && trx_state_eq(t, TRX_STATE_ACTIVE)) { - n_trx++; - n_rows += t->undo_no; - } - } - if (n_rows > 0) { - service_manager_extend_timeout( - INNODB_EXTEND_TIMEOUT_INTERVAL, - "To roll back: " ULINTPF " transactions, " - "%llu rows", n_trx, n_rows); - } - - ib::info() << "To roll back: " << n_trx << " transactions, " - << n_rows << " rows"; - } - + bool report = recv_sys->report(now); mutex_exit(&recv_sys->mutex); - mutex_exit(&trx_sys->mutex); - return false; -} -/*******************************************************************//** -Rollback or clean up any incomplete transactions which were -encountered in crash recovery. If the transaction already was -committed, then we clean up a possible insert undo log. If the -transaction was not yet committed, then we roll it back. */ -void -trx_rollback_or_clean_recovered( -/*============================*/ - ibool all) /*!< in: FALSE=roll back dictionary transactions; - TRUE=roll back all non-PREPARED transactions */ -{ - trx_t* trx; + if (report) { + trx_roll_count_callback_arg arg; - ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO); + /* Get number of recovered active transactions and number of + rows they modified. Numbers must be accurate, because only this + thread is allowed to touch recovered transactions. */ + trx_sys.rw_trx_hash.iterate_no_dups( + reinterpret_cast<my_hash_walk_action> + (trx_roll_count_callback), &arg); - if (trx_sys_get_n_rw_trx() == 0) { + if (arg.n_rows > 0) { + service_manager_extend_timeout( + INNODB_EXTEND_TIMEOUT_INTERVAL, + "To roll back: " UINT32PF " transactions, " + UINT64PF " rows", arg.n_trx, arg.n_rows); + } - return; - } + ib::info() << "To roll back: " << arg.n_trx + << " transactions, " << arg.n_rows << " rows"; - if (all) { - ib::info() << "Starting in background the rollback" - " of recovered transactions"; } +} - /* Note: For XA recovered transactions, we rely on MySQL to - do rollback. They will be in TRX_STATE_PREPARED state. If the server - is shutdown and they are still lingering in trx_sys_t::trx_list - then the shutdown will hang. */ - - /* Loop over the transaction list as long as there are - recovered transactions to clean up or recover. */ - - do { - trx_sys_mutex_enter(); - - for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); - trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { - - assert_trx_in_rw_list(trx); - /* If this function does a cleanup or rollback - then it will release the trx_sys->mutex, therefore - we need to reacquire it before retrying the loop. */ +static my_bool trx_rollback_recovered_callback(rw_trx_hash_element_t *element, + std::vector<trx_t*> *trx_list) +{ + mutex_enter(&element->mutex); + if (trx_t *trx= element->trx) + { + mutex_enter(&trx->mutex); + if (trx_state_eq(trx, TRX_STATE_ACTIVE) && trx->is_recovered) + trx_list->push_back(trx); + mutex_exit(&trx->mutex); + } + mutex_exit(&element->mutex); + return 0; +} - if (trx_rollback_resurrected(trx, &all)) { - trx_sys_mutex_enter(); +/** + Rollback any incomplete transactions which were encountered in crash recovery. - break; - } - } + If the transaction already was committed, then we clean up a possible insert + undo log. If the transaction was not yet committed, then we roll it back. - trx_sys_mutex_exit(); + Note: For XA recovered transactions, we rely on MySQL to + do rollback. They will be in TRX_STATE_PREPARED state. If the server + is shutdown and they are still lingering in trx_sys_t::trx_list + then the shutdown will hang. - } while (trx != NULL); + @param[in] all true=roll back all recovered active transactions; + false=roll back any incomplete dictionary transaction +*/ - if (all) { - ib::info() << "Rollback of non-prepared transactions" - " completed"; - } +void trx_rollback_recovered(bool all) +{ + std::vector<trx_t*> trx_list; + + ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO); + + /* + Collect list of recovered ACTIVE transaction ids first. Once collected, no + other thread is allowed to modify or remove these transactions from + rw_trx_hash. + */ + trx_sys.rw_trx_hash.iterate_no_dups(reinterpret_cast<my_hash_walk_action> + (trx_rollback_recovered_callback), + &trx_list); + + while (!trx_list.empty()) + { + trx_t *trx= trx_list.back(); + trx_list.pop_back(); + + ut_ad(trx); + ut_d(trx_mutex_enter(trx)); + ut_ad(trx->is_recovered); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_d(trx_mutex_exit(trx)); + + if (srv_shutdown_state != SRV_SHUTDOWN_NONE && !srv_undo_sources && + srv_fast_shutdown) + goto discard; + + if (all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) + { + trx_rollback_active(trx); + if (trx->error_state != DB_SUCCESS) + { + ut_ad(trx->error_state == DB_INTERRUPTED); + trx->error_state= DB_SUCCESS; + ut_ad(!srv_undo_sources); + ut_ad(srv_fast_shutdown); +discard: + /* Note: before kill_server() invoked innobase_end() via + unireg_end(), it invoked close_connections(), which should initiate + the rollback of any user transactions via THD::cleanup() in the + connection threads, and wait for all THD::cleanup() to complete. + So, no active user transactions should exist at this point. + + srv_undo_sources=false was cleared early in innobase_end(). + + Generally, the server guarantees that all connections using + InnoDB must be disconnected by the time we are reaching this code, + be it during shutdown or UNINSTALL PLUGIN. + + Because there is no possible race condition with any + concurrent user transaction, we do not have to invoke + trx->commit_state() or wait for !trx->is_referenced() + before trx_sys.deregister_rw(trx). */ + trx_sys.deregister_rw(trx); + trx_free_at_shutdown(trx); + } + else + trx->free(); + } + } } + /*******************************************************************//** Rollback or clean up any incomplete transactions which were encountered in crash recovery. If the transaction already was @@ -862,11 +850,7 @@ Note: this is done in a background thread. @return a dummy parameter */ extern "C" os_thread_ret_t -DECLARE_THREAD(trx_rollback_or_clean_all_recovered)( -/*================================================*/ - void* arg MY_ATTRIBUTE((unused))) - /*!< in: a dummy parameter required by - os_thread_create */ +DECLARE_THREAD(trx_rollback_all_recovered)(void*) { my_thread_init(); ut_ad(!srv_read_only_mode); @@ -875,9 +859,15 @@ DECLARE_THREAD(trx_rollback_or_clean_all_recovered)( pfs_register_thread(trx_rollback_clean_thread_key); #endif /* UNIV_PFS_THREAD */ - trx_rollback_or_clean_recovered(TRUE); + if (trx_sys.rw_trx_hash.size()) { + ib::info() << "Starting in background the rollback of" + " recovered transactions"; + trx_rollback_recovered(true); + ib::info() << "Rollback of non-prepared transactions" + " completed"; + } - trx_rollback_or_clean_is_active = false; + trx_rollback_is_active = false; my_thread_end(); /* We count the number of threads in os_thread_exit(). A created @@ -894,25 +884,15 @@ static void trx_roll_try_truncate(trx_t* trx) { - ut_ad(mutex_own(&trx->undo_mutex)); - trx->pages_undone = 0; undo_no_t undo_no = trx->undo_no; - trx_undo_t* insert_undo = trx->rsegs.m_redo.insert_undo; - trx_undo_t* update_undo = trx->rsegs.m_redo.update_undo; - - if (insert_undo || update_undo) { - mutex_enter(&trx->rsegs.m_redo.rseg->mutex); - if (insert_undo) { - ut_ad(insert_undo->rseg == trx->rsegs.m_redo.rseg); - trx_undo_truncate_end(insert_undo, undo_no, false); - } - if (update_undo) { - ut_ad(update_undo->rseg == trx->rsegs.m_redo.rseg); - trx_undo_truncate_end(update_undo, undo_no, false); - } - mutex_exit(&trx->rsegs.m_redo.rseg->mutex); + + if (trx_undo_t* undo = trx->rsegs.m_redo.undo) { + ut_ad(undo->rseg == trx->rsegs.m_redo.rseg); + mutex_enter(&undo->rseg->mutex); + trx_undo_truncate_end(undo, undo_no, false); + mutex_exit(&undo->rseg->mutex); } if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) { @@ -935,10 +915,8 @@ trx_roll_pop_top_rec( trx_undo_t* undo, /*!< in: undo log */ mtr_t* mtr) /*!< in: mtr */ { - ut_ad(mutex_own(&trx->undo_mutex)); - page_t* undo_page = trx_undo_page_get_s_latched( - page_id_t(undo->space, undo->top_page_no), mtr); + page_id_t(undo->rseg->space->id, undo->top_page_no), mtr); ulint offset = undo->top_offset; @@ -947,8 +925,8 @@ trx_roll_pop_top_rec( true, mtr); if (prev_rec == NULL) { - - undo->empty = TRUE; + undo->top_undo_no = IB_ID_MAX; + ut_ad(undo->empty()); } else { page_t* prev_rec_page = page_align(prev_rec); @@ -958,8 +936,9 @@ trx_roll_pop_top_rec( } undo->top_page_no = page_get_page_no(prev_rec_page); - undo->top_offset = prev_rec - prev_rec_page; + undo->top_offset = ulint(prev_rec - prev_rec_page); undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec); + ut_ad(!undo->empty()); } return(undo_page + offset); @@ -974,30 +953,29 @@ trx_roll_pop_top_rec( trx_undo_rec_t* trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap) { - mutex_enter(&trx->undo_mutex); - if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) { trx_roll_try_truncate(trx); } - trx_undo_t* undo = NULL; - trx_undo_t* insert = trx->rsegs.m_redo.insert_undo; - trx_undo_t* update = trx->rsegs.m_redo.update_undo; + trx_undo_t* undo = NULL; + trx_undo_t* insert = trx->rsegs.m_redo.old_insert; + trx_undo_t* update = trx->rsegs.m_redo.undo; trx_undo_t* temp = trx->rsegs.m_noredo.undo; const undo_no_t limit = trx->roll_limit; - ut_ad(!insert || !update || insert->empty || update->empty + ut_ad(!insert || !update || insert->empty() || update->empty() || insert->top_undo_no != update->top_undo_no); - ut_ad(!insert || !temp || insert->empty || temp->empty + ut_ad(!insert || !temp || insert->empty() || temp->empty() || insert->top_undo_no != temp->top_undo_no); - ut_ad(!update || !temp || update->empty || temp->empty + ut_ad(!update || !temp || update->empty() || temp->empty() || update->top_undo_no != temp->top_undo_no); - if (insert && !insert->empty && limit <= insert->top_undo_no) { + if (UNIV_LIKELY_NULL(insert) + && !insert->empty() && limit <= insert->top_undo_no) { undo = insert; } - if (update && !update->empty && update->top_undo_no >= limit) { + if (update && !update->empty() && update->top_undo_no >= limit) { if (!undo) { undo = update; } else if (undo->top_undo_no < update->top_undo_no) { @@ -1005,7 +983,7 @@ trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap) } } - if (temp && !temp->empty && temp->top_undo_no >= limit) { + if (temp && !temp->empty() && temp->top_undo_no >= limit) { if (!undo) { undo = temp; } else if (undo->top_undo_no < temp->top_undo_no) { @@ -1019,12 +997,11 @@ trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap) if the transaction object is committed and reused later, we will default to a full ROLLBACK. */ trx->roll_limit = 0; - ut_d(trx->in_rollback = false); - mutex_exit(&trx->undo_mutex); + trx->in_rollback = false; return(NULL); } - ut_ad(!undo->empty); + ut_ad(!undo->empty()); ut_ad(limit <= undo->top_undo_no); *roll_ptr = trx_undo_build_roll_ptr( @@ -1036,11 +1013,19 @@ trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap) trx_undo_rec_t* undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr); const undo_no_t undo_no = trx_undo_rec_get_undo_no(undo_rec); switch (trx_undo_rec_get_type(undo_rec)) { + case TRX_UNDO_INSERT_METADATA: + /* This record type was introduced in MDEV-11369 + instant ADD COLUMN, which was implemented after + MDEV-12288 removed the insert_undo log. There is no + instant ADD COLUMN for temporary tables. Therefore, + this record can only be present in the main undo log. */ + ut_ad(undo == update); + /* fall through */ case TRX_UNDO_RENAME_TABLE: - ut_ad(undo == insert); + ut_ad(undo == insert || undo == update); /* fall through */ case TRX_UNDO_INSERT_REC: - ut_ad(undo == insert || undo == temp); + ut_ad(undo == insert || undo == update || undo == temp); *roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS; break; default: @@ -1048,12 +1033,7 @@ trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap) break; } - ut_ad(trx_roll_check_undo_rec_ordering( - undo_no, undo->rseg->space, trx)); - trx->undo_no = undo_no; - trx->undo_rseg_space = undo->rseg->space; - mutex_exit(&trx->undo_mutex); trx_undo_rec_t* undo_rec_copy = trx_undo_rec_copy(undo_rec, heap); mtr.commit(); @@ -1111,7 +1091,7 @@ trx_rollback_start( ut_ad(!trx->in_rollback); trx->roll_limit = roll_limit; - ut_d(trx->in_rollback = true); + trx->in_rollback = true; ut_a(trx->roll_limit <= trx->undo_no); @@ -1128,21 +1108,6 @@ trx_rollback_start( return(que_fork_start_command(roll_graph)); } -/****************************************************************//** -Finishes a transaction rollback. */ -static -void -trx_rollback_finish( -/*================*/ - trx_t* trx) /*!< in: transaction */ -{ - trx_commit(trx); - - trx->mod_tables.clear(); - - trx->lock.que_state = TRX_QUE_RUNNING; -} - /*********************************************************************//** Creates a rollback command node struct. @return own: rollback node struct */ diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc index ac946adc8b5..d6720979716 100644 --- a/storage/innobase/trx/trx0rseg.cc +++ b/storage/innobase/trx/trx0rseg.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,28 +33,274 @@ Created 3/26/1996 Heikki Tuuri #include <algorithm> -/** Creates a rollback segment header. -This function is called only when a new rollback segment is created in -the database. -@param[in] space space id -@param[in] max_size max size in pages -@param[in] rseg_slot_no rseg id == slot number in trx sys +#ifdef WITH_WSREP +#include <mysql/service_wsrep.h> + +#ifdef UNIV_DEBUG +/** The latest known WSREP XID sequence number */ +static long long wsrep_seqno = -1; +#endif /* UNIV_DEBUG */ +/** The latest known WSREP XID UUID */ +static unsigned char wsrep_uuid[16]; + +/** Write the WSREP XID information into rollback segment header. +@param[in,out] rseg_header rollback segment header +@param[in] xid WSREP XID +@param[in,out] mtr mini transaction */ +static void +trx_rseg_write_wsrep_checkpoint( + trx_rsegf_t* rseg_header, + const XID* xid, + mtr_t* mtr) +{ + mlog_write_ulint(TRX_RSEG_WSREP_XID_FORMAT + rseg_header, + uint32_t(xid->formatID), + MLOG_4BYTES, mtr); + + mlog_write_ulint(TRX_RSEG_WSREP_XID_GTRID_LEN + rseg_header, + uint32_t(xid->gtrid_length), + MLOG_4BYTES, mtr); + + mlog_write_ulint(TRX_RSEG_WSREP_XID_BQUAL_LEN + rseg_header, + uint32_t(xid->bqual_length), + MLOG_4BYTES, mtr); + + mlog_write_string(TRX_RSEG_WSREP_XID_DATA + rseg_header, + reinterpret_cast<const byte*>(xid->data), + XIDDATASIZE, mtr); +} + +/** Update the WSREP XID information in rollback segment header. +@param[in,out] rseg_header rollback segment header +@param[in] xid WSREP XID +@param[in,out] mtr mini-transaction */ +void +trx_rseg_update_wsrep_checkpoint( + trx_rsegf_t* rseg_header, + const XID* xid, + mtr_t* mtr) +{ + ut_ad(wsrep_is_wsrep_xid(xid)); + +#ifdef UNIV_DEBUG + /* Check that seqno is monotonically increasing */ + long long xid_seqno = wsrep_xid_seqno(xid); + const byte* xid_uuid = wsrep_xid_uuid(xid); + + if (xid_seqno != -1 + && !memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) { + ut_ad(xid_seqno > wsrep_seqno); + } else { + memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid); + } + wsrep_seqno = xid_seqno; +#endif /* UNIV_DEBUG */ + trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr); +} + +/** Clear the WSREP XID information from rollback segment header. +@param[in,out] rseg_header Rollback segment header +@param[in,out] mtr mini-transaction */ +static void +trx_rseg_clear_wsrep_checkpoint( + trx_rsegf_t* rseg_header, + mtr_t* mtr) +{ + mlog_write_ulint(TRX_RSEG_WSREP_XID_FORMAT + rseg_header, + 0, MLOG_4BYTES, mtr); +} + +static void +trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr) +{ + const byte* xid_uuid = wsrep_xid_uuid(xid); + /* We must make check against wsrep_uuid here, the + trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with + xid contents in debug mode and the memcmp() will never give nonzero + result. */ + const bool must_clear_rsegs = memcmp(wsrep_uuid, xid_uuid, + sizeof wsrep_uuid); + const trx_rseg_t* rseg = trx_sys.rseg_array[0]; + + trx_rsegf_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no, + mtr); + if (UNIV_UNLIKELY(mach_read_from_4(rseg_header + TRX_RSEG_FORMAT))) { + trx_rseg_format_upgrade(rseg_header, mtr); + } + + trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr); + + if (must_clear_rsegs) { + /* Because the UUID part of the WSREP XID differed + from current_xid_uuid, the WSREP group UUID was + changed, and we must reset the XID in all rollback + segment headers. */ + for (ulint rseg_id = 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) { + if (const trx_rseg_t* rseg = + trx_sys.rseg_array[rseg_id]) { + trx_rseg_clear_wsrep_checkpoint( + trx_rsegf_get(rseg->space, + rseg->page_no, mtr), + mtr); + } + } + } +} + +/** Update WSREP checkpoint XID in first rollback segment header +as part of wsrep_set_SE_checkpoint() when it is guaranteed that there +are no wsrep transactions committing. +If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already +stored into rollback segments, the WSREP XID in all the remaining rollback +segments will be reset. +@param[in] xid WSREP XID */ +void trx_rseg_update_wsrep_checkpoint(const XID* xid) +{ + mtr_t mtr; + mtr.start(); + trx_rseg_update_wsrep_checkpoint(xid, &mtr); + mtr.commit(); +} + +/** Read the WSREP XID information in rollback segment header. +@param[in] rseg_header Rollback segment header +@param[out] xid Transaction XID +@return whether the WSREP XID was present */ +static +bool trx_rseg_read_wsrep_checkpoint(const trx_rsegf_t* rseg_header, XID& xid) +{ + int formatID = static_cast<int>( + mach_read_from_4( + TRX_RSEG_WSREP_XID_FORMAT + rseg_header)); + if (formatID == 0) { + return false; + } + + xid.formatID = formatID; + xid.gtrid_length = static_cast<int>( + mach_read_from_4( + TRX_RSEG_WSREP_XID_GTRID_LEN + rseg_header)); + + xid.bqual_length = static_cast<int>( + mach_read_from_4( + TRX_RSEG_WSREP_XID_BQUAL_LEN + rseg_header)); + + memcpy(xid.data, TRX_RSEG_WSREP_XID_DATA + rseg_header, XIDDATASIZE); + + return true; +} + +/** Read the WSREP XID from the TRX_SYS page (in case of upgrade). +@param[in] page TRX_SYS page +@param[out] xid WSREP XID (if present) +@return whether the WSREP XID is present */ +static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid) +{ + if (mach_read_from_4(TRX_SYS + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_MAGIC_N_FLD + + page) + != TRX_SYS_WSREP_XID_MAGIC_N) { + return false; + } + + xid.formatID = static_cast<int>( + mach_read_from_4( + TRX_SYS + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_FORMAT + page)); + xid.gtrid_length = static_cast<int>( + mach_read_from_4( + TRX_SYS + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_GTRID_LEN + page)); + xid.bqual_length = static_cast<int>( + mach_read_from_4( + TRX_SYS + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_BQUAL_LEN + page)); + memcpy(xid.data, + TRX_SYS + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE); + return true; +} + +/** Recover the latest WSREP checkpoint XID. +@param[out] xid WSREP XID +@return whether the WSREP XID was found */ +bool trx_rseg_read_wsrep_checkpoint(XID& xid) +{ + mtr_t mtr; + long long max_xid_seqno = -1; + bool found = false; + + for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; + rseg_id++, mtr.commit()) { + mtr.start(); + const buf_block_t* sys = trx_sysf_get(&mtr, false); + const uint32_t page_no = trx_sysf_rseg_get_page_no( + sys, rseg_id); + + if (page_no == FIL_NULL) { + continue; + } + + const trx_rsegf_t* rseg_header = trx_rsegf_get_new( + trx_sysf_rseg_get_space(sys, rseg_id), page_no, &mtr); + + if (mach_read_from_4(rseg_header + TRX_RSEG_FORMAT)) { + continue; + } + + XID tmp_xid; + long long tmp_seqno = 0; + if (trx_rseg_read_wsrep_checkpoint(rseg_header, tmp_xid) + && (tmp_seqno = wsrep_xid_seqno(&tmp_xid)) + > max_xid_seqno) { + found = true; + max_xid_seqno = tmp_seqno; + xid = tmp_xid; + memcpy(wsrep_uuid, wsrep_xid_uuid(&tmp_xid), + sizeof wsrep_uuid); + } + } + + return found; +} +#endif /* WITH_WSREP */ + +/** Upgrade a rollback segment header page to MariaDB 10.3 format. +@param[in,out] rseg_header rollback segment header page +@param[in,out] mtr mini-transaction */ +void trx_rseg_format_upgrade(trx_rsegf_t* rseg_header, mtr_t* mtr) +{ + ut_ad(page_offset(rseg_header) == TRX_RSEG); + byte* rseg_format = TRX_RSEG_FORMAT + rseg_header; + mlog_write_ulint(rseg_format, 0, MLOG_4BYTES, mtr); + /* Clear also possible garbage at the end of the page. Old + InnoDB versions did not initialize unused parts of pages. */ + byte* b = rseg_header + TRX_RSEG_MAX_TRX_ID + 8; + ulint len = srv_page_size + - (FIL_PAGE_DATA_END + + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8); + memset(b, 0, len); + mlog_log_string(b, len, mtr); +} + +/** Create a rollback segment header. +@param[in,out] space system, undo, or temporary tablespace +@param[in] rseg_id rollback segment identifier +@param[in,out] sys_header the TRX_SYS page (NULL for temporary rseg) @param[in,out] mtr mini-transaction @return the created rollback segment @retval NULL on failure */ buf_block_t* trx_rseg_header_create( - ulint space, - ulint max_size, - ulint rseg_slot_no, - mtr_t* mtr) + fil_space_t* space, + ulint rseg_id, + buf_block_t* sys_header, + mtr_t* mtr) { - trx_sysf_t* sys_header; buf_block_t* block; - ut_ad(mtr); - ut_ad(mtr_memo_contains(mtr, fil_space_get(space), - MTR_MEMO_SPACE_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, space, MTR_MEMO_SPACE_X_LOCK)); + ut_ad(!sys_header == (space == fil_system.temp_space)); /* Allocate a new file segment for the rollback segment */ block = fseg_create(space, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr); @@ -66,9 +312,8 @@ trx_rseg_header_create( buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); - /* Initialize max size field */ - mlog_write_ulint(TRX_RSEG + TRX_RSEG_MAX_SIZE + block->frame, - max_size, MLOG_4BYTES, mtr); + mlog_write_ulint(TRX_RSEG + TRX_RSEG_FORMAT + block->frame, 0, + MLOG_4BYTES, mtr); /* Initialize the history list */ @@ -84,17 +329,20 @@ trx_rseg_header_create( trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr); } - if (space != SRV_TMP_SPACE_ID) { + if (sys_header) { /* Add the rollback segment info to the free slot in the trx system header */ - sys_header = trx_sysf_get(mtr); - - trx_sysf_rseg_set_space(sys_header, rseg_slot_no, space, mtr); - - trx_sysf_rseg_set_page_no( - sys_header, rseg_slot_no, - block->page.id.page_no(), mtr); + mlog_write_ulint(TRX_SYS + TRX_SYS_RSEGS + + TRX_SYS_RSEG_SPACE + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->frame, + space->id, MLOG_4BYTES, mtr); + mlog_write_ulint(TRX_SYS + TRX_SYS_RSEGS + + TRX_SYS_RSEG_PAGE_NO + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->frame, + block->page.id.page_no(), MLOG_4BYTES, mtr); } return block; @@ -110,33 +358,20 @@ trx_rseg_mem_free(trx_rseg_t* rseg) mutex_free(&rseg->mutex); /* There can't be any active transactions. */ - ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0); - ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0); + ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0); + ut_a(UT_LIST_GET_LEN(rseg->old_insert_list) == 0); - for (undo = UT_LIST_GET_FIRST(rseg->update_undo_cached); + for (undo = UT_LIST_GET_FIRST(rseg->undo_cached); undo != NULL; undo = next_undo) { next_undo = UT_LIST_GET_NEXT(undo_list, undo); - UT_LIST_REMOVE(rseg->update_undo_cached, undo); + UT_LIST_REMOVE(rseg->undo_cached, undo); MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); - trx_undo_mem_free(undo); - } - - for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached); - undo != NULL; - undo = next_undo) { - - next_undo = UT_LIST_GET_NEXT(undo_list, undo); - - UT_LIST_REMOVE(rseg->insert_undo_cached, undo); - - MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); - - trx_undo_mem_free(undo); + ut_free(undo); } ut_free(rseg); @@ -148,7 +383,7 @@ trx_rseg_mem_free(trx_rseg_t* rseg) @param[in] page_no page number of the segment header */ static trx_rseg_t* -trx_rseg_mem_create(ulint id, ulint space, ulint page_no) +trx_rseg_mem_create(ulint id, fil_space_t* space, ulint page_no) { trx_rseg_t* rseg = static_cast<trx_rseg_t*>( ut_zalloc_nokey(sizeof *rseg)); @@ -157,104 +392,252 @@ trx_rseg_mem_create(ulint id, ulint space, ulint page_no) rseg->space = space; rseg->page_no = page_no; rseg->last_page_no = FIL_NULL; + rseg->curr_size = 1; mutex_create(rseg->is_persistent() ? LATCH_ID_REDO_RSEG : LATCH_ID_NOREDO_RSEG, &rseg->mutex); - UT_LIST_INIT(rseg->update_undo_list, &trx_undo_t::undo_list); - UT_LIST_INIT(rseg->update_undo_cached, &trx_undo_t::undo_list); - UT_LIST_INIT(rseg->insert_undo_list, &trx_undo_t::undo_list); - UT_LIST_INIT(rseg->insert_undo_cached, &trx_undo_t::undo_list); + UT_LIST_INIT(rseg->undo_list, &trx_undo_t::undo_list); + UT_LIST_INIT(rseg->old_insert_list, &trx_undo_t::undo_list); + UT_LIST_INIT(rseg->undo_cached, &trx_undo_t::undo_list); return(rseg); } +/** Read the undo log lists. +@param[in,out] rseg rollback segment +@param[in,out] max_trx_id maximum observed transaction identifier +@param[in] rseg_header rollback segment header +@return the combined size of undo log segments in pages */ +static +ulint +trx_undo_lists_init(trx_rseg_t* rseg, trx_id_t& max_trx_id, + const trx_rsegf_t* rseg_header) +{ + ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN); + + ulint size = 0; + + for (ulint i = 0; i < TRX_RSEG_N_SLOTS; i++) { + ulint page_no = trx_rsegf_get_nth_undo(rseg_header, i); + if (page_no != FIL_NULL) { + size += trx_undo_mem_create_at_db_start( + rseg, i, page_no, max_trx_id); + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED); + } + } + + return(size); +} + /** Restore the state of a persistent rollback segment. -@param[in,out] rseg persistent rollback segment -@param[in,out] mtr mini-transaction */ +@param[in,out] rseg persistent rollback segment +@param[in,out] max_trx_id maximum observed transaction identifier +@param[in,out] mtr mini-transaction */ static void -trx_rseg_mem_restore(trx_rseg_t* rseg, mtr_t* mtr) +trx_rseg_mem_restore(trx_rseg_t* rseg, trx_id_t& max_trx_id, mtr_t* mtr) { - ulint len; - fil_addr_t node_addr; - trx_rsegf_t* rseg_header; - trx_ulogf_t* undo_log_hdr; - ulint sum_of_undo_sizes; + /* This is based on trx_rsegf_get_new(). + We need to access buf_block_t. */ + buf_block_t *block = buf_page_get( + page_id_t(rseg->space->id, rseg->page_no), + univ_page_size, RW_S_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); - rseg_header = trx_rsegf_get_new(rseg->space, rseg->page_no, mtr); + const trx_rsegf_t* rseg_header = TRX_RSEG + block->frame; - rseg->max_size = mtr_read_ulint( - rseg_header + TRX_RSEG_MAX_SIZE, MLOG_4BYTES, mtr); + if (mach_read_from_4(rseg_header + TRX_RSEG_FORMAT) == 0) { + trx_id_t id = mach_read_from_8(rseg_header + + TRX_RSEG_MAX_TRX_ID); - /* Initialize the undo log lists according to the rseg header */ + if (id > max_trx_id) { + max_trx_id = id; + } + + if (rseg_header[TRX_RSEG_BINLOG_NAME]) { + lsn_t lsn = std::max(block->page.newest_modification, + mach_read_from_8(FIL_PAGE_LSN + + block->frame)); + compile_time_assert(TRX_RSEG_BINLOG_NAME_LEN == sizeof + trx_sys.recovered_binlog_filename); + if (lsn > trx_sys.recovered_binlog_lsn) { + trx_sys.recovered_binlog_lsn = lsn; + trx_sys.recovered_binlog_offset + = mach_read_from_8( + rseg_header + + TRX_RSEG_BINLOG_OFFSET); + memcpy(trx_sys.recovered_binlog_filename, + rseg_header + TRX_RSEG_BINLOG_NAME, + TRX_RSEG_BINLOG_NAME_LEN); + } + +#ifdef WITH_WSREP + trx_rseg_read_wsrep_checkpoint( + rseg_header, trx_sys.recovered_wsrep_xid); +#endif + } + } - sum_of_undo_sizes = trx_undo_lists_init(rseg); + if (srv_operation == SRV_OPERATION_RESTORE) { + /* mariabackup --prepare only deals with + the redo log and the data files, not with + transactions or the data dictionary. */ + return; + } - rseg->curr_size = mtr_read_ulint( - rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr) - + 1 + sum_of_undo_sizes; + /* Initialize the undo log lists according to the rseg header */ - len = flst_get_len(rseg_header + TRX_RSEG_HISTORY); + rseg->curr_size = mach_read_from_4(rseg_header + TRX_RSEG_HISTORY_SIZE) + + 1 + trx_undo_lists_init(rseg, max_trx_id, rseg_header); - if (len > 0) { - my_atomic_addlint(&trx_sys->rseg_history_len, len); + if (ulint len = flst_get_len(rseg_header + TRX_RSEG_HISTORY)) { + trx_sys.history_add(int32(len)); - node_addr = trx_purge_get_log_from_hist( + fil_addr_t node_addr = trx_purge_get_log_from_hist( flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr)); rseg->last_page_no = node_addr.page; rseg->last_offset = node_addr.boffset; - undo_log_hdr = trx_undo_page_get( - page_id_t(rseg->space, node_addr.page), mtr) + const trx_ulogf_t* undo_log_hdr = trx_undo_page_get( + page_id_t(rseg->space->id, node_addr.page), mtr) + node_addr.boffset; - rseg->last_trx_no = mach_read_from_8( - undo_log_hdr + TRX_UNDO_TRX_NO); - - rseg->last_del_marks = mtr_read_ulint( - undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr); - - TrxUndoRsegs elem(rseg->last_trx_no); - elem.push_back(rseg); + trx_id_t id = mach_read_from_8(undo_log_hdr + TRX_UNDO_TRX_ID); + if (id > max_trx_id) { + max_trx_id = id; + } + id = mach_read_from_8(undo_log_hdr + TRX_UNDO_TRX_NO); + if (id > max_trx_id) { + max_trx_id = id; + } + unsigned purge = mach_read_from_2( + undo_log_hdr + TRX_UNDO_NEEDS_PURGE); + ut_ad(purge <= 1); + rseg->set_last_trx_no(id, purge != 0); + rseg->needs_purge = purge != 0; if (rseg->last_page_no != FIL_NULL) { /* There is no need to cover this operation by the purge mutex because we are still bootstrapping. */ - - purge_sys->purge_queue.push(elem); + purge_sys.purge_queue.push(*rseg); } } } +/** Read binlog metadata from the TRX_SYS page, in case we are upgrading +from MySQL or a MariaDB version older than 10.3.5. */ +static void trx_rseg_init_binlog_info(const page_t* page) +{ + if (mach_read_from_4(TRX_SYS + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD + + page) + == TRX_SYS_MYSQL_LOG_MAGIC_N) { + memcpy(trx_sys.recovered_binlog_filename, + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME + + TRX_SYS + page, TRX_SYS_MYSQL_LOG_NAME_LEN); + trx_sys.recovered_binlog_offset = mach_read_from_8( + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET + + TRX_SYS + page); + } + +#ifdef WITH_WSREP + trx_rseg_init_wsrep_xid(page, trx_sys.recovered_wsrep_xid); +#endif +} + /** Initialize the rollback segments in memory at database startup. */ void trx_rseg_array_init() { - mtr_t mtr; + trx_id_t max_trx_id = 0; + + *trx_sys.recovered_binlog_filename = '\0'; + trx_sys.recovered_binlog_offset = 0; +#ifdef WITH_WSREP + trx_sys.recovered_wsrep_xid.null(); + XID wsrep_sys_xid; + wsrep_sys_xid.null(); + bool wsrep_xid_in_rseg_found = false; +#endif + + for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) { + mtr_t mtr; + mtr.start(); + if (const buf_block_t* sys = trx_sysf_get(&mtr, false)) { + if (rseg_id == 0) { + /* In case this is an upgrade from + before MariaDB 10.3.5, fetch the base + information from the TRX_SYS page. */ + max_trx_id = mach_read_from_8( + TRX_SYS + TRX_SYS_TRX_ID_STORE + + sys->frame); + trx_rseg_init_binlog_info(sys->frame); +#ifdef WITH_WSREP + wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid); +#endif + } + + const uint32_t page_no = trx_sysf_rseg_get_page_no( + sys, rseg_id); + if (page_no != FIL_NULL) { + trx_rseg_t* rseg = trx_rseg_mem_create( + rseg_id, + fil_space_get(trx_sysf_rseg_get_space( + sys, rseg_id)), + page_no); + ut_ad(rseg->is_persistent()); + ut_ad(rseg->id == rseg_id); + ut_ad(!trx_sys.rseg_array[rseg_id]); + trx_sys.rseg_array[rseg_id] = rseg; + trx_rseg_mem_restore(rseg, max_trx_id, &mtr); +#ifdef WITH_WSREP + if (!wsrep_sys_xid.is_null() && + !wsrep_sys_xid.eq(&trx_sys.recovered_wsrep_xid)) { + wsrep_xid_in_rseg_found = true; + ut_ad(memcmp(wsrep_xid_uuid(&wsrep_sys_xid), + wsrep_xid_uuid(&trx_sys.recovered_wsrep_xid), + sizeof wsrep_uuid) + || wsrep_xid_seqno( + &wsrep_sys_xid) + <= wsrep_xid_seqno( + &trx_sys.recovered_wsrep_xid)); + } +#endif + } + } - for (ulint i = 0; i < TRX_SYS_N_RSEGS; i++) { + mtr.commit(); + } + +#ifdef WITH_WSREP + if (!wsrep_sys_xid.is_null()) { + /* Upgrade from a version prior to 10.3.5, + where WSREP XID was stored in TRX_SYS page. + If no rollback segment has a WSREP XID set, + we must copy the XID found in TRX_SYS page + to rollback segments. */ + mtr_t mtr; mtr.start(); - trx_sysf_t* sys_header = trx_sysf_get(&mtr); - ulint page_no = trx_sysf_rseg_get_page_no( - sys_header, i, &mtr); - if (page_no != FIL_NULL) { - trx_rseg_t* rseg = trx_rseg_mem_create( - i, - trx_sysf_rseg_get_space(sys_header, i, &mtr), - page_no); - ut_ad(rseg->is_persistent()); - ut_ad(!trx_sys->rseg_array[rseg->id]); - trx_sys->rseg_array[rseg->id] = rseg; - trx_rseg_mem_restore(rseg, &mtr); + if (!wsrep_xid_in_rseg_found) { + trx_rseg_update_wsrep_checkpoint(&wsrep_sys_xid, &mtr); } + /* Finally, clear WSREP XID in TRX_SYS page. */ + const buf_block_t* sys = trx_sysf_get(&mtr); + mlog_write_ulint(TRX_SYS + TRX_SYS_WSREP_XID_INFO + + + TRX_SYS_WSREP_XID_MAGIC_N_FLD + sys->frame, + 0, MLOG_4BYTES, &mtr); + mtr.commit(); } +#endif + + trx_sys.init_max_trx_id(max_trx_id + 1); } /** Create a persistent rollback segment. @@ -270,29 +653,25 @@ trx_rseg_create(ulint space_id) mtr.start(); /* To obey the latching order, acquire the file space - x-latch before the trx_sys->mutex. */ -#ifdef UNIV_DEBUG - const fil_space_t* space = -#endif /* UNIV_DEBUG */ - mtr_x_lock_space(space_id, &mtr); + x-latch before the trx_sys.mutex. */ + fil_space_t* space = mtr_x_lock_space(space_id, &mtr); ut_ad(space->purpose == FIL_TYPE_TABLESPACE); - ulint slot_no = trx_sysf_rseg_find_free(&mtr); - if (buf_block_t* block = slot_no == ULINT_UNDEFINED - ? NULL - : trx_rseg_header_create(space_id, ULINT_MAX, slot_no, &mtr)) { - trx_sysf_t* sys_header = trx_sysf_get(&mtr); - - ulint id = trx_sysf_rseg_get_space( - sys_header, slot_no, &mtr); - ut_a(id == space_id); - - rseg = trx_rseg_mem_create(slot_no, space_id, - block->page.id.page_no()); - ut_ad(rseg->is_persistent()); - ut_ad(!trx_sys->rseg_array[rseg->id]); - trx_sys->rseg_array[rseg->id] = rseg; - trx_rseg_mem_restore(rseg, &mtr); + if (buf_block_t* sys_header = trx_sysf_get(&mtr)) { + ulint rseg_id = trx_sys_rseg_find_free(sys_header); + if (buf_block_t* rblock = rseg_id == ULINT_UNDEFINED + ? NULL + : trx_rseg_header_create(space, rseg_id, sys_header, + &mtr)) { + ut_ad(trx_sysf_rseg_get_space(sys_header, rseg_id) + == space_id); + rseg = trx_rseg_mem_create(rseg_id, space, + rblock->page.id.page_no()); + ut_ad(rseg->id == rseg_id); + ut_ad(rseg->is_persistent()); + ut_ad(!trx_sys.rseg_array[rseg->id]); + trx_sys.rseg_array[rseg->id] = rseg; + } } mtr.commit(); @@ -309,20 +688,15 @@ trx_temp_rseg_create() for (ulong i = 0; i < TRX_SYS_N_RSEGS; i++) { mtr.start(); mtr.set_log_mode(MTR_LOG_NO_REDO); -#ifdef UNIV_DEBUG - const fil_space_t* space = -#endif /* UNIV_DEBUG */ - mtr_x_lock_space(SRV_TMP_SPACE_ID, &mtr); - ut_ad(space->purpose == FIL_TYPE_TEMPORARY); + mtr_x_lock_space(fil_system.temp_space, &mtr); - buf_block_t* block = trx_rseg_header_create( - SRV_TMP_SPACE_ID, ULINT_MAX, i, &mtr); + buf_block_t* rblock = trx_rseg_header_create( + fil_system.temp_space, i, NULL, &mtr); trx_rseg_t* rseg = trx_rseg_mem_create( - i, SRV_TMP_SPACE_ID, block->page.id.page_no()); + i, fil_system.temp_space, rblock->page.id.page_no()); ut_ad(!rseg->is_persistent()); - ut_ad(!trx_sys->temp_rsegs[i]); - trx_sys->temp_rsegs[i] = rseg; - trx_rseg_mem_restore(rseg, &mtr); + ut_ad(!trx_sys.temp_rsegs[i]); + trx_sys.temp_rsegs[i] = rseg; mtr.commit(); } } @@ -339,54 +713,70 @@ trx_rseg_get_n_undo_tablespaces( ulint* space_ids) /*!< out: array of space ids of UNDO tablespaces */ { - ulint i; - mtr_t mtr; - trx_sysf_t* sys_header; - ulint n_undo_tablespaces = 0; - - mtr_start(&mtr); + mtr_t mtr; + mtr.start(); - sys_header = trx_sysf_get(&mtr); + buf_block_t* sys_header = trx_sysf_get(&mtr, false); + if (!sys_header) { + mtr.commit(); + return 0; + } - for (i = 0; i < TRX_SYS_N_RSEGS; i++) { - ulint page_no; - ulint space; + ulint* end = space_ids; - page_no = trx_sysf_rseg_get_page_no(sys_header, i, &mtr); + for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) { + uint32_t page_no = trx_sysf_rseg_get_page_no(sys_header, + rseg_id); if (page_no == FIL_NULL) { continue; } - space = trx_sysf_rseg_get_space(sys_header, i, &mtr); - - if (space != 0) { - ulint j; - ibool found = FALSE; - - for (j = 0; j < n_undo_tablespaces; ++j) { - if (space_ids[j] == space) { - found = TRUE; - break; - } - } - - if (!found) { - ut_a(n_undo_tablespaces <= i); - space_ids[n_undo_tablespaces++] = space; + if (ulint space = trx_sysf_rseg_get_space(sys_header, + rseg_id)) { + if (std::find(space_ids, end, space) == end) { + *end++ = space; } } } - mtr_commit(&mtr); + mtr.commit(); - ut_a(n_undo_tablespaces <= TRX_SYS_N_RSEGS); + ut_a(end - space_ids <= TRX_SYS_N_RSEGS); + *end = ULINT_UNDEFINED; - space_ids[n_undo_tablespaces] = ULINT_UNDEFINED; + std::sort(space_ids, end); - if (n_undo_tablespaces > 0) { - std::sort(space_ids, space_ids + n_undo_tablespaces); + return ulint(end - space_ids); +} + +/** Update the offset information about the end of the binlog entry +which corresponds to the transaction just being committed. +In a replication slave, this updates the master binlog position +up to which replication has proceeded. +@param[in,out] rseg_header rollback segment header +@param[in] trx committing transaction +@param[in,out] mtr mini-transaction */ +void +trx_rseg_update_binlog_offset(byte* rseg_header, const trx_t* trx, mtr_t* mtr) +{ + DBUG_LOG("trx", "trx_mysql_binlog_offset: " << trx->mysql_log_offset); + + const size_t len = strlen(trx->mysql_log_file_name) + 1; + + ut_ad(len > 1); + + if (UNIV_UNLIKELY(len > TRX_RSEG_BINLOG_NAME_LEN)) { + return; } - return(n_undo_tablespaces); + mlog_write_ull(rseg_header + TRX_RSEG_BINLOG_OFFSET, + trx->mysql_log_offset, mtr); + byte* p = rseg_header + TRX_RSEG_BINLOG_NAME; + const byte* binlog_name = reinterpret_cast<const byte*> + (trx->mysql_log_file_name); + + if (memcmp(binlog_name, p, len)) { + mlog_write_string(p, binlog_name, len, mtr); + } } diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 9138e9475bf..87814fa6c69 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -24,8 +24,8 @@ Transaction system Created 3/26/1996 Heikki Tuuri *******************************************************/ -#include "mysqld.h" #include "trx0sys.h" +#include "mysqld.h" #include "sql_error.h" #include "fsp0fsp.h" @@ -40,55 +40,9 @@ Created 3/26/1996 Heikki Tuuri #include "log0log.h" #include "log0recv.h" #include "os0file.h" -#include "read0read.h" - -#include <mysql/service_wsrep.h> - -/** The file format tag structure with id and name. */ -struct file_format_t { - ulint id; /*!< id of the file format */ - const char* name; /*!< text representation of the - file format */ - ib_mutex_t mutex; /*!< covers changes to the above - fields */ -}; /** The transaction system */ -trx_sys_t* trx_sys; - -/** List of animal names representing file format. */ -static const char* file_format_name_map[] = { - "Antelope", - "Barracuda", - "Cheetah", - "Dragon", - "Elk", - "Fox", - "Gazelle", - "Hornet", - "Impala", - "Jaguar", - "Kangaroo", - "Leopard", - "Moose", - "Nautilus", - "Ocelot", - "Porpoise", - "Quail", - "Rabbit", - "Shark", - "Tiger", - "Urchin", - "Viper", - "Whale", - "Xenops", - "Yak", - "Zebra" -}; - -/** The number of elements in the file format name array. */ -static const ulint FILE_FORMAT_NAME_N - = sizeof(file_format_name_map) / sizeof(file_format_name_map[0]); +trx_sys_t trx_sys; /** Check whether transaction id is valid. @param[in] id transaction id to check @@ -98,7 +52,7 @@ ReadView::check_trx_id_sanity( trx_id_t id, const table_name_t& name) { - if (id >= trx_sys->max_trx_id) { + if (id >= trx_sys.get_max_trx_id()) { ib::warn() << "A transaction id" << " in a record of table " @@ -129,249 +83,32 @@ ReadView::check_trx_id_sanity( uint trx_rseg_n_slots_debug = 0; #endif -/** This is used to track the maximum file format id known to InnoDB. It's -updated via SET GLOBAL innodb_file_format_max = 'x' or when we open -or create a table. */ -static file_format_t file_format_max; - -/*****************************************************************//** -Writes the value of max_trx_id to the file based trx system header. */ -void -trx_sys_flush_max_trx_id(void) -/*==========================*/ -{ - mtr_t mtr; - trx_sysf_t* sys_header; - - /* wsrep_fake_trx_id violates this assert - Copied from trx_sys_get_new_trx_id - */ - ut_ad(trx_sys_mutex_own()); - - if (!srv_read_only_mode) { - mtr_start(&mtr); - - sys_header = trx_sysf_get(&mtr); - - mlog_write_ull( - sys_header + TRX_SYS_TRX_ID_STORE, - trx_sys->max_trx_id, &mtr); - - mtr_commit(&mtr); - } -} - -/*****************************************************************//** -Updates the offset information about the end of the MySQL binlog entry -which corresponds to the transaction just being committed. In a MySQL -replication slave updates the latest master binlog position up to which -replication has proceeded. */ -void -trx_sys_update_mysql_binlog_offset( -/*===============================*/ - const char* file_name,/*!< in: MySQL log file name */ - int64_t offset, /*!< in: position in that log file */ - trx_sysf_t* sys_header, /*!< in: trx sys header */ - mtr_t* mtr) /*!< in: mtr */ -{ - DBUG_PRINT("InnoDB",("trx_mysql_binlog_offset: %lld", (longlong) offset)); - - const size_t len = strlen(file_name) + 1; - - if (len > TRX_SYS_MYSQL_LOG_NAME_LEN) { - - /* We cannot fit the name to the 512 bytes we have reserved */ - - return; - } - - if (mach_read_from_4(TRX_SYS_MYSQL_LOG_MAGIC_N_FLD - + TRX_SYS_MYSQL_LOG_INFO + sys_header) - != TRX_SYS_MYSQL_LOG_MAGIC_N) { - - mlog_write_ulint(TRX_SYS_MYSQL_LOG_MAGIC_N_FLD - + TRX_SYS_MYSQL_LOG_INFO + sys_header, - TRX_SYS_MYSQL_LOG_MAGIC_N, - MLOG_4BYTES, mtr); - } - - if (memcmp(file_name, TRX_SYS_MYSQL_LOG_NAME + TRX_SYS_MYSQL_LOG_INFO - + sys_header, len)) { - mlog_write_string(TRX_SYS_MYSQL_LOG_NAME - + TRX_SYS_MYSQL_LOG_INFO - + sys_header, - reinterpret_cast<const byte*>(file_name), - len, mtr); - } - - mlog_write_ull(TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET - + sys_header, offset, mtr); -} - /** Display the MySQL binlog offset info if it is present in the trx system header. */ void trx_sys_print_mysql_binlog_offset() { - mtr_t mtr; - - mtr.start(); - - const trx_sysf_t* sys_header = trx_sysf_get(&mtr); - - if (mach_read_from_4(TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD + sys_header) - == TRX_SYS_MYSQL_LOG_MAGIC_N) { - ib::info() << "Last binlog file '" - << TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME - + sys_header - << "', position " - << mach_read_from_8(TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_OFFSET - + sys_header); - } - - mtr.commit(); -} - -#ifdef WITH_WSREP - -#ifdef UNIV_DEBUG -static long long trx_sys_cur_xid_seqno = -1; -static unsigned char trx_sys_cur_xid_uuid[16]; - -/** Read WSREP XID seqno */ -static inline long long read_wsrep_xid_seqno(const XID* xid) -{ - long long seqno; - memcpy(&seqno, xid->data + 24, sizeof(long long)); - return seqno; -} - -/** Read WSREP XID UUID */ -static inline void read_wsrep_xid_uuid(const XID* xid, unsigned char* buf) -{ - memcpy(buf, xid->data + 8, 16); -} - -#endif /* UNIV_DEBUG */ - -/** Update WSREP XID info in sys_header of TRX_SYS_PAGE_NO = 5. -@param[in] xid Transaction XID -@param[in,out] sys_header sys_header -@param[in] mtr minitransaction */ -UNIV_INTERN -void -trx_sys_update_wsrep_checkpoint( - const XID* xid, - trx_sysf_t* sys_header, - mtr_t* mtr) -{ - ut_ad(xid->formatID == 1); - ut_ad(wsrep_is_wsrep_xid(xid)); - - if (mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_MAGIC_N_FLD) - != TRX_SYS_WSREP_XID_MAGIC_N) { - mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_MAGIC_N_FLD, - TRX_SYS_WSREP_XID_MAGIC_N, - MLOG_4BYTES, mtr); -#ifdef UNIV_DEBUG - } else { - /* Check that seqno is monotonically increasing */ - unsigned char xid_uuid[16]; - long long xid_seqno = read_wsrep_xid_seqno(xid); - read_wsrep_xid_uuid(xid, xid_uuid); - - if (!memcmp(xid_uuid, trx_sys_cur_xid_uuid, 8)) { - ut_ad(xid_seqno > trx_sys_cur_xid_seqno); - trx_sys_cur_xid_seqno = xid_seqno; - } else { - memcpy(trx_sys_cur_xid_uuid, xid_uuid, 16); - } - - trx_sys_cur_xid_seqno = xid_seqno; -#endif /* UNIV_DEBUG */ - } - - mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_FORMAT, - (int)xid->formatID, - MLOG_4BYTES, mtr); - mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_GTRID_LEN, - (int)xid->gtrid_length, - MLOG_4BYTES, mtr); - mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_BQUAL_LEN, - (int)xid->bqual_length, - MLOG_4BYTES, mtr); - mlog_write_string(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_DATA, - (const unsigned char*) xid->data, - XIDDATASIZE, mtr); -} - -/** Read WSREP checkpoint XID from sys header. -@param[out] xid WSREP XID -@return whether the checkpoint was present */ -UNIV_INTERN -bool -trx_sys_read_wsrep_checkpoint(XID* xid) -{ - trx_sysf_t* sys_header; - mtr_t mtr; - ulint magic; - - ut_ad(xid); - - mtr_start(&mtr); - - sys_header = trx_sysf_get(&mtr); - - if ((magic = mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_MAGIC_N_FLD)) - != TRX_SYS_WSREP_XID_MAGIC_N) { - mtr.commit(); - xid->null(); - xid->gtrid_length = 0; - xid->bqual_length = 0; - memset(xid->data, 0, sizeof xid->data); - memset(xid->data + 24, 0xff, 8); - return false; + if (!*trx_sys.recovered_binlog_filename) { + return; } - xid->formatID = (int)mach_read_from_4( - sys_header - + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_FORMAT); - xid->gtrid_length = (int)mach_read_from_4( - sys_header - + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_GTRID_LEN); - xid->bqual_length = (int)mach_read_from_4( - sys_header - + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_BQUAL_LEN); - ut_memcpy(xid->data, - sys_header + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_DATA, - XIDDATASIZE); - - mtr_commit(&mtr); - return true; + ib::info() << "Last binlog file '" + << trx_sys.recovered_binlog_filename + << "', position " + << trx_sys.recovered_binlog_offset; } -#endif /* WITH_WSREP */ - -/** @return an unallocated rollback segment slot in the TRX_SYS header +/** Find an available rollback segment. +@param[in] sys_header +@return an unallocated rollback segment slot in the TRX_SYS header @retval ULINT_UNDEFINED if not found */ ulint -trx_sysf_rseg_find_free(mtr_t* mtr) +trx_sys_rseg_find_free(const buf_block_t* sys_header) { - trx_sysf_t* sys_header = trx_sysf_get(mtr); - - for (ulint i = 0; i < TRX_SYS_N_RSEGS; i++) { - if (trx_sysf_rseg_get_page_no(sys_header, i, mtr) + for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) { + if (trx_sysf_rseg_get_page_no(sys_header, rseg_id) == FIL_NULL) { - return(i); + return rseg_id; } } @@ -386,13 +123,14 @@ trx_sysf_get_n_rseg_slots() mtr_t mtr; mtr.start(); - trx_sysf_t* sys_header = trx_sysf_get(&mtr); srv_available_undo_logs = 0; - - for (ulint i = 0; i < TRX_SYS_N_RSEGS; i++) { - srv_available_undo_logs - += trx_sysf_rseg_get_page_no(sys_header, i, &mtr) - != FIL_NULL; + if (const buf_block_t* sys_header = trx_sysf_get(&mtr, false)) { + for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) { + srv_available_undo_logs + += trx_sysf_rseg_get_page_no(sys_header, + rseg_id) + != FIL_NULL; + } } mtr.commit(); @@ -407,7 +145,6 @@ trx_sysf_create( /*============*/ mtr_t* mtr) /*!< in: mtr */ { - trx_sysf_t* sys_header; ulint slot_no; buf_block_t* block; page_t* page; @@ -419,10 +156,12 @@ trx_sysf_create( then enter the kernel: we must do it in this order to conform to the latching order rules. */ - mtr_x_lock_space(TRX_SYS_SPACE, mtr); + mtr_x_lock_space(fil_system.sys_space, mtr); + compile_time_assert(TRX_SYS_SPACE == 0); /* Create the trx sys file block in a new allocated file segment */ - block = fseg_create(TRX_SYS_SPACE, TRX_SYS + TRX_SYS_FSEG_HEADER, + block = fseg_create(fil_system.sys_space, + TRX_SYS + TRX_SYS_FSEG_HEADER, mtr); buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); @@ -440,126 +179,42 @@ trx_sysf_create( mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr); - sys_header = trx_sysf_get(mtr); - - /* Start counting transaction ids from number 1 up */ - mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1); - /* Reset the rollback segment slots. Old versions of InnoDB (before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect that the whole array is initialized. */ - ptr = TRX_SYS_RSEGS + sys_header; + ptr = TRX_SYS + TRX_SYS_RSEGS + page; compile_time_assert(256 >= TRX_SYS_N_RSEGS); memset(ptr, 0xff, 256 * TRX_SYS_RSEG_SLOT_SIZE); ptr += 256 * TRX_SYS_RSEG_SLOT_SIZE; - ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END)); + ut_a(ptr <= page + (srv_page_size - FIL_PAGE_DATA_END)); /* Initialize all of the page. This part used to be uninitialized. */ - memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr); + memset(ptr, 0, srv_page_size - FIL_PAGE_DATA_END + size_t(page - ptr)); - mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END - + page - sys_header, mtr); + mlog_log_string(TRX_SYS + page, srv_page_size - FIL_PAGE_DATA_END + - TRX_SYS, mtr); /* Create the first rollback segment in the SYSTEM tablespace */ - slot_no = trx_sysf_rseg_find_free(mtr); - buf_block_t* rblock = trx_rseg_header_create(TRX_SYS_SPACE, ULINT_MAX, - slot_no, mtr); + slot_no = trx_sys_rseg_find_free(block); + buf_block_t* rblock = trx_rseg_header_create(fil_system.sys_space, + slot_no, block, mtr); ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); ut_a(rblock->page.id.page_no() == FSP_FIRST_RSEG_PAGE_NO); } -/** Initialize the transaction system main-memory data structures. */ -void -trx_sys_init_at_db_start() -{ - trx_sysf_t* sys_header; - ib_uint64_t rows_to_undo = 0; - const char* unit = ""; - - /* VERY important: after the database is started, max_trx_id value is - divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in - trx_sys_get_new_trx_id will evaluate to TRUE when the function - is first time called, and the value for trx id will be written - to the disk-based header! Thus trx id values will not overlap when - the database is repeatedly started! */ - - mtr_t mtr; - mtr.start(); - - sys_header = trx_sysf_get(&mtr); - - trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN - + ut_uint64_align_up(mach_read_from_8(sys_header - + TRX_SYS_TRX_ID_STORE), - TRX_SYS_TRX_ID_WRITE_MARGIN); - - mtr.commit(); - ut_d(trx_sys->rw_max_trx_id = trx_sys->max_trx_id); - - trx_lists_init_at_db_start(); - - /* This mutex is not strictly required, it is here only to satisfy - the debug code (assertions). We are still running in single threaded - bootstrap mode. */ - - trx_sys_mutex_enter(); - - if (UT_LIST_GET_LEN(trx_sys->rw_trx_list) > 0) { - const trx_t* trx; - - for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); - trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { - - ut_ad(trx->is_recovered); - assert_trx_in_rw_list(trx); - - if (trx_state_eq(trx, TRX_STATE_ACTIVE)) { - rows_to_undo += trx->undo_no; - } - } - - if (rows_to_undo > 1000000000) { - unit = "M"; - rows_to_undo = rows_to_undo / 1000000; - } - - ib::info() << UT_LIST_GET_LEN(trx_sys->rw_trx_list) - << " transaction(s) which must be rolled back or" - " cleaned up in total " << rows_to_undo << unit - << " row operations to undo"; - - ib::info() << "Trx id counter is " << trx_sys->max_trx_id; - } - - trx_sys_mutex_exit(); - - trx_sys->mvcc->clone_oldest_view(&purge_sys->view); -} - -/*****************************************************************//** -Creates the trx_sys instance and initializes purge_queue and mutex. */ +/** Create the instance */ void -trx_sys_create(void) -/*================*/ +trx_sys_t::create() { - ut_ad(trx_sys == NULL); - - trx_sys = static_cast<trx_sys_t*>(ut_zalloc_nokey(sizeof(*trx_sys))); - - mutex_create(LATCH_ID_TRX_SYS, &trx_sys->mutex); - - UT_LIST_INIT(trx_sys->serialisation_list, &trx_t::no_list); - UT_LIST_INIT(trx_sys->rw_trx_list, &trx_t::trx_list); - UT_LIST_INIT(trx_sys->mysql_trx_list, &trx_t::mysql_trx_list); - - trx_sys->mvcc = UT_NEW_NOKEY(MVCC(1024)); - - new(&trx_sys->rw_trx_ids) trx_ids_t(ut_allocator<trx_id_t>( - mem_key_trx_sys_t_rw_trx_ids)); - - new(&trx_sys->rw_trx_set) TrxIdSet(); + ut_ad(this == &trx_sys); + ut_ad(!is_initialised()); + m_initialised = true; + mutex_create(LATCH_ID_TRX_SYS, &mutex); + UT_LIST_INIT(trx_list, &trx_t::trx_list); + my_atomic_store32(&rseg_history_len, 0); + + rw_trx_hash.init(); } /*****************************************************************//** @@ -577,260 +232,6 @@ trx_sys_create_sys_pages(void) mtr_commit(&mtr); } -/*****************************************************************//** -Update the file format tag. -@return always TRUE */ -static -ibool -trx_sys_file_format_max_write( -/*==========================*/ - ulint format_id, /*!< in: file format id */ - const char** name) /*!< out: max file format name, can - be NULL */ -{ - mtr_t mtr; - byte* ptr; - buf_block_t* block; - ib_uint64_t tag_value; - - mtr_start(&mtr); - - block = buf_page_get( - page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), univ_page_size, - RW_X_LATCH, &mtr); - - file_format_max.id = format_id; - file_format_max.name = trx_sys_file_format_id_to_name(format_id); - - ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; - tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; - - if (name) { - *name = file_format_max.name; - } - - mlog_write_ull(ptr, tag_value, &mtr); - - mtr_commit(&mtr); - - return(TRUE); -} - -/*****************************************************************//** -Read the file format tag. -@return the file format or ULINT_UNDEFINED if not set. */ -static -ulint -trx_sys_file_format_max_read(void) -/*==============================*/ -{ - mtr_t mtr; - const byte* ptr; - const buf_block_t* block; - ib_id_t file_format_id; - - /* Since this is called during the startup phase it's safe to - read the value without a covering mutex. */ - mtr_start(&mtr); - - block = buf_page_get( - page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), univ_page_size, - RW_X_LATCH, &mtr); - - ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; - file_format_id = mach_read_from_8(ptr); - - mtr_commit(&mtr); - - file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; - - if (file_format_id >= FILE_FORMAT_NAME_N) { - - /* Either it has never been tagged, or garbage in it. */ - return(ULINT_UNDEFINED); - } - - return((ulint) file_format_id); -} - -/*****************************************************************//** -Get the name representation of the file format from its id. -@return pointer to the name */ -const char* -trx_sys_file_format_id_to_name( -/*===========================*/ - const ulint id) /*!< in: id of the file format */ -{ - ut_a(id < FILE_FORMAT_NAME_N); - - return(file_format_name_map[id]); -} - -/*****************************************************************//** -Check for the max file format tag stored on disk. Note: If max_format_id -is == UNIV_FORMAT_MAX + 1 then we only print a warning. -@return DB_SUCCESS or error code */ -dberr_t -trx_sys_file_format_max_check( -/*==========================*/ - ulint max_format_id) /*!< in: max format id to check */ -{ - ulint format_id; - - /* Check the file format in the tablespace. Do not try to - recover if the file format is not supported by the engine - unless forced by the user. */ - format_id = trx_sys_file_format_max_read(); - if (format_id == ULINT_UNDEFINED) { - /* Format ID was not set. Set it to minimum possible - value. */ - format_id = UNIV_FORMAT_MIN; - } - - ib::info() << "Highest supported file format is " - << trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX) << "."; - - if (format_id > UNIV_FORMAT_MAX) { - - ut_a(format_id < FILE_FORMAT_NAME_N); - - const std::string msg = std::string("The system" - " tablespace is in a file format that this version" - " doesn't support - ") - + trx_sys_file_format_id_to_name(format_id) - + "."; - - if (max_format_id <= UNIV_FORMAT_MAX) { - ib::error() << msg; - } else { - ib::warn() << msg; - } - - if (max_format_id <= UNIV_FORMAT_MAX) { - return(DB_ERROR); - } - } - - format_id = (format_id > max_format_id) ? format_id : max_format_id; - - /* We don't need a mutex here, as this function should only - be called once at start up. */ - file_format_max.id = format_id; - file_format_max.name = trx_sys_file_format_id_to_name(format_id); - - return(DB_SUCCESS); -} - -/*****************************************************************//** -Set the file format id unconditionally except if it's already the -same value. -@return TRUE if value updated */ -ibool -trx_sys_file_format_max_set( -/*========================*/ - ulint format_id, /*!< in: file format id */ - const char** name) /*!< out: max file format name or - NULL if not needed. */ -{ - ibool ret = FALSE; - - ut_a(format_id <= UNIV_FORMAT_MAX); - - mutex_enter(&file_format_max.mutex); - - /* Only update if not already same value. */ - if (format_id != file_format_max.id) { - - ret = trx_sys_file_format_max_write(format_id, name); - } - - mutex_exit(&file_format_max.mutex); - - return(ret); -} - -/********************************************************************//** -Tags the system table space with minimum format id if it has not been -tagged yet. -WARNING: This function is only called during the startup and AFTER the -redo log application during recovery has finished. */ -void -trx_sys_file_format_tag_init(void) -/*==============================*/ -{ - ulint format_id; - - format_id = trx_sys_file_format_max_read(); - - /* If format_id is not set then set it to the minimum. */ - if (format_id == ULINT_UNDEFINED) { - trx_sys_file_format_max_set(UNIV_FORMAT_MIN, NULL); - } -} - -/********************************************************************//** -Update the file format tag in the system tablespace only if the given -format id is greater than the known max id. -@return TRUE if format_id was bigger than the known max id */ -ibool -trx_sys_file_format_max_upgrade( -/*============================*/ - const char** name, /*!< out: max file format name */ - ulint format_id) /*!< in: file format identifier */ -{ - ibool ret = FALSE; - - ut_a(name); - ut_a(file_format_max.name != NULL); - ut_a(format_id <= UNIV_FORMAT_MAX); - - mutex_enter(&file_format_max.mutex); - - if (format_id > file_format_max.id) { - - ret = trx_sys_file_format_max_write(format_id, name); - } - - mutex_exit(&file_format_max.mutex); - - return(ret); -} - -/*****************************************************************//** -Get the name representation of the file format from its id. -@return pointer to the max format name */ -const char* -trx_sys_file_format_max_get(void) -/*=============================*/ -{ - return(file_format_max.name); -} - -/*****************************************************************//** -Initializes the tablespace tag system. */ -void -trx_sys_file_format_init(void) -/*==========================*/ -{ - mutex_create(LATCH_ID_FILE_FORMAT_MAX, &file_format_max.mutex); - - /* We don't need a mutex here, as this function should only - be called once at start up. */ - file_format_max.id = UNIV_FORMAT_MIN; - - file_format_max.name = trx_sys_file_format_id_to_name( - file_format_max.id); -} - -/*****************************************************************//** -Closes the tablespace tag system. */ -void -trx_sys_file_format_close(void) -/*===========================*/ -{ - mutex_free(&file_format_max.mutex); -} - /** Create the rollback segments. @return whether the creation succeeded */ bool @@ -909,141 +310,53 @@ trx_sys_create_rsegs() return(true); } -/********************************************************************* -Shutdown/Close the transaction system. */ +/** Close the transaction system on shutdown */ void -trx_sys_close(void) -/*===============*/ +trx_sys_t::close() { - ut_ad(trx_sys != NULL); ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + if (!is_initialised()) { + return; + } - if (ulint size = trx_sys->mvcc->size()) { + if (size_t size = view_count()) { ib::error() << "All read views were not closed before" " shutdown: " << size << " read views open"; } - /* Only prepared transactions may be left in the system. Free them. */ - while (trx_t* trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list)) { - trx_free_prepared(trx); - } + rw_trx_hash.destroy(); /* There can't be any active transactions. */ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { - if (trx_rseg_t* rseg = trx_sys->rseg_array[i]) { + if (trx_rseg_t* rseg = rseg_array[i]) { trx_rseg_mem_free(rseg); } - if (trx_rseg_t* rseg = trx_sys->temp_rsegs[i]) { + if (trx_rseg_t* rseg = temp_rsegs[i]) { trx_rseg_mem_free(rseg); } } - UT_DELETE(trx_sys->mvcc); - - ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == 0); - ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0); - ut_a(UT_LIST_GET_LEN(trx_sys->serialisation_list) == 0); - - /* We used placement new to create this mutex. Call the destructor. */ - mutex_free(&trx_sys->mutex); - - trx_sys->rw_trx_ids.~trx_ids_t(); - - trx_sys->rw_trx_set.~TrxIdSet(); - - ut_free(trx_sys); - - trx_sys = NULL; -} - -/********************************************************************* -Check if there are any active (non-prepared) transactions. -This is only used to check if it's safe to shutdown. -@return total number of active transactions or 0 if none */ -ulint -trx_sys_any_active_transactions(void) -/*=================================*/ -{ - ulint total_trx = 0; - - trx_sys_mutex_enter(); - - for (trx_t* trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); - trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { - ut_ad(trx->in_rw_trx_list); - trx_mutex_enter(trx); - switch (trx->state) { - case TRX_STATE_NOT_STARTED: - DBUG_ASSERT(!"invalid state"); - /* fall through */ - case TRX_STATE_PREPARED: - case TRX_STATE_PREPARED_RECOVERED: - break; - case TRX_STATE_ACTIVE: - case TRX_STATE_COMMITTED_IN_MEMORY: - total_trx++; - } - trx_mutex_exit(trx); - } - - for (trx_t* trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); - trx != NULL; - trx = UT_LIST_GET_NEXT(mysql_trx_list, trx)) { - ut_ad(trx->in_mysql_trx_list); - trx_mutex_enter(trx); - /* This may count some ACTIVE transactions twice, - both in rw_trx_list and mysql_trx_list. */ - total_trx += trx->state == TRX_STATE_ACTIVE; - trx_mutex_exit(trx); - } - - trx_sys_mutex_exit(); - - return(total_trx); -} - -#ifdef UNIV_DEBUG -/*************************************************************//** -Validate the trx_ut_list_t. -@return true if valid. */ -static -bool -trx_sys_validate_trx_list_low( -/*===========================*/ - trx_ut_list_t* trx_list) /*!< in: &trx_sys->rw_trx_list */ -{ - const trx_t* trx; - const trx_t* prev_trx = NULL; - - ut_ad(trx_sys_mutex_own()); - - ut_ad(trx_list == &trx_sys->rw_trx_list); - - for (trx = UT_LIST_GET_FIRST(*trx_list); - trx != NULL; - prev_trx = trx, trx = UT_LIST_GET_NEXT(trx_list, prev_trx)) { - - check_trx_state(trx); - ut_a(prev_trx == NULL || prev_trx->id > trx->id); - } - - return(true); + ut_a(UT_LIST_GET_LEN(trx_list) == 0); + mutex_free(&mutex); + m_initialised = false; } -/*************************************************************//** -Validate the trx_sys_t::rw_trx_list. -@return true if the list is valid. */ -bool -trx_sys_validate_trx_list() -/*=======================*/ +/** @return total number of active (non-prepared) transactions */ +ulint trx_sys_t::any_active_transactions() { - ut_ad(trx_sys_mutex_own()); - - ut_a(trx_sys_validate_trx_list_low(&trx_sys->rw_trx_list)); - - return(true); + uint32_t total_trx= 0; + + mutex_enter(&mutex); + for (trx_t* trx= UT_LIST_GET_FIRST(trx_sys.trx_list); + trx != NULL; + trx= UT_LIST_GET_NEXT(trx_list, trx)) + { + if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY || + (trx->state == TRX_STATE_ACTIVE && trx->id)) + total_trx++; + } + mutex_exit(&mutex); + return total_trx; } -#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 42bd67cb24b..90ed4141633 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -37,7 +37,6 @@ Created 3/26/1996 Heikki Tuuri #include "log0log.h" #include "os0proc.h" #include "que0que.h" -#include "read0read.h" #include "srv0mon.h" #include "srv0srv.h" #include "srv0start.h" @@ -53,8 +52,16 @@ Created 3/26/1996 Heikki Tuuri #include <set> #include <new> -extern "C" -int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2); +/** The bit pattern corresponding to TRX_ID_MAX */ +const byte trx_id_max_bytes[8] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +/** The bit pattern corresponding to max timestamp */ +const byte timestamp_max_bytes[7] = { + 0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f +}; + static const ulint MAX_DETAILED_ERROR_LEN = 256; @@ -149,7 +156,7 @@ trx_init( trx->last_sql_stat_start.least_undo_no = 0; - ut_ad(!MVCC::is_view_active(trx->read_view)); + ut_ad(!trx->read_view.is_open()); trx->lock.rec_cached = 0; @@ -181,6 +188,9 @@ struct TrxFactory { new(&trx->lock.table_locks) lock_list(); + new(&trx->read_view) ReadView(); + + trx->rw_trx_hash_pins = 0; trx_init(trx); trx->dict_operation_lock_mode = 0; @@ -200,7 +210,6 @@ struct TrxFactory { &trx_named_savept_t::trx_savepoints); mutex_create(LATCH_ID_TRX, &trx->mutex); - mutex_create(LATCH_ID_TRX_UNDO, &trx->undo_mutex); } /** Release resources held by the transaction object. @@ -209,7 +218,7 @@ struct TrxFactory { { #ifdef __SANITIZE_ADDRESS__ /* Unpoison the memory for AddressSanitizer */ - MEM_UNDEFINED(trx, sizeof *trx); + MEM_MAKE_ADDRESSABLE(trx, sizeof *trx); #else /* Declare the contents as initialized for Valgrind; we checked this in trx_t::free(). */ @@ -217,8 +226,7 @@ struct TrxFactory { #endif ut_a(trx->magic_n == TRX_MAGIC_N); - ut_ad(!trx->in_rw_trx_list); - ut_ad(!trx->in_mysql_trx_list); + ut_ad(!trx->mysql_thd); ut_a(trx->lock.wait_lock == NULL); ut_a(trx->lock.wait_thr == NULL); @@ -235,13 +243,14 @@ struct TrxFactory { ut_free(trx->detailed_error); mutex_free(&trx->mutex); - mutex_free(&trx->undo_mutex); trx->mod_tables.~trx_mod_tables_t(); - ut_ad(trx->read_view == NULL); + ut_ad(!trx->read_view.is_open()); trx->lock.table_locks.~lock_list(); + + trx->read_view.~ReadView(); } }; @@ -319,15 +328,15 @@ trx_pool_close() trx_pools = 0; } -/** @return allocated transaction object for internal operations */ -trx_t *trx_allocate_for_background() +/** @return an allocated transaction */ +trx_t *trx_create() { trx_t* trx = trx_pools->get(); #ifdef __SANITIZE_ADDRESS__ /* Unpoison the memory for AddressSanitizer. It may have been poisoned in trx_t::free().*/ - MEM_UNDEFINED(trx, sizeof *trx); + MEM_MAKE_ADDRESSABLE(trx, sizeof *trx); #else /* Declare the memory initialized for Valgrind. The trx_t that are released to the pool are @@ -344,6 +353,7 @@ trx_t *trx_allocate_for_background() /* We just got trx from pool, it should be non locking */ ut_ad(trx->will_lock == 0); ut_ad(trx->state == TRX_STATE_NOT_STARTED); + ut_ad(!trx->rw_trx_hash_pins); DBUG_LOG("trx", "Create: " << trx); @@ -363,24 +373,31 @@ trx_t *trx_allocate_for_background() ut_ad(!trx->wsrep_UK_scan); #endif /* WITH_WSREP */ + trx_sys.register_trx(trx); + return(trx); } /** Free the memory to trx_pools */ -inline void trx_t::free() +void trx_t::free() { - assert_trx_is_inactive(this); - MEM_CHECK_DEFINED(this, sizeof *this); - ut_ad(!read_view); + ut_ad(!n_mysql_tables_in_use); + ut_ad(!mysql_n_tables_locked); + ut_ad(!internal); + ut_ad(!declared_to_be_inside_innodb); ut_ad(!will_lock); ut_ad(error_state == DB_SUCCESS); ut_ad(magic_n == TRX_MAGIC_N); ut_ad(!read_only); - ut_ad(!in_mysql_trx_list); ut_ad(!lock.wait_lock); + dict_operation= TRX_DICT_OP_NONE; + trx_sys.deregister_trx(this); + assert_trx_is_free(this); + trx_sys.rw_trx_hash.put_pins(this); + mysql_thd= NULL; mysql_log_file_name= NULL; @@ -406,7 +423,6 @@ inline void trx_t::free() #endif MEM_NOACCESS(&read_view, sizeof read_view); MEM_NOACCESS(&trx_list, sizeof trx_list); - MEM_NOACCESS(&no_list, sizeof no_list); MEM_NOACCESS(&lock, sizeof lock); MEM_NOACCESS(&op_info, sizeof op_info); MEM_NOACCESS(&isolation_level, sizeof isolation_level); @@ -430,27 +446,16 @@ inline void trx_t::free() MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset); MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use); MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked); -#ifdef UNIV_DEBUG - MEM_NOACCESS(&in_rw_trx_list, sizeof in_rw_trx_list); -#endif /* UNIV_DEBUG */ - MEM_NOACCESS(&mysql_trx_list, sizeof mysql_trx_list); -#ifdef UNIV_DEBUG - MEM_NOACCESS(&in_mysql_trx_list, sizeof in_mysql_trx_list); -#endif /* UNIV_DEBUG */ MEM_NOACCESS(&error_state, sizeof error_state); MEM_NOACCESS(&error_info, sizeof error_info); MEM_NOACCESS(&error_key_num, sizeof error_key_num); MEM_NOACCESS(&graph, sizeof graph); MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints); - /* do not poison undo_mutex */ MEM_NOACCESS(&undo_no, sizeof undo_no); - MEM_NOACCESS(&undo_rseg_space, sizeof undo_rseg_space); MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start); MEM_NOACCESS(&rsegs, sizeof rsegs); MEM_NOACCESS(&roll_limit, sizeof roll_limit); -#ifdef UNIV_DEBUG MEM_NOACCESS(&in_rollback, sizeof in_rollback); -#endif /* UNIV_DEBUG */ MEM_NOACCESS(&pages_undone, sizeof pages_undone); MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows); MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks); @@ -479,92 +484,12 @@ inline void trx_t::free() trx_pools->mem_free(this); } -/********************************************************************//** -Creates a transaction object for MySQL. -@return own: transaction object */ -trx_t* -trx_allocate_for_mysql(void) -/*========================*/ -{ - trx_t* trx; - - trx = trx_allocate_for_background(); - - trx_sys_mutex_enter(); - - ut_d(trx->in_mysql_trx_list = TRUE); - UT_LIST_ADD_FIRST(trx_sys->mysql_trx_list, trx); - - trx_sys_mutex_exit(); - - return(trx); -} - -/** Check state of transaction before freeing it. -@param trx trx object to validate */ -static -void -trx_validate_state_before_free(trx_t* trx) -{ - ut_ad(!trx->declared_to_be_inside_innodb); - ut_ad(!trx->n_mysql_tables_in_use); - ut_ad(!trx->mysql_n_tables_locked); - ut_ad(!trx->internal); - - if (UNIV_UNLIKELY(trx->declared_to_be_inside_innodb)) { - ib::error() << "Freeing a trx (" - << trx_get_id_for_print(trx) << ") which is declared" - " to be processing inside InnoDB"; - - trx_print(stderr, trx, 600); - putc('\n', stderr); - - /* This is an error but not a fatal error. We must keep - the counters like srv_conc.n_active accurate. */ - srv_conc_force_exit_innodb(trx); - } - - if (trx->n_mysql_tables_in_use != 0 - || trx->mysql_n_tables_locked != 0) { - - ib::error() << "MySQL is freeing a thd though" - " trx->n_mysql_tables_in_use is " - << trx->n_mysql_tables_in_use - << " and trx->mysql_n_tables_locked is " - << trx->mysql_n_tables_locked << "."; - - trx_print(stderr, trx, 600); - ut_print_buf(stderr, trx, sizeof(trx_t)); - putc('\n', stderr); - } - - trx->dict_operation = TRX_DICT_OP_NONE; - assert_trx_is_inactive(trx); -} - -/** Free and initialize a transaction object instantinated during recovery. -@param trx trx object to free and initialize during recovery */ -void -trx_free_resurrected(trx_t* trx) -{ - trx_validate_state_before_free(trx); - - trx_init(trx); - trx->free(); -} - -/** Free a transaction that was allocated by background or user threads. -@param trx trx object to free */ -void -trx_free_for_background(trx_t* trx) -{ - trx_validate_state_before_free(trx); - trx->free(); -} - /** Transition to committed state, to release implicit locks. */ inline void trx_t::commit_state() { + ut_ad(state == TRX_STATE_PREPARED + || state == TRX_STATE_PREPARED_RECOVERED + || state == TRX_STATE_ACTIVE); /* This makes the transaction committed in memory and makes its changes to data visible to other transactions. NOTE that there is a small discrepancy from the strict formal visibility rules here: a @@ -577,23 +502,9 @@ inline void trx_t::commit_state() makes modifications to the database, will get an lsn larger than the committing transaction T. In the case where the log flush fails, and T never gets committed, also T2 will never get committed. */ - ut_ad(trx_mutex_own(this)); - ut_ad(state != TRX_STATE_NOT_STARTED); - ut_ad(state != TRX_STATE_COMMITTED_IN_MEMORY - || (is_recovered && !UT_LIST_GET_LEN(lock.trx_locks))); + trx_mutex_enter(this); state= TRX_STATE_COMMITTED_IN_MEMORY; - - /* If the background thread trx_rollback_or_clean_recovered() - is still active then there is a chance that the rollback - thread may see this trx as COMMITTED_IN_MEMORY and goes ahead - to clean it up calling trx_cleanup_at_db_startup(). This can - happen in the case we are committing a trx here that is left - in PREPARED state during the crash. Note that commit of the - rollback of a PREPARED trx happens in the recovery thread - while the rollback of other transactions happen in the - background thread. To avoid this race we unconditionally unset - the is_recovered flag. */ - is_recovered= false; + trx_mutex_exit(this); ut_ad(id || !is_referenced()); } @@ -601,49 +512,41 @@ inline void trx_t::commit_state() inline void trx_t::release_locks() { DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY); + DBUG_ASSERT(!is_referenced()); if (UT_LIST_GET_LEN(lock.trx_locks)) - lock_trx_release_locks(this); - else - lock.table_locks.clear(); + { + lock_release(this); + lock.n_rec_locks = 0; + ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0); + ut_ad(ib_vector_is_empty(autoinc_locks)); + mem_heap_empty(lock.lock_heap); + } + + lock.table_locks.clear(); } -/********************************************************************//** -At shutdown, frees a transaction object that is in the PREPARED state. */ +/** At shutdown, frees a transaction object. */ void -trx_free_prepared( -/*==============*/ - trx_t* trx) /*!< in, own: trx object */ +trx_free_at_shutdown(trx_t *trx) { - trx_mutex_enter(trx); - ut_ad(trx->state == TRX_STATE_PREPARED - || trx->state == TRX_STATE_PREPARED_RECOVERED - || !srv_was_started - || srv_read_only_mode - || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); + ut_ad(trx->is_recovered); ut_a(trx_state_eq(trx, TRX_STATE_PREPARED) || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) - || (trx->is_recovered - && (trx_state_eq(trx, TRX_STATE_ACTIVE) - || trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) + || (trx_state_eq(trx, TRX_STATE_ACTIVE) && (!srv_was_started || is_mariabackup_restore_or_export() || srv_read_only_mode - || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO))); + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO + || (!srv_is_being_started + && !srv_undo_sources && srv_fast_shutdown)))); ut_a(trx->magic_n == TRX_MAGIC_N); trx->commit_state(); - trx_mutex_exit(trx); trx->release_locks(); - trx_undo_free_prepared(trx); - - assert_trx_in_rw_list(trx); + trx_undo_free_at_shutdown(trx); ut_a(!trx->read_only); - ut_ad(trx->in_rw_trx_list); - UT_LIST_REMOVE(trx_sys->rw_trx_list, trx); - ut_d(trx->in_rw_trx_list = false); - DBUG_LOG("trx", "Free prepared: " << trx); trx->state = TRX_STATE_NOT_STARTED; ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks)); @@ -651,70 +554,20 @@ trx_free_prepared( trx->free(); } -/** Disconnect a transaction from MySQL and optionally mark it as if -it's been recovered. For the marking the transaction must be in prepared state. -The recovery-marked transaction is going to survive "alone" so its association -with the mysql handle is destroyed now rather than when it will be -finally freed. -@param[in,out] trx transaction -@param[in] prepared boolean value to specify whether trx is - for recovery or not. */ -inline -void -trx_disconnect_from_mysql( - trx_t* trx, - bool prepared) -{ - trx_sys_mutex_enter(); - - ut_ad(trx->in_mysql_trx_list); - ut_d(trx->in_mysql_trx_list = FALSE); - - UT_LIST_REMOVE(trx_sys->mysql_trx_list, trx); - - if (trx->read_view != NULL) { - trx_sys->mvcc->view_close(trx->read_view, true); - } - - ut_ad(trx_sys_validate_trx_list()); - - if (prepared) { - ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED)); - - trx->is_recovered = true; - trx->mysql_thd = NULL; - /* todo/fixme: suggest to do it at innodb prepare */ - trx->will_lock = 0; - } - - trx_sys_mutex_exit(); -} - -/** Disconnect a transaction from MySQL. -@param[in,out] trx transaction */ -inline -void -trx_disconnect_plain(trx_t* trx) -{ - trx_disconnect_from_mysql(trx, false); -} - -/** Disconnect a prepared transaction from MySQL. -@param[in,out] trx transaction */ -void -trx_disconnect_prepared(trx_t* trx) -{ - trx_disconnect_from_mysql(trx, true); -} - -/** Free a transaction object for MySQL. -@param[in,out] trx transaction */ -void -trx_free_for_mysql(trx_t* trx) +/** + Disconnect a prepared transaction from MySQL + @param[in,out] trx transaction +*/ +void trx_disconnect_prepared(trx_t *trx) { - trx_disconnect_plain(trx); - trx_free_for_background(trx); + ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_ad(trx->mysql_thd); + trx->read_view.close(); + trx->is_recovered= true; + trx->mysql_thd= NULL; + /* todo/fixme: suggest to do it at innodb prepare */ + trx->will_lock= 0; } /****************************************************************//** @@ -724,8 +577,6 @@ void trx_resurrect_table_locks( /*======================*/ trx_t* trx, /*!< in/out: transaction */ - const trx_undo_ptr_t* undo_ptr, - /*!< in: pointer to undo segment. */ const trx_undo_t* undo) /*!< in: undo log */ { mtr_t mtr; @@ -733,10 +584,11 @@ trx_resurrect_table_locks( trx_undo_rec_t* undo_rec; table_id_set tables; - ut_ad(undo == undo_ptr->insert_undo || undo == undo_ptr->update_undo); - - if (trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) || undo->empty) { + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) || + trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_ad(undo->rseg == trx->rsegs.m_redo.rseg); + if (undo->empty()) { return; } @@ -745,7 +597,8 @@ trx_resurrect_table_locks( /* trx_rseg_mem_create() may have acquired an X-latch on this page, so we cannot acquire an S-latch. */ undo_page = trx_undo_page_get( - page_id_t(undo->space, undo->top_page_no), &mtr); + page_id_t(trx->rsegs.m_redo.rseg->space->id, + undo->top_page_no), &mtr); undo_rec = undo_page + undo->top_offset; @@ -788,185 +641,97 @@ trx_resurrect_table_locks( } if (trx->state == TRX_STATE_PREPARED) { - trx->mod_tables.insert(table); + trx->mod_tables.insert( + trx_mod_tables_t::value_type(table, + 0)); } lock_table_ix_resurrect(table, trx); - DBUG_PRINT("ib_trx", - ("resurrect" TRX_ID_FMT - " table '%s' IX lock from %s undo", - trx_get_id_for_print(trx), - table->name.m_name, - undo == undo_ptr->insert_undo - ? "insert" : "update")); + DBUG_LOG("ib_trx", + "resurrect " << ib::hex(trx->id) + << " IX lock on " << table->name); dict_table_close(table, FALSE, FALSE); } } } -/****************************************************************//** -Resurrect the transactions that were doing inserts the time of the -crash, they need to be undone. -@return trx_t instance */ -static -trx_t* -trx_resurrect_insert( -/*=================*/ - trx_undo_t* undo, /*!< in: entry to UNDO */ - trx_rseg_t* rseg) /*!< in: rollback segment */ -{ - trx_t* trx; - - trx = trx_allocate_for_background(); - - ut_d(trx->start_file = __FILE__); - ut_d(trx->start_line = __LINE__); - - trx->rsegs.m_redo.rseg = rseg; - *trx->xid = undo->xid; - trx->id = undo->trx_id; - trx->rsegs.m_redo.insert_undo = undo; - trx->is_recovered = true; - /* This is single-threaded startup code, we do not need the - protection of trx->mutex or trx_sys->mutex here. */ - - if (undo->state != TRX_UNDO_ACTIVE) { - - /* Prepared transactions are left in the prepared state - waiting for a commit or abort decision from MySQL */ - - if (undo->state == TRX_UNDO_PREPARED) { - - ib::info() << "Transaction " - << trx_get_id_for_print(trx) - << " was in the XA prepared state."; - - trx->state = TRX_STATE_PREPARED; - } else { - trx->state = TRX_STATE_COMMITTED_IN_MEMORY; - } - - /* We give a dummy value for the trx no; this should have no - relevance since purge is not interested in committed - transaction numbers, unless they are in the history - list, in which case it looks the number from the disk based - undo log structure */ - - trx->no = trx->id; - - } else { - trx->state = TRX_STATE_ACTIVE; - - /* A running transaction always has the number - field inited to TRX_ID_MAX */ - - trx->no = TRX_ID_MAX; - } - - /* trx_start_low() is not called with resurrect, so need to initialize - start time here.*/ - if (trx->state != TRX_STATE_COMMITTED_IN_MEMORY) { - trx->start_time = time(NULL); - trx->start_time_micro = microsecond_interval_timer(); - } - - if (undo->dict_operation) { - trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); - trx->table_id = undo->table_id; - } - - if (!undo->empty) { - trx->undo_no = undo->top_undo_no + 1; - trx->undo_rseg_space = undo->rseg->space; - } - - return(trx); -} +/** + Resurrect the transactions that were doing inserts/updates the time of the + crash, they need to be undone. +*/ + +static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg, + time_t start_time, ulonglong start_time_micro, + uint64_t *rows_to_undo, + bool is_old_insert) +{ + trx_state_t state; + /* + This is single-threaded startup code, we do not need the + protection of trx->mutex or trx_sys.mutex here. + */ + switch (undo->state) + { + case TRX_UNDO_ACTIVE: + state= TRX_STATE_ACTIVE; + break; + case TRX_UNDO_PREPARED: + /* + Prepared transactions are left in the prepared state + waiting for a commit or abort decision from MySQL + */ + ib::info() << "Transaction " << undo->trx_id + << " was in the XA prepared state."; + + state= TRX_STATE_PREPARED; + break; + default: + if (is_old_insert && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) + trx_undo_commit_cleanup(undo, false); + return; + } -/****************************************************************//** -Prepared transactions are left in the prepared state waiting for a -commit or abort decision from MySQL */ -static -void -trx_resurrect_update_in_prepared_state( -/*===================================*/ - trx_t* trx, /*!< in,out: transaction */ - const trx_undo_t* undo) /*!< in: update UNDO record */ -{ - /* This is single-threaded startup code, we do not need the - protection of trx->mutex or trx_sys->mutex here. */ + trx_t *trx= trx_create(); + trx->state= state; + ut_d(trx->start_file= __FILE__); + ut_d(trx->start_line= __LINE__); + ut_ad(trx->no == TRX_ID_MAX); - if (undo->state == TRX_UNDO_PREPARED) { - ib::info() << "Transaction " << trx_get_id_for_print(trx) - << " was in the XA prepared state."; - ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED) - || trx_state_eq(trx, TRX_STATE_PREPARED)); + if (is_old_insert) + trx->rsegs.m_redo.old_insert= undo; + else + trx->rsegs.m_redo.undo= undo; + + trx->undo_no= undo->top_undo_no + 1; + trx->rsegs.m_redo.rseg= rseg; + /* + For transactions with active data will not have rseg size = 1 + or will not qualify for purge limit criteria. So it is safe to increment + this trx_ref_count w/o mutex protection. + */ + ++trx->rsegs.m_redo.rseg->trx_ref_count; + *trx->xid= undo->xid; + trx->id= undo->trx_id; + trx->is_recovered= true; + trx->start_time= start_time; + trx->start_time_micro= start_time_micro; + + if (undo->dict_operation) + { + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + if (!trx->table_id) + trx->table_id= undo->table_id; + } - trx->state = TRX_STATE_PREPARED; - } else { - trx->state = TRX_STATE_COMMITTED_IN_MEMORY; - } + trx_sys.rw_trx_hash.insert(trx); + trx_sys.rw_trx_hash.put_pins(trx); + trx_resurrect_table_locks(trx, undo); + if (trx_state_eq(trx, TRX_STATE_ACTIVE)) + *rows_to_undo+= trx->undo_no; } -/****************************************************************//** -Resurrect the transactions that were doing updates the time of the -crash, they need to be undone. */ -static -void -trx_resurrect_update( -/*=================*/ - trx_t* trx, /*!< in/out: transaction */ - trx_undo_t* undo, /*!< in/out: update UNDO record */ - trx_rseg_t* rseg) /*!< in/out: rollback segment */ -{ - trx->rsegs.m_redo.rseg = rseg; - *trx->xid = undo->xid; - trx->id = undo->trx_id; - trx->rsegs.m_redo.update_undo = undo; - trx->is_recovered = true; - - /* This is single-threaded startup code, we do not need the - protection of trx->mutex or trx_sys->mutex here. */ - - if (undo->state != TRX_UNDO_ACTIVE) { - trx_resurrect_update_in_prepared_state(trx, undo); - - /* We give a dummy value for the trx number */ - - trx->no = trx->id; - - } else { - trx->state = TRX_STATE_ACTIVE; - - /* A running transaction always has the number field inited to - TRX_ID_MAX */ - - trx->no = TRX_ID_MAX; - } - - /* trx_start_low() is not called with resurrect, so need to initialize - start time here.*/ - if (trx->state == TRX_STATE_ACTIVE - || trx->state == TRX_STATE_PREPARED) { - trx->start_time = time(NULL); - trx->start_time_micro = microsecond_interval_timer(); - } - - if (undo->dict_operation) { - trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); - if (!trx->table_id) { - trx->table_id = undo->table_id; - } - } - - if (!undo->empty && undo->top_undo_no >= trx->undo_no) { - - trx->undo_no = undo->top_undo_no + 1; - trx->undo_rseg_space = undo->rseg->space; - } -} /** Initialize (resurrect) transactions at startup. */ void @@ -974,22 +739,31 @@ trx_lists_init_at_db_start() { ut_a(srv_is_being_started); ut_ad(!srv_was_started); - ut_ad(!purge_sys); - purge_sys = UT_NEW_NOKEY(purge_sys_t()); + if (srv_operation == SRV_OPERATION_RESTORE) { + /* mariabackup --prepare only deals with + the redo log and the data files, not with + transactions or the data dictionary. */ + trx_rseg_array_init(); + return; + } if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) { return; } + purge_sys.create(); trx_rseg_array_init(); /* Look from the rollback segments if there exist undo logs for transactions. */ + const time_t start_time = time(NULL); + const ulonglong start_time_micro= microsecond_interval_timer(); + uint64_t rows_to_undo = 0; for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { trx_undo_t* undo; - trx_rseg_t* rseg = trx_sys->rseg_array[i]; + trx_rseg_t* rseg = trx_sys.rseg_array[i]; /* Some rollback segment may be unavailable, especially if the server was previously run with a @@ -998,76 +772,59 @@ trx_lists_init_at_db_start() continue; } - /* Resurrect transactions that were doing inserts. */ - for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_list); - undo != NULL; - undo = UT_LIST_GET_NEXT(undo_list, undo)) { - - /* trx_purge() will not run before we return, - so we can safely increment this without - holding rseg->mutex. */ - ++rseg->trx_ref_count; - - trx_t* trx; - - trx = trx_resurrect_insert(undo, rseg); - - trx_sys_rw_trx_add(trx); - - trx_resurrect_table_locks( - trx, &trx->rsegs.m_redo, undo); + /* Resurrect transactions that were doing inserts + using the old separate insert_undo log. */ + undo = UT_LIST_GET_FIRST(rseg->old_insert_list); + while (undo) { + trx_undo_t* next = UT_LIST_GET_NEXT(undo_list, undo); + trx_resurrect(undo, rseg, start_time, start_time_micro, + &rows_to_undo, true); + undo = next; } - /* Ressurrect transactions that were doing updates. */ - for (undo = UT_LIST_GET_FIRST(rseg->update_undo_list); + /* Ressurrect other transactions. */ + for (undo = UT_LIST_GET_FIRST(rseg->undo_list); undo != NULL; undo = UT_LIST_GET_NEXT(undo_list, undo)) { - - /* Check the trx_sys->rw_trx_set first. */ - trx_sys_mutex_enter(); - - trx_t* trx = trx_get_rw_trx_by_id(undo->trx_id); - - trx_sys_mutex_exit(); - - if (trx == NULL) { - trx = trx_allocate_for_background(); - ++rseg->trx_ref_count; - - ut_d(trx->start_file = __FILE__); - ut_d(trx->start_line = __LINE__); + trx_t *trx = trx_sys.find(0, undo->trx_id, false); + if (!trx) { + trx_resurrect(undo, rseg, start_time, + start_time_micro, + &rows_to_undo, false); + } else { + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) || + trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_ad(trx->start_time == start_time); + ut_ad(trx->is_recovered); + ut_ad(trx->rsegs.m_redo.rseg == rseg); + ut_ad(trx->rsegs.m_redo.rseg->trx_ref_count); + + trx->rsegs.m_redo.undo = undo; + if (undo->top_undo_no >= trx->undo_no) { + if (trx_state_eq(trx, + TRX_STATE_ACTIVE)) { + rows_to_undo -= trx->undo_no; + rows_to_undo += + undo->top_undo_no + 1; + } + + trx->undo_no = undo->top_undo_no + 1; + } + trx_resurrect_table_locks(trx, undo); } - - trx_resurrect_update(trx, undo, rseg); - - trx_sys_rw_trx_add(trx); - - trx_resurrect_table_locks( - trx, &trx->rsegs.m_redo, undo); } } - TrxIdSet::iterator end = trx_sys->rw_trx_set.end(); + if (trx_sys.rw_trx_hash.size()) { - for (TrxIdSet::iterator it = trx_sys->rw_trx_set.begin(); - it != end; - ++it) { + ib::info() << trx_sys.rw_trx_hash.size() + << " transaction(s) which must be rolled back or" + " cleaned up in total " << rows_to_undo + << " row operations to undo"; - ut_ad(it->m_trx->in_rw_trx_list); -#ifdef UNIV_DEBUG - if (it->m_trx->id > trx_sys->rw_max_trx_id) { - trx_sys->rw_max_trx_id = it->m_trx->id; - } -#endif /* UNIV_DEBUG */ - - if (it->m_trx->state == TRX_STATE_ACTIVE - || it->m_trx->state == TRX_STATE_PREPARED) { - - trx_sys->rw_trx_ids.push_back(it->m_id); - } - - UT_LIST_ADD_FIRST(trx_sys->rw_trx_list, it->m_trx); + ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id(); } + trx_sys.clone_oldest_view(); } /** Assign a persistent rollback segment in a round-robin fashion, @@ -1082,7 +839,7 @@ static trx_rseg_t* trx_assign_rseg_low() } /* The first slot is always assigned to the system tablespace. */ - ut_ad(trx_sys->rseg_array[0]->space == TRX_SYS_SPACE); + ut_ad(trx_sys.rseg_array[0]->space == fil_system.sys_space); /* Choose a rollback segment evenly distributed between 0 and innodb_undo_logs-1 in a round-robin fashion, skipping those @@ -1105,7 +862,7 @@ static trx_rseg_t* trx_assign_rseg_low() do { for (;;) { - rseg = trx_sys->rseg_array[slot]; + rseg = trx_sys.rseg_array[slot]; #ifdef UNIV_DEBUG /* Ensure that we are not revisiting the same @@ -1124,14 +881,14 @@ static trx_rseg_t* trx_assign_rseg_low() ut_ad(rseg->is_persistent()); - if (rseg->space != TRX_SYS_SPACE) { + if (rseg->space != fil_system.sys_space) { if (rseg->skip_allocation || !srv_undo_tablespaces) { continue; } } else if (trx_rseg_t* next - = trx_sys->rseg_array[slot]) { - if (next->space != TRX_SYS_SPACE + = trx_sys.rseg_array[slot]) { + if (next->space != fil_system.sys_space && srv_undo_tablespaces > 0) { /** If dedicated innodb_undo_tablespaces have @@ -1161,11 +918,11 @@ static trx_rseg_t* trx_assign_rseg_low() } /** Set the innodb_log_optimize_ddl page flush observer -@param[in] space_id tablespace id -@param[in,out] stage performance_schema accounting */ -void trx_t::set_flush_observer(ulint space_id, ut_stage_alter_t* stage) +@param[in,out] space tablespace +@param[in,out] stage performance_schema accounting */ +void trx_t::set_flush_observer(fil_space_t* space, ut_stage_alter_t* stage) { - flush_observer = UT_NEW_NOKEY(FlushObserver(space_id, this, stage)); + flush_observer = UT_NEW_NOKEY(FlushObserver(space, this, stage)); } /** Remove the flush observer */ @@ -1190,17 +947,13 @@ trx_t::assign_temp_rseg() multiple transactions that start modifications concurrently will write their undo log to the same rollback segment. */ static ulong rseg_slot; - trx_rseg_t* rseg = trx_sys->temp_rsegs[ + trx_rseg_t* rseg = trx_sys.temp_rsegs[ rseg_slot++ & (TRX_SYS_N_RSEGS - 1)]; ut_ad(!rseg->is_persistent()); rsegs.m_noredo.rseg = rseg; if (id == 0) { - mutex_enter(&trx_sys->mutex); - id = trx_sys_get_new_trx_id(); - trx_sys->rw_trx_ids.push_back(id); - trx_sys->rw_trx_set.insert(TrxTrack(id, this)); - mutex_exit(&trx_sys->mutex); + trx_sys.register_rw(this); } ut_ad(!rseg->is_persistent()); @@ -1252,17 +1005,14 @@ trx_start_low( ut_a(ib_vector_is_empty(trx->autoinc_locks)); ut_a(trx->lock.table_locks.empty()); - /* If this transaction came from trx_allocate_for_mysql(), - trx->in_mysql_trx_list would hold. In that case, the trx->state - change must be protected by the trx_sys->mutex, so that - lock_print_info_all_transactions() will have a consistent view. */ - - ut_ad(!trx->in_rw_trx_list); + /* No other thread can access this trx object through rw_trx_hash, thus + we don't need trx_sys.mutex protection for that purpose. Still this + trx can be found through trx_sys.trx_list, which means state + change must be protected by e.g. trx->mutex. - /* We tend to over assert and that complicates the code somewhat. - e.g., the transaction state can be set earlier but we are forced to - set it under the protection of the trx_sys_t::mutex because some - trx list assertions are triggered unnecessarily. */ + For now we update it without mutex protection, because original code + did it this way. It has to be reviewed and fixed properly. */ + trx->state = TRX_STATE_ACTIVE; /* By default all transactions are in the read-only list unless they are non-locking auto-commit read only transactions or background @@ -1273,37 +1023,14 @@ trx_start_low( if (!trx->read_only && (trx->mysql_thd == 0 || read_write || trx->ddl)) { - trx->rsegs.m_redo.rseg = trx_assign_rseg_low(); - /* Temporary rseg is assigned only if the transaction updates a temporary table */ - - trx_sys_mutex_enter(); - - trx->id = trx_sys_get_new_trx_id(); - - trx_sys->rw_trx_ids.push_back(trx->id); - - trx_sys_rw_trx_add(trx); - + trx->rsegs.m_redo.rseg = trx_assign_rseg_low(); ut_ad(trx->rsegs.m_redo.rseg != 0 || srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); - UT_LIST_ADD_FIRST(trx_sys->rw_trx_list, trx); - - ut_d(trx->in_rw_trx_list = true); -#ifdef UNIV_DEBUG - if (trx->id > trx_sys->rw_max_trx_id) { - trx_sys->rw_max_trx_id = trx->id; - } -#endif /* UNIV_DEBUG */ - - trx->state = TRX_STATE_ACTIVE; - - ut_ad(trx_sys_validate_trx_list()); - - trx_sys_mutex_exit(); + trx_sys.register_rw(trx); } else { if (!trx_is_autocommit_non_locking(trx)) { @@ -1312,26 +1039,11 @@ trx_start_low( to write to the temporary table. */ if (read_write) { - - trx_sys_mutex_enter(); - ut_ad(!srv_read_only_mode); - - trx->id = trx_sys_get_new_trx_id(); - - trx_sys->rw_trx_ids.push_back(trx->id); - - trx_sys->rw_trx_set.insert( - TrxTrack(trx->id, trx)); - - trx_sys_mutex_exit(); + trx_sys.register_rw(trx); } - - trx->state = TRX_STATE_ACTIVE; - } else { ut_ad(!read_write); - trx->state = TRX_STATE_ACTIVE; } } @@ -1346,52 +1058,36 @@ trx_start_low( } /** Set the serialisation number for a persistent committed transaction. -@param[in,out] trx committed transaction with persistent changes -@param[in,out] rseg rollback segment for update_undo, or NULL */ +@param[in,out] trx committed transaction with persistent changes */ static void -trx_serialise(trx_t* trx, trx_rseg_t* rseg) +trx_serialise(trx_t* trx) { - ut_ad(!rseg || rseg == trx->rsegs.m_redo.rseg); + trx_rseg_t *rseg = trx->rsegs.m_redo.rseg; + ut_ad(rseg); + ut_ad(mutex_own(&rseg->mutex)); - trx_sys_mutex_enter(); - - trx->no = trx_sys_get_new_trx_id(); + if (rseg->last_page_no == FIL_NULL) { + mutex_enter(&purge_sys.pq_mutex); + } - /* Track the minimum serialisation number. */ - UT_LIST_ADD_LAST(trx_sys->serialisation_list, trx); + trx_sys.assign_new_trx_no(trx); - /* If the rollack segment is not empty then the + /* If the rollback segment is not empty then the new trx_t::no can't be less than any trx_t::no already in the rollback segment. User threads only produce events when a rollback segment is empty. */ - if (rseg && rseg->last_page_no == FIL_NULL) { - TrxUndoRsegs elem(trx->no); - elem.push_back(rseg); - - mutex_enter(&purge_sys->pq_mutex); - - /* This is to reduce the pressure on the trx_sys_t::mutex - though in reality it should make very little (read no) - difference because this code path is only taken when the - rbs is empty. */ - - trx_sys_mutex_exit(); - - purge_sys->purge_queue.push(elem); - - mutex_exit(&purge_sys->pq_mutex); - } else { - trx_sys_mutex_exit(); + if (rseg->last_page_no == FIL_NULL) { + purge_sys.purge_queue.push(TrxUndoRsegs(trx->no, *rseg)); + mutex_exit(&purge_sys.pq_mutex); } } /****************************************************************//** Assign the transaction its history serialisation number and write the -update UNDO log record to the assigned rollback segment. -@return true if a serialisation log was written */ +update UNDO log record to the assigned rollback segment. */ static -bool +void trx_write_serialisation_history( /*============================*/ trx_t* trx, /*!< in/out: transaction */ @@ -1422,70 +1118,43 @@ trx_write_serialisation_history( temp_mtr.commit(); } - if (!trx->rsegs.m_redo.rseg) { - ut_ad(!trx->rsegs.m_redo.insert_undo); - ut_ad(!trx->rsegs.m_redo.update_undo); - return false; + trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; + if (!rseg) { + ut_ad(!trx->rsegs.m_redo.undo); + ut_ad(!trx->rsegs.m_redo.old_insert); + return; } - trx_undo_t* insert = trx->rsegs.m_redo.insert_undo; - trx_undo_t* update = trx->rsegs.m_redo.update_undo; + trx_undo_t*& undo = trx->rsegs.m_redo.undo; + trx_undo_t*& old_insert = trx->rsegs.m_redo.old_insert; - if (!insert && !update) { - return false; + if (!undo && !old_insert) { + return; } ut_ad(!trx->read_only); - trx_rseg_t* update_rseg = update ? trx->rsegs.m_redo.rseg : NULL; - mutex_enter(&trx->rsegs.m_redo.rseg->mutex); + ut_ad(!undo || undo->rseg == rseg); + ut_ad(!old_insert || old_insert->rseg == rseg); + mutex_enter(&rseg->mutex); /* Assign the transaction serialisation number and add any - update_undo log to the purge queue. */ - trx_serialise(trx, update_rseg); + undo log to the purge queue. */ + trx_serialise(trx); - /* It is not necessary to acquire trx->undo_mutex here because - only a single OS thread is allowed to commit this transaction. */ - if (insert) { - trx_undo_set_state_at_finish(insert, mtr); + if (UNIV_LIKELY_NULL(old_insert)) { + UT_LIST_REMOVE(rseg->old_insert_list, old_insert); + trx_purge_add_undo_to_history(trx, old_insert, mtr); } - if (update) { - /* The undo logs and possible delete-marked records - for updates and deletes will be purged later. */ - page_t* undo_hdr_page = trx_undo_set_state_at_finish( - update, mtr); - - trx_undo_update_cleanup(trx, undo_hdr_page, mtr); + if (undo) { + UT_LIST_REMOVE(rseg->undo_list, undo); + trx_purge_add_undo_to_history(trx, undo, mtr); } - mutex_exit(&trx->rsegs.m_redo.rseg->mutex); + mutex_exit(&rseg->mutex); MONITOR_INC(MONITOR_TRX_COMMIT_UNDO); - trx_sysf_t* sys_header = trx_sysf_get(mtr); -#ifdef WITH_WSREP - /* Update latest MySQL wsrep XID in trx sys header. */ - if (wsrep_is_wsrep_xid(trx->xid)) { - trx_sys_update_wsrep_checkpoint(trx->xid, sys_header, mtr); - } -#endif /* WITH_WSREP */ - - /* Update the latest MySQL binlog name and offset info - in trx sys header if MySQL binlogging is on or the database - server is a MySQL replication slave */ - - if (trx->mysql_log_file_name != NULL - && trx->mysql_log_file_name[0] != '\0') { - - trx_sys_update_mysql_binlog_offset( - trx->mysql_log_file_name, - trx->mysql_log_offset, - sys_header, - mtr); - - trx->mysql_log_file_name = NULL; - } - - return(true); + trx->mysql_log_file_name = NULL; } /******************************************************************** @@ -1602,9 +1271,6 @@ trx_update_mod_tables_timestamp( /*============================*/ trx_t* trx) /*!< in: transaction */ { - - ut_ad(trx->id != 0); - /* consider using trx->start_time if calling time() is too expensive here */ const time_t now = time(NULL); @@ -1623,58 +1289,13 @@ trx_update_mod_tables_timestamp( "garbage" in table->update_time is justified because protecting it with a latch here would be too performance intrusive. */ - (*it)->update_time = now; + dict_table_t* table = it->first; + table->update_time = now; } trx->mod_tables.clear(); } -/** -Erase the transaction from running transaction lists and serialization -list. Active RW transaction list of a MVCC snapshot(ReadView::prepare) -won't include this transaction after this call. All implicit locks are -also released by this call as trx is removed from rw_trx_list. -@param[in] trx Transaction to erase, must have an ID > 0 -@param[in] serialised true if serialisation log was written */ -static -void -trx_erase_lists( - trx_t* trx, - bool serialised) -{ - ut_ad(trx->id > 0); - trx_sys_mutex_enter(); - - if (serialised) { - UT_LIST_REMOVE(trx_sys->serialisation_list, trx); - } - - trx_ids_t::iterator it = std::lower_bound( - trx_sys->rw_trx_ids.begin(), - trx_sys->rw_trx_ids.end(), - trx->id); - ut_ad(*it == trx->id); - trx_sys->rw_trx_ids.erase(it); - - if (trx->read_only || trx->rsegs.m_redo.rseg == NULL) { - - ut_ad(!trx->in_rw_trx_list); - } else { - - UT_LIST_REMOVE(trx_sys->rw_trx_list, trx); - ut_d(trx->in_rw_trx_list = false); - ut_ad(trx_sys_validate_trx_list()); - - if (trx->read_view != NULL) { - trx_sys->mvcc->view_close(trx->read_view, true); - } - } - - trx_sys->rw_trx_set.erase(TrxTrack(trx->id)); - - trx_sys_mutex_exit(); -} - /****************************************************************//** Commits a transaction in memory. */ static @@ -1682,21 +1303,18 @@ void trx_commit_in_memory( /*=================*/ trx_t* trx, /*!< in/out: transaction */ - const mtr_t* mtr, /*!< in: mini-transaction of + const mtr_t* mtr) /*!< in: mini-transaction of trx_write_serialisation_history(), or NULL if the transaction did not modify anything */ - bool serialised) - /*!< in: true if serialisation log was - written */ { trx->must_flush_log_later = false; + trx->read_view.close(); if (trx_is_autocommit_non_locking(trx)) { ut_ad(trx->id == 0); ut_ad(trx->read_only); ut_a(!trx->is_recovered); ut_ad(trx->rsegs.m_redo.rseg == NULL); - ut_ad(!trx->in_rw_trx_list); /* Note: We are asserting without holding the lock mutex. But that is OK because this transaction is not waiting and cannot @@ -1709,15 +1327,11 @@ trx_commit_in_memory( there is an inherent race here around state transition during printouts. We ignore this race for the sake of efficiency. However, the trx_sys_t::mutex will protect the trx_t instance - and it cannot be removed from the mysql_trx_list and freed + and it cannot be removed from the trx_list and freed without first acquiring the trx_sys_t::mutex. */ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); - if (trx->read_view != NULL) { - trx_sys->mvcc->view_close(trx->read_view, false); - } - MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT); DBUG_LOG("trx", "Autocommit in memory: " << trx); @@ -1733,12 +1347,10 @@ trx_commit_in_memory( } } #endif /* UNIV_DEBUG */ - trx_mutex_enter(trx); trx->commit_state(); - trx_mutex_exit(trx); if (trx->id) { - trx_erase_lists(trx, serialised); + trx_sys.deregister_rw(trx); /* Wait for any implicit-to-explicit lock conversions to cease, so that there will be no @@ -1746,29 +1358,24 @@ trx_commit_in_memory( while (UNIV_UNLIKELY(trx->is_referenced())) { ut_delay(srv_spin_wait_delay); } - - trx->release_locks(); - trx->id = 0; } else { ut_ad(trx->read_only || !trx->rsegs.m_redo.rseg); - ut_ad(!trx->in_rw_trx_list); - trx->release_locks(); } - DEBUG_SYNC_C("after_trx_committed_in_memory"); - if (trx->read_only || !trx->rsegs.m_redo.rseg) { MONITOR_INC(MONITOR_TRX_RO_COMMIT); - if (trx->read_view) { - trx_sys->mvcc->view_close( - trx->read_view, false); - } } else { + trx_update_mod_tables_timestamp(trx); MONITOR_INC(MONITOR_TRX_RW_COMMIT); + trx->is_recovered = false; } + + trx->release_locks(); + trx->id = 0; + DEBUG_SYNC_C("after_trx_committed_in_memory"); } - ut_ad(!trx->rsegs.m_redo.update_undo); + ut_ad(!trx->rsegs.m_redo.undo); if (trx_rseg_t* rseg = trx->rsegs.m_redo.rseg) { mutex_enter(&rseg->mutex); @@ -1776,14 +1383,14 @@ trx_commit_in_memory( --rseg->trx_ref_count; mutex_exit(&rseg->mutex); - if (trx_undo_t*& insert = trx->rsegs.m_redo.insert_undo) { + if (trx_undo_t*& insert = trx->rsegs.m_redo.old_insert) { ut_ad(insert->rseg == rseg); trx_undo_commit_cleanup(insert, false); insert = NULL; } } - ut_ad(!trx->rsegs.m_redo.insert_undo); + ut_ad(!trx->rsegs.m_redo.old_insert); if (mtr != NULL) { if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) { @@ -1863,9 +1470,6 @@ trx_commit_in_memory( trx->wsrep = false; #endif - /* trx->in_mysql_trx_list would hold between - trx_allocate_for_mysql() and trx_free_for_mysql(). It does not - hold for recovered transactions or system transactions. */ assert_trx_is_free(trx); trx_init(trx); @@ -1876,19 +1480,18 @@ trx_commit_in_memory( srv_wake_purge_thread_if_not_active(); } -/****************************************************************//** -Commits a transaction and a mini-transaction. */ -void -trx_commit_low( -/*===========*/ - trx_t* trx, /*!< in/out: transaction */ - mtr_t* mtr) /*!< in/out: mini-transaction (will be committed), - or NULL if trx made no modifications */ +/** Commit a transaction and a mini-transaction. +@param[in,out] trx transaction +@param[in,out] mtr mini-transaction (NULL if no modifications) */ +void trx_commit_low(trx_t* trx, mtr_t* mtr) { assert_trx_nonlocking_or_in_list(trx); ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); ut_ad(!mtr || mtr->is_active()); - ut_ad(!mtr == !trx->has_logged()); + ut_d(bool aborted = trx->in_rollback + && trx->error_state == DB_DEADLOCK); + ut_ad(!mtr == (aborted || !trx->has_logged_or_recovered())); + ut_ad(!mtr || !aborted); /* undo_no is non-zero if we're doing the final commit. */ if (trx->fts_trx != NULL && trx->undo_no != 0) { @@ -1912,10 +1515,12 @@ trx_commit_low( } } - bool serialised; +#ifndef DBUG_OFF + const bool debug_sync = trx->mysql_thd && trx->has_logged_persistent(); +#endif if (mtr != NULL) { - serialised = trx_write_serialisation_history(trx, mtr); + trx_write_serialisation_history(trx, mtr); /* The following call commits the mini-transaction, making the whole transaction committed in the file-based world, at this @@ -1944,9 +1549,6 @@ trx_commit_low( DBUG_SUICIDE(); }); /*--------------*/ - - } else { - serialised = false; } #ifndef DBUG_OFF /* In case of this function is called from a stack executing @@ -1957,12 +1559,12 @@ trx_commit_low( thd->debug_sync_control defined any longer. However the stack is possible only with a prepared trx not updating any data. */ - if (trx->mysql_thd != NULL && trx->has_logged_persistent()) { + if (debug_sync) { DEBUG_SYNC_C("before_trx_state_committed_in_memory"); } #endif - trx_commit_in_memory(trx, mtr, serialised); + trx_commit_in_memory(trx, mtr); } /****************************************************************//** @@ -1978,7 +1580,7 @@ trx_commit( DBUG_EXECUTE_IF("ib_trx_commit_crash_before_trx_commit_start", DBUG_SUICIDE();); - if (trx->has_logged()) { + if (trx->has_logged_or_recovered()) { mtr = &local_mtr; mtr->start(); } else { @@ -1990,82 +1592,13 @@ trx_commit( } /****************************************************************//** -Cleans up a transaction at database startup. The cleanup is needed if -the transaction already got to the middle of a commit when the database -crashed, and we cannot roll it back. */ -void -trx_cleanup_at_db_startup( -/*======================*/ - trx_t* trx) /*!< in: transaction */ -{ - ut_ad(trx->is_recovered); - ut_ad(!trx->rsegs.m_noredo.undo); - ut_ad(!trx->rsegs.m_redo.update_undo); - - if (trx_undo_t*& undo = trx->rsegs.m_redo.insert_undo) { - ut_ad(undo->rseg == trx->rsegs.m_redo.rseg); - trx_undo_commit_cleanup(undo, false); - undo = NULL; - } - - memset(&trx->rsegs, 0x0, sizeof(trx->rsegs)); - trx->undo_no = 0; - trx->undo_rseg_space = 0; - trx->last_sql_stat_start.least_undo_no = 0; - - trx_sys_mutex_enter(); - - ut_a(!trx->read_only); - - UT_LIST_REMOVE(trx_sys->rw_trx_list, trx); - - ut_d(trx->in_rw_trx_list = FALSE); - - trx_sys_mutex_exit(); - - /* Change the transaction state without mutex protection, now - that it no longer is in the trx_list. Recovered transactions - are never placed in the mysql_trx_list. */ - ut_ad(trx->is_recovered); - ut_ad(!trx->in_rw_trx_list); - ut_ad(!trx->in_mysql_trx_list); - DBUG_LOG("trx", "Cleanup at startup: " << trx); - trx->id = 0; - trx->state = TRX_STATE_NOT_STARTED; -} - -/********************************************************************//** -Assigns a read view for a consistent read query. All the consistent reads -within the same transaction will get the same read view, which is created -when this function is first called for a new started transaction. -@return consistent read view */ -ReadView* -trx_assign_read_view( -/*=================*/ - trx_t* trx) /*!< in/out: active transaction */ -{ - ut_ad(trx->state == TRX_STATE_ACTIVE); - - if (srv_read_only_mode) { - - ut_ad(trx->read_view == NULL); - return(NULL); - - } else if (!MVCC::is_view_active(trx->read_view)) { - trx_sys->mvcc->view_open(trx->read_view, trx); - } - - return(trx->read_view); -} - -/****************************************************************//** Prepares a transaction for commit/rollback. */ void trx_commit_or_rollback_prepare( /*===========================*/ trx_t* trx) /*!< in/out: transaction */ { - /* We are reading trx->state without holding trx_sys->mutex + /* We are reading trx->state without holding trx_sys.mutex here, because the commit or rollback should be invoked for a running (or recovered prepared) transaction that is associated with the current thread. */ @@ -2191,10 +1724,6 @@ trx_commit_for_mysql( case TRX_STATE_PREPARED_RECOVERED: trx->op_info = "committing"; - if (trx->id != 0) { - trx_update_mod_tables_timestamp(trx); - } - trx_commit(trx); MONITOR_DEC(MONITOR_TRX_ACTIVE); @@ -2243,7 +1772,6 @@ trx_mark_sql_stat_end( break; case TRX_STATE_NOT_STARTED: trx->undo_no = 0; - trx->undo_rseg_space = 0; /* fall through */ case TRX_STATE_ACTIVE: trx->last_sql_stat_start.least_undo_no = trx->undo_no; @@ -2259,8 +1787,7 @@ trx_mark_sql_stat_end( } /**********************************************************************//** -Prints info about a transaction. -Caller must hold trx_sys->mutex. */ +Prints info about a transaction. */ void trx_print_low( /*==========*/ @@ -2281,12 +1808,10 @@ trx_print_low( ibool newline; const char* op_info; - ut_ad(trx_sys_mutex_own()); - fprintf(f, "TRANSACTION " TRX_ID_FMT, trx_get_id_for_print(trx)); /* trx->state cannot change from or to NOT_STARTED while we - are holding the trx_sys->mutex. It may change from ACTIVE to + are holding the trx_sys.mutex. It may change from ACTIVE to PREPARED or COMMITTED. */ switch (trx->state) { case TRX_STATE_NOT_STARTED: @@ -2380,7 +1905,7 @@ state_ok: /**********************************************************************//** Prints info about a transaction. -The caller must hold lock_sys->mutex and trx_sys->mutex. +The caller must hold lock_sys.mutex. When possible, use trx_print() instead. */ void trx_print_latched( @@ -2391,7 +1916,6 @@ trx_print_latched( or 0 to use the default max length */ { ut_ad(lock_mutex_own()); - ut_ad(trx_sys_mutex_own()); trx_print_low(f, trx, max_query_len, lock_number_of_rows_locked(&trx->lock), @@ -2399,116 +1923,9 @@ trx_print_latched( mem_heap_get_size(trx->lock.lock_heap)); } -#ifdef WITH_WSREP -/**********************************************************************//** -Prints info about a transaction. -Transaction information may be retrieved without having trx_sys->mutex acquired -so it may not be completely accurate. The caller must own lock_sys->mutex -and the trx must have some locks to make sure that it does not escape -without locking lock_sys->mutex. */ -UNIV_INTERN -void -wsrep_trx_print_locking( - FILE* f, - /*!< in: output stream */ - const trx_t* trx, - /*!< in: transaction */ - ulint max_query_len) - /*!< in: max query length to print, - or 0 to use the default max length */ -{ - ibool newline; - const char* op_info; - - ut_ad(lock_mutex_own()); - ut_ad(trx->lock.trx_locks.count > 0); - - fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id); - - /* trx->state may change since trx_sys->mutex is not required */ - switch (trx->state) { - case TRX_STATE_NOT_STARTED: - fputs(", not started", f); - goto state_ok; - case TRX_STATE_ACTIVE: - fprintf(f, ", ACTIVE %lu sec", - (ulong) difftime(time(NULL), trx->start_time)); - goto state_ok; - case TRX_STATE_PREPARED: - case TRX_STATE_PREPARED_RECOVERED: - fprintf(f, ", ACTIVE (PREPARED) %lu sec", - (ulong) difftime(time(NULL), trx->start_time)); - goto state_ok; - case TRX_STATE_COMMITTED_IN_MEMORY: - fputs(", COMMITTED IN MEMORY", f); - goto state_ok; - } - fprintf(f, ", state %lu", (ulong) trx->state); - ut_ad(0); -state_ok: - - /* prevent a race condition */ - op_info = trx->op_info; - - if (*op_info) { - putc(' ', f); - fputs(op_info, f); - } - - if (trx->is_recovered) { - fputs(" recovered trx", f); - } - - if (trx->declared_to_be_inside_innodb) { - fprintf(f, ", thread declared inside InnoDB %lu", - (ulong) trx->n_tickets_to_enter_innodb); - } - - putc('\n', f); - - if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { - fprintf(f, "mysql tables in use %lu, locked %lu\n", - (ulong) trx->n_mysql_tables_in_use, - (ulong) trx->mysql_n_tables_locked); - } - - newline = TRUE; - - /* trx->lock.que_state of an ACTIVE transaction may change - while we are not holding trx->mutex. We perform a dirty read - for performance reasons. */ - - switch (trx->lock.que_state) { - case TRX_QUE_RUNNING: - newline = FALSE; break; - case TRX_QUE_LOCK_WAIT: - fputs("LOCK WAIT ", f); break; - case TRX_QUE_ROLLING_BACK: - fputs("ROLLING BACK ", f); break; - case TRX_QUE_COMMITTING: - fputs("COMMITTING ", f); break; - default: - fprintf(f, "que state %lu ", (ulong) trx->lock.que_state); - } - - if (trx->undo_no != 0) { - newline = TRUE; - fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no); - } - - if (newline) { - putc('\n', f); - } - - if (trx->mysql_thd != NULL) { - innobase_mysql_print_thd( - f, trx->mysql_thd, static_cast<uint>(max_query_len)); - } -} -#endif /* WITH_WSREP */ /**********************************************************************//** Prints info about a transaction. -Acquires and releases lock_sys->mutex and trx_sys->mutex. */ +Acquires and releases lock_sys.mutex. */ void trx_print( /*======*/ @@ -2527,53 +1944,10 @@ trx_print( heap_size = mem_heap_get_size(trx->lock.lock_heap); lock_mutex_exit(); - mutex_enter(&trx_sys->mutex); - trx_print_low(f, trx, max_query_len, n_rec_locks, n_trx_locks, heap_size); - - mutex_exit(&trx_sys->mutex); } -#ifdef UNIV_DEBUG -/**********************************************************************//** -Asserts that a transaction has been started. -The caller must hold trx_sys->mutex. -@return TRUE if started */ -ibool -trx_assert_started( -/*===============*/ - const trx_t* trx) /*!< in: transaction */ -{ - ut_ad(trx_sys_mutex_own()); - - /* Non-locking autocommits should not hold any locks and this - function is only called from the locking code. */ - check_trx_state(trx); - - /* trx->state can change from or to NOT_STARTED while we are holding - trx_sys->mutex for non-locking autocommit selects but not for other - types of transactions. It may change from ACTIVE to PREPARED. Unless - we are holding lock_sys->mutex, it may also change to COMMITTED. */ - - switch (trx->state) { - case TRX_STATE_PREPARED: - case TRX_STATE_PREPARED_RECOVERED: - return(TRUE); - - case TRX_STATE_ACTIVE: - case TRX_STATE_COMMITTED_IN_MEMORY: - return(TRUE); - - case TRX_STATE_NOT_STARTED: - break; - } - - ut_error; - return(FALSE); -} -#endif /* UNIV_DEBUG */ - /*******************************************************************//** Compares the "weight" (or size) of two transactions. Transactions that have edited non-transactional tables are considered heavier than ones @@ -2616,11 +1990,10 @@ static lsn_t trx_prepare_low(trx_t* trx) { - mtr_t mtr; + ut_ad(!trx->rsegs.m_redo.old_insert); + ut_ad(!trx->is_recovered); - /* It is not necessary to acquire trx->undo_mutex here because - only the owning (connection) thread of the transaction is - allowed to perform XA PREPARE. */ + mtr_t mtr; if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) { ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg); @@ -2635,15 +2008,15 @@ trx_prepare_low(trx_t* trx) mtr.commit(); } - trx_undo_t* insert = trx->rsegs.m_redo.insert_undo; - trx_undo_t* update = trx->rsegs.m_redo.update_undo; + trx_undo_t* undo = trx->rsegs.m_redo.undo; - if (!insert && !update) { + if (!undo) { /* There were no changes to persistent tables. */ return(0); } trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; + ut_ad(undo->rseg == rseg); mtr.start(); @@ -2653,17 +2026,7 @@ trx_prepare_low(trx_t* trx) world, at the serialization point of lsn. */ mutex_enter(&rseg->mutex); - - if (insert) { - ut_ad(insert->rseg == rseg); - trx_undo_set_state_at_prepare(trx, insert, false, &mtr); - } - - if (update) { - ut_ad(update->rseg == rseg); - trx_undo_set_state_at_prepare(trx, update, false, &mtr); - } - + trx_undo_set_state_at_prepare(trx, undo, false, &mtr); mutex_exit(&rseg->mutex); /* Make the XA PREPARE durable. */ @@ -2688,12 +2051,10 @@ trx_prepare( DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE();); - /*--------------------------------------*/ ut_a(trx->state == TRX_STATE_ACTIVE); trx_mutex_enter(trx); trx->state = TRX_STATE_PREPARED; trx_mutex_exit(trx); - /*--------------------------------------*/ if (lsn) { /* Depending on the my.cnf options, we may now write the log @@ -2729,126 +2090,135 @@ void trx_prepare_for_mysql(trx_t* trx) trx->op_info = ""; } -/**********************************************************************//** -This function is used to find number of prepared transactions and -their transaction objects for a recovery. -@return number of prepared transactions stored in xid_list */ -int -trx_recover_for_mysql( -/*==================*/ - XID* xid_list, /*!< in/out: prepared transactions */ - ulint len) /*!< in: number of slots in xid_list */ -{ - trx_t* trx; - ulint count = 0; - - ut_ad(xid_list); - ut_ad(len); - - /* We should set those transactions which are in the prepared state - to the xid_list */ - - trx_sys_mutex_enter(); - for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); - trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { +struct trx_recover_for_mysql_callback_arg +{ + XID *xid_list; + uint len; + uint count; +}; - assert_trx_in_rw_list(trx); - /* The state of a read-write transaction cannot change - from or to NOT_STARTED while we are holding the - trx_sys->mutex. It may change to PREPARED, but not if - trx->is_recovered. It may also change to COMMITTED. */ - if (trx_state_eq(trx, TRX_STATE_PREPARED)) { - trx->state = TRX_STATE_PREPARED_RECOVERED; - xid_list[count] = *trx->xid; +static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element, + trx_recover_for_mysql_callback_arg *arg) +{ + DBUG_ASSERT(arg->len > 0); + mutex_enter(&element->mutex); + if (trx_t *trx= element->trx) + { + /* + The state of a read-write transaction can only change from ACTIVE to + PREPARED while we are holding the element->mutex. But since it is + executed at startup no state change should occur. + */ + if (trx_state_eq(trx, TRX_STATE_PREPARED)) + { + ut_ad(trx->is_recovered); + ut_ad(trx->id); + if (arg->count == 0) + ib::info() << "Starting recovery for XA transactions..."; + XID& xid= arg->xid_list[arg->count]; + if (arg->count++ < arg->len) + { + trx->state= TRX_STATE_PREPARED_RECOVERED; + ib::info() << "Transaction " << trx->id + << " in prepared state after recovery"; + ib::info() << "Transaction contains changes to " << trx->undo_no + << " rows"; + xid= *trx->xid; + } + } + } + mutex_exit(&element->mutex); + /* Do not terminate upon reaching arg->len; count all transactions */ + return false; +} - if (count == 0) { - ib::info() << "Starting recovery for" - " XA transactions..."; - } - ib::info() << "Transaction " - << trx_get_id_for_print(trx) - << " in prepared state after recovery"; +static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element, + void*) +{ + mutex_enter(&element->mutex); + if (trx_t *trx= element->trx) + { + if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) + trx->state= TRX_STATE_PREPARED; + } + mutex_exit(&element->mutex); + return false; +} - ib::info() << "Transaction contains changes to " - << trx->undo_no << " rows"; - count++; +/** + Find prepared transaction objects for recovery. - if (count == len) { - goto partial; - } - } - } + @param[out] xid_list prepared transactions + @param[in] len number of slots in xid_list - /* After returning the full list, reset the state, because - there will be a second call to recover the transactions. */ - for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); - trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { - if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) { - trx->state = TRX_STATE_PREPARED; - } - } + @return number of prepared transactions stored in xid_list +*/ -partial: - trx_sys_mutex_exit(); +int trx_recover_for_mysql(XID *xid_list, uint len) +{ + trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 }; - if (count > 0){ - ib::info() << count << " transactions in prepared state" - " after recovery"; - } + ut_ad(xid_list); + ut_ad(len); - return(int (count)); + /* Fill xid_list with PREPARED transactions. */ + trx_sys.rw_trx_hash.iterate_no_dups(reinterpret_cast<my_hash_walk_action> + (trx_recover_for_mysql_callback), &arg); + if (arg.count) + { + ib::info() << arg.count + << " transactions in prepared state after recovery"; + /* After returning the full list, reset the state, because + init_server_components() wants to recover the collection of + transactions twice, by first calling tc_log->open() and then + ha_recover() directly. */ + if (arg.count <= len) + trx_sys.rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action> + (trx_recover_reset_callback), NULL); + } + return int(std::min(arg.count, len)); } -/** Look up an X/Open distributed transaction in XA PREPARE state. -@param[in] xid X/Open XA transaction identifier -@return trx on match, the trx->xid will be invalidated; -note that the trx may have been committed before the caller -acquires trx_t::mutex */ -static MY_ATTRIBUTE((warn_unused_result)) -trx_t* trx_get_trx_by_xid_low(const XID* xid) -{ - trx_t* trx; - ut_ad(trx_sys_mutex_own()); - - for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); - trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { - trx_mutex_enter(trx); - assert_trx_in_rw_list(trx); +struct trx_get_trx_by_xid_callback_arg +{ + const XID *xid; + trx_t *trx; +}; - /* Compare two X/Open XA transaction id's: their - length should be the same and binary comparison - of gtrid_length+bqual_length bytes should be - the same */ - if (trx->is_recovered - && (trx_state_eq(trx, TRX_STATE_PREPARED) - || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) - && xid->eq(trx->xid)) { +static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element, + trx_get_trx_by_xid_callback_arg *arg) +{ + my_bool found= 0; + mutex_enter(&element->mutex); + if (trx_t *trx= element->trx) + { + trx_mutex_enter(trx); + if (trx->is_recovered && + (trx_state_eq(trx, TRX_STATE_PREPARED) || + trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) && + arg->xid->eq(reinterpret_cast<XID*>(trx->xid))) + { #ifdef WITH_WSREP - /* The commit of a prepared recovered Galera - transaction needs a valid trx->xid for - invoking trx_sys_update_wsrep_checkpoint(). */ - if (!wsrep_is_wsrep_xid(trx->xid)) + /* The commit of a prepared recovered Galera + transaction needs a valid trx->xid for + invoking trx_sys_update_wsrep_checkpoint(). */ + if (!wsrep_is_wsrep_xid(trx->xid)) #endif - /* Invalidate the XID, so that subsequent calls - will not find it. */ - trx->xid->null(); - trx_mutex_exit(trx); - break; - } - - trx_mutex_exit(trx); - } - - return(trx); + /* Invalidate the XID, so that subsequent calls will not find it. */ + trx->xid->null(); + arg->trx= trx; + found= 1; + } + trx_mutex_exit(trx); + } + mutex_exit(&element->mutex); + return found; } /** Look up an X/Open distributed transaction in XA PREPARE state. @@ -2859,24 +2229,15 @@ trx_t::mutex @retval NULL if no match */ trx_t* trx_get_trx_by_xid(const XID* xid) { - trx_t* trx; - - if (xid == NULL) { - - return(NULL); - } - - trx_sys_mutex_enter(); - - /* Recovered/Resurrected transactions are always only on the - trx_sys_t::rw_trx_list. */ - trx = trx_get_trx_by_xid_low(xid); - - trx_sys_mutex_exit(); + trx_get_trx_by_xid_callback_arg arg= { xid, 0 }; - return(trx); + if (xid) + trx_sys.rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action> + (trx_get_trx_by_xid_callback), &arg); + return arg.trx; } + /*************************************************************//** Starts the transaction if it is not yet started. */ void @@ -2895,7 +2256,7 @@ trx_start_if_not_started_xa_low( /* If the transaction is tagged as read-only then it can only write to temp tables and for such transactions we don't want to move them to the - trx_sys_t::rw_trx_list. */ + trx_sys_t::rw_trx_hash. */ if (!trx->read_only) { trx_set_rw_mode(trx); } @@ -2996,15 +2357,6 @@ trx_start_for_ddl_low( return; case TRX_STATE_ACTIVE: - - /* We have this start if not started idiom, therefore we - can't add stronger checks here. */ - trx->ddl = true; - - ut_ad(trx->dict_operation != TRX_DICT_OP_NONE); - ut_ad(trx->will_lock > 0); - return; - case TRX_STATE_PREPARED: case TRX_STATE_PREPARED_RECOVERED: case TRX_STATE_COMMITTED_IN_MEMORY: @@ -3027,48 +2379,28 @@ trx_set_rw_mode( trx_t* trx) /*!< in/out: transaction that is RW */ { ut_ad(trx->rsegs.m_redo.rseg == 0); - ut_ad(!trx->in_rw_trx_list); ut_ad(!trx_is_autocommit_non_locking(trx)); ut_ad(!trx->read_only); + ut_ad(trx->id == 0); if (high_level_read_only) { return; } /* Function is promoting existing trx from ro mode to rw mode. - In this process it has acquired trx_sys->mutex as it plan to + In this process it has acquired trx_sys.mutex as it plan to move trx from ro list to rw list. If in future, some other thread looks at this trx object while it is being promoted then ensure that both threads are synced by acquring trx->mutex to avoid decision based on in-consistent view formed during promotion. */ trx->rsegs.m_redo.rseg = trx_assign_rseg_low(); - ut_ad(trx->rsegs.m_redo.rseg != 0); - mutex_enter(&trx_sys->mutex); - - ut_ad(trx->id == 0); - trx->id = trx_sys_get_new_trx_id(); - - trx_sys->rw_trx_ids.push_back(trx->id); - - trx_sys->rw_trx_set.insert(TrxTrack(trx->id, trx)); + trx_sys.register_rw(trx); /* So that we can see our own changes. */ - if (MVCC::is_view_active(trx->read_view)) { - MVCC::set_view_creator_trx_id(trx->read_view, trx->id); + if (trx->read_view.is_open()) { + trx->read_view.set_creator_trx_id(trx->id); } - -#ifdef UNIV_DEBUG - if (trx->id > trx_sys->rw_max_trx_id) { - trx_sys->rw_max_trx_id = trx->id; - } -#endif /* UNIV_DEBUG */ - - UT_LIST_ADD_FIRST(trx_sys->rw_trx_list, trx); - - ut_d(trx->in_rw_trx_list = true); - - mutex_exit(&trx_sys->mutex); } diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index 5e8467af298..7254c830cde 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2014, 2018, MariaDB Corporation. +Copyright (c) 2014, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -74,16 +74,19 @@ can still remove old versions from the bottom of the stack. */ ------------------------------------------------------------------- latches? ------- -The contention of the trx_sys_t::mutex should be minimized. When a transaction +The contention of the trx_sys.mutex should be minimized. When a transaction does its first insert or modify in an index, an undo log is assigned for it. Then we must have an x-latch to the rollback segment header. - When the transaction does more modifys or rolls back, the undo log is -protected with undo_mutex in the transaction. - When the transaction commits, its insert undo log is either reset and -cached for a fast reuse, or freed. In these cases we must have an x-latch on -the rollback segment page. The update undo log is put to the history list. If -it is not suitable for reuse, its slot in the rollback segment is reset. In -both cases, an x-latch must be acquired on the rollback segment. + When the transaction performs modifications or rolls back, its +undo log is protected by undo page latches. +Only the thread that is associated with the transaction may hold multiple +undo page latches at a time. Undo pages are always private to a single +transaction. Other threads that are performing MVCC reads +or checking for implicit locks will lock at most one undo page at a time +in trx_undo_get_undo_rec_low(). + When the transaction commits, its persistent undo log is added +to the history list. If it is not suitable for reuse, its slot is reset. +In both cases, an x-latch must be acquired on the rollback segment header page. The purge operation steps through the history list without modifying it until a truncate operation occurs, which can remove undo logs from the end of the list and release undo log segments. In stepping through the list, @@ -91,16 +94,6 @@ s-latches on the undo log pages are enough, but in a truncate, x-latches must be obtained on the rollback segment and individual pages. */ /********************************************************************//** -Initializes the fields in an undo log segment page. */ -static -void -trx_undo_page_init( -/*===============*/ - page_t* undo_page, /*!< in: undo log segment page */ - ulint type, /*!< in: undo log segment type */ - mtr_t* mtr); /*!< in: mtr */ - -/********************************************************************//** Creates and initializes an undo log memory object. @return own: the undo log memory object */ static @@ -109,26 +102,58 @@ trx_undo_mem_create( /*================*/ trx_rseg_t* rseg, /*!< in: rollback segment memory object */ ulint id, /*!< in: slot index within rseg */ - ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or - TRX_UNDO_UPDATE */ trx_id_t trx_id, /*!< in: id of the trx for which the undo log is created */ const XID* xid, /*!< in: X/Open XA transaction identification*/ ulint page_no,/*!< in: undo log header page number */ ulint offset);/*!< in: undo log header byte offset on page */ -/***************************************************************//** -Initializes a cached insert undo log header page for new use. NOTE that this -function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change -the operation of this function! -@return undo log header byte offset on page */ + +/** Determine the start offset of undo log records of an undo log page. +@param[in] undo_page undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset +@return start offset */ static -ulint -trx_undo_insert_header_reuse( -/*=========================*/ - page_t* undo_page, /*!< in/out: insert undo log segment - header page, x-latched */ - trx_id_t trx_id, /*!< in: transaction id */ - mtr_t* mtr); /*!< in: mtr */ +uint16_t +trx_undo_page_get_start(const page_t* undo_page, ulint page_no, ulint offset) +{ + return page_no == page_get_page_no(undo_page) + ? mach_read_from_2(offset + TRX_UNDO_LOG_START + undo_page) + : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE; +} + +/** Get the first undo log record on a page. +@param[in] page undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header page offset +@return pointer to first record +@retval NULL if none exists */ +static +trx_undo_rec_t* +trx_undo_page_get_first_rec(page_t* page, ulint page_no, ulint offset) +{ + ulint start = trx_undo_page_get_start(page, page_no, offset); + return start == trx_undo_page_get_end(page, page_no, offset) + ? NULL + : page + start; +} + +/** Get the last undo log record on a page. +@param[in] page undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header page offset +@return pointer to last record +@retval NULL if none exists */ +static +trx_undo_rec_t* +trx_undo_page_get_last_rec(page_t* page, ulint page_no, ulint offset) +{ + ulint end = trx_undo_page_get_end(page, page_no, offset); + + return trx_undo_page_get_start(page, page_no, offset) == end + ? NULL + : page + mach_read_from_2(page + end - 2); +} /***********************************************************************//** Gets the previous record in an undo log from the previous page. @@ -172,6 +197,31 @@ trx_undo_get_prev_rec_from_prev_page( return(trx_undo_page_get_last_rec(prev_page, page_no, offset)); } +/** Get the previous undo log record. +@param[in] rec undo log record +@param[in] page_no undo log header page number +@param[in] offset undo log header page offset +@return pointer to record +@retval NULL if none */ +static +trx_undo_rec_t* +trx_undo_page_get_prev_rec(trx_undo_rec_t* rec, ulint page_no, ulint offset) +{ + page_t* undo_page; + ulint start; + + undo_page = (page_t*) ut_align_down(rec, srv_page_size); + + start = trx_undo_page_get_start(undo_page, page_no, offset); + + if (start + undo_page == rec) { + + return(NULL); + } + + return(undo_page + mach_read_from_2(rec - 2)); +} + /***********************************************************************//** Gets the previous record in an undo log. @return undo log record, the page s-latched, NULL if none */ @@ -292,7 +342,7 @@ trx_undo_get_next_rec( @return undo log record, the page latched, NULL if none */ trx_undo_rec_t* trx_undo_get_first_rec( - ulint space, + fil_space_t* space, ulint page_no, ulint offset, ulint mode, @@ -301,7 +351,7 @@ trx_undo_get_first_rec( page_t* undo_page; trx_undo_rec_t* rec; - const page_id_t page_id(space, page_no); + const page_id_t page_id(space->id, page_no); if (mode == RW_S_LATCH) { undo_page = trx_undo_page_get_s_latched(page_id, mtr); @@ -315,176 +365,202 @@ trx_undo_get_first_rec( return(rec); } - return(trx_undo_get_next_rec_from_next_page(space, + return(trx_undo_get_next_rec_from_next_page(space->id, undo_page, page_no, offset, mode, mtr)); } /*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/ -/**********************************************************************//** -Writes the mtr log entry of an undo log page initialization. */ -UNIV_INLINE -void -trx_undo_page_init_log( -/*===================*/ - page_t* undo_page, /*!< in: undo log page */ - ulint type, /*!< in: undo log type */ - mtr_t* mtr) /*!< in: mtr */ +/** Parse MLOG_UNDO_INIT. +@param[in] ptr log record +@param[in] end_ptr end of log record buffer +@param[in,out] page page or NULL +@return end of log record +@retval NULL if the log record is incomplete */ +byte* +trx_undo_parse_page_init(const byte* ptr, const byte* end_ptr, page_t* page) { - mlog_write_initial_log_record(undo_page, MLOG_UNDO_INIT, mtr); + if (end_ptr <= ptr) { + return NULL; + } - mlog_catenate_ulint_compressed(mtr, type); + const ulint type = *ptr++; + + if (type > TRX_UNDO_UPDATE) { + recv_sys->found_corrupt_log = true; + } else if (page) { + /* Starting with MDEV-12288 in MariaDB 10.3.1, we use + type=0 for the combined insert/update undo log + pages. MariaDB 10.2 would use TRX_UNDO_INSERT or + TRX_UNDO_UPDATE. */ + mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_UNDO_LOG); + mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + page, + type); + mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + page, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + page, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + } + + return(const_cast<byte*>(ptr)); } -/***********************************************************//** -Parses the redo log entry of an undo log page initialization. +/** Parse MLOG_UNDO_HDR_REUSE for crash-upgrade from MariaDB 10.2. +@param[in] ptr redo log record +@param[in] end_ptr end of log buffer +@param[in,out] page undo log page or NULL @return end of log record or NULL */ byte* -trx_undo_parse_page_init( -/*=====================*/ - const byte* ptr, /*!< in: buffer */ - const byte* end_ptr,/*!< in: buffer end */ - page_t* page, /*!< in: page or NULL */ - mtr_t* mtr) /*!< in: mtr or NULL */ +trx_undo_parse_page_header_reuse( + const byte* ptr, + const byte* end_ptr, + page_t* undo_page) { - ulint type; + trx_id_t trx_id = mach_u64_parse_compressed(&ptr, end_ptr); - type = mach_parse_compressed(&ptr, end_ptr); + if (!ptr || !undo_page) { + return(const_cast<byte*>(ptr)); + } - if (ptr == NULL) { + compile_time_assert(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE + + TRX_UNDO_LOG_XA_HDR_SIZE + < UNIV_PAGE_SIZE_MIN - 100); - return(NULL); - } + const ulint new_free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE + + TRX_UNDO_LOG_OLD_HDR_SIZE; - if (page) { - trx_undo_page_init(page, type, mtr); - } + /* Insert undo data is not needed after commit: we may free all + the space on the page */ - return(const_cast<byte*>(ptr)); -} + ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + + undo_page) + == TRX_UNDO_INSERT); -/********************************************************************//** -Initializes the fields in an undo log segment page. */ -static -void -trx_undo_page_init( -/*===============*/ - page_t* undo_page, /*!< in: undo log segment page */ - ulint type, /*!< in: undo log segment type */ - mtr_t* mtr) /*!< in: mtr */ -{ - trx_upagef_t* page_hdr; + byte* page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free); + mach_write_to_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page, + TRX_UNDO_ACTIVE); - page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + byte* log_hdr = undo_page + TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE; - mach_write_to_2(page_hdr + TRX_UNDO_PAGE_TYPE, type); + mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); + mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); + + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + + return(const_cast<byte*>(ptr)); +} - mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, +/** Initialize the fields in an undo log segment page. +@param[in,out] undo_block undo page +@param[in,out] mtr mini-transaction */ +static void trx_undo_page_init(buf_block_t* undo_block, mtr_t* mtr) +{ + page_t* page = undo_block->frame; + mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_UNDO_LOG); + mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + page, 0); + mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); - mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, + mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); - fil_page_set_type(undo_page, FIL_PAGE_UNDO_LOG); + mtr->set_modified(); + switch (mtr->get_log_mode()) { + case MTR_LOG_NONE: + case MTR_LOG_NO_REDO: + return; + case MTR_LOG_SHORT_INSERTS: + ut_ad(0); + /* fall through */ + case MTR_LOG_ALL: + break; + } - trx_undo_page_init_log(undo_page, type, mtr); + byte* log_ptr = mtr->get_log()->open(11 + 1); + log_ptr = mlog_write_initial_log_record_low( + MLOG_UNDO_INIT, + undo_block->page.id.space(), + undo_block->page.id.page_no(), + log_ptr, mtr); + *log_ptr++ = 0; + mlog_close(mtr, log_ptr); } -/***************************************************************//** -Creates a new undo log segment in file. -@return DB_SUCCESS if page creation OK possible error codes are: -DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE */ -static MY_ATTRIBUTE((warn_unused_result)) -dberr_t -trx_undo_seg_create( -/*================*/ - trx_rseg_t* rseg MY_ATTRIBUTE((unused)),/*!< in: rollback segment */ - trx_rsegf_t* rseg_hdr,/*!< in: rollback segment header, page - x-latched */ - ulint type, /*!< in: type of the segment: TRX_UNDO_INSERT or - TRX_UNDO_UPDATE */ - ulint* id, /*!< out: slot index within rseg header */ - page_t** undo_page, - /*!< out: segment header page x-latched, NULL - if there was an error */ - mtr_t* mtr) /*!< in: mtr */ +/** Create an undo log segment. +@param[in,out] space tablespace +@param[in,out] rseg_hdr rollback segment header (x-latched) +@param[out] id undo slot number +@param[out] err error code +@param[in,out] mtr mini-transaction +@return undo log block +@retval NULL on failure */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +buf_block_t* +trx_undo_seg_create(fil_space_t* space, trx_rsegf_t* rseg_hdr, ulint* id, + dberr_t* err, mtr_t* mtr) { ulint slot_no; - ulint space; buf_block_t* block; - trx_upagef_t* page_hdr; - trx_usegf_t* seg_hdr; ulint n_reserved; bool success; - dberr_t err = DB_SUCCESS; - ut_ad(mtr != NULL); - ut_ad(id != NULL); - ut_ad(rseg_hdr != NULL); - ut_ad(mutex_own(&(rseg->mutex))); - - /* fputs(type == TRX_UNDO_INSERT - ? "Creating insert undo log segment\n" - : "Creating update undo log segment\n", stderr); */ - slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr); + slot_no = trx_rsegf_undo_find_free(rseg_hdr); if (slot_no == ULINT_UNDEFINED) { ib::warn() << "Cannot find a free slot for an undo log. Do" " you have too many active transactions running" " concurrently?"; - return(DB_TOO_MANY_CONCURRENT_TRXS); + *err = DB_TOO_MANY_CONCURRENT_TRXS; + return NULL; } - space = page_get_space_id(page_align(rseg_hdr)); - success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO, mtr); if (!success) { - - return(DB_OUT_OF_FILE_SPACE); + *err = DB_OUT_OF_FILE_SPACE; + return NULL; } /* Allocate a new file segment for the undo log */ - block = fseg_create_general(space, - TRX_UNDO_SEG_HDR - + TRX_UNDO_FSEG_HEADER, TRUE, mtr, NULL); + block = fseg_create(space, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, + mtr, true); - fil_space_release_free_extents(space, n_reserved); + space->release_free_extents(n_reserved); if (block == NULL) { - /* No space left */ - - return(DB_OUT_OF_FILE_SPACE); + *err = DB_OUT_OF_FILE_SPACE; + return NULL; } buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); - *undo_page = buf_block_get_frame(block); - - page_hdr = *undo_page + TRX_UNDO_PAGE_HDR; - seg_hdr = *undo_page + TRX_UNDO_SEG_HDR; - - trx_undo_page_init(*undo_page, type, mtr); + trx_undo_page_init(block, mtr); - mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, + mlog_write_ulint(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block->frame, TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE, MLOG_2BYTES, mtr); - mlog_write_ulint(seg_hdr + TRX_UNDO_LAST_LOG, 0, MLOG_2BYTES, mtr); + mlog_write_ulint(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + block->frame, + 0, MLOG_2BYTES, mtr); - flst_init(seg_hdr + TRX_UNDO_PAGE_LIST, mtr); + flst_init(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame, mtr); - flst_add_last(seg_hdr + TRX_UNDO_PAGE_LIST, - page_hdr + TRX_UNDO_PAGE_NODE, mtr); + flst_add_last(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + block->frame, + mtr); - trx_rsegf_set_nth_undo(rseg_hdr, slot_no, - page_get_page_no(*undo_page), mtr); *id = slot_no; + trx_rsegf_set_nth_undo(rseg_hdr, slot_no, block->page.id.page_no(), + mtr); MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED); - return(err); + *err = DB_SUCCESS; + return block; } /**********************************************************************//** @@ -537,7 +613,7 @@ trx_undo_header_create( new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; - ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100); mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); @@ -559,7 +635,7 @@ trx_undo_header_create( log_hdr = undo_page + free; - mach_write_to_2(log_hdr + TRX_UNDO_DEL_MARKS, TRUE); + mach_write_to_2(log_hdr + TRX_UNDO_NEEDS_PURGE, 1); mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); @@ -607,10 +683,7 @@ trx_undo_write_xid( Read X/Open XA Transaction Identification (XID) from undo log header */ static void -trx_undo_read_xid( -/*==============*/ - trx_ulogf_t* log_hdr,/*!< in: undo log header */ - XID* xid) /*!< out: X/Open XA Transaction Identification */ +trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid) { xid->formatID=static_cast<long>(mach_read_from_4( log_hdr + TRX_UNDO_XA_FORMAT)); @@ -662,23 +735,7 @@ trx_undo_header_add_space_for_xid( MLOG_2BYTES, mtr); } -/**********************************************************************//** -Writes the mtr log entry of an undo log header reuse. */ -UNIV_INLINE -void -trx_undo_insert_header_reuse_log( -/*=============================*/ - const page_t* undo_page, /*!< in: undo log header page */ - trx_id_t trx_id, /*!< in: transaction id */ - mtr_t* mtr) /*!< in: mtr */ -{ - mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_REUSE, mtr); - - mlog_catenate_ull_compressed(mtr, trx_id); -} - -/** Parse the redo log entry of an undo log page header create or reuse. -@param[in] type MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE +/** Parse the redo log entry of an undo log page header create. @param[in] ptr redo log record @param[in] end_ptr end of log buffer @param[in,out] page page frame or NULL @@ -686,7 +743,6 @@ trx_undo_insert_header_reuse_log( @return end of log record or NULL */ byte* trx_undo_parse_page_header( - mlog_id_t type, const byte* ptr, const byte* end_ptr, page_t* page, @@ -695,93 +751,20 @@ trx_undo_parse_page_header( trx_id_t trx_id = mach_u64_parse_compressed(&ptr, end_ptr); if (ptr != NULL && page != NULL) { - switch (type) { - case MLOG_UNDO_HDR_CREATE: - trx_undo_header_create(page, trx_id, mtr); - return(const_cast<byte*>(ptr)); - case MLOG_UNDO_HDR_REUSE: - trx_undo_insert_header_reuse(page, trx_id, mtr); - return(const_cast<byte*>(ptr)); - default: - break; - } - ut_ad(0); + trx_undo_header_create(page, trx_id, mtr); + return(const_cast<byte*>(ptr)); } return(const_cast<byte*>(ptr)); } -/***************************************************************//** -Initializes a cached insert undo log header page for new use. NOTE that this -function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change -the operation of this function! -@return undo log header byte offset on page */ -static -ulint -trx_undo_insert_header_reuse( -/*=========================*/ - page_t* undo_page, /*!< in/out: insert undo log segment - header page, x-latched */ - trx_id_t trx_id, /*!< in: transaction id */ - mtr_t* mtr) /*!< in: mtr */ -{ - trx_upagef_t* page_hdr; - trx_usegf_t* seg_hdr; - trx_ulogf_t* log_hdr; - ulint free; - ulint new_free; - - ut_ad(mtr && undo_page); - - page_hdr = undo_page + TRX_UNDO_PAGE_HDR; - seg_hdr = undo_page + TRX_UNDO_SEG_HDR; - - free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE; - - ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); - - log_hdr = undo_page + free; - - new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; - - /* Insert undo data is not needed after commit: we may free all - the space on the page */ - - ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_TYPE) - == TRX_UNDO_INSERT); - - mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); - - mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free); - - mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE); - - log_hdr = undo_page + free; - - mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); - mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); - - mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); - mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); - - /* Write the log record MLOG_UNDO_HDR_REUSE */ - trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr); - - return(free); -} - /** Allocate an undo log page. -@param[in,out] trx transaction @param[in,out] undo undo log @param[in,out] mtr mini-transaction that does not hold any page latch @return X-latched block if success @retval NULL on failure */ -buf_block_t* -trx_undo_add_page(trx_t* trx, trx_undo_t* undo, mtr_t* mtr) +buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr) { - ut_ad(mutex_own(&trx->undo_mutex)); - trx_rseg_t* rseg = undo->rseg; buf_block_t* new_block = NULL; ulint n_reserved; @@ -792,14 +775,11 @@ trx_undo_add_page(trx_t* trx, trx_undo_t* undo, mtr_t* mtr) counterpart of the tree latch, which is the rseg mutex. */ mutex_enter(&rseg->mutex); - if (rseg->curr_size == rseg->max_size) { - goto func_exit; - } header_page = trx_undo_page_get( - page_id_t(undo->space, undo->hdr_page_no), mtr); + page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr); - if (!fsp_reserve_free_extents(&n_reserved, undo->space, 1, + if (!fsp_reserve_free_extents(&n_reserved, undo->rseg->space, 1, FSP_UNDO, mtr)) { goto func_exit; } @@ -809,7 +789,7 @@ trx_undo_add_page(trx_t* trx, trx_undo_t* undo, mtr_t* mtr) + header_page, undo->top_page_no + 1, FSP_UP, TRUE, mtr, mtr); - fil_space_release_free_extents(undo->space, n_reserved); + rseg->space->release_free_extents(n_reserved); if (!new_block) { goto func_exit; @@ -819,7 +799,7 @@ trx_undo_add_page(trx_t* trx, trx_undo_t* undo, mtr_t* mtr) buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE); undo->last_page_no = new_block->page.id.page_no(); - trx_undo_page_init(new_block->frame, undo->type, mtr); + trx_undo_page_init(new_block, mtr); flst_add_last(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_page, @@ -842,9 +822,8 @@ ulint trx_undo_free_page( /*===============*/ trx_rseg_t* rseg, /*!< in: rollback segment */ - ibool in_history, /*!< in: TRUE if the undo log is in the history + bool in_history, /*!< in: TRUE if the undo log is in the history list */ - ulint space, /*!< in: space */ ulint hdr_page_no, /*!< in: header page number */ ulint page_no, /*!< in: page number to free: must not be the header page */ @@ -852,34 +831,30 @@ trx_undo_free_page( undo log page; the caller must have reserved the rollback segment mutex */ { - page_t* header_page; - page_t* undo_page; - fil_addr_t last_addr; - trx_rsegf_t* rseg_header; - ulint hist_size; + const ulint space = rseg->space->id; ut_a(hdr_page_no != page_no); ut_ad(mutex_own(&(rseg->mutex))); - undo_page = trx_undo_page_get(page_id_t(space, page_no), mtr); - - header_page = trx_undo_page_get(page_id_t(space, hdr_page_no), mtr); + page_t* undo_page = trx_undo_page_get(page_id_t(space, page_no), mtr); + page_t* header_page = trx_undo_page_get(page_id_t(space, hdr_page_no), + mtr); - flst_remove(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, - undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); + flst_remove(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_page, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + undo_page, mtr); - fseg_free_page(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, - space, page_no, mtr); + fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + header_page, + rseg->space, page_no, mtr); - last_addr = flst_get_last(header_page + TRX_UNDO_SEG_HDR - + TRX_UNDO_PAGE_LIST, mtr); + const fil_addr_t last_addr = flst_get_last( + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_page, mtr); rseg->curr_size--; if (in_history) { - rseg_header = trx_rsegf_get(space, rseg->page_no, mtr); - - hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, - MLOG_4BYTES, mtr); + trx_rsegf_t* rseg_header = trx_rsegf_get( + rseg->space, rseg->page_no, mtr); + uint32_t hist_size = mach_read_from_4( + rseg_header + TRX_RSEG_HISTORY_SIZE); ut_ad(hist_size > 0); mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, hist_size - 1, MLOG_4BYTES, mtr); @@ -899,40 +874,11 @@ trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr) ut_ad(undo->size > 0); undo->last_page_no = trx_undo_free_page( - undo->rseg, FALSE, undo->space, - undo->hdr_page_no, undo->last_page_no, mtr); + undo->rseg, false, undo->hdr_page_no, undo->last_page_no, mtr); undo->size--; } -/** Empties an undo log header page of undo records for that undo log. -Other undo logs may still have records on that page, if it is an update -undo log. -@param[in] space space -@param[in] hdr_page_no header page number -@param[in] hdr_offset header offset -@param[in,out] mtr mini-transaction */ -static -void -trx_undo_empty_header_page( - ulint space, - ulint hdr_page_no, - ulint hdr_offset, - mtr_t* mtr) -{ - page_t* header_page; - trx_ulogf_t* log_hdr; - ulint end; - - header_page = trx_undo_page_get(page_id_t(space, hdr_page_no), mtr); - - log_hdr = header_page + hdr_offset; - - end = trx_undo_page_get_end(header_page, hdr_page_no, hdr_offset); - - mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, end, MLOG_2BYTES, mtr); -} - /** Truncate the tail of an undo log during rollback. @param[in,out] undo undo log @param[in] limit all undo logs after this limit will be discarded @@ -952,7 +898,8 @@ trx_undo_truncate_end(trx_undo_t* undo, undo_no_t limit, bool is_temp) trx_undo_rec_t* trunc_here = NULL; page_t* undo_page = trx_undo_page_get( - page_id_t(undo->space, undo->last_page_no), &mtr); + page_id_t(undo->rseg->space->id, undo->last_page_no), + &mtr); trx_undo_rec_t* rec = trx_undo_page_get_last_rec( undo_page, undo->hdr_page_no, undo->hdr_offset); while (rec) { @@ -974,7 +921,7 @@ function_exit: if (trunc_here) { mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, - trunc_here - undo_page, + ulint(trunc_here - undo_page), MLOG_2BYTES, &mtr); } @@ -1044,12 +991,18 @@ loop: page_no = page_get_page_no(undo_page); if (page_no == hdr_page_no) { - trx_undo_empty_header_page(rseg->space, - hdr_page_no, hdr_offset, - &mtr); + uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG + + undo_page); + if (end == 0) { + end = mach_read_from_2(TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE + + undo_page); + } + + mlog_write_ulint(undo_page + hdr_offset + TRX_UNDO_LOG_START, + end, MLOG_2BYTES, &mtr); } else { - trx_undo_free_page(rseg, TRUE, rseg->space, hdr_page_no, - page_no, &mtr); + trx_undo_free_page(rseg, true, hdr_page_no, page_no, &mtr); } mtr_commit(&mtr); @@ -1085,7 +1038,7 @@ trx_undo_seg_free( mutex_enter(&(rseg->mutex)); - seg_header = trx_undo_page_get(page_id_t(undo->space, + seg_header = trx_undo_page_get(page_id_t(undo->rseg->space->id, undo->hdr_page_no), &mtr) + TRX_UNDO_SEG_HDR; @@ -1111,181 +1064,108 @@ trx_undo_seg_free( /*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/ -/********************************************************************//** -Creates and initializes an undo log memory object according to the values -in the header in file, when the database is started. The memory object is -inserted in the appropriate list of rseg. -@return own: the undo log memory object */ -static -trx_undo_t* -trx_undo_mem_create_at_db_start( -/*============================*/ - trx_rseg_t* rseg, /*!< in: rollback segment memory object */ - ulint id, /*!< in: slot index within rseg */ - ulint page_no,/*!< in: undo log segment page number */ - mtr_t* mtr) /*!< in: mtr */ +/** Read an undo log when starting up the database. +@param[in,out] rseg rollback segment +@param[in] id rollback segment slot +@param[in] page_no undo log segment page number +@param[in,out] max_trx_id the largest observed transaction ID +@return size of the undo log in pages */ +ulint +trx_undo_mem_create_at_db_start(trx_rseg_t* rseg, ulint id, ulint page_no, + trx_id_t& max_trx_id) { - page_t* undo_page; - trx_upagef_t* page_header; - trx_usegf_t* seg_header; - trx_ulogf_t* undo_header; - trx_undo_t* undo; - ulint type; - ulint state; - trx_id_t trx_id; - ulint offset; - fil_addr_t last_addr; - page_t* last_page; - trx_undo_rec_t* rec; + mtr_t mtr; XID xid; - ibool xid_exists = FALSE; - ut_a(id < TRX_RSEG_N_SLOTS); - - undo_page = trx_undo_page_get(page_id_t(rseg->space, page_no), mtr); - - page_header = undo_page + TRX_UNDO_PAGE_HDR; - - type = mtr_read_ulint(page_header + TRX_UNDO_PAGE_TYPE, MLOG_2BYTES, - mtr); - seg_header = undo_page + TRX_UNDO_SEG_HDR; + ut_ad(id < TRX_RSEG_N_SLOTS); - state = mach_read_from_2(seg_header + TRX_UNDO_STATE); - - offset = mach_read_from_2(seg_header + TRX_UNDO_LAST_LOG); - - undo_header = undo_page + offset; + mtr.start(); + const page_t* undo_page = trx_undo_page_get( + page_id_t(rseg->space->id, page_no), &mtr); + const ulint type = mach_read_from_2( + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page); + ut_ad(type == 0 || type == TRX_UNDO_INSERT || type == TRX_UNDO_UPDATE); - trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID); + uint state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + + undo_page); + uint offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + + undo_page); - xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS, - MLOG_1BYTE, mtr); + const trx_ulogf_t* undo_header = undo_page + offset; /* Read X/Open XA transaction identification if it exists, or set it to NULL. */ - xid.null(); - if (xid_exists == TRUE) { + if (undo_header[TRX_UNDO_XID_EXISTS]) { trx_undo_read_xid(undo_header, &xid); + } else { + xid.null(); } - mutex_enter(&(rseg->mutex)); - - undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid, - page_no, offset); - mutex_exit(&(rseg->mutex)); - - undo->dict_operation = mtr_read_ulint( - undo_header + TRX_UNDO_DICT_TRANS, MLOG_1BYTE, mtr); - - undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID); - undo->state = state; - undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST); - - /* If the log segment is being freed, the page list is inconsistent! */ - if (state == TRX_UNDO_TO_FREE) { - - goto add_to_list; + trx_id_t trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID); + if (trx_id > max_trx_id) { + max_trx_id = trx_id; } - last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr); - - undo->last_page_no = last_addr.page; - undo->top_page_no = last_addr.page; - - last_page = trx_undo_page_get( - page_id_t(rseg->space, undo->last_page_no), mtr); + mutex_enter(&rseg->mutex); + trx_undo_t* undo = trx_undo_mem_create( + rseg, id, trx_id, &xid, page_no, offset); + mutex_exit(&rseg->mutex); - rec = trx_undo_page_get_last_rec(last_page, page_no, offset); + undo->dict_operation = undo_header[TRX_UNDO_DICT_TRANS]; + undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID); + undo->size = flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + + undo_page); - if (rec == NULL) { - undo->empty = TRUE; + if (UNIV_UNLIKELY(state == TRX_UNDO_TO_FREE)) { + /* This is an old-format insert_undo log segment that + is being freed. The page list is inconsistent. */ + ut_ad(type == TRX_UNDO_INSERT); + state = TRX_UNDO_TO_PURGE; } else { - undo->empty = FALSE; - undo->top_offset = rec - last_page; - undo->top_undo_no = trx_undo_rec_get_undo_no(rec); - } -add_to_list: - if (type == TRX_UNDO_INSERT) { - if (state != TRX_UNDO_CACHED) { + if (state == TRX_UNDO_TO_PURGE + || state == TRX_UNDO_CACHED) { + trx_id_t id = mach_read_from_8(TRX_UNDO_TRX_NO + + undo_header); + if (id > max_trx_id) { + max_trx_id = id; + } + } - UT_LIST_ADD_LAST(rseg->insert_undo_list, undo); - } else { + fil_addr_t last_addr = flst_get_last( + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + undo_page, + &mtr); - UT_LIST_ADD_LAST(rseg->insert_undo_cached, undo); + undo->last_page_no = last_addr.page; + undo->top_page_no = last_addr.page; - MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); - } - } else { - ut_ad(type == TRX_UNDO_UPDATE); - if (state != TRX_UNDO_CACHED) { + page_t* last_page = trx_undo_page_get( + page_id_t(rseg->space->id, undo->last_page_no), &mtr); - UT_LIST_ADD_LAST(rseg->update_undo_list, undo); + if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec( + last_page, page_no, offset)) { + undo->top_offset = ulint(rec - last_page); + undo->top_undo_no = trx_undo_rec_get_undo_no(rec); + ut_ad(!undo->empty()); } else { - - UT_LIST_ADD_LAST(rseg->update_undo_cached, undo); - - MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); + undo->top_undo_no = IB_ID_MAX; + ut_ad(undo->empty()); } } - return(undo); -} - -/********************************************************************//** -Initializes the undo log lists for a rollback segment memory copy. This -function is only called when the database is started or a new rollback -segment is created. -@return the combined size of undo log segments in pages */ -ulint -trx_undo_lists_init( -/*================*/ - trx_rseg_t* rseg) /*!< in: rollback segment memory object */ -{ - ulint size = 0; - trx_rsegf_t* rseg_header; - ulint i; - mtr_t mtr; - - mtr_start(&mtr); - - rseg_header = trx_rsegf_get_new(rseg->space, rseg->page_no, &mtr); - - for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { - ulint page_no; - - page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr); - - /* In forced recovery: try to avoid operations which look - at database pages; undo logs are rapidly changing data, and - the probability that they are in an inconsistent state is - high */ - - if (page_no != FIL_NULL - && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { - - trx_undo_t* undo; - - undo = trx_undo_mem_create_at_db_start( - rseg, i, page_no, &mtr); - - size += undo->size; - - mtr_commit(&mtr); - - mtr_start(&mtr); - - rseg_header = trx_rsegf_get( - rseg->space, rseg->page_no, &mtr); + undo->state = state; - /* Found a used slot */ - MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED); - } + if (state != TRX_UNDO_CACHED) { + UT_LIST_ADD_LAST(type == TRX_UNDO_INSERT + ? rseg->old_insert_list + : rseg->undo_list, undo); + } else { + UT_LIST_ADD_LAST(rseg->undo_cached, undo); + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); } - mtr_commit(&mtr); - - return(size); + mtr.commit(); + return undo->size; } /********************************************************************//** @@ -1297,8 +1177,6 @@ trx_undo_mem_create( /*================*/ trx_rseg_t* rseg, /*!< in: rollback segment memory object */ ulint id, /*!< in: slot index within rseg */ - ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or - TRX_UNDO_UPDATE */ trx_id_t trx_id, /*!< in: id of the trx for which the undo log is created */ const XID* xid, /*!< in: X/Open transaction identification */ @@ -1319,9 +1197,7 @@ trx_undo_mem_create( } undo->id = id; - undo->type = type; undo->state = TRX_UNDO_ACTIVE; - undo->del_marks = FALSE; undo->trx_id = trx_id; undo->xid = *xid; @@ -1329,15 +1205,15 @@ trx_undo_mem_create( undo->rseg = rseg; - undo->space = rseg->space; undo->hdr_page_no = page_no; undo->hdr_offset = offset; undo->last_page_no = page_no; undo->size = 1; - undo->empty = TRUE; + undo->top_undo_no = IB_ID_MAX; undo->top_page_no = page_no; undo->guess_block = NULL; + ut_ad(undo->empty()); return(undo); } @@ -1359,201 +1235,198 @@ trx_undo_mem_init_for_reuse( ut_a(undo->id < TRX_RSEG_N_SLOTS); undo->state = TRX_UNDO_ACTIVE; - undo->del_marks = FALSE; undo->trx_id = trx_id; undo->xid = *xid; undo->dict_operation = FALSE; undo->hdr_offset = offset; - undo->empty = TRUE; -} - -/********************************************************************//** -Frees an undo log memory copy. */ -void -trx_undo_mem_free( -/*==============*/ - trx_undo_t* undo) /*!< in: the undo object to be freed */ -{ - ut_a(undo->id < TRX_RSEG_N_SLOTS); - - ut_free(undo); + undo->top_undo_no = IB_ID_MAX; + ut_ad(undo->empty()); } -/**********************************************************************//** -Creates a new undo log. -@return DB_SUCCESS if successful in creating the new undo lob object, -possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS -DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY */ +/** Create an undo log. +@param[in,out] trx transaction +@param[in,out] rseg rollback segment +@param[out] undo undo log object +@param[out] err error code +@param[in,out] mtr mini-transaction +@return undo log block +@retval NULL on failure */ static MY_ATTRIBUTE((nonnull, warn_unused_result)) -dberr_t -trx_undo_create( -/*============*/ - trx_t* trx, /*!< in: transaction */ - trx_rseg_t* rseg, /*!< in: rollback segment memory copy */ - ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or - TRX_UNDO_UPDATE */ - trx_id_t trx_id, /*!< in: id of the trx for which the undo log - is created */ - const XID* xid, /*!< in: X/Open transaction identification*/ - trx_undo_t** undo, /*!< out: the new undo log object, undefined - * if did not succeed */ - mtr_t* mtr) /*!< in: mtr */ +buf_block_t* +trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, + dberr_t* err, mtr_t* mtr) { - trx_rsegf_t* rseg_header; - ulint page_no; - ulint offset; ulint id; - page_t* undo_page; - dberr_t err; ut_ad(mutex_own(&(rseg->mutex))); - if (rseg->curr_size == rseg->max_size) { + buf_block_t* block = trx_undo_seg_create( + rseg->space, + trx_rsegf_get(rseg->space, rseg->page_no, mtr), &id, err, mtr); - return(DB_OUT_OF_FILE_SPACE); + if (!block) { + return NULL; } rseg->curr_size++; - rseg_header = trx_rsegf_get(rseg->space, rseg->page_no, mtr); - - err = trx_undo_seg_create(rseg, rseg_header, type, &id, - &undo_page, mtr); - - if (err != DB_SUCCESS) { - /* Did not succeed */ + ulint offset = trx_undo_header_create(block->frame, trx->id, mtr); - rseg->curr_size--; + trx_undo_header_add_space_for_xid(block->frame, block->frame + offset, + mtr); - return(err); - } - - page_no = page_get_page_no(undo_page); - - offset = trx_undo_header_create(undo_page, trx_id, mtr); - - trx_undo_header_add_space_for_xid(undo_page, undo_page + offset, mtr); - - *undo = trx_undo_mem_create(rseg, id, type, trx_id, xid, - page_no, offset); + *undo = trx_undo_mem_create(rseg, id, trx->id, trx->xid, + block->page.id.page_no(), offset); if (*undo == NULL) { + *err = DB_OUT_OF_MEMORY; + /* FIXME: this will not free the undo block to the file */ + return NULL; + } else if (rseg != trx->rsegs.m_redo.rseg) { + return block; + } - err = DB_OUT_OF_MEMORY; + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + break; + case TRX_DICT_OP_INDEX: + /* Do not discard the table on recovery. */ + trx->table_id = 0; + /* fall through */ + case TRX_DICT_OP_TABLE: + (*undo)->table_id = trx->table_id; + (*undo)->dict_operation = TRUE; + mlog_write_ulint(block->frame + offset + TRX_UNDO_DICT_TRANS, + TRUE, MLOG_1BYTE, mtr); + mlog_write_ull(block->frame + offset + TRX_UNDO_TABLE_ID, + trx->table_id, mtr); } - return(err); + *err = DB_SUCCESS; + return block; } /*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/ -/********************************************************************//** -Reuses a cached undo log. -@return the undo log memory object, NULL if none cached */ +/** Reuse a cached undo log block. +@param[in,out] trx transaction +@param[in,out] rseg rollback segment +@param[out] pundo the undo log memory object +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL if none cached */ static -trx_undo_t* -trx_undo_reuse_cached( -/*==================*/ - trx_t* trx, /*!< in: transaction */ - trx_rseg_t* rseg, /*!< in: rollback segment memory object */ - ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or - TRX_UNDO_UPDATE */ - trx_id_t trx_id, /*!< in: id of the trx for which the undo log - is used */ - const XID* xid, /*!< in: X/Open XA transaction identification */ - mtr_t* mtr) /*!< in: mtr */ +buf_block_t* +trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo, + mtr_t* mtr) { - trx_undo_t* undo; - page_t* undo_page; - ulint offset; - - ut_ad(mutex_own(&(rseg->mutex))); - - if (type == TRX_UNDO_INSERT) { - - undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached); - if (undo == NULL) { - - return(NULL); - } - - UT_LIST_REMOVE(rseg->insert_undo_cached, undo); - - MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); - } else { - ut_ad(type == TRX_UNDO_UPDATE); - - undo = UT_LIST_GET_FIRST(rseg->update_undo_cached); - if (undo == NULL) { - - return(NULL); - } - - UT_LIST_REMOVE(rseg->update_undo_cached, undo); + ut_ad(mutex_own(&rseg->mutex)); - MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); + trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached); + if (!undo) { + return NULL; } ut_ad(undo->size == 1); - ut_a(undo->id < TRX_RSEG_N_SLOTS); + ut_ad(undo->id < TRX_RSEG_N_SLOTS); - undo_page = trx_undo_page_get( - page_id_t(undo->space, undo->hdr_page_no), mtr); + buf_block_t* block = buf_page_get(page_id_t(undo->rseg->space->id, + undo->hdr_page_no), + univ_page_size, RW_X_LATCH, mtr); + if (!block) { + return NULL; + } - if (type == TRX_UNDO_INSERT) { - offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); - trx_undo_header_add_space_for_xid( - undo_page, undo_page + offset, mtr); - } else { - ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_TYPE) - == TRX_UNDO_UPDATE); + UT_LIST_REMOVE(rseg->undo_cached, undo); + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); - offset = trx_undo_header_create(undo_page, trx_id, mtr); + *pundo = undo; - trx_undo_header_add_space_for_xid( - undo_page, undo_page + offset, mtr); + ulint offset = trx_undo_header_create(block->frame, trx->id, mtr); + /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being + repurposed after upgrading to MariaDB 10.3. */ + if (ut_d(ulint type =) UNIV_UNLIKELY( + mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + + block->frame))) { + ut_ad(type == TRX_UNDO_INSERT || type == TRX_UNDO_UPDATE); + mlog_write_ulint(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + + block->frame, 0, MLOG_2BYTES, mtr); } - trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset); - - return(undo); -} + trx_undo_header_add_space_for_xid(block->frame, block->frame + offset, + mtr); -/** Mark that an undo log header belongs to a data dictionary transaction. -@param[in] trx dictionary transaction -@param[in,out] undo undo log -@param[in,out] mtr mini-transaction */ -void trx_undo_mark_as_dict(const trx_t* trx, trx_undo_t* undo, mtr_t* mtr) -{ - ut_ad(undo == trx->rsegs.m_redo.insert_undo - || undo == trx->rsegs.m_redo.update_undo); + trx_undo_mem_init_for_reuse(undo, trx->id, trx->xid, offset); - page_t* hdr_page = trx_undo_page_get( - page_id_t(undo->space, undo->hdr_page_no), mtr); + if (rseg != trx->rsegs.m_redo.rseg) { + return block; + } switch (trx_get_dict_operation(trx)) { case TRX_DICT_OP_NONE: - ut_error; + return block; case TRX_DICT_OP_INDEX: /* Do not discard the table on recovery. */ - undo->table_id = 0; - break; + trx->table_id = 0; + /* fall through */ case TRX_DICT_OP_TABLE: undo->table_id = trx->table_id; - break; + undo->dict_operation = TRUE; + mlog_write_ulint(block->frame + offset + TRX_UNDO_DICT_TRANS, + TRUE, MLOG_1BYTE, mtr); + mlog_write_ull(block->frame + offset + TRX_UNDO_TABLE_ID, + trx->table_id, mtr); } - mlog_write_ulint(hdr_page + undo->hdr_offset - + TRX_UNDO_DICT_TRANS, - TRUE, MLOG_1BYTE, mtr); + return block; +} + +/** Assign an undo log for a persistent transaction. +A new undo log is created or a cached undo log reused. +@param[in,out] trx transaction +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr) +{ + ut_ad(mtr->get_log_mode() == MTR_LOG_ALL); + + trx_undo_t* undo = trx->rsegs.m_redo.undo; - mlog_write_ull(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID, - undo->table_id, mtr); + if (undo) { + return buf_page_get_gen( + page_id_t(undo->rseg->space->id, undo->last_page_no), + univ_page_size, RW_X_LATCH, undo->guess_block, + BUF_GET, __FILE__, __LINE__, mtr, err); + } + + trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; - undo->dict_operation = TRUE; + mutex_enter(&rseg->mutex); + buf_block_t* block = trx_undo_reuse_cached( + trx, rseg, &trx->rsegs.m_redo.undo, mtr); + + if (!block) { + block = trx_undo_create(trx, rseg, &trx->rsegs.m_redo.undo, + err, mtr); + ut_ad(!block == (*err != DB_SUCCESS)); + if (!block) { + goto func_exit; + } + } else { + *err = DB_SUCCESS; + } + + UT_LIST_ADD_FIRST(rseg->undo_list, trx->rsegs.m_redo.undo); + +func_exit: + mutex_exit(&rseg->mutex); + return block; } /** Assign an undo log for a transaction. @@ -1561,73 +1434,55 @@ A new undo log is created or a cached undo log reused. @param[in,out] trx transaction @param[in] rseg rollback segment @param[out] undo the undo log -@param[in] type TRX_UNDO_INSERT or TRX_UNDO_UPDATE -@retval DB_SUCCESS on success -@retval DB_TOO_MANY_CONCURRENT_TRXS -@retval DB_OUT_OF_FILE_SPACE -@retval DB_READ_ONLY -@retval DB_OUT_OF_MEMORY */ -dberr_t -trx_undo_assign_undo( - trx_t* trx, - trx_rseg_t* rseg, - trx_undo_t** undo, - ulint type) +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, + dberr_t* err, mtr_t* mtr) { - const bool is_temp = rseg == trx->rsegs.m_noredo.rseg; - mtr_t mtr; - dberr_t err = DB_SUCCESS; + const bool is_temp __attribute__((unused)) = rseg == trx->rsegs.m_noredo.rseg; - ut_ad(mutex_own(&trx->undo_mutex)); ut_ad(rseg == trx->rsegs.m_redo.rseg || rseg == trx->rsegs.m_noredo.rseg); - ut_ad(type == TRX_UNDO_INSERT || type == TRX_UNDO_UPDATE); - - mtr.start(); - - if (is_temp) { - mtr.set_log_mode(MTR_LOG_NO_REDO); - ut_ad(undo == &trx->rsegs.m_noredo.undo); - } else { - ut_ad(undo == (type == TRX_UNDO_INSERT - ? &trx->rsegs.m_redo.insert_undo - : &trx->rsegs.m_redo.update_undo)); + ut_ad(undo == (is_temp + ? &trx->rsegs.m_noredo.undo + : &trx->rsegs.m_redo.undo)); + ut_ad(mtr->get_log_mode() + == (is_temp ? MTR_LOG_NO_REDO : MTR_LOG_ALL)); + + if (*undo) { + return buf_page_get_gen( + page_id_t(rseg->space->id, (*undo)->last_page_no), + univ_page_size, RW_X_LATCH, (*undo)->guess_block, + BUF_GET, __FILE__, __LINE__, mtr, err); } - mutex_enter(&rseg->mutex); - DBUG_EXECUTE_IF( "ib_create_table_fail_too_many_trx", - err = DB_TOO_MANY_CONCURRENT_TRXS; - goto func_exit; + *err = DB_TOO_MANY_CONCURRENT_TRXS; return NULL; ); - *undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, trx->xid, - &mtr); - if (*undo == NULL) { - err = trx_undo_create(trx, rseg, type, trx->id, trx->xid, - undo, &mtr); - if (err != DB_SUCCESS) { + mutex_enter(&rseg->mutex); + + buf_block_t* block = trx_undo_reuse_cached(trx, rseg, undo, mtr); + + if (!block) { + block = trx_undo_create(trx, rseg, undo, err, mtr); + ut_ad(!block == (*err != DB_SUCCESS)); + if (!block) { goto func_exit; } - } - - if (is_temp) { - UT_LIST_ADD_FIRST(rseg->insert_undo_list, *undo); } else { - UT_LIST_ADD_FIRST(type == TRX_UNDO_INSERT - ? rseg->insert_undo_list - : rseg->update_undo_list, *undo); - if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { - trx_undo_mark_as_dict(trx, *undo, &mtr); - } + *err = DB_SUCCESS; } + UT_LIST_ADD_FIRST(rseg->undo_list, *undo); + func_exit: mutex_exit(&rseg->mutex); - mtr.commit(); - - return(err); + return block; } /******************************************************************//** @@ -1647,7 +1502,7 @@ trx_undo_set_state_at_finish( ut_a(undo->id < TRX_RSEG_N_SLOTS); undo_page = trx_undo_page_get( - page_id_t(undo->space, undo->hdr_page_no), mtr); + page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr); seg_hdr = undo_page + TRX_UNDO_SEG_HDR; page_hdr = undo_page + TRX_UNDO_PAGE_HDR; @@ -1657,10 +1512,6 @@ trx_undo_set_state_at_finish( < TRX_UNDO_PAGE_REUSE_LIMIT) { state = TRX_UNDO_CACHED; - - } else if (undo->type == TRX_UNDO_INSERT) { - - state = TRX_UNDO_TO_FREE; } else { state = TRX_UNDO_TO_PURGE; } @@ -1674,7 +1525,7 @@ trx_undo_set_state_at_finish( /** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK. @param[in,out] trx transaction -@param[in,out] undo insert_undo or update_undo log +@param[in,out] undo undo log @param[in] rollback false=XA PREPARE, true=XA ROLLBACK @param[in,out] mtr mini-transaction @return undo log segment header page, x-latched */ @@ -1695,7 +1546,7 @@ trx_undo_set_state_at_prepare( ut_a(undo->id < TRX_RSEG_N_SLOTS); undo_page = trx_undo_page_get( - page_id_t(undo->space, undo->hdr_page_no), mtr); + page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr); seg_hdr = undo_page + TRX_UNDO_SEG_HDR; @@ -1726,43 +1577,7 @@ trx_undo_set_state_at_prepare( return(undo_page); } -/**********************************************************************//** -Adds the update undo log header as the first in the history list, and -frees the memory object, or puts it to the list of cached update undo log -segments. */ -void -trx_undo_update_cleanup( -/*====================*/ - trx_t* trx, /*!< in: trx owning the update - undo log */ - page_t* undo_page, /*!< in: update undo log header page, - x-latched */ - mtr_t* mtr) /*!< in: mtr */ -{ - trx_undo_t* undo = trx->rsegs.m_redo.update_undo; - trx_rseg_t* rseg = undo->rseg; - - ut_ad(mutex_own(&rseg->mutex)); - - trx_purge_add_update_undo_to_history(trx, undo_page, mtr); - - UT_LIST_REMOVE(rseg->update_undo_list, undo); - - trx->rsegs.m_redo.update_undo = NULL; - - if (undo->state == TRX_UNDO_CACHED) { - - UT_LIST_ADD_FIRST(rseg->update_undo_cached, undo); - - MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); - } else { - ut_ad(undo->state == TRX_UNDO_TO_PURGE); - - trx_undo_mem_free(undo); - } -} - -/** Free an insert or temporary undo log after commit or rollback. +/** Free an old insert or temporary undo log after commit or rollback. The information is not needed after a commit or rollback, therefore the data can be discarded. @param[in,out] undo undo log @@ -1772,44 +1587,39 @@ trx_undo_commit_cleanup(trx_undo_t* undo, bool is_temp) { trx_rseg_t* rseg = undo->rseg; ut_ad(is_temp == !rseg->is_persistent()); + ut_ad(!is_temp || 0 == UT_LIST_GET_LEN(rseg->old_insert_list)); mutex_enter(&rseg->mutex); - UT_LIST_REMOVE(rseg->insert_undo_list, undo); + UT_LIST_REMOVE(is_temp ? rseg->undo_list : rseg->old_insert_list, + undo); if (undo->state == TRX_UNDO_CACHED) { - UT_LIST_ADD_FIRST(rseg->insert_undo_cached, undo); + UT_LIST_ADD_FIRST(rseg->undo_cached, undo); MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); } else { - ut_ad(undo->state == TRX_UNDO_TO_FREE); + ut_ad(undo->state == TRX_UNDO_TO_PURGE); /* Delete first the undo log segment in the file */ mutex_exit(&rseg->mutex); - if (!srv_read_only_mode) { - trx_undo_seg_free(undo, is_temp); - } + trx_undo_seg_free(undo, is_temp); mutex_enter(&rseg->mutex); ut_ad(rseg->curr_size > undo->size); rseg->curr_size -= undo->size; - trx_undo_mem_free(undo); + ut_free(undo); } mutex_exit(&rseg->mutex); } -/********************************************************************//** -At shutdown, frees the undo logs of a PREPARED transaction. */ +/** At shutdown, frees the undo logs of a transaction. */ void -trx_undo_free_prepared( -/*===================*/ - trx_t* trx) /*!< in/out: PREPARED transaction */ +trx_undo_free_at_shutdown(trx_t *trx) { - ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); - - if (trx->rsegs.m_redo.update_undo) { - switch (trx->rsegs.m_redo.update_undo->state) { + if (trx_undo_t*& undo = trx->rsegs.m_redo.undo) { + switch (undo->state) { case TRX_UNDO_PREPARED: break; case TRX_UNDO_CACHED: @@ -1820,10 +1630,7 @@ trx_undo_free_prepared( /* fall through */ case TRX_UNDO_ACTIVE: /* trx_t::commit_state() assigns - trx->is_recovered=false and - trx->state = TRX_STATE_COMMITTED_IN_MEMORY, - also for transactions that we faked - to TRX_STATE_PREPARED in trx_rollback_resurrected(). */ + trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */ ut_a(!srv_was_started || srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO @@ -1833,15 +1640,13 @@ trx_undo_free_prepared( ut_error; } - UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->update_undo_list, - trx->rsegs.m_redo.update_undo); - trx_undo_mem_free(trx->rsegs.m_redo.update_undo); - - trx->rsegs.m_redo.update_undo = NULL; + UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->undo_list, undo); + ut_free(undo); + undo = NULL; } - if (trx->rsegs.m_redo.insert_undo) { - switch (trx->rsegs.m_redo.insert_undo->state) { + if (trx_undo_t*& undo = trx->rsegs.m_redo.old_insert) { + switch (undo->state) { case TRX_UNDO_PREPARED: break; case TRX_UNDO_CACHED: @@ -1852,10 +1657,7 @@ trx_undo_free_prepared( /* fall through */ case TRX_UNDO_ACTIVE: /* trx_t::commit_state() assigns - trx->is_recovered=false and - trx->state = TRX_STATE_COMMITTED_IN_MEMORY, - also for transactions that we faked - to TRX_STATE_PREPARED in trx_rollback_resurrected(). */ + trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */ ut_a(!srv_was_started || srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO @@ -1865,19 +1667,16 @@ trx_undo_free_prepared( ut_error; } - UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->insert_undo_list, - trx->rsegs.m_redo.insert_undo); - trx_undo_mem_free(trx->rsegs.m_redo.insert_undo); - - trx->rsegs.m_redo.insert_undo = NULL; + UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->old_insert_list, undo); + ut_free(undo); + undo = NULL; } if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) { ut_a(undo->state == TRX_UNDO_PREPARED); - UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->insert_undo_list, - undo); - trx_undo_mem_free(undo); + UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->undo_list, undo); + ut_free(undo); undo = NULL; } } diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc index 5c62309ee89..2c8aa3afe4d 100644 --- a/storage/innobase/ut/ut0crc32.cc +++ b/storage/innobase/ut/ut0crc32.cc @@ -90,18 +90,6 @@ mysys/my_perf.c, contributed by Facebook under the following license. #include <intrin.h> #endif -/** Pointer to CRC32 calculation function. */ -ut_crc32_func_t ut_crc32; - -#ifdef INNODB_BUG_ENDIAN_CRC32 -/** Pointer to CRC32 calculation function, which uses big-endian byte order -when converting byte strings to integers internally. */ -ut_crc32_func_t ut_crc32_legacy_big_endian; -#endif /* INNODB_BUG_ENDIAN_CRC32 */ - -/** Text description of CRC32 implementation */ -const char* ut_crc32_implementation; - /** Swap the byte order of an 8 byte integer. @param[in] i 8-byte integer @return 8-byte integer */ @@ -135,6 +123,13 @@ ut_crc32_power8( { return crc32c_vpmsum(0, buf, len); } + +ut_crc32_func_t ut_crc32 = ut_crc32_power8; +const char* ut_crc32_implementation = "Using POWER8 crc32 instructions"; +#else +uint32_t ut_crc32_sw(const byte* buf, ulint len); +ut_crc32_func_t ut_crc32 = ut_crc32_sw; +const char* ut_crc32_implementation = "Using generic crc32 instructions"; #endif #if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER) @@ -281,39 +276,6 @@ ut_crc32_64_hw( *len -= 8; } -#ifdef INNODB_BUG_ENDIAN_CRC32 -/** Calculate CRC32 over 64-bit byte string using a hardware/CPU instruction. -The byte string is converted to a 64-bit integer using big endian byte order. -@param[in,out] crc crc32 checksum so far when this function is called, -when the function ends it will contain the new checksum -@param[in,out] data data to be checksummed, the pointer will be advanced -with 8 bytes -@param[in,out] len remaining bytes, it will be decremented with 8 */ -inline -void -ut_crc32_64_legacy_big_endian_hw( - uint32_t* crc, - const byte** data, - ulint* len) -{ - uint64_t data_int = *reinterpret_cast<const uint64_t*>(*data); - -#ifndef WORDS_BIGENDIAN - data_int = ut_crc32_swap_byteorder(data_int); -#else - /* Currently we only support x86_64 (little endian) CPUs. In case - some big endian CPU supports a CRC32 instruction, then maybe we will - NOT need a byte order swap here. */ -#error Dont know how to handle big endian CPUs -#endif /* WORDS_BIGENDIAN */ - - *crc = ut_crc32_64_low_hw(*crc, data_int); - - *data += 8; - *len -= 8; -} -#endif /* INNODB_BUG_ENDIAN_CRC32 */ - /** Calculates CRC32 using hardware/CPU instructions. @param[in] buf data over which to calculate CRC32 @param[in] len data length @@ -400,58 +362,6 @@ ut_crc32_hw( return(~crc); } - -# ifdef INNODB_BUG_ENDIAN_CRC32 -/** Calculates CRC32 using hardware/CPU instructions. -This function uses big endian byte ordering when converting byte sequence to -integers. -@param[in] buf data over which to calculate CRC32 -@param[in] len data length -@return CRC-32C (polynomial 0x11EDC6F41) */ -uint32_t -ut_crc32_legacy_big_endian_hw( - const byte* buf, - ulint len) -{ - uint32_t crc = 0xFFFFFFFFU; - - /* Calculate byte-by-byte up to an 8-byte aligned address. After - this consume the input 8-bytes at a time. */ - while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0) { - ut_crc32_8_hw(&crc, &buf, &len); - } - - while (len >= 128) { - /* This call is repeated 16 times. 16 * 8 = 128. */ - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - } - - while (len >= 8) { - ut_crc32_64_legacy_big_endian_hw(&crc, &buf, &len); - } - - while (len > 0) { - ut_crc32_8_hw(&crc, &buf, &len); - } - - return(~crc); -} -# endif /* INNODB_BUG_ENDIAN_CRC32 */ #endif /* defined(__GNUC__) && defined(__x86_64__) || (_WIN64) */ /* CRC32 software implementation. */ @@ -649,10 +559,7 @@ integers. @param[in] buf data over which to calculate CRC32 @param[in] len data length @return CRC-32C (polynomial 0x11EDC6F41) */ -uint32_t -ut_crc32_legacy_big_endian_sw( - const byte* buf, - ulint len) +uint32_t ut_crc32_legacy_big_endian(const byte* buf, ulint len) { uint32_t crc = 0xFFFFFFFFU; @@ -704,11 +611,6 @@ ut_crc32_init() /*===========*/ { ut_crc32_slice8_table_init(); - ut_crc32 = ut_crc32_sw; -#ifdef INNODB_BUG_ENDIAN_CRC32 - ut_crc32_legacy_big_endian = ut_crc32_legacy_big_endian_sw; -#endif /* INNODB_BUG_ENDIAN_CRC32 */ - ut_crc32_implementation = "Using generic crc32 instructions"; #if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER) uint32_t vend[3]; @@ -728,10 +630,5 @@ ut_crc32_init() #endif /* INNODB_BUG_ENDIAN_CRC32 */ ut_crc32_implementation = "Using SSE2 crc32 instructions"; } - -#elif defined(HAVE_CRC32_VPMSUM) - ut_crc32 = ut_crc32_power8; - ut_crc32_implementation = "Using POWER8 crc32 instructions"; #endif - } diff --git a/storage/innobase/ut/ut0new.cc b/storage/innobase/ut/ut0new.cc index 05c7eac1d83..2a372ca9f63 100644 --- a/storage/innobase/ut/ut0new.cc +++ b/storage/innobase/ut/ut0new.cc @@ -42,7 +42,6 @@ PSI_memory_key mem_key_other; PSI_memory_key mem_key_row_log_buf; PSI_memory_key mem_key_row_merge_sort; PSI_memory_key mem_key_std; -PSI_memory_key mem_key_trx_sys_t_rw_trx_ids; #ifdef UNIV_PFS_MEMORY @@ -70,7 +69,6 @@ static PSI_memory_info pfs_info[] = { {&mem_key_row_log_buf, "row_log_buf", 0}, {&mem_key_row_merge_sort, "row_merge_sort", 0}, {&mem_key_std, "std", 0}, - {&mem_key_trx_sys_t_rw_trx_ids, "trx_sys_t::rw_trx_ids", 0}, }; /** Map used for default performance schema keys, based on file name of the diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc index 1b3d100f9ee..7f7be193175 100644 --- a/storage/innobase/ut/ut0ut.cc +++ b/storage/innobase/ut/ut0ut.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -37,6 +37,10 @@ Created 5/11/1994 Heikki Tuuri #include "trx0trx.h" #include <string> #include "log.h" +#include "my_cpu.h" +#ifndef DBUG_OFF +#include "rem0rec.h" +#endif /**********************************************************//** Returns the number of milliseconds since some epoch. The @@ -136,27 +140,6 @@ ut_sprintf_timestamp( } /*************************************************************//** -Runs an idle loop on CPU. The argument gives the desired delay -in microseconds on 100 MHz Pentium + Visual C++. -@return dummy value */ -void -ut_delay( -/*=====*/ - ulint delay) /*!< in: delay in microseconds on 100 MHz Pentium */ -{ - ulint i; - - UT_LOW_PRIORITY_CPU(); - - for (i = 0; i < delay * 50; i++) { - UT_RELAX_CPU(); - UT_COMPILER_BARRIER(); - } - - UT_RESUME_PRIORITY_CPU(); -} - -/*************************************************************//** Prints the contents of a memory buffer in hex and ascii. */ void ut_print_buf( @@ -275,7 +258,7 @@ ut_get_name( name, strlen(name), trx ? trx->mysql_thd : NULL); buf[bufend - buf] = '\0'; - return(std::string(buf, 0, bufend - buf)); + return(std::string(buf, 0, size_t(bufend - buf))); } /**********************************************************************//** @@ -299,7 +282,7 @@ ut_print_name( name, strlen(name), trx ? trx->mysql_thd : NULL); - if (fwrite(buf, 1, bufend - buf, f) != (size_t) (bufend - buf)) { + if (fwrite(buf, 1, size_t(bufend - buf), f) != size_t(bufend - buf)) { perror("fwrite"); } } @@ -376,32 +359,6 @@ ut_copy_file( The returned string is static and should not be freed or modified. @param[in] num InnoDB internal error number @return string, describing the error */ -std::string -ut_get_name( -/*=========*/ - const trx_t* trx, /*!< in: transaction (NULL=no quotes) */ - ibool table_id,/*!< in: TRUE=print a table name, - FALSE=print other identifier */ - const char* name) /*!< in: name to print */ -{ - /* 2 * NAME_LEN for database and table name, - and some slack for the #mysql50# prefix and quotes */ - char buf[3 * NAME_LEN]; - const char* bufend; - ulint namelen = strlen(name); - - bufend = innobase_convert_name(buf, sizeof buf, - name, namelen, - trx ? trx->mysql_thd : NULL); - buf[bufend-buf]='\0'; - std::string str(buf); - return str; -} - -/** Convert an error number to a human readable text message. -The returned string is static and should not be freed or modified. -@param[in] num InnoDB internal error number -@return string, describing the error */ const char* ut_strerr( dberr_t num) @@ -673,4 +630,49 @@ fatal_or_error::~fatal_or_error() } // namespace ib +#ifndef DBUG_OFF +static char dbug_print_buf[1024]; + +const char * dbug_print_rec(const rec_t* rec, const rec_offs* offsets) +{ + rec_printer r(rec, offsets); + strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1); + return dbug_print_buf; +} + +const char * dbug_print_rec(const rec_t* rec, ulint info, const rec_offs* offsets) +{ + rec_printer r(rec, info, offsets); + strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1); + return dbug_print_buf; +} + +const char * dbug_print_rec(const dtuple_t* tuple) +{ + rec_printer r(tuple); + strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1); + return dbug_print_buf; +} + +const char * dbug_print_rec(const dfield_t* field, ulint n) +{ + rec_printer r(field, n); + strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1); + return dbug_print_buf; +} + +const char * dbug_print_rec(const rec_t* rec, dict_index_t* index) +{ + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + mem_heap_t* tmp_heap = NULL; + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &tmp_heap); + rec_printer r(rec, offsets); + strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1); + return dbug_print_buf; +} +#endif /* !DBUG_OFF */ + #endif /* !UNIV_INNOCHECKSUM */ |