diff options
author | Sergei Golubchik <serg@mariadb.org> | 2017-03-30 12:48:42 +0200 |
---|---|---|
committer | Sergei Golubchik <serg@mariadb.org> | 2017-03-30 12:48:42 +0200 |
commit | da4d71d10d23c1ac2d10b72baee14991ccb7a146 (patch) | |
tree | 7cdf3a8c8e72ca7c1c8105427c04123f025bd870 /storage/xtradb | |
parent | 9ec85009985d644ce7ae797bc3572d0ad0f69bb0 (diff) | |
parent | a00517ac9707ffd51c092f5af5d198c5ee789bb4 (diff) | |
download | mariadb-git-da4d71d10d23c1ac2d10b72baee14991ccb7a146.tar.gz |
Merge branch '10.1' into 10.2
Diffstat (limited to 'storage/xtradb')
92 files changed, 3382 insertions, 3067 deletions
diff --git a/storage/xtradb/btr/btr0btr.cc b/storage/xtradb/btr/btr0btr.cc index bce81f95ead..417eeb2c367 100644 --- a/storage/xtradb/btr/btr0btr.cc +++ b/storage/xtradb/btr/btr0btr.cc @@ -3571,8 +3571,6 @@ btr_level_list_remove_func( ulint prev_page_no; ulint next_page_no; - ut_ad(page != NULL); - ut_ad(mtr != NULL); ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); ut_ad(space == page_get_space_id(page)); /* Get the previous and next page numbers of page */ diff --git a/storage/xtradb/btr/btr0cur.cc b/storage/xtradb/btr/btr0cur.cc index 2acf5dfa6f7..454b085862c 100644 --- a/storage/xtradb/btr/btr0cur.cc +++ b/storage/xtradb/btr/btr0cur.cc @@ -1843,7 +1843,7 @@ btr_cur_pessimistic_insert( /*************************************************************//** For an update, checks the locks and does the undo logging. @return DB_SUCCESS, DB_WAIT_LOCK, or error number */ -UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,6,7))) +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) dberr_t btr_cur_upd_lock_and_undo( /*======================*/ @@ -2073,7 +2073,6 @@ btr_cur_update_alloc_zip_func( const page_t* page = page_cur_get_page(cursor); ut_ad(page_zip == page_cur_get_page_zip(cursor)); - ut_ad(page_zip); ut_ad(!dict_index_is_ibuf(index)); ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets)); @@ -3940,19 +3939,42 @@ inexact: return(n_rows); } -/*******************************************************************//** -Estimates the number of rows in a given index range. -@return estimated number of rows */ -UNIV_INTERN -ib_int64_t -btr_estimate_n_rows_in_range( -/*=========================*/ - dict_index_t* index, /*!< in: index */ - const dtuple_t* tuple1, /*!< in: range start, may also be empty tuple */ - ulint mode1, /*!< in: search mode for range start */ - const dtuple_t* tuple2, /*!< in: range end, may also be empty tuple */ - ulint mode2, /*!< in: search mode for range end */ - trx_t* trx) /*!< in: trx */ +/** If the tree gets changed too much between the two dives for the left +and right boundary then btr_estimate_n_rows_in_range_low() will retry +that many times before giving up and returning the value stored in +rows_in_range_arbitrary_ret_val. */ +static const unsigned rows_in_range_max_retries = 4; + +/** We pretend that a range has that many records if the tree keeps changing +for rows_in_range_max_retries retries while we try to estimate the records +in a given range. */ +static const int64_t rows_in_range_arbitrary_ret_val = 10; + +/** Estimates the number of rows in a given index range. +@param[in] index index +@param[in] tuple1 range start, may also be empty tuple +@param[in] mode1 search mode for range start +@param[in] tuple2 range end, may also be empty tuple +@param[in] mode2 search mode for range end +@param[in] trx trx +@param[in] nth_attempt if the tree gets modified too much while +we are trying to analyze it, then we will retry (this function will call +itself, incrementing this parameter) +@return estimated number of rows; if after rows_in_range_max_retries +retries the tree keeps changing, then we will just return +rows_in_range_arbitrary_ret_val as a result (if +nth_attempt >= rows_in_range_max_retries and the tree is modified between +the two dives). */ +static +int64_t +btr_estimate_n_rows_in_range_low( + dict_index_t* index, + const dtuple_t* tuple1, + ulint mode1, + const dtuple_t* tuple2, + ulint mode2, + trx_t* trx, + unsigned nth_attempt) { btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS]; btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS]; @@ -3990,6 +4012,12 @@ btr_estimate_n_rows_in_range( mtr_start_trx(&mtr, trx); +#ifdef UNIV_DEBUG + if (!strcmp(index->name, "iC")) { + DEBUG_SYNC_C("btr_estimate_n_rows_in_range_between_dives"); + } +#endif + cursor.path_arr = path2; if (dtuple_get_n_fields(tuple2) > 0) { @@ -4056,6 +4084,33 @@ btr_estimate_n_rows_in_range( if (!diverged && slot1->nth_rec != slot2->nth_rec) { + /* If both slots do not point to the same page or if + the paths have crossed and the same page on both + apparently contains a different number of records, + this means that the tree must have changed between + the dive for slot1 and the dive for slot2 at the + beginning of this function. */ + if (slot1->page_no != slot2->page_no + || slot1->page_level != slot2->page_level + || (slot1->nth_rec >= slot2->nth_rec + && slot1->n_recs != slot2->n_recs)) { + + /* If the tree keeps changing even after a + few attempts, then just return some arbitrary + number. */ + if (nth_attempt >= rows_in_range_max_retries) { + return(rows_in_range_arbitrary_ret_val); + } + + const int64_t ret = + btr_estimate_n_rows_in_range_low( + index, tuple1, mode1, + tuple2, mode2, trx, + nth_attempt + 1); + + return(ret); + } + diverged = TRUE; if (slot1->nth_rec < slot2->nth_rec) { @@ -4074,7 +4129,7 @@ btr_estimate_n_rows_in_range( in this case slot1->nth_rec will point to the supr record and slot2->nth_rec will point to 6 */ - n_rows = 0; + return(0); } } else if (diverged && !diverged_lot) { @@ -4105,6 +4160,30 @@ btr_estimate_n_rows_in_range( } } +/** Estimates the number of rows in a given index range. +@param[in] index index +@param[in] tuple1 range start, may also be empty tuple +@param[in] mode1 search mode for range start +@param[in] tuple2 range end, may also be empty tuple +@param[in] mode2 search mode for range end +@param[in] trx trx +@return estimated number of rows */ +int64_t +btr_estimate_n_rows_in_range( + dict_index_t* index, + const dtuple_t* tuple1, + ulint mode1, + const dtuple_t* tuple2, + ulint mode2, + trx_t* trx) +{ + const int64_t ret = btr_estimate_n_rows_in_range_low( + index, tuple1, mode1, tuple2, mode2, trx, + 1 /* first attempt */); + + return(ret); +} + /*******************************************************************//** Record the number of non_null key values in a given index for each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). @@ -4567,7 +4646,6 @@ btr_cur_disown_inherited_fields( ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); ut_ad(rec_offs_any_extern(offsets)); - ut_ad(mtr); for (i = 0; i < rec_offs_n_fields(offsets); i++) { if (rec_offs_nth_extern(offsets, i) @@ -4630,9 +4708,6 @@ btr_push_update_extern_fields( ulint n; const upd_field_t* uf; - ut_ad(tuple); - ut_ad(update); - uf = update->fields; n = upd_get_n_fields(update); @@ -4816,7 +4891,6 @@ btr_store_big_rec_extern_fields( ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(rec_offs_any_extern(offsets)); - ut_ad(btr_mtr); ut_ad(mtr_memo_contains(btr_mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX)); diff --git a/storage/xtradb/btr/btr0scrub.cc b/storage/xtradb/btr/btr0scrub.cc index 62a41d19768..560d2ece6c0 100644 --- a/storage/xtradb/btr/btr0scrub.cc +++ b/storage/xtradb/btr/btr0scrub.cc @@ -111,11 +111,18 @@ log_scrub_failure( Lock dict mutexes */ static bool -btr_scrub_lock_dict_func(ulint space, bool lock_to_close_table, +btr_scrub_lock_dict_func(ulint space_id, bool lock_to_close_table, const char * file, uint line) { - uint start = time(0); - uint last = start; + time_t start = time(0); + time_t last = start; + + /* FIXME: this is not the proper way of doing things. The + dict_sys->mutex should not be held by any thread for longer + than a few microseconds. It must not be held during I/O, + for example. So, what is the purpose for this busy-waiting? + This function should be rewritten as part of MDEV-8139: + Fix scrubbing tests. */ while (mutex_enter_nowait_func(&(dict_sys->mutex), file, line)) { /* if we lock to close a table, we wait forever @@ -123,19 +130,24 @@ btr_scrub_lock_dict_func(ulint space, bool lock_to_close_table, * is closing, and then instead give up */ if (lock_to_close_table == false) { - if (fil_crypt_is_closing(space)) { + fil_space_t* space = fil_space_acquire(space_id); + if (!space || space->stop_new_ops) { + if (space) { + fil_space_release(space); + } return false; } + fil_space_release(space); } os_thread_sleep(250000); - uint now = time(0); + time_t now = time(0); if (now >= last + 30) { fprintf(stderr, - "WARNING: %s:%u waited %u seconds for" + "WARNING: %s:%u waited %ld seconds for" " dict_sys lock, space: %lu" - " lock_to_close_table: %u\n", - file, line, now - start, space, + " lock_to_close_table: %d\n", + file, line, now - start, space_id, lock_to_close_table); last = now; @@ -181,16 +193,24 @@ void btr_scrub_table_close_for_thread( btr_scrub_t *scrub_data) { - if (scrub_data->current_table == NULL) + if (scrub_data->current_table == NULL) { return; + } - bool lock_for_close = true; - btr_scrub_lock_dict(scrub_data->space, lock_for_close); + fil_space_t* space = fil_space_acquire(scrub_data->space); - /* perform the actual closing */ - btr_scrub_table_close(scrub_data->current_table); + /* If tablespace is not marked as stopping perform + the actual close. */ + if (space && !space->is_stopping()) { + mutex_enter(&dict_sys->mutex); + /* perform the actual closing */ + btr_scrub_table_close(scrub_data->current_table); + mutex_exit(&dict_sys->mutex); + } - btr_scrub_unlock_dict(); + if (space) { + fil_space_release(space); + } scrub_data->current_table = NULL; scrub_data->current_index = NULL; diff --git a/storage/xtradb/buf/buf0buddy.cc b/storage/xtradb/buf/buf0buddy.cc index 8cb880c1169..2ee39c6c992 100644 --- a/storage/xtradb/buf/buf0buddy.cc +++ b/storage/xtradb/buf/buf0buddy.cc @@ -485,7 +485,6 @@ buf_buddy_alloc_low( { buf_block_t* block; - ut_ad(lru); ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); diff --git a/storage/xtradb/buf/buf0buf.cc b/storage/xtradb/buf/buf0buf.cc index 6d5776dc726..c9a3f6aa6ec 100644 --- a/storage/xtradb/buf/buf0buf.cc +++ b/storage/xtradb/buf/buf0buf.cc @@ -65,26 +65,9 @@ Created 11/5/1995 Heikki Tuuri #include "fil0pagecompress.h" #include "ha_prototypes.h" -/* Enable this for checksum error messages. */ -//#ifdef UNIV_DEBUG -//#define UNIV_DEBUG_LEVEL2 1 -//#endif - /* prototypes for new functions added to ha_innodb.cc */ trx_t* innobase_get_trx(); -/********************************************************************//** -Check if page is maybe compressed, encrypted or both when we encounter -corrupted page. Note that we can't be 100% sure if page is corrupted -or decrypt/decompress just failed. -*/ -static -ibool -buf_page_check_corrupt( -/*===================*/ - buf_page_t* bpage); /*!< in/out: buffer page read from - disk */ - static inline void _increment_page_get_statistics(buf_block_t* block, trx_t* trx) @@ -568,6 +551,7 @@ buf_block_alloc( /********************************************************************//** Checks if a page is all zeroes. @return TRUE if the page is all zeroes */ +UNIV_INTERN bool buf_page_is_zeroes( /*===============*/ @@ -590,7 +574,7 @@ buf_page_is_zeroes( @param[in] checksum_field1 new checksum field @param[in] checksum_field2 old checksum field @return true if the page is in crc32 checksum format */ -UNIV_INLINE +UNIV_INTERN bool buf_page_is_checksum_valid_crc32( const byte* read_buf, @@ -599,15 +583,15 @@ buf_page_is_checksum_valid_crc32( { ib_uint32_t crc32 = buf_calc_page_crc32(read_buf); -#ifdef UNIV_DEBUG_LEVEL2 if (!(checksum_field1 == crc32 && checksum_field2 == crc32)) { - ib_logf(IB_LOG_LEVEL_INFO, - "Page checksum crc32 not valid field1 %lu field2 %lu crc32 %lu.", - checksum_field1, checksum_field2, (ulint)crc32); + DBUG_PRINT("buf_checksum", + ("Page checksum crc32 not valid field1 " ULINTPF + " field2 " ULINTPF " crc32 %u.", + checksum_field1, checksum_field2, crc32)); + return (false); } -#endif - return(checksum_field1 == crc32 && checksum_field2 == crc32); + return (true); } /** Checks if the page is in innodb checksum format. @@ -615,7 +599,7 @@ buf_page_is_checksum_valid_crc32( @param[in] checksum_field1 new checksum field @param[in] checksum_field2 old checksum field @return true if the page is in innodb checksum format */ -UNIV_INLINE +UNIV_INTERN bool buf_page_is_checksum_valid_innodb( const byte* read_buf, @@ -634,13 +618,13 @@ buf_page_is_checksum_valid_innodb( if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN) && checksum_field2 != buf_calc_page_old_checksum(read_buf)) { -#ifdef UNIV_DEBUG_LEVEL2 - ib_logf(IB_LOG_LEVEL_INFO, - "Page checksum innodb not valid field1 %lu field2 %lu crc32 %lu lsn %lu.", + + DBUG_PRINT("buf_checksum", + ("Page checksum innodb not valid field1 " ULINTPF + " field2 " ULINTPF "crc32 " ULINTPF " lsn " ULINTPF ".", checksum_field1, checksum_field2, buf_calc_page_old_checksum(read_buf), - mach_read_from_4(read_buf + FIL_PAGE_LSN) - ); -#endif + mach_read_from_4(read_buf + FIL_PAGE_LSN))); + return(false); } @@ -651,13 +635,13 @@ buf_page_is_checksum_valid_innodb( if (checksum_field1 != 0 && checksum_field1 != buf_calc_page_new_checksum(read_buf)) { -#ifdef UNIV_DEBUG_LEVEL2 - ib_logf(IB_LOG_LEVEL_INFO, - "Page checksum innodb not valid field1 %lu field2 %lu crc32 %lu lsn %lu.", + + DBUG_PRINT("buf_checksum", + ("Page checksum innodb not valid field1 " ULINTPF + " field2 " ULINTPF "crc32 " ULINTPF " lsn " ULINTPF ".", checksum_field1, checksum_field2, buf_calc_page_new_checksum(read_buf), - mach_read_from_4(read_buf + FIL_PAGE_LSN) - ); -#endif + mach_read_from_4(read_buf + FIL_PAGE_LSN))); + return(false); } @@ -669,22 +653,21 @@ buf_page_is_checksum_valid_innodb( @param[in] checksum_field1 new checksum field @param[in] checksum_field2 old checksum field @return true if the page is in none checksum format */ -UNIV_INLINE +UNIV_INTERN bool buf_page_is_checksum_valid_none( const byte* read_buf, ulint checksum_field1, ulint checksum_field2) { -#ifdef UNIV_DEBUG_LEVEL2 - if (!(checksum_field1 == checksum_field2 || checksum_field1 == BUF_NO_CHECKSUM_MAGIC)) { - ib_logf(IB_LOG_LEVEL_INFO, - "Page checksum none not valid field1 %lu field2 %lu crc32 %lu lsn %lu.", + + if (!(checksum_field1 == checksum_field2 && checksum_field1 == BUF_NO_CHECKSUM_MAGIC)) { + DBUG_PRINT("buf_checksum", + ("Page checksum none not valid field1 " ULINTPF + " field2 " ULINTPF "crc32 " ULINTPF " lsn " ULINTPF ".", checksum_field1, checksum_field2, BUF_NO_CHECKSUM_MAGIC, - mach_read_from_4(read_buf + FIL_PAGE_LSN) - ); + mach_read_from_4(read_buf + FIL_PAGE_LSN))); } -#endif return(checksum_field1 == checksum_field2 && checksum_field1 == BUF_NO_CHECKSUM_MAGIC); @@ -692,43 +675,42 @@ buf_page_is_checksum_valid_none( /********************************************************************//** Checks if a page is corrupt. -@return TRUE if corrupted */ +@param[in] check_lsn true if LSN should be checked +@param[in] read_buf Page to be checked +@param[in] zip_size compressed size or 0 +@param[in] space Pointer to tablespace +@return true if corrupted, false if not */ UNIV_INTERN -ibool +bool buf_page_is_corrupted( -/*==================*/ - bool check_lsn, /*!< in: true if we need to check - and complain about the LSN */ - const byte* read_buf, /*!< in: a database page */ - ulint zip_size) /*!< in: size of compressed page; - 0 for uncompressed pages */ + bool check_lsn, + const byte* read_buf, + ulint zip_size, + const fil_space_t* space) { ulint checksum_field1; ulint checksum_field2; ulint space_id = mach_read_from_4( read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - ulint page_type = mach_read_from_4( + ulint page_type = mach_read_from_2( read_buf + FIL_PAGE_TYPE); - bool no_checksum = (page_type == FIL_PAGE_PAGE_COMPRESSED || - page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED); - fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id); - - - /* Page is encrypted if encryption information is found from - tablespace and page contains used key_version. This is true - also for pages first compressed and then encrypted. */ - if (crypt_data && - crypt_data->type != CRYPT_SCHEME_UNENCRYPTED && - fil_page_is_encrypted(read_buf)) { - no_checksum = true; - } - /* Return early if there is no checksum or END_LSN */ - if (no_checksum) { - return (FALSE); - } - - if (!no_checksum && !zip_size + /* We can trust page type if page compression is set on tablespace + flags because page compression flag means file must have been + created with 10.1 (later than 5.5 code base). In 10.1 page + compressed tables do not contain post compression checksum and + FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can + be null if we are in fil_check_first_page() and first page + is not compressed or encrypted. Page checksum is verified + after decompression (i.e. normally pages are already + decompressed at this stage). */ + if ((page_type == FIL_PAGE_PAGE_COMPRESSED || + page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) + && space && FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags)) { + return (false); + } + + if (!zip_size && memcmp(read_buf + FIL_PAGE_LSN + 4, read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { @@ -780,7 +762,7 @@ buf_page_is_corrupted( /* Check whether the checksum fields have correct values */ if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE) { - return(FALSE); + return(false); } if (zip_size) { @@ -807,14 +789,14 @@ buf_page_is_corrupted( ib_logf(IB_LOG_LEVEL_INFO, "Checksum fields zero but page is not empty."); - return(TRUE); + return(true); } } - return(FALSE); + return(false); } - DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(TRUE); ); + DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(true); ); ulint page_no = mach_read_from_4(read_buf + FIL_PAGE_OFFSET); @@ -827,7 +809,7 @@ buf_page_is_corrupted( if (buf_page_is_checksum_valid_crc32(read_buf, checksum_field1, checksum_field2)) { - return(FALSE); + return(false); } if (buf_page_is_checksum_valid_none(read_buf, @@ -840,7 +822,7 @@ buf_page_is_corrupted( space_id, page_no); } - return(FALSE); + return(false); } if (buf_page_is_checksum_valid_innodb(read_buf, @@ -853,17 +835,17 @@ buf_page_is_corrupted( space_id, page_no); } - return(FALSE); + return(false); } - return(TRUE); + return(true); case SRV_CHECKSUM_ALGORITHM_INNODB: case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: if (buf_page_is_checksum_valid_innodb(read_buf, checksum_field1, checksum_field2)) { - return(FALSE); + return(false); } if (buf_page_is_checksum_valid_none(read_buf, @@ -876,7 +858,7 @@ buf_page_is_corrupted( space_id, page_no); } - return(FALSE); + return(false); } if (buf_page_is_checksum_valid_crc32(read_buf, @@ -889,16 +871,16 @@ buf_page_is_corrupted( space_id, page_no); } - return(FALSE); + return(false); } - return(TRUE); + return(true); case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: if (buf_page_is_checksum_valid_none(read_buf, checksum_field1, checksum_field2)) { - return(FALSE); + return(false); } if (buf_page_is_checksum_valid_crc32(read_buf, @@ -907,7 +889,7 @@ buf_page_is_corrupted( curr_algo, SRV_CHECKSUM_ALGORITHM_CRC32, space_id, page_no); - return(FALSE); + return(false); } if (buf_page_is_checksum_valid_innodb(read_buf, @@ -916,10 +898,10 @@ buf_page_is_corrupted( curr_algo, SRV_CHECKSUM_ALGORITHM_INNODB, space_id, page_no); - return(FALSE); + return(false); } - return(TRUE); + return(true); case SRV_CHECKSUM_ALGORITHM_NONE: /* should have returned FALSE earlier */ @@ -929,7 +911,7 @@ buf_page_is_corrupted( } ut_error; - return(FALSE); + return(false); } /********************************************************************//** @@ -1198,12 +1180,8 @@ buf_block_init( block->page.state = BUF_BLOCK_NOT_USED; block->page.buf_fix_count = 0; block->page.io_fix = BUF_IO_NONE; - block->page.key_version = 0; - block->page.page_encrypted = false; - block->page.page_compressed = false; block->page.encrypted = false; - block->page.stored_checksum = BUF_NO_CHECKSUM_MAGIC; - block->page.calculated_checksum = BUF_NO_CHECKSUM_MAGIC; + block->page.key_version = 0; block->page.real_size = 0; block->page.write_size = 0; block->modify_clock = 0; @@ -3026,14 +3004,14 @@ loop: } else if (retries < BUF_PAGE_READ_MAX_RETRIES) { ++retries; - bool corrupted = true; + bool corrupted = false; if (bpage) { corrupted = buf_page_check_corrupt(bpage); } /* Do not try again for encrypted pages */ - if (!corrupted) { + if (corrupted && bpage->encrypted) { ib_mutex_t* pmutex = buf_page_get_mutex(bpage); mutex_enter(&buf_pool->LRU_list_mutex); mutex_enter(pmutex); @@ -3062,14 +3040,14 @@ loop: retries = BUF_PAGE_READ_MAX_RETRIES; ); } else { - bool corrupted = true; + bool corrupted = false; if (bpage) { corrupted = buf_page_check_corrupt(bpage); } - if (corrupted) { - fprintf(stderr, "InnoDB: Error: Unable" + if (corrupted && !bpage->encrypted) { + ib_logf(IB_LOG_LEVEL_ERROR, "Unable" " to read tablespace %lu page no" " %lu into the buffer pool after" " %lu attempts\n" @@ -3880,12 +3858,8 @@ buf_page_init_low( bpage->newest_modification = 0; bpage->oldest_modification = 0; bpage->write_size = 0; - bpage->key_version = 0; - bpage->stored_checksum = BUF_NO_CHECKSUM_MAGIC; - bpage->calculated_checksum = BUF_NO_CHECKSUM_MAGIC; - bpage->page_encrypted = false; - bpage->page_compressed = false; bpage->encrypted = false; + bpage->key_version = 0; bpage->real_size = 0; HASH_INVALIDATE(bpage, hash); @@ -3924,15 +3898,6 @@ buf_page_init( /* Set the state of the block */ buf_block_set_file_page(block, space, offset); -#ifdef UNIV_DEBUG_VALGRIND - if (!space) { - /* Silence valid Valgrind warnings about uninitialized - data being written to data files. There are some unused - bytes on some pages that InnoDB does not initialize. */ - UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE); - } -#endif /* UNIV_DEBUG_VALGRIND */ - buf_block_init_low(block); block->lock_hash_val = lock_rec_hash(space, offset); @@ -4598,78 +4563,80 @@ buf_mark_space_corrupt( Check if page is maybe compressed, encrypted or both when we encounter corrupted page. Note that we can't be 100% sure if page is corrupted or decrypt/decompress just failed. -*/ -static -ibool +@param[in,out] bpage Page +@return true if page corrupted, false if not */ +UNIV_INTERN +bool buf_page_check_corrupt( -/*===================*/ - buf_page_t* bpage) /*!< in/out: buffer page read from disk */ + buf_page_t* bpage) { ulint zip_size = buf_page_get_zip_size(bpage); byte* dst_frame = (zip_size) ? bpage->zip.data : ((buf_block_t*) bpage)->frame; - bool page_compressed = bpage->page_encrypted; - ulint stored_checksum = bpage->stored_checksum; - ulint calculated_checksum = bpage->calculated_checksum; - bool page_compressed_encrypted = bpage->page_compressed; - ulint space_id = mach_read_from_4( - dst_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id); - fil_space_t* space = fil_space_found_by_id(space_id); - bool corrupted = true; - ulint key_version = bpage->key_version; - - if (key_version != 0 || page_compressed_encrypted) { - bpage->encrypted = true; + ulint space_id = bpage->space; + fil_space_t* space = fil_space_acquire_silent(space_id); + bool still_encrypted = false; + bool corrupted = false; + ulint page_type = mach_read_from_2(dst_frame + FIL_PAGE_TYPE); + fil_space_crypt_t* crypt_data = NULL; + + ut_ad(space); + crypt_data = space->crypt_data; + + /* In buf_decrypt_after_read we have either decrypted the page if + page post encryption checksum matches and used key_id is found + from the encryption plugin. If checksum did not match page was + not decrypted and it could be either encrypted and corrupted + or corrupted or good page. If we decrypted, there page could + still be corrupted if used key does not match. */ + still_encrypted = (crypt_data && + crypt_data->type != CRYPT_SCHEME_UNENCRYPTED && + !bpage->encrypted && + fil_space_verify_crypt_checksum(dst_frame, zip_size, + space, bpage->offset)); + + if (!still_encrypted) { + /* If traditional checksums match, we assume that page is + not anymore encrypted. */ + corrupted = buf_page_is_corrupted(true, dst_frame, zip_size, space); + + if (!corrupted) { + bpage->encrypted = false; + } } - if (key_version != 0 || - (crypt_data && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED) || - page_compressed || page_compressed_encrypted) { - - /* Page is really corrupted if post encryption stored - checksum does not match calculated checksum after page was - read. For pages compressed and then encrypted, there is no - checksum. */ - corrupted = (!page_compressed_encrypted && stored_checksum != calculated_checksum); + /* Pages that we think are unencrypted but do not match the checksum + checks could be corrupted or encrypted or both. */ + if (corrupted && !bpage->encrypted) { + ib_logf(IB_LOG_LEVEL_ERROR, + "%s: Block in space_id " ULINTPF " in file %s corrupted.", + page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED ? "Maybe corruption" : "Corruption", + space_id, (space && space->name) ? space->name : "NULL"); + ib_logf(IB_LOG_LEVEL_ERROR, + "Based on page type %s (" ULINTPF ")", + fil_get_page_type_name(page_type), page_type); + } else if (still_encrypted || (bpage->encrypted && corrupted)) { + bpage->encrypted = true; + corrupted = true; - if (corrupted) { - ib_logf(IB_LOG_LEVEL_ERROR, - "%s: Block in space_id %lu in file %s corrupted.", - page_compressed_encrypted ? "Maybe corruption" : "Corruption", - space_id, space ? space->name : "NULL"); - ib_logf(IB_LOG_LEVEL_ERROR, - "Page based on contents %s encrypted.", - (key_version == 0 && page_compressed_encrypted == false) ? "not" : "maybe"); - if (stored_checksum != BUF_NO_CHECKSUM_MAGIC || calculated_checksum != BUF_NO_CHECKSUM_MAGIC) { - ib_logf(IB_LOG_LEVEL_ERROR, - "Page stored checksum %lu but calculated checksum %lu.", - stored_checksum, calculated_checksum); - } - ib_logf(IB_LOG_LEVEL_ERROR, - "Reason could be that key_version %lu in page " - "or in crypt_data %p could not be found.", - key_version, crypt_data); - ib_logf(IB_LOG_LEVEL_ERROR, - "Reason could be also that key management plugin is not found or" - " used encryption algorithm or method does not match."); - ib_logf(IB_LOG_LEVEL_ERROR, - "Based on page page compressed %d, compressed and encrypted %d.", - page_compressed, page_compressed_encrypted); - } else { - ib_logf(IB_LOG_LEVEL_ERROR, - "Block in space_id %lu in file %s encrypted.", - space_id, space ? space->name : "NULL"); - ib_logf(IB_LOG_LEVEL_ERROR, - "However key management plugin or used key_id %lu is not found or" + ib_logf(IB_LOG_LEVEL_ERROR, + "Block in space_id " ULINTPF " in file %s encrypted.", + space_id, (space && space->name) ? space->name : "NULL"); + ib_logf(IB_LOG_LEVEL_ERROR, + "However key management plugin or used key_version %u is not found or" " used encryption algorithm or method does not match.", - key_version); + bpage->key_version); + if (space_id > TRX_SYS_SPACE) { ib_logf(IB_LOG_LEVEL_ERROR, "Marking tablespace as missing. You may drop this table or" " install correct key management plugin and key file."); } } + if (space) { + fil_space_release(space); + } + return corrupted; } @@ -4689,6 +4656,8 @@ buf_page_io_complete( == BUF_BLOCK_FILE_PAGE); bool have_LRU_mutex = false; fil_space_t* space = NULL; + byte* frame = NULL; + bool corrupted = false; ut_a(buf_page_in_file(bpage)); @@ -4704,21 +4673,13 @@ buf_page_io_complete( if (io_type == BUF_IO_READ) { ulint read_page_no; ulint read_space_id; - byte* frame; - if (!buf_page_decrypt_after_read(bpage)) { - /* encryption error! */ - if (buf_page_get_zip_size(bpage)) { - frame = bpage->zip.data; - } else { - frame = ((buf_block_t*) bpage)->frame; - } - - ib_logf(IB_LOG_LEVEL_INFO, - "Page %u in tablespace %u encryption error key_version %u.", - bpage->offset, bpage->space, bpage->key_version); + buf_page_decrypt_after_read(bpage); - goto database_corrupted; + if (buf_page_get_zip_size(bpage)) { + frame = bpage->zip.data; + } else { + frame = ((buf_block_t*) bpage)->frame; } if (buf_page_get_zip_size(bpage)) { @@ -4735,6 +4696,8 @@ buf_page_io_complete( "Page %u in tablespace %u zip_decompress failure.", bpage->offset, bpage->space); + corrupted = true; + goto database_corrupted; } os_atomic_decrement_ulint(&buf_pool->n_pend_unzip, 1); @@ -4773,7 +4736,7 @@ buf_page_io_complete( fprintf(stderr, " InnoDB: Error: space id and page n:o" " stored in the page\n" - "InnoDB: read in are %lu:%lu," + "InnoDB: read in are " ULINTPF ":" ULINTPF "," " should be %u:%u!\n", read_space_id, read_page_no, @@ -4783,121 +4746,116 @@ buf_page_io_complete( if (UNIV_LIKELY(!bpage->is_corrupt || !srv_pass_corrupt_table)) { - /* From version 3.23.38 up we store the page checksum - to the 4 first bytes of the page end lsn field */ - - if (buf_page_is_corrupted(true, frame, - buf_page_get_zip_size(bpage))) { - - /* Not a real corruption if it was triggered by - error injection */ - DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", - if (bpage->space > TRX_SYS_SPACE - && buf_mark_space_corrupt(bpage)) { - ib_logf(IB_LOG_LEVEL_INFO, - "Simulated page corruption"); - return(true); - } - goto page_not_corrupt; - ;); + corrupted = buf_page_check_corrupt(bpage); + + } + database_corrupted: - bool corrupted = buf_page_check_corrupt(bpage); + if (corrupted) { + /* Not a real corruption if it was triggered by + error injection */ + + DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", + if (bpage->space > TRX_SYS_SPACE + && buf_mark_space_corrupt(bpage)) { + ib_logf(IB_LOG_LEVEL_INFO, + "Simulated page corruption"); + return(true); + } + goto page_not_corrupt; + ); - if (corrupted) { - fil_system_enter(); - space = fil_space_get_by_id(bpage->space); - fil_system_exit(); - ib_logf(IB_LOG_LEVEL_ERROR, - "Database page corruption on disk" - " or a failed"); - ib_logf(IB_LOG_LEVEL_ERROR, - "Space %u file %s read of page %u.", - bpage->space, - space ? space->name : "NULL", - bpage->offset); - ib_logf(IB_LOG_LEVEL_ERROR, - "You may have to recover" - " from a backup."); + if (!bpage->encrypted) { + fil_system_enter(); + space = fil_space_get_by_id(bpage->space); + fil_system_exit(); + ib_logf(IB_LOG_LEVEL_ERROR, + "Database page corruption on disk" + " or a failed"); + ib_logf(IB_LOG_LEVEL_ERROR, + "Space %u file %s read of page %u.", + bpage->space, + space->name ? space->name : "NULL", + bpage->offset); + ib_logf(IB_LOG_LEVEL_ERROR, + "You may have to recover" + " from a backup."); + buf_page_print(frame, buf_page_get_zip_size(bpage), + BUF_PAGE_PRINT_NO_CRASH); - buf_page_print(frame, buf_page_get_zip_size(bpage), - BUF_PAGE_PRINT_NO_CRASH); + ib_logf(IB_LOG_LEVEL_ERROR, + "It is also possible that your operating" + "system has corrupted its own file cache."); + ib_logf(IB_LOG_LEVEL_ERROR, + "and rebooting your computer removes the error."); + ib_logf(IB_LOG_LEVEL_ERROR, + "If the corrupt page is an index page you can also try to"); + ib_logf(IB_LOG_LEVEL_ERROR, + "fix the corruption by dumping, dropping, and reimporting"); + ib_logf(IB_LOG_LEVEL_ERROR, + "the corrupt table. You can use CHECK"); + ib_logf(IB_LOG_LEVEL_ERROR, + "TABLE to scan your table for corruption."); + ib_logf(IB_LOG_LEVEL_ERROR, + "See also " + REFMAN "forcing-innodb-recovery.html" + " about forcing recovery."); + } - ib_logf(IB_LOG_LEVEL_ERROR, - "It is also possible that your operating" - "system has corrupted its own file cache."); - ib_logf(IB_LOG_LEVEL_ERROR, - "and rebooting your computer removes the error."); - ib_logf(IB_LOG_LEVEL_ERROR, - "If the corrupt page is an index page you can also try to"); - ib_logf(IB_LOG_LEVEL_ERROR, - "fix the corruption by dumping, dropping, and reimporting"); - ib_logf(IB_LOG_LEVEL_ERROR, - "the corrupt table. You can use CHECK"); - ib_logf(IB_LOG_LEVEL_ERROR, - "TABLE to scan your table for corruption."); - ib_logf(IB_LOG_LEVEL_ERROR, - "See also " - REFMAN "forcing-innodb-recovery.html" - " about forcing recovery."); + if (srv_pass_corrupt_table && bpage->space != 0 + && bpage->space < SRV_LOG_SPACE_FIRST_ID) { + trx_t* trx; + + fprintf(stderr, + "InnoDB: space %u will be treated as corrupt.\n", + bpage->space); + fil_space_set_corrupt(bpage->space); + + trx = innobase_get_trx(); + + if (trx && trx->dict_operation_lock_mode == RW_X_LATCH) { + dict_table_set_corrupt_by_space(bpage->space, FALSE); + } else { + dict_table_set_corrupt_by_space(bpage->space, TRUE); } - if (srv_pass_corrupt_table && bpage->space != 0 - && bpage->space < SRV_LOG_SPACE_FIRST_ID) { - trx_t* trx; + bpage->is_corrupt = TRUE; + } - fprintf(stderr, - "InnoDB: space %u will be treated as corrupt.\n", - bpage->space); - fil_space_set_corrupt(bpage->space); + if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) { + /* If page space id is larger than TRX_SYS_SPACE + (0), we will attempt to mark the corresponding + table as corrupted instead of crashing server */ + if (bpage->space > TRX_SYS_SPACE + && buf_mark_space_corrupt(bpage)) { + return(false); + } else { + if (!bpage->encrypted) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Ending processing because of a corrupt database page."); - trx = innobase_get_trx(); - if (trx && trx->dict_operation_lock_mode == RW_X_LATCH) { - dict_table_set_corrupt_by_space(bpage->space, FALSE); - } else { - dict_table_set_corrupt_by_space(bpage->space, TRUE); + ut_error; } - bpage->is_corrupt = TRUE; - } - if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) { - /* If page space id is larger than TRX_SYS_SPACE - (0), we will attempt to mark the corresponding - table as corrupted instead of crashing server */ - if (bpage->space > TRX_SYS_SPACE - && buf_mark_space_corrupt(bpage)) { - return(false); + ib_push_warning(innobase_get_trx(), DB_DECRYPTION_FAILED, + "Table in tablespace %lu encrypted." + "However key management plugin or used key_id %lu is not found or" + " used encryption algorithm or method does not match." + " Can't continue opening the table.", + bpage->space, bpage->key_version); + + if (bpage->encrypted && bpage->space > TRX_SYS_SPACE) { + buf_mark_space_corrupt(bpage); } else { - corrupted = buf_page_check_corrupt(bpage); - ulint key_version = bpage->key_version; - - if (corrupted) { - ib_logf(IB_LOG_LEVEL_ERROR, - "Ending processing because of a corrupt database page."); - - ut_error; - } - - ib_push_warning(innobase_get_trx(), DB_DECRYPTION_FAILED, - "Table in tablespace %lu encrypted." - "However key management plugin or used key_id %lu is not found or" - " used encryption algorithm or method does not match." - " Can't continue opening the table.", - (ulint)bpage->space, key_version); - - if (bpage->space > TRX_SYS_SPACE) { - if (corrupted) { - buf_mark_space_corrupt(bpage); - } - } else { - ut_error; - } - return(false); + ut_error; } + + return(false); } } - } /**/ + } DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", page_not_corrupt: bpage = bpage; ); @@ -4912,32 +4870,19 @@ database_corrupted: && fil_page_get_type(frame) == FIL_PAGE_INDEX && page_is_leaf(frame)) { - buf_block_t* block; - ibool update_ibuf_bitmap; - - if (UNIV_UNLIKELY(bpage->is_corrupt && - srv_pass_corrupt_table)) { - - block = NULL; - update_ibuf_bitmap = FALSE; - } else { - - block = (buf_block_t *) bpage; - update_ibuf_bitmap = TRUE; - } - if (bpage && bpage->encrypted) { - fprintf(stderr, - "InnoDB: Warning: Table in tablespace %lu encrypted." - "However key management plugin or used key_id %u is not found or" + ib_logf(IB_LOG_LEVEL_WARN, + "Table in tablespace %lu encrypted." + "However key management plugin or used key_version %u is not found or" " used encryption algorithm or method does not match." " Can't continue opening the table.\n", (ulint)bpage->space, bpage->key_version); } else { + ibuf_merge_or_delete_for_page( - block, bpage->space, + (buf_block_t*)bpage, bpage->space, bpage->offset, buf_page_get_zip_size(bpage), - update_ibuf_bitmap); + TRUE); } } @@ -5081,24 +5026,22 @@ buf_all_freed_instance( mutex_exit(&buf_pool->LRU_list_mutex); - if (UNIV_LIKELY_NULL(block)) { - if (block->page.key_version == 0) { - fil_space_t* space = fil_space_get(block->page.space); - ib_logf(IB_LOG_LEVEL_ERROR, - "Page %u %u still fixed or dirty.", - block->page.space, - block->page.offset); - ib_logf(IB_LOG_LEVEL_ERROR, - "Page oldest_modification %lu fix_count %d io_fix %d.", - (ulong) block->page.oldest_modification, - block->page.buf_fix_count, - buf_page_get_io_fix(&block->page)); - ib_logf(IB_LOG_LEVEL_ERROR, - "Page space_id %u name %s.", - block->page.space, - (space && space->name) ? space->name : "NULL"); - ut_error; - } + if (UNIV_LIKELY_NULL(block) && block->page.key_version == 0) { + fil_space_t* space = fil_space_get(block->page.space); + ib_logf(IB_LOG_LEVEL_ERROR, + "Page %u %u still fixed or dirty.", + block->page.space, + block->page.offset); + ib_logf(IB_LOG_LEVEL_ERROR, + "Page oldest_modification " LSN_PF + " fix_count %d io_fix %d.", + block->page.oldest_modification, + block->page.buf_fix_count, + buf_page_get_io_fix(&block->page)); + ib_logf(IB_LOG_LEVEL_FATAL, + "Page space_id %u name %s.", + block->page.space, + (space && space->name) ? space->name : "NULL"); } } @@ -6304,21 +6247,17 @@ buf_pool_reserve_tmp_slot( /********************************************************************//** Encrypts a buffer page right before it's flushed to disk +@param[in,out] bpage Page control block +@param[in,out] src_frame Source page +@param[in] space_id Tablespace id +@return either unencrypted source page or decrypted page. */ byte* buf_page_encrypt_before_write( -/*==========================*/ - buf_page_t* bpage, /*!< in/out: buffer page to be flushed */ - byte* src_frame, /*!< in: src frame */ - ulint space_id) /*!< in: space id */ + buf_page_t* bpage, + byte* src_frame, + ulint space_id) { - fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id); - ulint zip_size = buf_page_get_zip_size(bpage); - ulint page_size = (zip_size) ? zip_size : UNIV_PAGE_SIZE; - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - bool page_compressed = fil_space_is_page_compressed(bpage->space); - bool encrypted = true; - bpage->real_size = UNIV_PAGE_SIZE; fil_page_type_validate(src_frame); @@ -6335,7 +6274,20 @@ buf_page_encrypt_before_write( return src_frame; } - if (crypt_data != NULL && crypt_data->not_encrypted()) { + fil_space_t* space = fil_space_acquire_silent(space_id); + + /* Tablespace must exist during write operation */ + if (!space) { + /* This could be true on discard if we have injected a error + case e.g. in innodb.innodb-wl5522-debug-zip so that space + is already marked as stop_new_ops = true. */ + return src_frame; + } + + fil_space_crypt_t* crypt_data = space->crypt_data; + bool encrypted = true; + + if (space->crypt_data != NULL && space->crypt_data->not_encrypted()) { /* Encryption is disabled */ encrypted = false; } @@ -6352,11 +6304,17 @@ buf_page_encrypt_before_write( encrypted = false; } + bool page_compressed = fil_space_is_page_compressed(bpage->space); + if (!encrypted && !page_compressed) { /* No need to encrypt or page compress the page */ + fil_space_release(space); return src_frame; } + ulint zip_size = buf_page_get_zip_size(bpage); + ulint page_size = (zip_size) ? zip_size : UNIV_PAGE_SIZE; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); /* Find free slot from temporary memory array */ buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool, page_compressed); slot->out_buf = NULL; @@ -6366,11 +6324,10 @@ buf_page_encrypt_before_write( if (!page_compressed) { /* Encrypt page content */ - byte* tmp = fil_space_encrypt(bpage->space, + byte* tmp = fil_space_encrypt(space, bpage->offset, bpage->newest_modification, src_frame, - zip_size, dst_frame); ulint key_version = mach_read_from_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); @@ -6408,11 +6365,10 @@ buf_page_encrypt_before_write( if(encrypted) { /* And then we encrypt the page content */ - tmp = fil_space_encrypt(bpage->space, + tmp = fil_space_encrypt(space, bpage->offset, bpage->newest_modification, tmp, - zip_size, dst_frame); } @@ -6423,17 +6379,20 @@ buf_page_encrypt_before_write( fil_page_type_validate(dst_frame); #endif + fil_space_release(space); // return dst_frame which will be written return dst_frame; } /********************************************************************//** Decrypt page after it has been read from disk +@param[in,out] bpage Page control block +@return true if successfull, false if something went wrong */ -ibool +UNIV_INTERN +bool buf_page_decrypt_after_read( -/*========================*/ - buf_page_t* bpage) /*!< in/out: buffer page read from disk */ + buf_page_t* bpage) { ulint zip_size = buf_page_get_zip_size(bpage); ulint size = (zip_size) ? zip_size : UNIV_PAGE_SIZE; @@ -6446,53 +6405,25 @@ buf_page_decrypt_after_read( bool page_compressed_encrypted = fil_page_is_compressed_encrypted(dst_frame); buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); bool success = true; - ulint space_id = mach_read_from_4( - dst_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id); - /* Page is encrypted if encryption information is found from - tablespace and page contains used key_version. This is true - also for pages first compressed and then encrypted. */ - if (!crypt_data || - (crypt_data && - crypt_data->type == CRYPT_SCHEME_UNENCRYPTED && - key_version != 0)) { - byte* frame = NULL; - - if (buf_page_get_zip_size(bpage)) { - frame = bpage->zip.data; - } else { - frame = ((buf_block_t*) bpage)->frame; - } + bpage->key_version = key_version; - /* If page is not corrupted at this point, page can't be - encrypted, thus set key_version to 0. If page is corrupted, - we assume at this point that it is encrypted as page - contained key_version != 0. Note that page could still be - really corrupted. This we will find out after decrypt by - checking page checksums. */ - if (!buf_page_is_corrupted(false, frame, buf_page_get_zip_size(bpage))) { - key_version = 0; - } + if (bpage->offset == 0) { + /* File header pages are not encrypted/compressed */ + return (true); } - /* If page is encrypted read post-encryption checksum */ - if (!page_compressed_encrypted && key_version != 0) { - bpage->stored_checksum = mach_read_from_4(dst_frame + + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4); - } + fil_space_t* space = fil_space_acquire(bpage->space); - ut_ad(bpage->key_version == 0); + fil_space_crypt_t* crypt_data = space->crypt_data; - if (bpage->offset == 0) { - /* File header pages are not encrypted/compressed */ - return (TRUE); + /* Page is encrypted if encryption information is found from + tablespace and page contains used key_version. This is true + also for pages first compressed and then encrypted. */ + if (!crypt_data) { + key_version = 0; } - /* Store these for corruption check */ - bpage->key_version = key_version; - bpage->page_encrypted = page_compressed_encrypted; - bpage->page_compressed = page_compressed; - if (page_compressed) { /* the page we read is unencrypted */ /* Find free slot from temporary memory array */ @@ -6519,6 +6450,13 @@ buf_page_decrypt_after_read( buf_tmp_buffer_t* slot = NULL; if (key_version) { + /* Verify encryption checksum before we even try to + decrypt. */ + if (!fil_space_verify_crypt_checksum(dst_frame, + zip_size, NULL, bpage->offset)) { + return (false); + } + /* Find free slot from temporary memory array */ slot = buf_pool_reserve_tmp_slot(buf_pool, page_compressed); @@ -6526,22 +6464,16 @@ buf_page_decrypt_after_read( fil_page_type_validate(dst_frame); #endif - /* Calculate checksum before decrypt, this will be - used later to find out if incorrect key was used. */ - if (!page_compressed_encrypted) { - bpage->calculated_checksum = fil_crypt_calculate_checksum(zip_size, dst_frame); - } - /* decrypt using crypt_buf to dst_frame */ - byte* res = fil_space_decrypt(bpage->space, + byte* res = fil_space_decrypt(space, slot->crypt_buf, - size, - dst_frame); + dst_frame, + &bpage->encrypted); if (!res) { - bpage->encrypted = true; success = false; } + #ifdef UNIV_DEBUG fil_page_type_validate(dst_frame); #endif @@ -6572,7 +6504,6 @@ buf_page_decrypt_after_read( } } - bpage->key_version = key_version; - + fil_space_release(space); return (success); } diff --git a/storage/xtradb/buf/buf0dblwr.cc b/storage/xtradb/buf/buf0dblwr.cc index 68bb83e4903..b11c32064bf 100644 --- a/storage/xtradb/buf/buf0dblwr.cc +++ b/storage/xtradb/buf/buf0dblwr.cc @@ -382,13 +382,7 @@ buf_dblwr_init_or_load_pages( doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; - if (mach_read_from_4(read_buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0) { - byte* tmp = fil_space_decrypt((ulint)TRX_SYS_SPACE, - read_buf + UNIV_PAGE_SIZE, - UNIV_PAGE_SIZE, /* page size */ - read_buf); - doublewrite = tmp + TRX_SYS_DOUBLEWRITE; - } + /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) == TRX_SYS_DOUBLEWRITE_MAGIC_N) { @@ -514,6 +508,7 @@ buf_dblwr_process() continue; } + fil_space_t* space = fil_space_found_by_id(space_id); ulint zip_size = fil_space_get_zip_size(space_id); ut_ad(!buf_page_is_zeroes(page, zip_size)); @@ -548,9 +543,9 @@ buf_dblwr_process() } if (fil_space_verify_crypt_checksum( - read_buf, zip_size) + read_buf, zip_size, NULL, page_no) || !buf_page_is_corrupted( - true, read_buf, zip_size)) { + true, read_buf, zip_size, space)) { /* The page is good; there is no need to consult the doublewrite buffer. */ continue; @@ -573,8 +568,8 @@ buf_dblwr_process() NULL, page, UNIV_PAGE_SIZE, NULL, true); } - if (!fil_space_verify_crypt_checksum(page, zip_size) - && buf_page_is_corrupted(true, page, zip_size)) { + if (!fil_space_verify_crypt_checksum(page, zip_size, NULL, page_no) + && buf_page_is_corrupted(true, page, zip_size, space)) { if (!is_all_zero) { ib_logf(IB_LOG_LEVEL_WARN, "A doublewrite copy of page " diff --git a/storage/xtradb/buf/buf0dump.cc b/storage/xtradb/buf/buf0dump.cc index 6abf7375775..e728636042b 100644 --- a/storage/xtradb/buf/buf0dump.cc +++ b/storage/xtradb/buf/buf0dump.cc @@ -53,8 +53,8 @@ enum status_severity { /* Flags that tell the buffer pool dump/load thread which action should it take after being waked up. */ -static ibool buf_dump_should_start = FALSE; -static ibool buf_load_should_start = FALSE; +static volatile bool buf_dump_should_start; +static volatile bool buf_load_should_start; static ibool buf_load_abort_flag = FALSE; @@ -79,7 +79,7 @@ void buf_dump_start() /*============*/ { - buf_dump_should_start = TRUE; + buf_dump_should_start = true; os_event_set(srv_buf_dump_event); } @@ -93,7 +93,7 @@ void buf_load_start() /*============*/ { - buf_load_should_start = TRUE; + buf_load_should_start = true; os_event_set(srv_buf_dump_event); } @@ -699,15 +699,18 @@ DECLARE_THREAD(buf_dump_thread)(void*) os_event_wait(srv_buf_dump_event); if (buf_dump_should_start) { - buf_dump_should_start = FALSE; + buf_dump_should_start = false; buf_dump(TRUE /* quit on shutdown */); } if (buf_load_should_start) { - buf_load_should_start = FALSE; + buf_load_should_start = false; buf_load(); } + if (buf_dump_should_start || buf_load_should_start) { + continue; + } os_event_reset(srv_buf_dump_event); } diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index 09f07bbd696..e7ed7204920 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -1,7 +1,7 @@ /***************************************************************************** -Copyright (c) 1995, 2016, Oracle and/or its affiliates -Copyright (c) 2013, 2016, MariaDB +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2017, MariaDB Corporation. All Rights Reserved. Copyright (c) 2013, 2014, Fusion-io This program is free software; you can redistribute it and/or modify it under diff --git a/storage/xtradb/buf/buf0lru.cc b/storage/xtradb/buf/buf0lru.cc index 579166753c4..dff67c0fad6 100644 --- a/storage/xtradb/buf/buf0lru.cc +++ b/storage/xtradb/buf/buf0lru.cc @@ -1301,6 +1301,71 @@ buf_LRU_check_size_of_non_data_objects( } } +/** Diagnose failure to get a free page and request InnoDB monitor output in +the error log if more than two seconds have been spent already. +@param[in] n_iterations how many buf_LRU_get_free_page iterations + already completed +@param[in] started_ms timestamp in ms of when the attempt to get the + free page started +@param[in] flush_failures how many times single-page flush, if allowed, + has failed +@param[out] mon_value_was previous srv_print_innodb_monitor value +@param[out] started_monitor whether InnoDB monitor print has been requested +*/ +static +void +buf_LRU_handle_lack_of_free_blocks(ulint n_iterations, ulint started_ms, + ulint flush_failures, + ibool *mon_value_was, + ibool *started_monitor) +{ + static ulint last_printout_ms = 0; + + /* Legacy algorithm started warning after at least 2 seconds, we + emulate this. */ + const ulint current_ms = ut_time_ms(); + + if ((current_ms > started_ms + 2000) + && (current_ms > last_printout_ms + 2000)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: difficult to find free blocks in\n" + "InnoDB: the buffer pool (%lu search iterations)!\n" + "InnoDB: %lu failed attempts to flush a page!" + " Consider\n" + "InnoDB: increasing the buffer pool size.\n" + "InnoDB: It is also possible that" + " in your Unix version\n" + "InnoDB: fsync is very slow, or" + " completely frozen inside\n" + "InnoDB: the OS kernel. Then upgrading to" + " a newer version\n" + "InnoDB: of your operating system may help." + " Look at the\n" + "InnoDB: number of fsyncs in diagnostic info below.\n" + "InnoDB: Pending flushes (fsync) log: %lu;" + " buffer pool: %lu\n" + "InnoDB: %lu OS file reads, %lu OS file writes," + " %lu OS fsyncs\n" + "InnoDB: Starting InnoDB Monitor to print further\n" + "InnoDB: diagnostics to the standard output.\n", + (ulong) n_iterations, + (ulong) flush_failures, + (ulong) fil_n_pending_log_flushes, + (ulong) fil_n_pending_tablespace_flushes, + (ulong) os_n_file_reads, (ulong) os_n_file_writes, + (ulong) os_n_fsyncs); + + last_printout_ms = current_ms; + *mon_value_was = srv_print_innodb_monitor; + *started_monitor = TRUE; + srv_print_innodb_monitor = TRUE; + os_event_set(lock_sys->timeout_event); + } + +} + /** The maximum allowed backoff sleep time duration, microseconds */ #define MAX_FREE_LIST_BACKOFF_SLEEP 10000 @@ -1348,6 +1413,7 @@ buf_LRU_get_free_block( ulint flush_failures = 0; ibool mon_value_was = FALSE; ibool started_monitor = FALSE; + ulint started_ms = 0; ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); @@ -1356,7 +1422,24 @@ loop: buf_LRU_check_size_of_non_data_objects(buf_pool); /* If there is a block in the free list, take it */ - block = buf_LRU_get_free_only(buf_pool); + if (DBUG_EVALUATE_IF("simulate_lack_of_pages", true, false)) { + + block = NULL; + + if (srv_debug_monitor_printed) + DBUG_SET("-d,simulate_lack_of_pages"); + + } else if (DBUG_EVALUATE_IF("simulate_recovery_lack_of_pages", + recv_recovery_on, false)) { + + block = NULL; + + if (srv_debug_monitor_printed) + DBUG_SUICIDE(); + } else { + + block = buf_LRU_get_free_only(buf_pool); + } if (block) { @@ -1371,6 +1454,9 @@ loop: return(block); } + if (!started_ms) + started_ms = ut_time_ms(); + if (srv_empty_free_list_algorithm == SRV_EMPTY_FREE_LIST_BACKOFF && buf_lru_manager_is_active && (srv_shutdown_state == SRV_SHUTDOWN_NONE @@ -1408,11 +1494,17 @@ loop: : FREE_LIST_BACKOFF_LOW_PRIO_DIVIDER)); } - /* In case of backoff, do not ever attempt single page flushes - and wait for the cleaner to free some pages instead. */ + buf_LRU_handle_lack_of_free_blocks(n_iterations, started_ms, + flush_failures, + &mon_value_was, + &started_monitor); n_iterations++; + srv_stats.buf_pool_wait_free.add(n_iterations, 1); + + /* In case of backoff, do not ever attempt single page flushes + and wait for the cleaner to free some pages instead. */ goto loop; } else { @@ -1444,6 +1536,12 @@ loop: mutex_exit(&buf_pool->flush_state_mutex); + if (DBUG_EVALUATE_IF("simulate_recovery_lack_of_pages", true, false) + || DBUG_EVALUATE_IF("simulate_lack_of_pages", true, false)) { + + buf_pool->try_LRU_scan = false; + } + freed = FALSE; if (buf_pool->try_LRU_scan || n_iterations > 0) { @@ -1469,41 +1567,9 @@ loop: } - if (n_iterations > 20) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Warning: difficult to find free blocks in\n" - "InnoDB: the buffer pool (%lu search iterations)!\n" - "InnoDB: %lu failed attempts to flush a page!" - " Consider\n" - "InnoDB: increasing the buffer pool size.\n" - "InnoDB: It is also possible that" - " in your Unix version\n" - "InnoDB: fsync is very slow, or" - " completely frozen inside\n" - "InnoDB: the OS kernel. Then upgrading to" - " a newer version\n" - "InnoDB: of your operating system may help." - " Look at the\n" - "InnoDB: number of fsyncs in diagnostic info below.\n" - "InnoDB: Pending flushes (fsync) log: %lu;" - " buffer pool: %lu\n" - "InnoDB: %lu OS file reads, %lu OS file writes," - " %lu OS fsyncs\n" - "InnoDB: Starting InnoDB Monitor to print further\n" - "InnoDB: diagnostics to the standard output.\n", - (ulong) n_iterations, - (ulong) flush_failures, - (ulong) fil_n_pending_log_flushes, - (ulong) fil_n_pending_tablespace_flushes, - (ulong) os_n_file_reads, (ulong) os_n_file_writes, - (ulong) os_n_fsyncs); - - mon_value_was = srv_print_innodb_monitor; - started_monitor = TRUE; - srv_print_innodb_monitor = TRUE; - os_event_set(srv_monitor_event); - } + buf_LRU_handle_lack_of_free_blocks(n_iterations, started_ms, + flush_failures, &mon_value_was, + &started_monitor); /* If we have scanned the whole LRU and still are unable to find a free block then we should sleep here to let the diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index f0480cfc169..f90b1e46c1e 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (C) 2013, 2014, Fusion-io. All Rights Reserved. -Copyright (C) 2013, 2015, MariaDB Corporation. All Rights Reserved. +Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -486,14 +486,13 @@ buf_mtflu_handler_init( mtflush_heap2 = mem_heap_create(0); ut_a(mtflush_heap2 != NULL); - mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap, + mtflush_ctx = (thread_sync_t *)mem_heap_zalloc(mtflush_heap, sizeof(thread_sync_t)); - memset(mtflush_ctx, 0, sizeof(thread_sync_t)); + ut_a(mtflush_ctx != NULL); - mtflush_ctx->thread_data = (thread_data_t*)mem_heap_alloc( + mtflush_ctx->thread_data = (thread_data_t*)mem_heap_zalloc( mtflush_heap, sizeof(thread_data_t) * n_threads); ut_a(mtflush_ctx->thread_data); - memset(mtflush_ctx->thread_data, 0, sizeof(thread_data_t) * n_threads); mtflush_ctx->n_threads = n_threads; mtflush_ctx->wq = ib_wqueue_create(); diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc index 90936f6667b..49de1cf7ef8 100644 --- a/storage/xtradb/dict/dict0dict.cc +++ b/storage/xtradb/dict/dict0dict.cc @@ -6214,7 +6214,6 @@ dict_set_corrupted( row_mysql_lock_data_dictionary(trx); } - ut_ad(index); ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(!dict_table_is_comp(dict_sys->sys_tables)); ut_ad(!dict_table_is_comp(dict_sys->sys_indexes)); diff --git a/storage/xtradb/dict/dict0stats.cc b/storage/xtradb/dict/dict0stats.cc index c13d4583fef..6a28f3cdf8f 100644 --- a/storage/xtradb/dict/dict0stats.cc +++ b/storage/xtradb/dict/dict0stats.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2009, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, 2016, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1161,7 +1161,8 @@ dict_stats_analyze_index_level( them away) which brings non-determinism. We skip only leaf-level delete marks because delete marks on non-leaf level do not make sense. */ - if (level == 0 && + + if (level == 0 && srv_stats_include_delete_marked? 0: rec_get_deleted_flag( rec, page_is_comp(btr_pcur_get_page(&pcur)))) { @@ -1347,8 +1348,12 @@ enum page_scan_method_t { the given page and count the number of distinct ones, also ignore delete marked records */ - QUIT_ON_FIRST_NON_BORING/* quit when the first record that differs + QUIT_ON_FIRST_NON_BORING,/* quit when the first record that differs from its right neighbor is found */ + COUNT_ALL_NON_BORING_INCLUDE_DEL_MARKED/* scan all records on + the given page and count the number of + distinct ones, include delete marked + records */ }; /* @} */ @@ -1624,6 +1629,8 @@ dict_stats_analyze_index_below_cur( offsets_rec = dict_stats_scan_page( &rec, offsets1, offsets2, index, page, n_prefix, + srv_stats_include_delete_marked ? + COUNT_ALL_NON_BORING_INCLUDE_DEL_MARKED: COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED, n_diff, n_external_pages); diff --git a/storage/xtradb/dict/dict0stats_bg.cc b/storage/xtradb/dict/dict0stats_bg.cc index c2265d6abd6..55d34ff6ae1 100644 --- a/storage/xtradb/dict/dict0stats_bg.cc +++ b/storage/xtradb/dict/dict0stats_bg.cc @@ -41,8 +41,9 @@ Created Apr 25, 2012 Vasil Dimov #define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE) -/** Event to wake up the stats thread */ -UNIV_INTERN os_event_t dict_stats_event = NULL; +/** Event to wake up dict_stats_thread on dict_stats_recalc_pool_add() +or shutdown. Not protected by any mutex. */ +UNIV_INTERN os_event_t dict_stats_event; /** This mutex protects the "recalc_pool" variable. */ static ib_mutex_t recalc_pool_mutex; diff --git a/storage/xtradb/dyn/dyn0dyn.cc b/storage/xtradb/dyn/dyn0dyn.cc index 3ef5297a7c9..dd1f6863c14 100644 --- a/storage/xtradb/dyn/dyn0dyn.cc +++ b/storage/xtradb/dyn/dyn0dyn.cc @@ -40,7 +40,6 @@ dyn_array_add_block( mem_heap_t* heap; dyn_block_t* block; - ut_ad(arr); ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); if (arr->heap == NULL) { diff --git a/storage/xtradb/fil/fil0crypt.cc b/storage/xtradb/fil/fil0crypt.cc index 131d03ea17a..60ab067d105 100644 --- a/storage/xtradb/fil/fil0crypt.cc +++ b/storage/xtradb/fil/fil0crypt.cc @@ -37,7 +37,6 @@ Modified Jan Lindström jan.lindstrom@mariadb.com #include "fsp0fsp.h" #include "fil0pagecompress.h" #include "ha_prototypes.h" // IB_LOG_ - #include <my_crypt.h> /** Mutex for keys */ @@ -59,7 +58,7 @@ UNIV_INTERN uint srv_n_fil_crypt_threads = 0; UNIV_INTERN uint srv_n_fil_crypt_threads_started = 0; /** At this age or older a space/page will be rotated */ -UNIV_INTERN uint srv_fil_crypt_rotate_key_age = 1; +UNIV_INTERN uint srv_fil_crypt_rotate_key_age; /** Event to signal FROM the key rotation threads. */ static os_event_t fil_crypt_event; @@ -67,11 +66,11 @@ static os_event_t fil_crypt_event; /** Event to signal TO the key rotation threads. */ UNIV_INTERN os_event_t fil_crypt_threads_event; -/** Event for waking up threads throttle */ +/** Event for waking up threads throttle. */ static os_event_t fil_crypt_throttle_sleep_event; -/** Mutex for key rotation threads */ -static ib_mutex_t fil_crypt_threads_mutex; +/** Mutex for key rotation threads. */ +UNIV_INTERN ib_mutex_t fil_crypt_threads_mutex; #ifdef UNIV_PFS_MUTEX static mysql_pfs_key_t fil_crypt_threads_mutex_key; @@ -104,9 +103,12 @@ static mysql_pfs_key_t fil_crypt_stat_mutex_key; UNIV_INTERN mysql_pfs_key_t fil_crypt_data_mutex_key; #endif +/** Is background scrubbing enabled, defined on btr0scrub.cc */ +extern my_bool srv_background_scrub_data_uncompressed; +extern my_bool srv_background_scrub_data_compressed; + static bool fil_crypt_needs_rotation( -/*=====================*/ fil_encryption_t encrypt_mode, /*!< in: Encryption mode */ uint key_version, /*!< in: Key version */ @@ -118,7 +120,6 @@ Init space crypt */ UNIV_INTERN void fil_space_crypt_init() -/*==================*/ { mutex_create(fil_crypt_key_mutex_key, &fil_crypt_key_mutex, SYNC_NO_ORDER_CHECK); @@ -127,6 +128,7 @@ fil_space_crypt_init() mutex_create(fil_crypt_stat_mutex_key, &crypt_stat_mutex, SYNC_NO_ORDER_CHECK); + memset(&crypt_stat, 0, sizeof(crypt_stat)); } @@ -135,9 +137,9 @@ Cleanup space crypt */ UNIV_INTERN void fil_space_crypt_cleanup() -/*=====================*/ { os_event_free(fil_crypt_throttle_sleep_event); + fil_crypt_throttle_sleep_event = NULL; mutex_free(&fil_crypt_key_mutex); mutex_free(&crypt_stat_mutex); } @@ -146,7 +148,7 @@ fil_space_crypt_cleanup() Get latest key version from encryption plugin. @return key version or ENCRYPTION_KEY_VERSION_INVALID */ uint -fil_space_crypt_struct::key_get_latest_version(void) +fil_space_crypt_t::key_get_latest_version(void) { uint key_version = key_found; @@ -160,12 +162,12 @@ fil_space_crypt_struct::key_get_latest_version(void) } /****************************************************************** -Get the latest(key-version), waking the encrypt thread, if needed */ +Get the latest(key-version), waking the encrypt thread, if needed +@param[in,out] crypt_data Crypt data */ static inline uint fil_crypt_get_latest_key_version( -/*=============================*/ - fil_space_crypt_t* crypt_data) /*!< in: crypt data */ + fil_space_crypt_t* crypt_data) { ut_ad(crypt_data != NULL); @@ -204,28 +206,31 @@ crypt_data_scheme_locker( /****************************************************************** Create a fil_space_crypt_t object +@param[in] type CRYPT_SCHEME_UNENCRYPTE or + CRYPT_SCHEME_1 +@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or + FIL_ENCRYPTION_ON or + FIL_ENCRYPTION_OFF +@param[in] min_key_version key_version or 0 +@param[in] key_id Used key id @return crypt object */ static fil_space_crypt_t* fil_space_create_crypt_data( -/*========================*/ uint type, fil_encryption_t encrypt_mode, uint min_key_version, - uint key_id, - ulint offset) + uint key_id) { - const uint sz = sizeof(fil_space_crypt_t); - void* buf = mem_zalloc(sz); + void* buf = mem_zalloc(sizeof(fil_space_crypt_t)); fil_space_crypt_t* crypt_data = NULL; if (buf) { crypt_data = new(buf) - fil_space_crypt_struct( + fil_space_crypt_t( type, min_key_version, key_id, - offset, encrypt_mode); } @@ -234,25 +239,30 @@ fil_space_create_crypt_data( /****************************************************************** Create a fil_space_crypt_t object +@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or + FIL_ENCRYPTION_ON or + FIL_ENCRYPTION_OFF + +@param[in] key_id Encryption key id @return crypt object */ UNIV_INTERN fil_space_crypt_t* fil_space_create_crypt_data( -/*========================*/ - fil_encryption_t encrypt_mode, /*!< in: encryption mode */ - uint key_id) /*!< in: encryption key id */ + fil_encryption_t encrypt_mode, + uint key_id) { - return (fil_space_create_crypt_data(0, encrypt_mode, 0, key_id, 0)); + return (fil_space_create_crypt_data(0, encrypt_mode, 0, key_id)); } /****************************************************************** -Merge fil_space_crypt_t object */ +Merge fil_space_crypt_t object +@param[in,out] dst Destination cryp data +@param[in] src Source crypt data */ UNIV_INTERN void fil_space_merge_crypt_data( -/*=======================*/ - fil_space_crypt_t* dst,/*!< out: Crypt data */ - const fil_space_crypt_t* src)/*!< in: Crypt data */ + fil_space_crypt_t* dst, + const fil_space_crypt_t* src) { mutex_enter(&dst->mutex); @@ -267,21 +277,22 @@ fil_space_merge_crypt_data( dst->type = src->type; dst->min_key_version = src->min_key_version; dst->keyserver_requests += src->keyserver_requests; - dst->closing = src->closing; mutex_exit(&dst->mutex); } /****************************************************************** Read crypt data from a page (0) -@return crypt data from page 0. */ +@param[in] space space_id +@param[in] page Page 0 +@param[in] offset Offset to crypt data +@return crypt data from page 0 or NULL. */ UNIV_INTERN fil_space_crypt_t* fil_space_read_crypt_data( -/*======================*/ - ulint space, /*!< in: file space id*/ - const byte* page, /*!< in: page 0 */ - ulint offset) /*!< in: offset */ + ulint space, + const byte* page, + ulint offset) { if (memcmp(page + offset, CRYPT_MAGIC, MAGIC_SZ) != 0) { /* Crypt data is not stored. */ @@ -294,8 +305,8 @@ fil_space_read_crypt_data( type == CRYPT_SCHEME_1)) { ib_logf(IB_LOG_LEVEL_ERROR, - "Found non sensible crypt scheme: %lu for space %lu " - " offset: %lu bytes: " + "Found non sensible crypt scheme: " ULINTPF " for space " ULINTPF + " offset: " ULINTPF " bytes: " "[ %.2x %.2x %.2x %.2x %.2x %.2x ].", type, space, offset, page[offset + 0 + MAGIC_SZ], @@ -346,43 +357,37 @@ fil_space_read_crypt_data( } /****************************************************************** -Free a crypt data object */ +Free a crypt data object +@param[in,out] crypt_data crypt data to be freed */ UNIV_INTERN void fil_space_destroy_crypt_data( -/*=========================*/ - fil_space_crypt_t **crypt_data) /*!< out: crypt data */ + fil_space_crypt_t **crypt_data) { if (crypt_data != NULL && (*crypt_data) != NULL) { fil_space_crypt_t* c = *crypt_data; - c->~fil_space_crypt_struct(); + c->~fil_space_crypt_t(); mem_free(c); *crypt_data = NULL; } } /****************************************************************** -Write crypt data to a page (0) */ -static +Write crypt data to a page (0) +@param[in,out] page0 Page 0 where to write +@param[in,out] mtr Minitransaction */ +UNIV_INTERN void -fil_space_write_crypt_data_low( -/*===========================*/ - fil_space_crypt_t* crypt_data, /*<! out: crypt data */ - ulint type, /*<! in: crypt scheme */ - byte* page, /*<! in: page 0 */ - ulint offset, /*<! in: offset */ - ulint maxsize, /*<! in: size of crypt data */ - mtr_t* mtr) /*<! in: minitransaction */ +fil_space_crypt_t::write_page0( + byte* page, + mtr_t* mtr) { - ut_a(offset > 0 && offset < UNIV_PAGE_SIZE); ulint space_id = mach_read_from_4( page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - const uint len = sizeof(crypt_data->iv); - const uint min_key_version = crypt_data->min_key_version; - const uint key_id = crypt_data->key_id; - const fil_encryption_t encryption = crypt_data->encryption; - crypt_data->page0_offset = offset; - ut_a(2 + len + 4 + 1 + 4 + MAGIC_SZ < maxsize); + const uint len = sizeof(iv); + ulint zip_size = fsp_header_get_zip_size(page); + const ulint offset = fsp_header_get_crypt_offset(zip_size); + page0_offset = offset; /* redo log this as bytewise updates to page 0 @@ -392,7 +397,7 @@ fil_space_write_crypt_data_low( mlog_write_string(page + offset, CRYPT_MAGIC, MAGIC_SZ, mtr); mlog_write_ulint(page + offset + MAGIC_SZ + 0, type, MLOG_1BYTE, mtr); mlog_write_ulint(page + offset + MAGIC_SZ + 1, len, MLOG_1BYTE, mtr); - mlog_write_string(page + offset + MAGIC_SZ + 2, crypt_data->iv, len, + mlog_write_string(page + offset + MAGIC_SZ + 2, iv, len, mtr); mlog_write_ulint(page + offset + MAGIC_SZ + 2 + len, min_key_version, MLOG_4BYTES, mtr); @@ -424,44 +429,61 @@ fil_space_write_crypt_data_low( log_ptr += 1; mlog_close(mtr, log_ptr); - mlog_catenate_string(mtr, crypt_data->iv, len); + mlog_catenate_string(mtr, iv, len); } } /****************************************************************** -Write crypt data to a page (0) */ -UNIV_INTERN -void -fil_space_write_crypt_data( -/*=======================*/ - ulint space, /*<! in: file space */ - byte* page, /*<! in: page 0 */ - ulint offset, /*<! in: offset */ - ulint maxsize, /*<! in: size of crypt data */ - mtr_t* mtr) /*<! in: minitransaction */ +Set crypt data for a tablespace +@param[in,out] space Tablespace +@param[in,out] crypt_data Crypt data to be set +@return crypt_data in tablespace */ +static +fil_space_crypt_t* +fil_space_set_crypt_data( + fil_space_t* space, + fil_space_crypt_t* crypt_data) { - fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space); + fil_space_crypt_t* free_crypt_data = NULL; + fil_space_crypt_t* ret_crypt_data = NULL; + + /* Provided space is protected using fil_space_acquire() + from concurrent operations. */ + if (space->crypt_data != NULL) { + /* There is already crypt data present, + merge new crypt_data */ + fil_space_merge_crypt_data(space->crypt_data, + crypt_data); + ret_crypt_data = space->crypt_data; + free_crypt_data = crypt_data; + } else { + space->crypt_data = crypt_data; + ret_crypt_data = space->crypt_data; + } - /* If no crypt data is stored on memory cache for this space, - then do not continue writing crypt data to page 0. */ - if (crypt_data == NULL) { - return; + if (free_crypt_data != NULL) { + /* there was already crypt data present and the new crypt + * data provided as argument to this function has been merged + * into that => free new crypt data + */ + fil_space_destroy_crypt_data(&free_crypt_data); } - fil_space_write_crypt_data_low(crypt_data, crypt_data->type, - page, offset, maxsize, mtr); + return ret_crypt_data; } /****************************************************************** Parse a MLOG_FILE_WRITE_CRYPT_DATA log entry +@param[in] ptr Log entry start +@param[in] end_ptr Log entry end +@param[in] block buffer block @return position on log buffer */ UNIV_INTERN -byte* +const byte* fil_parse_write_crypt_data( -/*=======================*/ - byte* ptr, /*!< in: Log entry start */ - byte* end_ptr,/*!< in: Log entry end */ - buf_block_t* block) /*!< in: buffer block */ + const byte* ptr, + const byte* end_ptr, + const buf_block_t* block) { /* check that redo log entry is complete */ size_t entry_size = @@ -473,7 +495,7 @@ fil_parse_write_crypt_data( 4 + // size of key_id 1; // fil_encryption_t - if ((size_t) (end_ptr - ptr) < entry_size){ + if (ptr + entry_size > end_ptr) { return NULL; } @@ -499,7 +521,7 @@ fil_parse_write_crypt_data( fil_encryption_t encryption = (fil_encryption_t)mach_read_from_1(ptr); ptr +=1; - if ((size_t) (end_ptr - ptr) < len) { + if (ptr + len > end_ptr) { return NULL; } @@ -512,47 +534,36 @@ fil_parse_write_crypt_data( ptr += len; /* update fil_space memory cache with crypt_data */ - fil_space_set_crypt_data(space_id, crypt_data); + fil_space_t* space = fil_space_acquire_silent(space_id); - return ptr; -} + if (space) { + crypt_data = fil_space_set_crypt_data(space, crypt_data); + fil_space_release(space); + } -/****************************************************************** -Clear crypt data from a page (0) */ -UNIV_INTERN -void -fil_space_clear_crypt_data( -/*=======================*/ - byte* page, /*!< in/out: Page 0 */ - ulint offset) /*!< in: Offset */ -{ - //TODO(jonaso): pass crypt-data and read len from there - ulint len = CRYPT_SCHEME_1_IV_LEN; - ulint size = - sizeof(CRYPT_MAGIC) + - 1 + // type - 1 + // len - len + // iv - 4 + // min key version - 4 + // key id - 1; // fil_encryption_t - memset(page + offset, 0, size); + return ptr; } /****************************************************************** -Encrypt a buffer */ +Encrypt a buffer +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in] zip_size Compressed size or 0 +@param[in,out] dst_frame Output buffer +@return encrypted buffer or NULL */ UNIV_INTERN byte* fil_encrypt_buf( -/*============*/ - fil_space_crypt_t* crypt_data, /*!< in: crypt data */ - ulint space, /*!< in: Space id */ - ulint offset, /*!< in: Page offset */ - lsn_t lsn, /*!< in: lsn */ - byte* src_frame, /*!< in: Source page to be encrypted */ - ulint zip_size, /*!< in: compressed size if - row format compressed */ - byte* dst_frame) /*!< in: outbut buffer */ + fil_space_crypt_t* crypt_data, + ulint space, + ulint offset, + lsn_t lsn, + const byte* src_frame, + ulint zip_size, + byte* dst_frame) { ulint page_size = (zip_size) ? zip_size : UNIV_PAGE_SIZE; uint key_version = fil_crypt_get_latest_key_version(crypt_data); @@ -625,46 +636,48 @@ fil_encrypt_buf( // store the post-encryption checksum after the key-version mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4, checksum); + ut_ad(fil_space_verify_crypt_checksum(dst_frame, zip_size, NULL, offset)); + srv_stats.pages_encrypted.inc(); return dst_frame; } /****************************************************************** -Encrypt a page */ +Encrypt a page + +@param[in] space Tablespace +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in,out] dst_frame Output buffer +@return encrypted buffer or NULL */ UNIV_INTERN byte* fil_space_encrypt( -/*==============*/ - ulint space, /*!< in: Space id */ - ulint offset, /*!< in: Page offset */ - lsn_t lsn, /*!< in: lsn */ - byte* src_frame, /*!< in: Source page to be encrypted */ - ulint zip_size, /*!< in: compressed size if - row_format compressed */ - byte* dst_frame) /*!< in: outbut buffer */ + const fil_space_t* space, + ulint offset, + lsn_t lsn, + byte* src_frame, + byte* dst_frame) { - fil_space_crypt_t* crypt_data = NULL; - ulint orig_page_type = mach_read_from_2(src_frame+FIL_PAGE_TYPE); if (orig_page_type==FIL_PAGE_TYPE_FSP_HDR || orig_page_type==FIL_PAGE_TYPE_XDES) { /* File space header or extent descriptor do not need to be encrypted. */ - return src_frame; + return (src_frame); } - /* Get crypt data from file space */ - crypt_data = fil_space_get_crypt_data(space); - - if (crypt_data == NULL) { - return src_frame; + if (!space->crypt_data || !space->crypt_data->is_encrypted()) { + return (src_frame); } - ut_a(crypt_data != NULL && crypt_data->is_encrypted()); - - byte* tmp = fil_encrypt_buf(crypt_data, space, offset, lsn, src_frame, zip_size, dst_frame); + fil_space_crypt_t* crypt_data = space->crypt_data; + ut_ad(space->n_pending_ops); + ulint zip_size = fsp_flags_get_zip_size(space->flags); + byte* tmp = fil_encrypt_buf(crypt_data, space->id, offset, lsn, src_frame, zip_size, dst_frame); #ifdef UNIV_DEBUG if (tmp) { @@ -685,7 +698,7 @@ fil_space_encrypt( src = uncomp_mem; } - bool corrupted1 = buf_page_is_corrupted(true, src, zip_size); + bool corrupted1 = buf_page_is_corrupted(true, src, zip_size, space); bool ok = fil_space_decrypt(crypt_data, tmp_mem, size, tmp, &err); /* Need to decompress the page if it was also compressed */ @@ -694,18 +707,17 @@ fil_space_encrypt( fil_decompress_page(tmp_mem, comp_mem, UNIV_PAGE_SIZE, NULL); } - bool corrupted = buf_page_is_corrupted(true, tmp_mem, zip_size); + bool corrupted = buf_page_is_corrupted(true, tmp_mem, zip_size, space); bool different = memcmp(src, tmp_mem, size); if (!ok || corrupted || corrupted1 || err != DB_SUCCESS || different) { - fprintf(stderr, "JAN: ok %d corrupted %d corrupted1 %d err %d different %d\n", ok , corrupted, corrupted1, err, different); - fprintf(stderr, "JAN1: src_frame\n"); + fprintf(stderr, "ok %d corrupted %d corrupted1 %d err %d different %d\n", ok , corrupted, corrupted1, err, different); + fprintf(stderr, "src_frame\n"); buf_page_print(src_frame, zip_size, BUF_PAGE_PRINT_NO_CRASH); - fprintf(stderr, "JAN2: encrypted_frame\n"); + fprintf(stderr, "encrypted_frame\n"); buf_page_print(tmp, zip_size, BUF_PAGE_PRINT_NO_CRASH); - fprintf(stderr, "JAN1: decrypted_frame\n"); - buf_page_print(tmp_mem, zip_size, BUF_PAGE_PRINT_NO_CRASH); - ut_error; + fprintf(stderr, "decrypted_frame\n"); + buf_page_print(tmp_mem, zip_size, 0); } free(tmp_mem); @@ -724,45 +736,22 @@ fil_space_encrypt( return tmp; } -/********************************************************************* -Check if extra buffer shall be allocated for decrypting after read -@return true if fil space has encryption data. */ -UNIV_INTERN -bool -fil_space_check_encryption_read( -/*=============================*/ - ulint space) /*!< in: tablespace id */ -{ - fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space); - - if (crypt_data == NULL) { - return false; - } - - if (crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) { - return false; - } - - if (crypt_data->not_encrypted()) { - return false; - } - - return true; -} - /****************************************************************** Decrypt a page +@param[in] crypt_data crypt_data +@param[in] tmp_frame Temporary buffer +@param[in] page_size Page size +@param[in,out] src_frame Page to decrypt +@param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED @return true if page decrypted, false if not.*/ UNIV_INTERN bool fil_space_decrypt( -/*==============*/ - fil_space_crypt_t* crypt_data, /*!< in: crypt data */ - byte* tmp_frame, /*!< in: temporary buffer */ - ulint page_size, /*!< in: page size */ - byte* src_frame, /*!< in: out: page buffer */ - dberr_t* err) /*!< in: out: DB_SUCCESS or - error code */ + fil_space_crypt_t* crypt_data, + byte* tmp_frame, + ulint page_size, + byte* src_frame, + dberr_t* err) { ulint page_type = mach_read_from_2(src_frame+FIL_PAGE_TYPE); uint key_version = mach_read_from_4(src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); @@ -770,6 +759,7 @@ fil_space_decrypt( ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET); ulint space = mach_read_from_4(src_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); ib_uint64_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN); + *err = DB_SUCCESS; if (key_version == ENCRYPTION_KEY_NOT_ENCRYPTED) { @@ -784,12 +774,12 @@ fil_space_decrypt( first page in a system tablespace data file (ibdata*, not *.ibd), if not clear it. */ -#ifdef UNIV_DEBUG - ib_logf(IB_LOG_LEVEL_WARN, - "Page on space %lu offset %lu has key_version %u" + + DBUG_PRINT("ib_crypt", + ("Page on space %lu offset %lu has key_version %u" " when it shoud be undefined.", - space, offset, key_version); -#endif + space, offset, key_version)); + mach_write_to_4(src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0); } return false; @@ -858,32 +848,43 @@ fil_space_decrypt( /****************************************************************** Decrypt a page -@return encrypted page, or original not encrypted page if encryption is -not needed. */ +@param[in] space Tablespace +@param[in] tmp_frame Temporary buffer used for decrypting +@param[in] page_size Page size +@param[in,out] src_frame Page to decrypt +@param[out] decrypted true if page was decrypted +@return decrypted page, or original not encrypted page if decryption is +not needed.*/ UNIV_INTERN byte* fil_space_decrypt( -/*==============*/ - ulint space, /*!< in: Fil space id */ - byte* tmp_frame, /*!< in: temporary buffer */ - ulint page_size, /*!< in: page size */ - byte* src_frame) /*!< in/out: page buffer */ + const fil_space_t* space, + byte* tmp_frame, + byte* src_frame, + bool* decrypted) { dberr_t err = DB_SUCCESS; byte* res = NULL; + ulint zip_size = fsp_flags_get_zip_size(space->flags); + ulint size = zip_size ? zip_size : UNIV_PAGE_SIZE; + *decrypted = false; + + ut_ad(space->crypt_data != NULL && space->crypt_data->is_encrypted()); + ut_ad(space->n_pending_ops > 0); bool encrypted = fil_space_decrypt( - fil_space_get_crypt_data(space), + space->crypt_data, tmp_frame, - page_size, + size, src_frame, &err); if (err == DB_SUCCESS) { if (encrypted) { + *decrypted = true; /* Copy the decrypted page back to page buffer, not really any other options. */ - memcpy(src_frame, tmp_frame, page_size); + memcpy(src_frame, tmp_frame, size); } res = src_frame; @@ -894,14 +895,15 @@ fil_space_decrypt( /****************************************************************** Calculate post encryption checksum +@param[in] zip_size zip_size or 0 +@param[in] dst_frame Block where checksum is calculated @return page checksum or BUF_NO_CHECKSUM_MAGIC not needed. */ UNIV_INTERN ulint fil_crypt_calculate_checksum( -/*=========================*/ - ulint zip_size, /*!< in: zip_size or 0 */ - byte* dst_frame) /*!< in: page where to calculate */ + ulint zip_size, + const byte* dst_frame) { ib_uint32_t checksum = 0; srv_checksum_algorithm_t algorithm = @@ -934,83 +936,133 @@ fil_crypt_calculate_checksum( } /********************************************************************* -Verify checksum for a page (iff it's encrypted) -NOTE: currently this function can only be run in single threaded mode -as it modifies srv_checksum_algorithm (temporarily) +Verify that post encryption checksum match calculated checksum. +This function should be called only if tablespace contains crypt_data +metadata (this is strong indication that tablespace is encrypted). +Function also verifies that traditional checksum does not match +calculated checksum as if it does page could be valid unencrypted, +encrypted, or corrupted. + +@param[in] page Page to verify +@param[in] zip_size zip size +@param[in] space Tablespace +@param[in] pageno Page no @return true if page is encrypted AND OK, false otherwise */ UNIV_INTERN bool fil_space_verify_crypt_checksum( -/*============================*/ - const byte* src_frame, /*!< in: page the verify */ - ulint zip_size) /*!< in: compressed size if - row_format compressed */ + byte* page, + ulint zip_size, + const fil_space_t* space, + ulint pageno) { - // key version - uint key_version = mach_read_from_4( - src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + uint key_version = mach_read_from_4(page+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + /* If page is not encrypted, return false */ if (key_version == 0) { - return false; // unencrypted page + return false; + } + + srv_checksum_algorithm_t algorithm = + static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm); + /* If no checksum is used, can't continue checking. */ + if (algorithm == SRV_CHECKSUM_ALGORITHM_NONE) { + return(true); } - /* "trick" the normal checksum routines by storing the post-encryption - * checksum into the normal checksum field allowing for reuse of - * the normal routines */ + /* Read stored post encryption checksum. */ + ib_uint32_t checksum = mach_read_from_4( + page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4); - // post encryption checksum - ib_uint32_t stored_post_encryption = mach_read_from_4( - src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4); + /* Declare empty pages non-corrupted */ + if (checksum == 0 + && *reinterpret_cast<const ib_uint64_t*>(page + FIL_PAGE_LSN) == 0 + && buf_page_is_zeroes(page, zip_size)) { + return(true); + } - // save pre encryption checksum for restore in end of this function - ib_uint32_t stored_pre_encryption = mach_read_from_4( - src_frame + FIL_PAGE_SPACE_OR_CHKSUM); + /* Compressed and encrypted pages do not have checksum. Assume not + corrupted. Page verification happens after decompression in + buf_page_io_complete() using buf_page_is_corrupted(). */ + if (mach_read_from_2(page+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { + return (true); + } - ib_uint32_t checksum_field2 = mach_read_from_4( - src_frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM); + /* Compressed pages use different checksum method. We first store + the post encryption checksum on checksum location and after function + restore the original. */ + if (zip_size) { + ib_uint32_t old = static_cast<ib_uint32_t>(mach_read_from_4( + page + FIL_PAGE_SPACE_OR_CHKSUM)); - /** prepare frame for usage of normal checksum routines */ - mach_write_to_4(const_cast<byte*>(src_frame) + FIL_PAGE_SPACE_OR_CHKSUM, - stored_post_encryption); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); - /* NOTE: this function is (currently) only run when restoring - * dblwr-buffer, server is single threaded so it's safe to modify - * srv_checksum_algorithm */ - srv_checksum_algorithm_t save_checksum_algorithm = - (srv_checksum_algorithm_t)srv_checksum_algorithm; + bool valid = page_zip_verify_checksum(page, zip_size); - if (zip_size == 0 && - (save_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB || - save_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB)) { - /* handle ALGORITHM_INNODB specially, - * "downgrade" to ALGORITHM_INNODB and store BUF_NO_CHECKSUM_MAGIC - * checksum_field2 is sort of pointless anyway... - */ - srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_INNODB; - mach_write_to_4(const_cast<byte*>(src_frame) + - UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, - BUF_NO_CHECKSUM_MAGIC); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, old); + + return (valid); } - /* verify checksums */ - ibool corrupted = buf_page_is_corrupted(false, src_frame, zip_size); + /* If stored checksum matches one of the calculated checksums + page is not corrupted. */ - /** restore frame & algorithm */ - srv_checksum_algorithm = save_checksum_algorithm; + ib_uint32_t cchecksum1 = buf_calc_page_crc32(page); + ib_uint32_t cchecksum2 = (ib_uint32_t) buf_calc_page_new_checksum( + page); + bool encrypted = (checksum == cchecksum1 || checksum == cchecksum2 + || checksum == BUF_NO_CHECKSUM_MAGIC); - mach_write_to_4(const_cast<byte*>(src_frame) + - FIL_PAGE_SPACE_OR_CHKSUM, - stored_pre_encryption); + /* MySQL 5.6 and MariaDB 10.0 and 10.1 will write an LSN to the + first page of each system tablespace file at + FIL_PAGE_FILE_FLUSH_LSN offset. On other pages and in other files, + the field might have been uninitialized until MySQL 5.5. In MySQL 5.7 + (and MariaDB Server 10.2.2) WL#7990 stopped writing the field for other + than page 0 of the system tablespace. - mach_write_to_4(const_cast<byte*>(src_frame) + - UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, - checksum_field2); + Starting from MariaDB 10.1 the field has been repurposed for + encryption key_version. - if (!corrupted) { - return true; // page was encrypted and checksum matched - } else { - return false; // page was encrypted but checksum didn't match + Starting with MySQL 5.7 (and MariaDB Server 10.2), the + field has been repurposed for SPATIAL INDEX pages for + FIL_RTREE_SPLIT_SEQ_NUM. + + Note that FIL_PAGE_FILE_FLUSH_LSN is not included in the InnoDB page + checksum. + + Thus, FIL_PAGE_FILE_FLUSH_LSN could contain any value. While the + field would usually be 0 for pages that are not encrypted, we cannot + assume that a nonzero value means that the page is encrypted. + Therefore we must validate the page both as encrypted and unencrypted + when FIL_PAGE_FILE_FLUSH_LSN does not contain 0. + */ + + ulint checksum1 = mach_read_from_4( + page + FIL_PAGE_SPACE_OR_CHKSUM); + + ulint checksum2 = mach_read_from_4( + page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM); + + + bool valid = (buf_page_is_checksum_valid_crc32(page,checksum1,checksum2) + || buf_page_is_checksum_valid_none(page,checksum1,checksum2) + || buf_page_is_checksum_valid_innodb(page,checksum1, checksum2)); + + if (encrypted && valid) { + /* If page is encrypted and traditional checksums match, + page could be still encrypted, or not encrypted and valid or + corrupted. */ + ib_logf(IB_LOG_LEVEL_ERROR, + " Page %lu in space %s (%lu) maybe corrupted." + " Post encryption checksum %u stored [%lu:%lu] key_version %u", + pageno, + space ? space->name : "N/A", + mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID), + checksum, checksum1, checksum2, key_version); + encrypted = false; } + + return(encrypted); } /***********************************************************************/ @@ -1029,12 +1081,13 @@ struct key_state_t { }; /*********************************************************************** -Copy global key state */ +Copy global key state +@param[in,out] new_state key state +@param[in] crypt_data crypt data */ static void fil_crypt_get_key_state( -/*====================*/ - key_state_t* new_state, /*!< out: key state */ - fil_space_crypt_t* crypt_data) /*!< in, out: crypt_data */ + key_state_t* new_state, + fil_space_crypt_t* crypt_data) { if (srv_encrypt_tables) { new_state->key_version = crypt_data->key_get_latest_version(); @@ -1049,15 +1102,17 @@ fil_crypt_get_key_state( /*********************************************************************** Check if a key needs rotation given a key_state +@param[in] encrypt_mode Encryption mode +@param[in] key_version Current key version +@param[in] latest_key_version Latest key version +@param[in] rotate_key_age when to rotate @return true if key needs rotation, false if not */ static bool fil_crypt_needs_rotation( -/*=====================*/ - fil_encryption_t encrypt_mode, /*!< in: Encryption - mode */ - uint key_version, /*!< in: Key version */ - uint latest_key_version, /*!< in: Latest key version */ - uint rotate_key_age) /*!< in: When to rotate */ + fil_encryption_t encrypt_mode, + uint key_version, + uint latest_key_version, + uint rotate_key_age) { if (key_version == ENCRYPTION_KEY_VERSION_INVALID) { return false; @@ -1070,7 +1125,7 @@ fil_crypt_needs_rotation( } if (latest_key_version == 0 && key_version != 0) { - if (encrypt_mode == FIL_SPACE_ENCRYPTION_DEFAULT) { + if (encrypt_mode == FIL_ENCRYPTION_DEFAULT) { /* this is rotation encrypted => unencrypted */ return true; } @@ -1087,59 +1142,34 @@ fil_crypt_needs_rotation( } /*********************************************************************** -Check if a space is closing (i.e just before drop) -@return true if space is closing, false if not. */ -UNIV_INTERN -bool -fil_crypt_is_closing( -/*=================*/ - ulint space) /*!< in: FIL space id */ -{ - bool closing=true; - fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space); - - if (crypt_data) { - closing = crypt_data->is_closing(false); - } - - return closing; -} - -/*********************************************************************** Start encrypting a space -@return true if a pending op (fil_inc_pending_ops/fil_decr_pending_ops) is held -*/ +@param[in,out] space Tablespace +@return true if a recheck is needed */ static bool fil_crypt_start_encrypting_space( -/*=============================*/ - ulint space, /*!< in: FIL space id */ - bool* recheck)/*!< out: true if recheck needed */ + fil_space_t* space) { - - /* we have a pending op when entering function */ - bool pending_op = true; - + bool recheck = false; mutex_enter(&fil_crypt_threads_mutex); - fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space); - ibool page_encrypted = (crypt_data != NULL); + fil_space_crypt_t *crypt_data = space->crypt_data; - /*If spage is not encrypted and encryption is not enabled, then + /* If space is not encrypted and encryption is not enabled, then do not continue encrypting the space. */ - if (!page_encrypted && !srv_encrypt_tables) { + if (!crypt_data && !srv_encrypt_tables) { mutex_exit(&fil_crypt_threads_mutex); - return pending_op; + return false; } if (crypt_data != NULL || fil_crypt_start_converting) { /* someone beat us to it */ if (fil_crypt_start_converting) { - *recheck = true; + recheck = true; } mutex_exit(&fil_crypt_threads_mutex); - return pending_op; + return recheck; } /* NOTE: we need to write and flush page 0 before publishing @@ -1148,10 +1178,11 @@ fil_crypt_start_encrypting_space( * crypt data in page 0 */ /* 1 - create crypt data */ - crypt_data = fil_space_create_crypt_data(FIL_SPACE_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); + crypt_data = fil_space_create_crypt_data(FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); + if (crypt_data == NULL) { mutex_exit(&fil_crypt_threads_mutex); - return pending_op; + return false; } crypt_data->type = CRYPT_SCHEME_UNENCRYPTED; @@ -1169,87 +1200,44 @@ fil_crypt_start_encrypting_space( do { - if (fil_crypt_is_closing(space) || - fil_space_found_by_id(space) == NULL) { - break; - } - mtr_t mtr; mtr_start(&mtr); /* 2 - get page 0 */ - ulint offset = 0; - ulint zip_size = fil_space_get_zip_size(space); - buf_block_t* block = buf_page_get_gen(space, zip_size, offset, + ulint zip_size = fsp_flags_get_zip_size(space->flags); + buf_block_t* block = buf_page_get_gen(space->id, zip_size, 0, RW_X_LATCH, NULL, BUF_GET, __FILE__, __LINE__, &mtr); - if (fil_crypt_is_closing(space) || - fil_space_found_by_id(space) == NULL) { - mtr_commit(&mtr); - break; - } - /* 3 - compute location to store crypt data */ + /* 3 - write crypt data to page 0 */ byte* frame = buf_block_get_frame(block); - ulint maxsize; - ut_ad(crypt_data); - crypt_data->page0_offset = - fsp_header_get_crypt_offset(zip_size, &maxsize); - - /* 4 - write crypt data to page 0 */ - fil_space_write_crypt_data_low(crypt_data, - CRYPT_SCHEME_1, - frame, - crypt_data->page0_offset, - maxsize, &mtr); + crypt_data->type = CRYPT_SCHEME_1; + crypt_data->write_page0(frame, &mtr); - mtr_commit(&mtr); - if (fil_crypt_is_closing(space) || - fil_space_found_by_id(space) == NULL) { - break; - } + mtr_commit(&mtr); /* record lsn of update */ lsn_t end_lsn = mtr.end_lsn; /* 4 - sync tablespace before publishing crypt data */ - /* release "lock" while syncing */ - fil_decr_pending_ops(space); - pending_op = false; - bool success = false; - ulint n_pages = 0; ulint sum_pages = 0; + do { + ulint n_pages = 0; success = buf_flush_list(ULINT_MAX, end_lsn, &n_pages); buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); sum_pages += n_pages; - } while (!success && - !fil_crypt_is_closing(space) && - !fil_space_found_by_id(space)); - - /* try to reacquire pending op */ - if (fil_inc_pending_ops(space, true)) { - break; - } - - /* pending op reacquired! */ - pending_op = true; - - if (fil_crypt_is_closing(space) || - fil_space_found_by_id(space) == NULL) { - break; - } + } while (!success); /* 5 - publish crypt data */ mutex_enter(&fil_crypt_threads_mutex); - ut_ad(crypt_data); mutex_enter(&crypt_data->mutex); crypt_data->type = CRYPT_SCHEME_1; ut_a(crypt_data->rotate_state.active_threads == 1); @@ -1260,10 +1248,9 @@ fil_crypt_start_encrypting_space( mutex_exit(&crypt_data->mutex); mutex_exit(&fil_crypt_threads_mutex); - return pending_op; + return recheck; } while (0); - ut_ad(crypt_data); mutex_enter(&crypt_data->mutex); ut_a(crypt_data->rotate_state.active_threads == 1); crypt_data->rotate_state.active_threads = 0; @@ -1273,7 +1260,7 @@ fil_crypt_start_encrypting_space( fil_crypt_start_converting = false; mutex_exit(&fil_crypt_threads_mutex); - return pending_op; + return recheck; } /** State of a rotation thread */ @@ -1287,7 +1274,7 @@ struct rotate_thread_t { uint thread_no; bool first; /*!< is position before first space */ - ulint space; /*!< current space */ + fil_space_t* space; /*!< current space or NULL */ ulint offset; /*!< current offset */ ulint batch; /*!< #pages to rotate */ uint min_key_version_found;/*!< min key version found but not rotated */ @@ -1322,54 +1309,41 @@ struct rotate_thread_t { /*********************************************************************** Check if space needs rotation given a key_state +@param[in,out] state Key rotation state +@param[in,out] key_state Key state +@param[in,out] recheck needs recheck ? @return true if space needs key rotation */ static bool fil_crypt_space_needs_rotation( -/*===========================*/ - rotate_thread_t* state, /*!< in: Key rotation state */ - key_state_t* key_state, /*!< in: Key state */ - bool* recheck) /*!< out: needs recheck ? */ + rotate_thread_t* state, + key_state_t* key_state, + bool* recheck) { - ulint space = state->space; - - /* Make sure that tablespace is found and it is normal tablespace */ - if (fil_space_found_by_id(space) == NULL || - fil_space_get_type(space) != FIL_TABLESPACE) { - return false; - } + fil_space_t* space = state->space; - if (fil_inc_pending_ops(space, true)) { - /* tablespace being dropped */ + /* Make sure that tablespace is normal tablespace */ + if (space->purpose != FIL_TABLESPACE) { return false; } - /* keep track of if we have pending op */ - bool pending_op = true; + ut_ad(space->n_pending_ops > 0); - fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space); + fil_space_crypt_t *crypt_data = space->crypt_data; if (crypt_data == NULL) { /** * space has no crypt data * start encrypting it... */ - pending_op = fil_crypt_start_encrypting_space(space, recheck); - - crypt_data = fil_space_get_crypt_data(space); + *recheck = fil_crypt_start_encrypting_space(space); + crypt_data = space->crypt_data; if (crypt_data == NULL) { - if (pending_op) { - fil_decr_pending_ops(space); - } return false; } crypt_data->key_get_latest_version(); - - if (!crypt_data->is_key_found()) { - return false; - } } /* If used key_id is not found from encryption plugin we can't @@ -1389,7 +1363,7 @@ fil_crypt_space_needs_rotation( } /* prevent threads from starting to rotate space */ - if (crypt_data->is_closing(true)) { + if (space->is_stopping()) { break; } @@ -1413,39 +1387,39 @@ fil_crypt_space_needs_rotation( key_state->key_version, key_state->rotate_key_age); crypt_data->rotate_state.scrubbing.is_active = - btr_scrub_start_space(space, &state->scrub_data); + btr_scrub_start_space(space->id, &state->scrub_data); time_t diff = time(0) - crypt_data->rotate_state.scrubbing. last_scrub_completed; bool need_scrubbing = + (srv_background_scrub_data_uncompressed || + srv_background_scrub_data_compressed) && crypt_data->rotate_state.scrubbing.is_active - && diff >= (time_t) srv_background_scrub_data_interval; + && diff >= 0 + && ulint(diff) >= srv_background_scrub_data_interval; if (need_key_rotation == false && need_scrubbing == false) { break; } mutex_exit(&crypt_data->mutex); - /* NOTE! fil_decr_pending_ops is performed outside */ + return true; } while (0); mutex_exit(&crypt_data->mutex); - if (pending_op) { - fil_decr_pending_ops(space); - } return false; } /*********************************************************************** -Update global statistics with thread statistics */ +Update global statistics with thread statistics +@param[in,out] state key rotation statistics */ static void fil_crypt_update_total_stat( -/*========================*/ - rotate_thread_t *state) /*!< in: Key rotation status */ + rotate_thread_t *state) { mutex_enter(&crypt_stat_mutex); crypt_stat.pages_read_from_cache += @@ -1469,15 +1443,19 @@ fil_crypt_update_total_stat( /*********************************************************************** Allocate iops to thread from global setting, used before starting to rotate a space. +@param[in,out] state Rotation state @return true if allocation succeeded, false if failed */ static bool fil_crypt_alloc_iops( -/*=================*/ - rotate_thread_t *state) /*!< in: Key rotation status */ + rotate_thread_t *state) { ut_ad(state->allocated_iops == 0); + /* We have not yet selected the space to rotate, thus + state might not contain space and we can't check + its status yet. */ + uint max_iops = state->estimated_max_iops; mutex_enter(&fil_crypt_threads_mutex); @@ -1503,12 +1481,12 @@ fil_crypt_alloc_iops( /*********************************************************************** Reallocate iops to thread, -used when inside a space */ +used when inside a space +@param[in,out] state Rotation state */ static void fil_crypt_realloc_iops( -/*===================*/ - rotate_thread_t *state) /*!< in: Key rotation status */ + rotate_thread_t *state) { ut_a(state->allocated_iops > 0); @@ -1517,13 +1495,12 @@ fil_crypt_realloc_iops( uint avg_wait_time_us = state->sum_waited_us / state->cnt_waited; -#if DEBUG_KEYROTATION_THROTTLING - ib_logf(IB_LOG_LEVEL_INFO, - "thr_no: %u - update estimated_max_iops from %u to %u.", + DBUG_PRINT("ib_crypt", + ("thr_no: %u - update estimated_max_iops from %u to %u.", state->thread_no, state->estimated_max_iops, - 1000000 / avg_wait_time_us); -#endif + 1000000 / avg_wait_time_us)); + if (avg_wait_time_us == 0) { avg_wait_time_us = 1; // prevent division by zero } @@ -1532,12 +1509,11 @@ fil_crypt_realloc_iops( state->cnt_waited = 0; state->sum_waited_us = 0; } else { -#if DEBUG_KEYROTATION_THROTTLING - ib_logf(IB_LOG_LEVEL_INFO, - "thr_no: %u only waited %lu%% skip re-estimate.", + + DBUG_PRINT("ib_crypt", + ("thr_no: %u only waited %lu%% skip re-estimate.", state->thread_no, - (100 * state->cnt_waited) / state->batch); -#endif + (100 * state->cnt_waited) / state->batch)); } if (state->estimated_max_iops <= state->allocated_iops) { @@ -1563,8 +1539,9 @@ fil_crypt_realloc_iops( state->allocated_iops ++; n_fil_crypt_iops_allocated ++; } - mutex_exit(&fil_crypt_threads_mutex); + os_event_set(fil_crypt_threads_event); + mutex_exit(&fil_crypt_threads_mutex); } } else { /* see if there are more to get */ @@ -1581,13 +1558,13 @@ fil_crypt_realloc_iops( } n_fil_crypt_iops_allocated += extra; state->allocated_iops += extra; -#if DEBUG_KEYROTATION_THROTTLING - ib_logf(IB_LOG_LEVEL_INFO, - "thr_no: %u increased iops from %u to %u.", + + DBUG_PRINT("ib_crypt", + ("thr_no: %u increased iops from %u to %u.", state->thread_no, state->allocated_iops - extra, - state->allocated_iops); -#endif + state->allocated_iops)); + } mutex_exit(&fil_crypt_threads_mutex); } @@ -1596,12 +1573,12 @@ fil_crypt_realloc_iops( } /*********************************************************************** -Return allocated iops to global */ +Return allocated iops to global +@param[in,out] state Rotation state */ static void fil_crypt_return_iops( -/*==================*/ - rotate_thread_t *state) /*!< in: Key rotation status */ + rotate_thread_t *state) { if (state->allocated_iops > 0) { uint iops = state->allocated_iops; @@ -1614,25 +1591,27 @@ fil_crypt_return_iops( ut_ad(0); iops = 0; } + n_fil_crypt_iops_allocated -= iops; - mutex_exit(&fil_crypt_threads_mutex); state->allocated_iops = 0; os_event_set(fil_crypt_threads_event); + mutex_exit(&fil_crypt_threads_mutex); } fil_crypt_update_total_stat(state); } /*********************************************************************** -Search for a space needing rotation */ -UNIV_INTERN +Search for a space needing rotation +@param[in,out] key_state Key state +@param[in,out] state Rotation state +@param[in,out] recheck recheck ? */ +static bool fil_crypt_find_space_to_rotate( -/*===========================*/ - key_state_t* key_state, /*!< in: Key state */ - rotate_thread_t* state, /*!< in: Key rotation state */ - bool* recheck) /*!< out: true if recheck - needed */ + key_state_t* key_state, + rotate_thread_t* state, + bool* recheck) { /* we need iops to start rotating */ while (!state->should_shutdown() && !fil_crypt_alloc_iops(state)) { @@ -1641,30 +1620,44 @@ fil_crypt_find_space_to_rotate( } if (state->should_shutdown()) { + if (state->space) { + fil_space_release(state->space); + state->space = NULL; + } return false; } if (state->first) { state->first = false; - state->space = fil_get_first_space_safe(); - } else { - state->space = fil_get_next_space_safe(state->space); + if (state->space) { + fil_space_release(state->space); + } + state->space = NULL; } - while (!state->should_shutdown() && state->space != ULINT_UNDEFINED) { - fil_space_t* space = fil_space_found_by_id(state->space); + /* If key rotation is enabled (default) we iterate all tablespaces. + If key rotation is not enabled we iterate only the tablespaces + added to keyrotation list. */ + if (srv_fil_crypt_rotate_key_age) { + state->space = fil_space_next(state->space); + } else { + state->space = fil_space_keyrotate_next(state->space); + } - if (space) { - if (fil_crypt_space_needs_rotation(state, key_state, recheck)) { - ut_ad(key_state->key_id); - /* init state->min_key_version_found before - * starting on a space */ - state->min_key_version_found = key_state->key_version; - return true; - } + while (!state->should_shutdown() && state->space) { + if (fil_crypt_space_needs_rotation(state, key_state, recheck)) { + ut_ad(key_state->key_id); + /* init state->min_key_version_found before + * starting on a space */ + state->min_key_version_found = key_state->key_version; + return true; } - state->space = fil_get_next_space_safe(state->space); + if (srv_fil_crypt_rotate_key_age) { + state->space = fil_space_next(state->space); + } else { + state->space = fil_space_keyrotate_next(state->space); + } } /* if we didn't find any space return iops */ @@ -1675,16 +1668,16 @@ fil_crypt_find_space_to_rotate( } /*********************************************************************** -Start rotating a space */ +Start rotating a space +@param[in] key_state Key state +@param[in,out] state Rotation state */ static void fil_crypt_start_rotate_space( -/*=========================*/ - const key_state_t* key_state, /*!< in: Key state */ - rotate_thread_t* state) /*!< in: Key rotation state */ + const key_state_t* key_state, + rotate_thread_t* state) { - ulint space = state->space; - fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space); + fil_space_crypt_t *crypt_data = state->space->crypt_data; ut_ad(crypt_data); mutex_enter(&crypt_data->mutex); @@ -1695,8 +1688,9 @@ fil_crypt_start_rotate_space( crypt_data->rotate_state.next_offset = 1; // skip page 0 /* no need to rotate beyond current max * if space extends, it will be encrypted with newer version */ - crypt_data->rotate_state.max_offset = fil_space_get_size(space); - + /* FIXME: max_offset could be removed and instead + space->size consulted.*/ + crypt_data->rotate_state.max_offset = state->space->size; crypt_data->rotate_state.end_lsn = 0; crypt_data->rotate_state.min_key_version_found = key_state->key_version; @@ -1724,26 +1718,34 @@ fil_crypt_start_rotate_space( /*********************************************************************** Search for batch of pages needing rotation +@param[in] key_state Key state +@param[in,out] state Rotation state @return true if page needing key rotation found, false if not found */ static bool fil_crypt_find_page_to_rotate( -/*==========================*/ - const key_state_t* key_state, /*!< in: Key state */ - rotate_thread_t* state) /*!< in: Key rotation state */ + const key_state_t* key_state, + rotate_thread_t* state) { ulint batch = srv_alloc_time * state->allocated_iops; - ulint space = state->space; - fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space); + fil_space_t* space = state->space; + + ut_ad(!space || space->n_pending_ops > 0); + + /* If space is marked to be dropped stop rotation. */ + if (!space || space->is_stopping()) { + return false; + } + + fil_space_crypt_t *crypt_data = space->crypt_data; /* Space might already be dropped */ if (crypt_data) { mutex_enter(&crypt_data->mutex); ut_ad(key_state->key_id == crypt_data->key_id); - if (!crypt_data->is_closing(true) && - crypt_data->rotate_state.next_offset < - crypt_data->rotate_state.max_offset) { + if (crypt_data->rotate_state.next_offset < + crypt_data->rotate_state.max_offset) { state->offset = crypt_data->rotate_state.next_offset; ulint remaining = crypt_data->rotate_state.max_offset - @@ -1768,59 +1770,47 @@ fil_crypt_find_page_to_rotate( /*********************************************************************** Check if a page is uninitialized (doesn't need to be rotated) -@return true if page is uninitialized, false if not.*/ -static +@param[in] frame Page to check +@param[in] zip_size zip_size or 0 +@return true if page is uninitialized, false if not. */ +static inline bool fil_crypt_is_page_uninitialized( -/*============================*/ - const byte *frame, /*!< in: Page */ - uint zip_size) /*!< in: compressed size if - row_format compressed */ + const byte *frame, + uint zip_size) { - if (zip_size) { - ulint stored_checksum = mach_read_from_4( - frame + FIL_PAGE_SPACE_OR_CHKSUM); - /* empty pages aren't encrypted */ - if (stored_checksum == 0) { - return true; - } - } else { - ulint size = UNIV_PAGE_SIZE; - ulint checksum_field1 = mach_read_from_4( - frame + FIL_PAGE_SPACE_OR_CHKSUM); - ulint checksum_field2 = mach_read_from_4( - frame + size - FIL_PAGE_END_LSN_OLD_CHKSUM); - /* empty pages are not encrypted */ - if (checksum_field1 == 0 && checksum_field2 == 0 - && mach_read_from_4(frame + FIL_PAGE_LSN) == 0) { - return true; - } - } - return false; + return (buf_page_is_zeroes(frame, zip_size)); } -#define fil_crypt_get_page_throttle(state,space,zip_size,offset,mtr,sleeptime_ms) \ - fil_crypt_get_page_throttle_func(state, space, zip_size, offset, mtr, \ +#define fil_crypt_get_page_throttle(state,offset,mtr,sleeptime_ms) \ + fil_crypt_get_page_throttle_func(state, offset, mtr, \ sleeptime_ms, __FILE__, __LINE__) /*********************************************************************** Get a page and compute sleep time -@return page */ +@param[in,out] state Rotation state +@param[in] zip_size compressed size or 0 +@param[in] offset Page offset +@param[in,out] mtr Minitransaction +@param[out] sleeptime_ms Sleep time +@param[in] file File where called +@param[in] line Line where called +@return page or NULL*/ static buf_block_t* fil_crypt_get_page_throttle_func( -/*=============================*/ - rotate_thread_t* state, /*!< in/out: Key rotation state */ - ulint space, /*!< in: FIL space id */ - uint zip_size, /*!< in: compressed size if - row_format compressed */ - ulint offset, /*!< in: page offsett */ - mtr_t* mtr, /*!< in/out: minitransaction */ - ulint* sleeptime_ms, /*!< out: sleep time */ - const char* file, /*!< in: file name */ - ulint line) /*!< in: file line */ + rotate_thread_t* state, + ulint offset, + mtr_t* mtr, + ulint* sleeptime_ms, + const char* file, + ulint line) { - buf_block_t* block = buf_page_try_get_func(space, offset, RW_X_LATCH, + fil_space_t* space = state->space; + ulint zip_size = fsp_flags_get_zip_size(space->flags); + ut_ad(space->n_pending_ops > 0); + + buf_block_t* block = buf_page_try_get_func(space->id, offset, RW_X_LATCH, true, file, line, mtr); if (block != NULL) { @@ -1831,16 +1821,14 @@ fil_crypt_get_page_throttle_func( /* Before reading from tablespace we need to make sure that tablespace exists and is not is just being dropped. */ - - if (fil_crypt_is_closing(space) || - fil_space_found_by_id(space) == NULL) { + if (space->is_stopping()) { return NULL; } state->crypt_stat.pages_read_from_disk++; ullint start = ut_time_us(NULL); - block = buf_page_get_gen(space, zip_size, offset, + block = buf_page_get_gen(space->id, zip_size, offset, RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, file, line, mtr); @@ -1866,6 +1854,7 @@ fil_crypt_get_page_throttle_func( } *sleeptime_ms += add_sleeptime_ms; + return block; } @@ -1875,27 +1864,35 @@ Get block and allocation status note: innodb locks fil_space_latch and then block when allocating page but locks block and then fil_space_latch when freeing page. -@return block + +@param[in,out] state Rotation state +@param[in] zip_size Compressed size or 0 +@param[in] offset Page offset +@param[in,out] mtr Minitransaction +@param[out] allocation_status Allocation status +@param[out] sleeptime_ms Sleep time +@return block or NULL */ static buf_block_t* btr_scrub_get_block_and_allocation_status( -/*======================================*/ - rotate_thread_t* state, /*!< in/out: Key rotation state */ - ulint space, /*!< in: FIL space id */ - uint zip_size, /*!< in: compressed size if - row_format compressed */ - ulint offset, /*!< in: page offsett */ - mtr_t* mtr, /*!< in/out: minitransaction - */ + rotate_thread_t* state, + uint zip_size, + ulint offset, + mtr_t* mtr, btr_scrub_page_allocation_status_t *allocation_status, - /*!< in/out: allocation status */ - ulint* sleeptime_ms) /*!< out: sleep time */ + ulint* sleeptime_ms) { mtr_t local_mtr; buf_block_t *block = NULL; + fil_space_t* space = state->space; + + ut_ad(space->n_pending_ops > 0); + ut_ad(zip_size == fsp_flags_get_zip_size(space->flags)); + mtr_start(&local_mtr); - *allocation_status = fsp_page_is_free(space, offset, &local_mtr) ? + + *allocation_status = fsp_page_is_free(space->id, offset, &local_mtr) ? BTR_SCRUB_PAGE_FREE : BTR_SCRUB_PAGE_ALLOCATED; @@ -1903,7 +1900,6 @@ btr_scrub_get_block_and_allocation_status( /* this is easy case, we lock fil_space_latch first and then block */ block = fil_crypt_get_page_throttle(state, - space, zip_size, offset, mtr, sleeptime_ms); mtr_commit(&local_mtr); @@ -1920,7 +1916,6 @@ btr_scrub_get_block_and_allocation_status( */ block = fil_crypt_get_page_throttle(state, - space, zip_size, offset, mtr, sleeptime_ms); } @@ -1930,21 +1925,29 @@ btr_scrub_get_block_and_allocation_status( /*********************************************************************** -Rotate one page */ +Rotate one page +@param[in,out] key_state Key state +@param[in,out] state Rotation state */ static void fil_crypt_rotate_page( -/*==================*/ - const key_state_t* key_state, /*!< in: Key state */ - rotate_thread_t* state) /*!< in: Key rotation state */ + const key_state_t* key_state, + rotate_thread_t* state) { - ulint space = state->space; + fil_space_t*space = state->space; + ulint space_id = space->id; ulint offset = state->offset; - const uint zip_size = fil_space_get_zip_size(space); + const uint zip_size = fsp_flags_get_zip_size(space->flags); ulint sleeptime_ms = 0; + fil_space_crypt_t *crypt_data = space->crypt_data; - /* check if tablespace is closing before reading page */ - if (fil_crypt_is_closing(space) || fil_space_found_by_id(space) == NULL) { + ut_ad(space->n_pending_ops > 0); + + /* In fil_crypt_thread where key rotation is done we have + acquired space and checked that this space is not yet + marked to be dropped. Similarly, in fil_crypt_find_page_to_rotate(). + Check here also to give DROP TABLE or similar a change. */ + if (space->is_stopping()) { return; } @@ -1956,7 +1959,6 @@ fil_crypt_rotate_page( mtr_t mtr; mtr_start(&mtr); buf_block_t* block = fil_crypt_get_page_throttle(state, - space, zip_size, offset, &mtr, &sleeptime_ms); @@ -1968,9 +1970,8 @@ fil_crypt_rotate_page( uint kv = block->page.key_version; /* check if tablespace is closing after reading page */ - if (!fil_crypt_is_closing(space)) { + if (space->is_stopping()) { byte* frame = buf_block_get_frame(block); - fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space); if (kv == 0 && fil_crypt_is_page_uninitialized(frame, zip_size)) { @@ -1990,7 +1991,7 @@ fil_crypt_rotate_page( /* force rotation by dummy updating page */ mlog_write_ulint(frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, - space, MLOG_4BYTES, &mtr); + space_id, MLOG_4BYTES, &mtr); /* update block */ block->page.key_version = key_state->key_version; @@ -2023,7 +2024,7 @@ fil_crypt_rotate_page( */ btr_scrub_page_allocation_status_t allocated; block = btr_scrub_get_block_and_allocation_status( - state, space, zip_size, offset, &mtr, + state, zip_size, offset, &mtr, &allocated, &sleeptime_ms); @@ -2037,7 +2038,7 @@ fil_crypt_rotate_page( /* we need to refetch it once more now that we have * index locked */ block = btr_scrub_get_block_and_allocation_status( - state, space, zip_size, offset, &mtr, + state, zip_size, offset, &mtr, &allocated, &sleeptime_ms); @@ -2068,7 +2069,6 @@ fil_crypt_rotate_page( if (needs_scrubbing == BTR_SCRUB_TURNED_OFF) { /* if we just detected that scrubbing was turned off * update global state to reflect this */ - fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space); ut_ad(crypt_data); mutex_enter(&crypt_data->mutex); crypt_data->rotate_state.scrubbing.is_active = false; @@ -2096,17 +2096,20 @@ fil_crypt_rotate_page( } /*********************************************************************** -Rotate a batch of pages */ +Rotate a batch of pages +@param[in,out] key_state Key state +@param[in,out] state Rotation state */ static void fil_crypt_rotate_pages( -/*===================*/ - const key_state_t* key_state, /*!< in: Key state */ - rotate_thread_t* state) /*!< in: Key rotation state */ + const key_state_t* key_state, + rotate_thread_t* state) { - ulint space = state->space; + ulint space = state->space->id; ulint end = state->offset + state->batch; + ut_ad(state->space->n_pending_ops > 0); + for (; state->offset < end; state->offset++) { /* we can't rotate pages in dblwr buffer as @@ -2127,20 +2130,23 @@ fil_crypt_rotate_pages( } /*********************************************************************** -Flush rotated pages and then update page 0 */ +Flush rotated pages and then update page 0 + +@param[in,out] state rotation state */ static void fil_crypt_flush_space( -/*==================*/ - rotate_thread_t* state, /*!< in: Key rotation state */ - ulint space) /*!< in: FIL space id */ + rotate_thread_t* state) { - fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space); + fil_space_t* space = state->space; + fil_space_crypt_t *crypt_data = space->crypt_data; + + ut_ad(space->n_pending_ops > 0); /* flush tablespace pages so that there are no pages left with old key */ lsn_t end_lsn = crypt_data->rotate_state.end_lsn; - if (end_lsn > 0 && !fil_crypt_is_closing(space)) { + if (end_lsn > 0 && !space->is_stopping()) { bool success = false; ulint n_pages = 0; ulint sum_pages = 0; @@ -2150,7 +2156,7 @@ fil_crypt_flush_space( success = buf_flush_list(ULINT_MAX, end_lsn, &n_pages); buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); sum_pages += n_pages; - } while (!success && !fil_crypt_is_closing(space)); + } while (!success && !space->is_stopping()); ullint end = ut_time_us(NULL); @@ -2168,40 +2174,38 @@ fil_crypt_flush_space( } /* update page 0 */ - if (!fil_crypt_is_closing(space)) { - mtr_t mtr; - mtr_start(&mtr); - ulint offset = 0; // page 0 - const uint zip_size = fil_space_get_zip_size(space); - buf_block_t* block = buf_page_get_gen(space, zip_size, offset, - RW_X_LATCH, NULL, BUF_GET, - __FILE__, __LINE__, &mtr); - byte* frame = buf_block_get_frame(block); - ulint maxsize; - crypt_data->page0_offset = - fsp_header_get_crypt_offset(zip_size, &maxsize); + mtr_t mtr; + mtr_start(&mtr); - fil_space_write_crypt_data(space, frame, - crypt_data->page0_offset, - ULINT_MAX, &mtr); - mtr_commit(&mtr); - } + const uint zip_size = fsp_flags_get_zip_size(state->space->flags); + + buf_block_t* block = buf_page_get_gen(space->id, zip_size, 0, + RW_X_LATCH, NULL, BUF_GET, + __FILE__, __LINE__, &mtr); + byte* frame = buf_block_get_frame(block); + + crypt_data->write_page0(frame, &mtr); + + mtr_commit(&mtr); } /*********************************************************************** -Complete rotating a space */ +Complete rotating a space +@param[in,out] key_state Key state +@param[in,out] state Rotation state */ static void fil_crypt_complete_rotate_space( -/*============================*/ - const key_state_t* key_state, /*!< in: Key state */ - rotate_thread_t* state) /*!< in: Key rotation state */ + const key_state_t* key_state, + rotate_thread_t* state) { - ulint space = state->space; - fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space); + fil_space_crypt_t *crypt_data = state->space->crypt_data; + + ut_ad(crypt_data); + ut_ad(state->space->n_pending_ops > 0); /* Space might already be dropped */ - if (crypt_data != NULL && !crypt_data->is_closing(false)) { + if (!state->space->is_stopping()) { mutex_enter(&crypt_data->mutex); /** @@ -2259,9 +2263,8 @@ fil_crypt_complete_rotate_space( } if (should_flush) { - fil_crypt_flush_space(state, space); + fil_crypt_flush_space(state); - ut_ad(crypt_data); mutex_enter(&crypt_data->mutex); crypt_data->rotate_state.flushing = false; mutex_exit(&crypt_data->mutex); @@ -2284,8 +2287,8 @@ DECLARE_THREAD(fil_crypt_thread)( mutex_enter(&fil_crypt_threads_mutex); uint thread_no = srv_n_fil_crypt_threads_started; srv_n_fil_crypt_threads_started++; - mutex_exit(&fil_crypt_threads_mutex); os_event_set(fil_crypt_event); /* signal that we started */ + mutex_exit(&fil_crypt_threads_mutex); /* state of this thread */ rotate_thread_t thr(thread_no); @@ -2305,6 +2308,7 @@ DECLARE_THREAD(fil_crypt_thread)( * i.e either new key version of change or * new rotate_key_age */ os_event_reset(fil_crypt_threads_event); + if (os_event_wait_time(fil_crypt_threads_event, 1000000) == 0) { break; } @@ -2318,7 +2322,12 @@ DECLARE_THREAD(fil_crypt_thread)( time_t waited = time(0) - wait_start; - if (waited >= (time_t) srv_background_scrub_data_check_interval) { + /* Break if we have waited the background scrub + internal and background scrubbing is enabled */ + if (waited >= 0 + && ulint(waited) >= srv_background_scrub_data_check_interval + && (srv_background_scrub_data_uncompressed + || srv_background_scrub_data_compressed)) { break; } } @@ -2333,29 +2342,32 @@ DECLARE_THREAD(fil_crypt_thread)( /* we found a space to rotate */ fil_crypt_start_rotate_space(&new_state, &thr); - /* decrement pending ops that was incremented in - * fil_crypt_space_needs_rotation - * (called from fil_crypt_find_space_to_rotate), - * this makes sure that tablespace won't be dropped - * just after we decided to start processing it. */ - fil_decr_pending_ops(thr.space); - /* iterate all pages (cooperativly with other threads) */ - while (!thr.should_shutdown() && + while (!thr.should_shutdown() && thr.space && fil_crypt_find_page_to_rotate(&new_state, &thr)) { /* rotate a (set) of pages */ fil_crypt_rotate_pages(&new_state, &thr); + /* If space is marked as stopping, release + space and stop rotation. */ + if (thr.space->is_stopping()) { + fil_space_release(thr.space); + thr.space = NULL; + break; + } + /* realloc iops */ fil_crypt_realloc_iops(&thr); } /* complete rotation */ - fil_crypt_complete_rotate_space(&new_state, &thr); + if (thr.space) { + fil_crypt_complete_rotate_space(&new_state, &thr); + } /* force key state refresh */ - new_state.key_id= 0; + new_state.key_id = 0; /* return iops */ fil_crypt_return_iops(&thr); @@ -2365,10 +2377,16 @@ DECLARE_THREAD(fil_crypt_thread)( /* return iops if shutting down */ fil_crypt_return_iops(&thr); + /* release current space if shutting down */ + if (thr.space) { + fil_space_release(thr.space); + thr.space = NULL; + } + mutex_enter(&fil_crypt_threads_mutex); srv_n_fil_crypt_threads_started--; - mutex_exit(&fil_crypt_threads_mutex); os_event_set(fil_crypt_event); /* signal that we stopped */ + mutex_exit(&fil_crypt_threads_mutex); /* We count the number of threads in os_thread_exit(). A created thread should always use that to exit and not use return() to exit. */ @@ -2379,23 +2397,26 @@ DECLARE_THREAD(fil_crypt_thread)( } /********************************************************************* -Adjust thread count for key rotation */ +Adjust thread count for key rotation +@param[in] enw_cnt Number of threads to be used */ UNIV_INTERN void fil_crypt_set_thread_cnt( -/*=====================*/ - uint new_cnt) /*!< in: New key rotation thread count */ + const uint new_cnt) { if (!fil_crypt_threads_inited) { fil_crypt_threads_init(); } + mutex_enter(&fil_crypt_threads_mutex); + if (new_cnt > srv_n_fil_crypt_threads) { uint add = new_cnt - srv_n_fil_crypt_threads; srv_n_fil_crypt_threads = new_cnt; for (uint i = 0; i < add; i++) { os_thread_id_t rotation_thread_id; os_thread_create(fil_crypt_thread, NULL, &rotation_thread_id); + ib_logf(IB_LOG_LEVEL_INFO, "Creating #%d thread id %lu total threads %u.", i+1, os_thread_pf(rotation_thread_id), new_cnt); @@ -2405,6 +2426,8 @@ fil_crypt_set_thread_cnt( os_event_set(fil_crypt_threads_event); } + mutex_exit(&fil_crypt_threads_mutex); + while(srv_n_fil_crypt_threads_started != srv_n_fil_crypt_threads) { os_event_reset(fil_crypt_event); os_event_wait_time(fil_crypt_event, 1000000); @@ -2412,39 +2435,39 @@ fil_crypt_set_thread_cnt( } /********************************************************************* -Adjust max key age */ +Adjust max key age +@param[in] val New max key age */ UNIV_INTERN void fil_crypt_set_rotate_key_age( -/*=========================*/ - uint val) /*!< in: New max key age */ + uint val) { srv_fil_crypt_rotate_key_age = val; os_event_set(fil_crypt_threads_event); } /********************************************************************* -Adjust rotation iops */ +Adjust rotation iops +@param[in] val New max roation iops */ UNIV_INTERN void fil_crypt_set_rotation_iops( -/*========================*/ - uint val) /*!< in: New iops setting */ + uint val) { srv_n_fil_crypt_iops = val; os_event_set(fil_crypt_threads_event); } /********************************************************************* -Adjust encrypt tables */ +Adjust encrypt tables +@param[in] val New setting for innodb-encrypt-tables */ UNIV_INTERN void fil_crypt_set_encrypt_tables( -/*=========================*/ - uint val) /*!< in: New srv_encrypt_tables setting */ + uint val) { - srv_encrypt_tables = val; - os_event_set(fil_crypt_threads_event); + srv_encrypt_tables = val; + os_event_set(fil_crypt_threads_event); } /********************************************************************* @@ -2452,7 +2475,6 @@ Init threads for key rotation */ UNIV_INTERN void fil_crypt_threads_init() -/*====================*/ { ut_ad(mutex_own(&fil_system->mutex)); if (!fil_crypt_threads_inited) { @@ -2473,75 +2495,40 @@ Clean up key rotation threads resources */ UNIV_INTERN void fil_crypt_threads_cleanup() -/*=======================*/ { if (!fil_crypt_threads_inited) { return; } ut_a(!srv_n_fil_crypt_threads_started); os_event_free(fil_crypt_event); + fil_crypt_event = NULL; os_event_free(fil_crypt_threads_event); + fil_crypt_threads_event = NULL; mutex_free(&fil_crypt_threads_mutex); fil_crypt_threads_inited = false; } /********************************************************************* -Mark a space as closing */ -UNIV_INTERN -void -fil_space_crypt_mark_space_closing( -/*===============================*/ - ulint space, /*!< in: tablespace id */ - fil_space_crypt_t* crypt_data) /*!< in: crypt_data or NULL */ -{ - if (!fil_crypt_threads_inited) { - return; - } - - mutex_enter(&fil_crypt_threads_mutex); - - if (!crypt_data) { - crypt_data = fil_space_get_crypt_data(space); - } - - if (crypt_data == NULL) { - mutex_exit(&fil_crypt_threads_mutex); - return; - } - - mutex_enter(&crypt_data->mutex); - mutex_exit(&fil_crypt_threads_mutex); - crypt_data->closing = true; - mutex_exit(&crypt_data->mutex); -} - -/********************************************************************* -Wait for crypt threads to stop accessing space */ +Wait for crypt threads to stop accessing space +@param[in] space Tablespace */ UNIV_INTERN void fil_space_crypt_close_tablespace( -/*=============================*/ - ulint space) /*!< in: Space id */ + const fil_space_t* space) { - if (!srv_encrypt_tables) { + if (!srv_encrypt_tables || !space->crypt_data) { return; } mutex_enter(&fil_crypt_threads_mutex); - fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space); - - if (crypt_data == NULL || crypt_data->is_closing(false)) { - mutex_exit(&fil_crypt_threads_mutex); - return; - } + fil_space_crypt_t* crypt_data = space->crypt_data; - uint start = time(0); - uint last = start; + time_t start = time(0); + time_t last = start; mutex_enter(&crypt_data->mutex); mutex_exit(&fil_crypt_threads_mutex); - crypt_data->closing = true; uint cnt = crypt_data->rotate_state.active_threads; bool flushing = crypt_data->rotate_state.flushing; @@ -2551,20 +2538,22 @@ fil_space_crypt_close_tablespace( /* release dict mutex so that scrub threads can release their * table references */ dict_mutex_exit_for_mysql(); + /* wakeup throttle (all) sleepers */ os_event_set(fil_crypt_throttle_sleep_event); + os_thread_sleep(20000); dict_mutex_enter_for_mysql(); mutex_enter(&crypt_data->mutex); cnt = crypt_data->rotate_state.active_threads; flushing = crypt_data->rotate_state.flushing; - uint now = time(0); + time_t now = time(0); if (now >= last + 30) { ib_logf(IB_LOG_LEVEL_WARN, - "Waited %u seconds to drop space: %lu.", - now - start, space); + "Waited %ld seconds to drop space: %s(" ULINTPF ").", + now - start, space->name, space->id); last = now; } } @@ -2574,22 +2563,23 @@ fil_space_crypt_close_tablespace( /********************************************************************* Get crypt status for a space (used by information_schema) -return 0 if crypt data present */ +@param[in] space Tablespace +@param[out] status Crypt status */ UNIV_INTERN -int +void fil_space_crypt_get_status( -/*=======================*/ - ulint id, /*!< in: space id */ - struct fil_space_crypt_status_t* status) /*!< out: status */ + const fil_space_t* space, + struct fil_space_crypt_status_t* status) { - fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(id); - memset(status, 0, sizeof(*status)); + ut_ad(space->n_pending_ops > 0); + fil_space_crypt_t* crypt_data = space->crypt_data; + status->space = space->id; + if (crypt_data != NULL) { - status->space = id; - status->scheme = crypt_data->type; mutex_enter(&crypt_data->mutex); + status->scheme = crypt_data->type; status->keyserver_requests = crypt_data->keyserver_requests; status->min_key_version = crypt_data->min_key_version; status->key_id = crypt_data->key_id; @@ -2603,8 +2593,6 @@ fil_space_crypt_get_status( crypt_data->rotate_state.next_offset; status->rotate_max_page_number = crypt_data->rotate_state.max_offset; - } else { - status->rotating = false; } mutex_exit(&crypt_data->mutex); @@ -2612,25 +2600,17 @@ fil_space_crypt_get_status( if (srv_encrypt_tables || crypt_data->min_key_version) { status->current_key_version = fil_crypt_get_latest_key_version(crypt_data); - } else { - status->current_key_version = 0; - } - } else { - if (srv_encrypt_tables) { - os_event_set(fil_crypt_threads_event); } } - - return crypt_data == NULL ? 1 : 0; } /********************************************************************* -Return crypt statistics */ +Return crypt statistics +@param[out] stat Crypt statistics */ UNIV_INTERN void fil_crypt_total_stat( -/*=================*/ - fil_crypt_stat_t *stat) /*!< out: Crypt statistics */ + fil_crypt_stat_t *stat) { mutex_enter(&crypt_stat_mutex); *stat = crypt_stat; @@ -2639,21 +2619,24 @@ fil_crypt_total_stat( /********************************************************************* Get scrub status for a space (used by information_schema) -return 0 if data found */ + +@param[in] space Tablespace +@param[out] status Scrub status */ UNIV_INTERN -int +void fil_space_get_scrub_status( -/*=======================*/ - ulint id, /*!< in: space id */ - struct fil_space_scrub_status_t* status) /*!< out: status */ + const fil_space_t* space, + struct fil_space_scrub_status_t* status) { - fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(id); - memset(status, 0, sizeof(*status)); + ut_ad(space->n_pending_ops > 0); + fil_space_crypt_t* crypt_data = space->crypt_data; + + status->space = space->id; + if (crypt_data != NULL) { - status->space = id; - status->compressed = fil_space_get_zip_size(id) > 0; + status->compressed = fsp_flags_get_zip_size(space->flags) > 0; mutex_enter(&crypt_data->mutex); status->last_scrub_completed = crypt_data->rotate_state.scrubbing.last_scrub_completed; @@ -2668,12 +2651,8 @@ fil_space_get_scrub_status( crypt_data->rotate_state.next_offset; status->current_scrub_max_page_number = crypt_data->rotate_state.max_offset; - } else { - status->scrubbing = false; } mutex_exit(&crypt_data->mutex); } - - return crypt_data == NULL ? 1 : 0; } diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index f4301d47028..a116bfad99d 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2014, 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -157,7 +157,11 @@ UNIV_INTERN mysql_pfs_key_t fil_space_latch_key; /** The tablespace memory cache. This variable is NULL before the module is initialized. */ -fil_system_t* fil_system = NULL; +UNIV_INTERN fil_system_t* fil_system = NULL; + +/** At this age or older a space/page will be rotated */ +UNIV_INTERN extern uint srv_fil_crypt_rotate_key_age; +UNIV_INTERN extern ib_mutex_t fil_crypt_threads_mutex; /** Determine if (i) is a user tablespace id or not. */ # define fil_is_user_tablespace_id(i) ((i) > srv_undo_tablespaces_open) @@ -169,7 +173,7 @@ fil_system_t* fil_system = NULL; && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)\ || ((s)->purpose == FIL_LOG \ && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT)) - + #else /* __WIN__ */ # define fil_buffering_disabled(s) (0) #endif /* __WIN__ */ @@ -601,7 +605,6 @@ fil_node_open_file( ibool success; byte* buf2; byte* page; - ulint page_size; ut_ad(mutex_own(&(system->mutex))); ut_a(node->n_pending == 0); @@ -619,6 +622,7 @@ fil_node_open_file( node->handle = os_file_create_simple_no_error_handling( innodb_file_data_key, node->name, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success, 0); + if (!success) { /* The following call prints an error message */ os_file_get_last_error(true); @@ -670,6 +674,16 @@ fil_node_open_file( const ulint space_id = fsp_header_get_space_id(page); ulint flags = fsp_header_get_flags(page); + /* Try to read crypt_data from page 0 if it is not yet + read. */ + if (!node->space->page_0_crypt_read) { + ulint offset = fsp_header_get_crypt_offset( + fsp_flags_get_zip_size(flags)); + ut_ad(node->space->crypt_data == NULL); + node->space->crypt_data = fil_space_read_crypt_data(space_id, page, offset); + node->space->page_0_crypt_read = true; + } + ut_free(buf2); os_file_close(node->handle); @@ -687,8 +701,6 @@ fil_node_open_file( flags = cflags; } - page_size = fsp_flags_get_page_size(flags); - if (UNIV_UNLIKELY(space_id != space->id)) { ib_logf(IB_LOG_LEVEL_ERROR, "tablespace id is " ULINTPF " in the data dictionary" @@ -697,17 +709,10 @@ fil_node_open_file( return(false); } - if (size_bytes >= (1024*1024)) { - /* Truncate the size to whole extent size. */ - size_bytes = ut_2pow_round(size_bytes, (1024*1024)); - } - - if (!fsp_flags_is_compressed(flags)) { - node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + if (ulint zip_size = fsp_flags_get_zip_size(flags)) { + node->size = ulint(size_bytes / zip_size); } else { - node->size = (ulint) - (size_bytes - / fsp_flags_get_zip_size(flags)); + node->size = ulint(size_bytes / UNIV_PAGE_SIZE); } #ifdef UNIV_HOTBACKUP @@ -1041,8 +1046,8 @@ fil_space_extend_must_retry( we have set the node->being_extended flag. */ mutex_exit(&fil_system->mutex); - ulint start_page_no = space->size; - ulint file_start_page_no = start_page_no - node->size; + ulint start_page_no = space->size; + const ulint file_start_page_no = start_page_no - node->size; /* Determine correct file block size */ if (node->file_block_size == 0) { @@ -1052,64 +1057,126 @@ fil_space_extend_must_retry( } ulint page_size = fsp_flags_get_zip_size(space->flags); - ulint pages_added = 0; - if (!page_size) { page_size = UNIV_PAGE_SIZE; } -#ifdef HAVE_POSIX_FALLOCATE +#ifdef _WIN32 + const ulint io_completion_type = OS_FILE_READ; + /* Logically or physically extend the file with zero bytes, + depending on whether it is sparse. */ + + /* FIXME: Call DeviceIoControl(node->handle, FSCTL_SET_SPARSE, ...) + when opening a file when FSP_FLAGS_HAS_PAGE_COMPRESSION(). */ + { + FILE_END_OF_FILE_INFO feof; + /* fil_read_first_page() expects UNIV_PAGE_SIZE bytes. + fil_node_open_file() expects at least 4 * UNIV_PAGE_SIZE bytes. + Do not shrink short ROW_FORMAT=COMPRESSED files. */ + feof.EndOfFile.QuadPart = std::max( + os_offset_t(size - file_start_page_no) * page_size, + os_offset_t(FIL_IBD_FILE_INITIAL_SIZE + * UNIV_PAGE_SIZE)); + *success = SetFileInformationByHandle(node->handle, + FileEndOfFileInfo, + &feof, sizeof feof); + if (!*success) { + ib_logf(IB_LOG_LEVEL_ERROR, "extending file %s" + " from " INT64PF + " to " INT64PF " bytes failed with %u", + node->name, + os_offset_t(node->size) * page_size, + feof.EndOfFile.QuadPart, GetLastError()); + } else { + start_page_no = size; + } + } +#else + /* We will logically extend the file with ftruncate() if + page_compression is enabled, because the file is expected to + be sparse in that case. Make sure that ftruncate() can deal + with large files. */ + const bool is_sparse = sizeof(off_t) >= 8 + && FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags); + +# ifdef HAVE_POSIX_FALLOCATE /* We must complete the I/O request after invoking posix_fallocate() to avoid an assertion failure at shutdown. Because no actual writes were dispatched, a read operation will suffice. */ const ulint io_completion_type = srv_use_posix_fallocate - ? OS_FILE_READ : OS_FILE_WRITE; + || is_sparse ? OS_FILE_READ : OS_FILE_WRITE; + + if (srv_use_posix_fallocate && !is_sparse) { + const os_offset_t start_offset + = os_offset_t(start_page_no - file_start_page_no) + * page_size; + const ulint n_pages = size - start_page_no; + const os_offset_t len = os_offset_t(n_pages) * page_size; - if (srv_use_posix_fallocate) { - const os_offset_t start_offset = static_cast<os_offset_t>( - start_page_no) * page_size; - const os_offset_t len = static_cast<os_offset_t>( - pages_added) * page_size; + int err; + do { + err = posix_fallocate(node->handle, start_offset, len); + } while (err == EINTR + && srv_shutdown_state == SRV_SHUTDOWN_NONE); - *success = !posix_fallocate(node->handle, start_offset, len); + *success = !err; if (!*success) { - ib_logf(IB_LOG_LEVEL_ERROR, "preallocating file " - "space for file \'%s\' failed. Current size " - INT64PF ", desired size " INT64PF, - node->name, start_offset, len+start_offset); - os_file_handle_error_no_exit( - node->name, "posix_fallocate", - FALSE, __FILE__, __LINE__); + ib_logf(IB_LOG_LEVEL_ERROR, "extending file %s" + " from " INT64PF " to " INT64PF " bytes" + " failed with error %d", + node->name, start_offset, len + start_offset, + err); } DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", - *success = FALSE; errno = 28; + *success = FALSE; os_has_said_disk_full = TRUE;); if (*success) { os_has_said_disk_full = FALSE; - } else { - pages_added = 0; + start_page_no = size; } } else -#else - const ulint io_completion_type = OS_FILE_WRITE; -#endif - { - byte* buf2; - byte* buf; - ulint buf_size; - +# else + const ulint io_completion_type = is_sparse + ? OS_FILE_READ : OS_FILE_WRITE; +# endif + if (is_sparse) { + /* fil_read_first_page() expects UNIV_PAGE_SIZE bytes. + fil_node_open_file() expects at least 4 * UNIV_PAGE_SIZE bytes. + Do not shrink short ROW_FORMAT=COMPRESSED files. */ + off_t s = std::max(off_t(size - file_start_page_no) + * off_t(page_size), + off_t(FIL_IBD_FILE_INITIAL_SIZE + * UNIV_PAGE_SIZE)); + *success = !ftruncate(node->handle, s); + if (!*success) { + ib_logf(IB_LOG_LEVEL_ERROR, "ftruncate of file %s" + " from " INT64PF " to " INT64PF " bytes" + " failed with error %d", + node->name, + os_offset_t(start_page_no - file_start_page_no) + * page_size, os_offset_t(s), errno); + } else { + start_page_no = size; + } + } else { /* Extend at most 64 pages at a time */ - buf_size = ut_min(64, size - start_page_no) + ulint buf_size = ut_min(64, size - start_page_no) * page_size; - buf2 = static_cast<byte*>(mem_alloc(buf_size + page_size)); - buf = static_cast<byte*>(ut_align(buf2, page_size)); - - memset(buf, 0, buf_size); - - while (start_page_no < size) { + byte* buf2 = static_cast<byte*>( + calloc(1, buf_size + page_size)); + *success = buf2 != NULL; + if (!buf2) { + ib_logf(IB_LOG_LEVEL_ERROR, "Cannot allocate " ULINTPF + " bytes to extend file", + buf_size + page_size); + } + byte* const buf = static_cast<byte*>( + ut_align(buf2, page_size)); + + while (*success && start_page_no < size) { ulint n_pages = ut_min(buf_size / page_size, size - start_page_no); @@ -1118,50 +1185,40 @@ fil_space_extend_must_retry( start_page_no - file_start_page_no) * page_size; - const char* name = node->name == NULL - ? space->name : node->name; - *success = os_aio(OS_FILE_WRITE, 0, OS_AIO_SYNC, - name, node->handle, buf, + node->name, node->handle, buf, offset, page_size * n_pages, page_size, node, NULL, space->id, NULL, 0); DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", - *success = FALSE; errno = 28; + *success = FALSE; os_has_said_disk_full = TRUE;); if (*success) { os_has_said_disk_full = FALSE; - } else { - /* Let us measure the size of the file - to determine how much we were able to - extend it */ - os_offset_t size; - - size = os_file_get_size(node->handle); - ut_a(size != (os_offset_t) -1); - - n_pages = ((ulint) (size / page_size)) - - node->size - pages_added; - - pages_added += n_pages; - break; } + /* Let us measure the size of the file + to determine how much we were able to + extend it */ + os_offset_t fsize = os_file_get_size(node->handle); + ut_a(fsize != os_offset_t(-1)); - start_page_no += n_pages; - pages_added += n_pages; + start_page_no = ulint(fsize / page_size) + + file_start_page_no; } - mem_free(buf2); + free(buf2); } - +#endif mutex_enter(&fil_system->mutex); ut_a(node->being_extended); + ut_a(start_page_no - file_start_page_no >= node->size); - space->size += pages_added; - node->size += pages_added; + ulint file_size = start_page_no - file_start_page_no; + space->size += file_size - node->size; + node->size = file_size; fil_node_complete_io(node, fil_system, io_completion_type); @@ -1449,17 +1506,24 @@ fil_space_contains_node( /*******************************************************************//** Creates a space memory object and puts it to the 'fil system' hash table. If there is an error, prints an error message to the .err log. +@param[in] name Space name +@param[in] id Space id +@param[in] flags Tablespace flags +@param[in] purpose FIL_TABLESPACE or FIL_LOG if log +@param[in] crypt_data Encryption information +@param[in] create_table True if this is create table +@param[in] mode Encryption mode @return TRUE if success */ UNIV_INTERN -ibool +bool fil_space_create( -/*=============*/ - const char* name, /*!< in: space name */ - ulint id, /*!< in: space id */ - ulint flags, /*!< in: tablespace flags */ - ulint purpose,/*!< in: FIL_TABLESPACE, or FIL_LOG if log */ - fil_space_crypt_t* crypt_data, /*!< in: crypt data */ - bool create_table) /*!< in: true if create table */ + const char* name, + ulint id, + ulint flags, + ulint purpose, + fil_space_crypt_t* crypt_data, + bool create_table, + fil_encryption_t mode) { fil_space_t* space; @@ -1483,7 +1547,7 @@ fil_space_create( mutex_exit(&fil_system->mutex); - return(FALSE); + return(false); } ib_logf(IB_LOG_LEVEL_WARN, @@ -1510,7 +1574,7 @@ fil_space_create( mutex_exit(&fil_system->mutex); - return(FALSE); + return(false); } space = static_cast<fil_space_t*>(mem_zalloc(sizeof(*space))); @@ -1541,17 +1605,6 @@ fil_space_create( space->flags = flags; space->magic_n = FIL_SPACE_MAGIC_N; - space->printed_compression_failure = false; - - rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP); - - HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space); - - HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash, - ut_fold_string(name), space); - space->is_in_unflushed_spaces = false; - - space->is_corrupt = FALSE; space->crypt_data = crypt_data; /* In create table we write page 0 so we have already @@ -1570,11 +1623,33 @@ fil_space_create( space->crypt_data ? space->crypt_data->encryption : 0); #endif + rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP); + + HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space); + + HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash, + ut_fold_string(name), space); + UT_LIST_ADD_LAST(space_list, fil_system->space_list, space); - mutex_exit(&fil_system->mutex); + /* Inform key rotation that there could be something + to do */ + if (purpose == FIL_TABLESPACE && !srv_fil_crypt_rotate_key_age && fil_crypt_threads_event && + (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF || + srv_encrypt_tables)) { + /* Key rotation is not enabled, need to inform background + encryption threads. */ + UT_LIST_ADD_LAST(rotation_list, fil_system->rotation_list, space); + space->is_in_rotation_list = true; + mutex_exit(&fil_system->mutex); + mutex_enter(&fil_crypt_threads_mutex); + os_event_set(fil_crypt_threads_event); + mutex_exit(&fil_crypt_threads_mutex); + } else { + mutex_exit(&fil_system->mutex); + } - return(TRUE); + return(true); } /*******************************************************************//** @@ -1686,6 +1761,12 @@ fil_space_free( space); } + if (space->is_in_rotation_list) { + space->is_in_rotation_list = false; + ut_a(UT_LIST_GET_LEN(fil_system->rotation_list) > 0); + UT_LIST_REMOVE(rotation_list, fil_system->rotation_list, space); + } + UT_LIST_REMOVE(space_list, fil_system->space_list, space); ut_a(space->magic_n == FIL_SPACE_MAGIC_N); @@ -2309,7 +2390,7 @@ fil_check_first_page(const page_t* page, ulint space_id, ulint flags) } if (buf_page_is_corrupted( - false, page, fsp_flags_get_zip_size(flags))) { + false, page, fsp_flags_get_zip_size(flags), NULL)) { return("checksum mismatch"); } @@ -2348,7 +2429,6 @@ fil_read_first_page( const char* check_msg = NULL; fil_space_crypt_t* cdata; - buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); /* Align the memory for a possible read from a raw device */ @@ -2388,7 +2468,7 @@ fil_read_first_page( ulint space = fsp_header_get_space_id(page); ulint offset = fsp_header_get_crypt_offset( - fsp_flags_get_zip_size(*flags), NULL); + fsp_flags_get_zip_size(*flags)); cdata = fil_space_read_crypt_data(space, page, offset); @@ -2767,7 +2847,7 @@ fil_op_log_parse_or_replay( space_id, name, path, flags, DICT_TF2_USE_TABLESPACE, FIL_IBD_FILE_INITIAL_SIZE, - FIL_SPACE_ENCRYPTION_DEFAULT, + FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY) != DB_SUCCESS) { ut_error; } @@ -2891,16 +2971,27 @@ fil_check_pending_operations( *space = 0; - /* Wait for crypt threads to stop accessing space */ - fil_space_crypt_close_tablespace(id); - mutex_enter(&fil_system->mutex); fil_space_t* sp = fil_space_get_by_id(id); + if (sp) { sp->stop_new_ops = TRUE; + /* space could be freed by other threads as soon + as n_pending_ops reaches 0, thus increment pending + ops here. */ + sp->n_pending_ops++; } + mutex_exit(&fil_system->mutex); + /* Wait for crypt threads to stop accessing space */ + if (sp) { + fil_space_crypt_close_tablespace(sp); + /* We have "acquired" this space and must + free it now as below we compare n_pending_ops. */ + fil_space_release(sp); + } + /* Check for pending change buffer merges. */ do { @@ -3852,7 +3943,23 @@ fil_create_new_single_table_tablespace( goto error_exit_3; } - ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE); + { + /* fil_read_first_page() expects UNIV_PAGE_SIZE bytes. + fil_node_open_file() expects at least 4 * UNIV_PAGE_SIZE bytes. + Do not create too short ROW_FORMAT=COMPRESSED files. */ + const ulint zip_size = fsp_flags_get_zip_size(flags); + const ulint page_size = zip_size ? zip_size : UNIV_PAGE_SIZE; + const os_offset_t fsize = std::max( + os_offset_t(size) * page_size, + os_offset_t(FIL_IBD_FILE_INITIAL_SIZE + * UNIV_PAGE_SIZE)); + /* ROW_FORMAT=COMPRESSED files never use page_compression + (are never sparse). */ + ut_ad(!zip_size || !FSP_FLAGS_HAS_PAGE_COMPRESSION(flags)); + + ret = os_file_set_size(path, file, fsize, + FSP_FLAGS_HAS_PAGE_COMPRESSION(flags)); + } if (!ret) { err = DB_OUT_OF_FILE_SPACE; @@ -3880,14 +3987,8 @@ fil_create_new_single_table_tablespace( fsp_header_init_fields(page, space_id, flags); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); - if (!(fsp_flags_is_compressed(flags))) { - buf_flush_init_for_writing(page, NULL, 0); - ret = os_file_write(path, file, page, 0, UNIV_PAGE_SIZE); - } else { + if (const ulint zip_size = fsp_flags_get_zip_size(flags)) { page_zip_des_t page_zip; - ulint zip_size; - - zip_size = fsp_flags_get_zip_size(flags); page_zip_set_size(&page_zip, zip_size); page_zip.data = page + UNIV_PAGE_SIZE; @@ -3898,6 +3999,9 @@ fil_create_new_single_table_tablespace( page_zip.n_blobs = 0; buf_flush_init_for_writing(page, &page_zip, 0); ret = os_file_write(path, file, page_zip.data, 0, zip_size); + } else { + buf_flush_init_for_writing(page, NULL, 0); + ret = os_file_write(path, file, page, 0, UNIV_PAGE_SIZE); } ut_free(buf2); @@ -3930,13 +4034,13 @@ fil_create_new_single_table_tablespace( /* Create crypt data if the tablespace is either encrypted or user has requested it to remain unencrypted. */ - if (mode == FIL_SPACE_ENCRYPTION_ON || mode == FIL_SPACE_ENCRYPTION_OFF || + if (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF || srv_encrypt_tables) { crypt_data = fil_space_create_crypt_data(mode, key_id); } success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE, - crypt_data, true); + crypt_data, true, mode); if (!success || !fil_node_create(path, size, space_id, FALSE)) { err = DB_ERROR; @@ -4547,13 +4651,13 @@ fil_user_tablespace_find_space_id( to UNIV_PAGE_SIZE. */ if (page_size == UNIV_PAGE_SIZE) { uncompressed_ok = !buf_page_is_corrupted( - false, page, 0); + false, page, 0, NULL); } bool compressed_ok = false; if (page_size <= UNIV_PAGE_SIZE_DEF) { compressed_ok = !buf_page_is_corrupted( - false, page, page_size); + false, page, page_size, NULL); } if (uncompressed_ok || compressed_ok) { @@ -6644,7 +6748,8 @@ fil_iterate( page_type == FIL_PAGE_PAGE_COMPRESSED); /* If tablespace is encrypted, we need to decrypt - the page. */ + the page. Note that tablespaces are not in + fil_system during import. */ if (encrypted) { decrypted = fil_space_decrypt( iter.crypt_data, @@ -6897,8 +7002,11 @@ fil_tablespace_iterate( iter.n_io_buffers = n_io_buffers; iter.page_size = callback.get_page_size(); + /* In MariaDB/MySQL 5.6 tablespace does not exist + during import, therefore we can't use space directly + here. */ ulint crypt_data_offset = fsp_header_get_crypt_offset( - callback.get_zip_size(), 0); + callback.get_zip_size()); /* read (optional) crypt data */ iter.crypt_data = fil_space_read_crypt_data( @@ -6940,7 +7048,7 @@ fil_tablespace_iterate( mem_free(io_buffer); - if (iter.crypt_data != NULL) { + if (crypt_io_buffer != NULL) { mem_free(crypt_io_buffer); iter.crypt_io_buffer = NULL; fil_space_destroy_crypt_data(&iter.crypt_data); @@ -7199,33 +7307,12 @@ fil_space_set_corrupt( space = fil_space_get_by_id(space_id); if (space) { - space->is_corrupt = TRUE; + space->is_corrupt = true; } mutex_exit(&fil_system->mutex); } -/****************************************************************//** -Acquire fil_system mutex */ -void -fil_system_enter(void) -/*==================*/ -{ - ut_ad(!mutex_own(&fil_system->mutex)); - mutex_enter(&fil_system->mutex); -} - -/****************************************************************//** -Release fil_system mutex */ -void -fil_system_exit(void) -/*=================*/ -{ - ut_ad(mutex_own(&fil_system->mutex)); - mutex_exit(&fil_system->mutex); -} - - /****************************************************************** Get id of first tablespace or ULINT_UNDEFINED if none */ UNIV_INTERN @@ -7256,36 +7343,6 @@ fil_get_first_space() } /****************************************************************** -Get id of first tablespace that has node or ULINT_UNDEFINED if none */ -UNIV_INTERN -ulint -fil_get_first_space_safe() -/*======================*/ -{ - ulint out_id = ULINT_UNDEFINED; - fil_space_t* space; - - mutex_enter(&fil_system->mutex); - - space = UT_LIST_GET_FIRST(fil_system->space_list); - if (space != NULL) { - do - { - if (!space->stop_new_ops && UT_LIST_GET_LEN(space->chain) > 0) { - out_id = space->id; - break; - } - - space = UT_LIST_GET_NEXT(space_list, space); - } while (space != NULL); - } - - mutex_exit(&fil_system->mutex); - - return out_id; -} - -/****************************************************************** Get id of next tablespace or ULINT_UNDEFINED if none */ UNIV_INTERN ulint @@ -7326,165 +7383,207 @@ fil_get_next_space( return out_id; } -/****************************************************************** -Get id of next tablespace that has node or ULINT_UNDEFINED if none */ -UNIV_INTERN -ulint -fil_get_next_space_safe( -/*====================*/ - ulint id) /*!< in: previous space id */ +/** Acquire a tablespace when it could be dropped concurrently. +Used by background threads that do not necessarily hold proper locks +for concurrency control. +@param[in] id tablespace ID +@param[in] silent whether to silently ignore missing tablespaces +@return the tablespace, or NULL if missing or being deleted */ +inline +fil_space_t* +fil_space_acquire_low( + ulint id, + bool silent) { - bool found; - fil_space_t* space; - ulint out_id = ULINT_UNDEFINED; + fil_space_t* space; mutex_enter(&fil_system->mutex); space = fil_space_get_by_id(id); - if (space == NULL) { - /* we didn't find it...search for space with space->id > id */ - found = false; - space = UT_LIST_GET_FIRST(fil_system->space_list); - } else { - /* we found it, take next available space */ - found = true; - } - - while ((space = UT_LIST_GET_NEXT(space_list, space)) != NULL) { - - if (!found && space->id <= id) - continue; - if (!space->stop_new_ops) { - /* inc reference to prevent drop */ - out_id = space->id; - break; + if (space == NULL) { + if (!silent) { + ib_logf(IB_LOG_LEVEL_WARN, "Trying to access missing" + " tablespace " ULINTPF ".", id); + ut_error; } + } else if (space->stop_new_ops) { + space = NULL; + } else { + space->n_pending_ops++; } mutex_exit(&fil_system->mutex); - return out_id; + return(space); } -/****************************************************************** -Get crypt data for a tablespace */ -UNIV_INTERN -fil_space_crypt_t* -fil_space_get_crypt_data( -/*=====================*/ - ulint id) /*!< in: space id */ +/** Acquire a tablespace when it could be dropped concurrently. +Used by background threads that do not necessarily hold proper locks +for concurrency control. +@param[in] id tablespace ID +@return the tablespace, or NULL if missing or being deleted */ +fil_space_t* +fil_space_acquire( + ulint id) { - fil_space_t* space; - fil_space_crypt_t* crypt_data = NULL; + return(fil_space_acquire_low(id, false)); +} - ut_ad(fil_system); +/** Acquire a tablespace that may not exist. +Used by background threads that do not necessarily hold proper locks +for concurrency control. +@param[in] id tablespace ID +@return the tablespace, or NULL if missing or being deleted */ +fil_space_t* +fil_space_acquire_silent( + ulint id) +{ + return(fil_space_acquire_low(id, true)); +} +/** Release a tablespace acquired with fil_space_acquire(). +@param[in,out] space tablespace to release */ +void +fil_space_release( + fil_space_t* space) +{ mutex_enter(&fil_system->mutex); + ut_ad(space->magic_n == FIL_SPACE_MAGIC_N); + ut_ad(space->n_pending_ops > 0); + space->n_pending_ops--; + mutex_exit(&fil_system->mutex); +} - space = fil_space_get_by_id(id); +/** Return the next fil_space_t. +Once started, the caller must keep calling this until it returns NULL. +fil_space_acquire() and fil_space_release() are invoked here which +blocks a concurrent operation from dropping the tablespace. +@param[in] prev_space Pointer to the previous fil_space_t. +If NULL, use the first fil_space_t on fil_system->space_list. +@return pointer to the next fil_space_t. +@retval NULL if this was the last*/ +fil_space_t* +fil_space_next( + fil_space_t* prev_space) +{ + fil_space_t* space=prev_space; - mutex_exit(&fil_system->mutex); + mutex_enter(&fil_system->mutex); - if (space != NULL) { - /* If we have not yet read the page0 - of this tablespace we will do it now. */ - if (!space->crypt_data && !space->page_0_crypt_read) { - ulint space_id = space->id; - fil_node_t* node; - - ut_a(space->crypt_data == NULL); - node = UT_LIST_GET_FIRST(space->chain); - - byte *buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); - byte *page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); - fil_read(true, space_id, 0, 0, 0, UNIV_PAGE_SIZE, page, - NULL, NULL); - ulint offset = fsp_header_get_crypt_offset( - fsp_header_get_zip_size(page), NULL); - space->crypt_data = fil_space_read_crypt_data(space_id, page, offset); - ut_free(buf); + if (prev_space == NULL) { + space = UT_LIST_GET_FIRST(fil_system->space_list); -#ifdef UNIV_DEBUG - ib_logf(IB_LOG_LEVEL_INFO, - "Read page 0 from tablespace for space %lu name %s key_id %u encryption %d handle %d.", - space_id, - space->name, - space->crypt_data ? space->crypt_data->key_id : 0, - space->crypt_data ? space->crypt_data->encryption : 0, - node->handle); -#endif + /* We can trust that space is not NULL because at least the + system tablespace is always present and loaded first. */ + space->n_pending_ops++; + } else { + ut_ad(space->n_pending_ops > 0); - ut_a(space->id == space_id); + /* Move on to the next fil_space_t */ + space->n_pending_ops--; + space = UT_LIST_GET_NEXT(space_list, space); - space->page_0_crypt_read = true; + /* Skip spaces that are being created by + fil_ibd_create(), or dropped, or !tablespace. */ + while (space != NULL + && (UT_LIST_GET_LEN(space->chain) == 0 + || space->stop_new_ops + || space->purpose != FIL_TABLESPACE)) { + space = UT_LIST_GET_NEXT(space_list, space); } - crypt_data = space->crypt_data; - - if (!space->page_0_crypt_read) { - ib_logf(IB_LOG_LEVEL_WARN, - "Space %lu name %s contains encryption %d information for key_id %u but page0 is not read.", - space->id, - space->name, - space->crypt_data ? space->crypt_data->encryption : 0, - space->crypt_data ? space->crypt_data->key_id : 0); + if (space != NULL) { + space->n_pending_ops++; } } - return(crypt_data); + mutex_exit(&fil_system->mutex); + + return(space); } -/****************************************************************** -Get crypt data for a tablespace */ -UNIV_INTERN -fil_space_crypt_t* -fil_space_set_crypt_data( -/*=====================*/ - ulint id, /*!< in: space id */ - fil_space_crypt_t* crypt_data) /*!< in: crypt data */ +/** +Remove space from key rotation list if there are no more +pending operations. +@param[in] space Tablespace */ +static +void +fil_space_remove_from_keyrotation( + fil_space_t* space) { - fil_space_t* space; - fil_space_crypt_t* free_crypt_data = NULL; - fil_space_crypt_t* ret_crypt_data = NULL; + ut_ad(mutex_own(&fil_system->mutex)); + ut_ad(space); - ut_ad(fil_system); + if (space->n_pending_ops == 0 && space->is_in_rotation_list) { + space->is_in_rotation_list = false; + ut_a(UT_LIST_GET_LEN(fil_system->rotation_list) > 0); + UT_LIST_REMOVE(rotation_list, fil_system->rotation_list, space); + } +} - mutex_enter(&fil_system->mutex); - space = fil_space_get_by_id(id); +/** Return the next fil_space_t from key rotation list. +Once started, the caller must keep calling this until it returns NULL. +fil_space_acquire() and fil_space_release() are invoked here which +blocks a concurrent operation from dropping the tablespace. +@param[in] prev_space Pointer to the previous fil_space_t. +If NULL, use the first fil_space_t on fil_system->space_list. +@return pointer to the next fil_space_t. +@retval NULL if this was the last*/ +fil_space_t* +fil_space_keyrotate_next( + fil_space_t* prev_space) +{ + fil_space_t* space = prev_space; + fil_space_t* old = NULL; - if (space != NULL) { - if (space->crypt_data != NULL) { - /* Here we need to release fil_system mutex to - avoid mutex deadlock assertion. Here we would - take mutexes in order fil_system, crypt_data and - in fil_crypt_start_encrypting_space we would - take them in order crypt_data, fil_system - at fil_space_get_flags -> fil_space_get_space */ - mutex_exit(&fil_system->mutex); - fil_space_merge_crypt_data(space->crypt_data, - crypt_data); - ret_crypt_data = space->crypt_data; - free_crypt_data = crypt_data; - } else { - space->crypt_data = crypt_data; - ret_crypt_data = space->crypt_data; - mutex_exit(&fil_system->mutex); + mutex_enter(&fil_system->mutex); + + if (UT_LIST_GET_LEN(fil_system->rotation_list) == 0) { + if (space) { + ut_ad(space->n_pending_ops > 0); + space->n_pending_ops--; + fil_space_remove_from_keyrotation(space); } - } else { - /* there is a small risk that tablespace has been deleted */ - free_crypt_data = crypt_data; mutex_exit(&fil_system->mutex); + return(NULL); } - if (free_crypt_data != NULL) { - /* there was already crypt data present and the new crypt - * data provided as argument to this function has been merged - * into that => free new crypt data - */ - fil_space_destroy_crypt_data(&free_crypt_data); + if (prev_space == NULL) { + space = UT_LIST_GET_FIRST(fil_system->rotation_list); + + /* We can trust that space is not NULL because we + checked list length above */ + } else { + ut_ad(space->n_pending_ops > 0); + + /* Move on to the next fil_space_t */ + space->n_pending_ops--; + + old = space; + space = UT_LIST_GET_NEXT(rotation_list, space); + + fil_space_remove_from_keyrotation(old); } - return ret_crypt_data; + /* Skip spaces that are being created by fil_ibd_create(), + or dropped. Note that rotation_list contains only + space->purpose == FIL_TABLESPACE. */ + while (space != NULL + && (UT_LIST_GET_LEN(space->chain) == 0 + || space->stop_new_ops)) { + + old = space; + space = UT_LIST_GET_NEXT(rotation_list, space); + fil_space_remove_from_keyrotation(old); + } + + if (space != NULL) { + space->n_pending_ops++; + } + + mutex_exit(&fil_system->mutex); + + return(space); } diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc index fe4f64d88ca..303ab5102fb 100644 --- a/storage/xtradb/fil/fil0pagecompress.cc +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -366,7 +366,7 @@ fil_compress_page( fil_decompress_page(uncomp_page, comp_page, len, NULL); - if(buf_page_is_corrupted(false, uncomp_page, 0)) { + if(buf_page_is_corrupted(false, uncomp_page, 0, space)) { buf_page_print(uncomp_page, 0, BUF_PAGE_PRINT_NO_CRASH); ut_error; } diff --git a/storage/xtradb/fsp/fsp0fsp.cc b/storage/xtradb/fsp/fsp0fsp.cc index c32fddaabbe..934824c6462 100644 --- a/storage/xtradb/fsp/fsp0fsp.cc +++ b/storage/xtradb/fsp/fsp0fsp.cc @@ -133,7 +133,7 @@ fsp_fill_free_list( ulint space, /*!< in: space */ fsp_header_t* header, /*!< in/out: space header */ mtr_t* mtr) /*!< in/out: mini-transaction */ - UNIV_COLD MY_ATTRIBUTE((nonnull)); + UNIV_COLD; /**********************************************************************//** Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space @@ -162,7 +162,7 @@ fseg_alloc_free_page_low( in which the page should be initialized. If init_mtr!=mtr, but the page is already latched in mtr, do not initialize the page. */ - MY_ATTRIBUTE((warn_unused_result, nonnull)); + MY_ATTRIBUTE((warn_unused_result)); #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** @@ -680,7 +680,7 @@ UNIV_INTERN void fsp_header_init( /*============*/ - ulint space, /*!< in: space id */ + ulint space_id, /*!< in: space id */ ulint size, /*!< in: current size in blocks */ mtr_t* mtr) /*!< in/out: mini-transaction */ { @@ -692,11 +692,11 @@ fsp_header_init( ut_ad(mtr); - mtr_x_lock(fil_space_get_latch(space, &flags), mtr); + mtr_x_lock(fil_space_get_latch(space_id, &flags), mtr); zip_size = fsp_flags_get_zip_size(flags); - block = buf_page_create(space, 0, zip_size, mtr); - buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr); + block = buf_page_create(space_id, 0, zip_size, mtr); + buf_page_get(space_id, zip_size, 0, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); /* The prior contents of the file page should be ignored */ @@ -709,7 +709,7 @@ fsp_header_init( header = FSP_HEADER_OFFSET + page; - mlog_write_ulint(header + FSP_SPACE_ID, space, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_SPACE_ID, space_id, MLOG_4BYTES, mtr); mlog_write_ulint(header + FSP_NOT_USED, 0, MLOG_4BYTES, mtr); mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr); @@ -725,18 +725,23 @@ fsp_header_init( flst_init(header + FSP_SEG_INODES_FREE, mtr); mlog_write_ull(header + FSP_SEG_ID, 1, mtr); - if (space == 0) { - fsp_fill_free_list(FALSE, space, header, mtr); + if (space_id == 0) { + fsp_fill_free_list(FALSE, space_id, header, mtr); btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, - 0, 0, DICT_IBUF_ID_MIN + space, + 0, 0, DICT_IBUF_ID_MIN + space_id, dict_ind_redundant, mtr); } else { - fsp_fill_free_list(TRUE, space, header, mtr); + fsp_fill_free_list(TRUE, space_id, header, mtr); + } + + fil_space_t* space = fil_space_acquire(space_id); + ut_ad(space); + + if (space->crypt_data) { + space->crypt_data->write_page0(page, mtr); } - ulint maxsize = 0; - ulint offset = fsp_header_get_crypt_offset(zip_size, &maxsize); - fil_space_write_crypt_data(space, page, offset, maxsize, mtr); + fil_space_release(space); } #endif /* !UNIV_HOTBACKUP */ @@ -1074,8 +1079,6 @@ fsp_fill_free_list( ulint i; mtr_t ibuf_mtr; - ut_ad(header != NULL); - ut_ad(mtr != NULL); ut_ad(page_offset(header) == FSP_HEADER_OFFSET); /* Check if we can fill free list from above the free list limit */ @@ -1338,7 +1341,7 @@ Allocates a single free page from a space. The page is marked as used. @retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded (init_mtr == mtr, or the page was not previously freed in mtr) @retval block (not allocated or initialized) otherwise */ -static MY_ATTRIBUTE((nonnull, warn_unused_result)) +static MY_ATTRIBUTE((warn_unused_result)) buf_block_t* fsp_alloc_free_page( /*================*/ @@ -1358,9 +1361,6 @@ fsp_alloc_free_page( ulint page_no; ulint space_size; - ut_ad(mtr); - ut_ad(init_mtr); - header = fsp_get_space_header(space, zip_size, mtr); /* Get the hinted descriptor */ @@ -2379,7 +2379,6 @@ fseg_alloc_free_page_low( ibool success; ulint n; - ut_ad(mtr); ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR)); ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); @@ -2817,6 +2816,7 @@ try_again: } } else { ut_a(alloc_type == FSP_CLEANING); + reserve = 0; } success = fil_space_reserve_free_extents(space, n_free, n_ext); @@ -4154,12 +4154,11 @@ fsp_print( /**********************************************************************//** Compute offset after xdes where crypt data can be stored +@param[in] zip_size Compressed size or 0 @return offset */ ulint fsp_header_get_crypt_offset( -/*========================*/ - ulint zip_size, /*!< in: zip_size */ - ulint* max_size) /*!< out: free space available for crypt data */ + const ulint zip_size) { ulint pageno = 0; /* compute first page_no that will have xdes stored on page != 0*/ @@ -4174,12 +4173,6 @@ fsp_header_get_crypt_offset( ulint iv_offset = XDES_ARR_OFFSET + XDES_SIZE * (1 + xdes_calc_descriptor_index(zip_size, pageno)); - if (max_size != NULL) { - /* return how much free space there is available on page */ - *max_size = (zip_size ? zip_size : UNIV_PAGE_SIZE) - - (FSP_HEADER_OFFSET + iv_offset + FIL_PAGE_DATA_END); - } - return FSP_HEADER_OFFSET + iv_offset; } diff --git a/storage/xtradb/fts/fts0fts.cc b/storage/xtradb/fts/fts0fts.cc index a9c4d175715..e1a95bcd427 100644 --- a/storage/xtradb/fts/fts0fts.cc +++ b/storage/xtradb/fts/fts0fts.cc @@ -1989,7 +1989,7 @@ fts_create_one_index_table( dict_mem_table_add_col(new_table, heap, "ilist", DATA_BLOB, 4130048, 0); - error = row_create_table_for_mysql(new_table, trx, false, FIL_SPACE_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); + error = row_create_table_for_mysql(new_table, trx, false, FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); if (error != DB_SUCCESS) { trx->error_state = error; diff --git a/storage/xtradb/fts/fts0opt.cc b/storage/xtradb/fts/fts0opt.cc index ed882d33548..cb30122adcb 100644 --- a/storage/xtradb/fts/fts0opt.cc +++ b/storage/xtradb/fts/fts0opt.cc @@ -579,9 +579,6 @@ fts_zip_read_word( fts_zip_t* zip, /*!< in: Zip state + data */ fts_string_t* word) /*!< out: uncompressed word */ { -#ifdef UNIV_DEBUG - ulint i; -#endif short len = 0; void* null = NULL; byte* ptr = word->f_str; @@ -656,10 +653,9 @@ fts_zip_read_word( } } -#ifdef UNIV_DEBUG /* All blocks must be freed at end of inflate. */ if (zip->status != Z_OK) { - for (i = 0; i < ib_vector_size(zip->blocks); ++i) { + for (ulint i = 0; i < ib_vector_size(zip->blocks); ++i) { if (ib_vector_getp(zip->blocks, i)) { ut_free(ib_vector_getp(zip->blocks, i)); ib_vector_set(zip->blocks, i, &null); @@ -670,7 +666,6 @@ fts_zip_read_word( if (ptr != NULL) { ut_ad(word->f_len == strlen((char*) ptr)); } -#endif /* UNIV_DEBUG */ return(zip->status == Z_OK || zip->status == Z_STREAM_END ? ptr : NULL); } diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 00f13172db1..742c03b5404 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -1295,6 +1295,9 @@ static SHOW_VAR innodb_status_variables[]= { {"encryption_rotation_estimated_iops", (char*) &export_vars.innodb_encryption_rotation_estimated_iops, SHOW_LONG}, + {"encryption_key_rotation_list_length", + (char*)&export_vars.innodb_key_rotation_list_length, + SHOW_LONGLONG}, /* Scrubing feature */ {"scrub_background_page_reorganizations", @@ -1748,6 +1751,7 @@ static bool innobase_purge_archive_logs( } #endif + /*************************************************************//** Check for a valid value of innobase_commit_concurrency. @return 0 for valid innodb_commit_concurrency */ @@ -1982,6 +1986,15 @@ thd_has_edited_nontrans_tables( return((ibool) thd_non_transactional_update(thd)); } +/* Return high resolution timestamp for the start of the current query */ +UNIV_INTERN +unsigned long long +thd_query_start_micro( + const THD* thd) /*!< in: thread handle */ +{ + return thd_start_utime(thd); +} + /******************************************************************//** Returns true if the thread is executing a SELECT statement. @return true if thd is executing SELECT */ @@ -4348,14 +4361,9 @@ innobase_change_buffering_inited_ok: innobase_commit_concurrency_init_default(); -#ifndef EXTENDED_FOR_KILLIDLE - srv_kill_idle_transaction = 0; -#endif - #ifdef HAVE_POSIX_FALLOCATE srv_use_posix_fallocate = (ibool) innobase_use_fallocate; #endif - /* Do not enable backoff algorithm for small buffer pool. */ if (!innodb_empty_free_list_algorithm_backoff_allowed( static_cast<srv_empty_free_list_t>( @@ -12392,7 +12400,7 @@ ha_innobase::check_table_options( atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes; fil_encryption_t encrypt = (fil_encryption_t)options->encryption; - if (encrypt != FIL_SPACE_ENCRYPTION_DEFAULT && !use_tablespace) { + if (encrypt != FIL_ENCRYPTION_DEFAULT && !use_tablespace) { push_warning( thd, Sql_condition::WARN_LEVEL_WARN, HA_WRONG_CREATE_OPTION, @@ -12400,7 +12408,7 @@ ha_innobase::check_table_options( return "ENCRYPTED"; } - if (encrypt == FIL_SPACE_ENCRYPTION_OFF && srv_encrypt_tables == 2) { + if (encrypt == FIL_ENCRYPTION_OFF && srv_encrypt_tables == 2) { push_warning( thd, Sql_condition::WARN_LEVEL_WARN, HA_WRONG_CREATE_OPTION, @@ -12481,8 +12489,8 @@ ha_innobase::check_table_options( } /* If encryption is set up make sure that used key_id is found */ - if (encrypt == FIL_SPACE_ENCRYPTION_ON || - (encrypt == FIL_SPACE_ENCRYPTION_DEFAULT && srv_encrypt_tables)) { + if (encrypt == FIL_ENCRYPTION_ON || + (encrypt == FIL_ENCRYPTION_DEFAULT && srv_encrypt_tables)) { if (!encryption_key_id_exists((unsigned int)options->encryption_key_id)) { push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, @@ -12496,7 +12504,7 @@ ha_innobase::check_table_options( } /* Ignore nondefault key_id if encryption is set off */ - if (encrypt == FIL_SPACE_ENCRYPTION_OFF && + if (encrypt == FIL_ENCRYPTION_OFF && options->encryption_key_id != THDVAR(thd, default_encryption_key_id)) { push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, @@ -12509,7 +12517,7 @@ ha_innobase::check_table_options( /* If default encryption is used make sure that used kay is found from key file. */ - if (encrypt == FIL_SPACE_ENCRYPTION_DEFAULT && + if (encrypt == FIL_ENCRYPTION_DEFAULT && !srv_encrypt_tables && options->encryption_key_id != FIL_DEFAULT_ENCRYPTION_KEY) { if (!encryption_key_id_exists((unsigned int)options->encryption_key_id)) { @@ -14101,9 +14109,13 @@ ha_innobase::info_low( /* If this table is already queued for background analyze, remove it from the queue as we are about to do the same */ - dict_mutex_enter_for_mysql(); - dict_stats_recalc_pool_del(ib_table); - dict_mutex_exit_for_mysql(); + if (!srv_read_only_mode) { + + dict_mutex_enter_for_mysql(); + dict_stats_recalc_pool_del( + ib_table); + dict_mutex_exit_for_mysql(); + } opt = DICT_STATS_RECALC_PERSISTENT; } else { @@ -16642,6 +16654,37 @@ ha_innobase::get_auto_increment( ulonglong col_max_value = innobase_get_int_col_max_value( table->next_number_field); + /** The following logic is needed to avoid duplicate key error + for autoincrement column. + + (1) InnoDB gives the current autoincrement value with respect + to increment and offset value. + + (2) Basically it does compute_next_insert_id() logic inside InnoDB + to avoid the current auto increment value changed by handler layer. + + (3) It is restricted only for insert operations. */ + + if (increment > 1 && thd_sql_command(user_thd) != SQLCOM_ALTER_TABLE + && autoinc < col_max_value) { + + ulonglong prev_auto_inc = autoinc; + + autoinc = ((autoinc - 1) + increment - offset)/ increment; + + autoinc = autoinc * increment + offset; + + /* If autoinc exceeds the col_max_value then reset + to old autoinc value. Because in case of non-strict + sql mode, boundary value is not considered as error. */ + + if (autoinc >= col_max_value) { + autoinc = prev_auto_inc; + } + + ut_ad(autoinc > 0); + } + /* Called for the first time ? */ if (trx->n_autoinc_rows == 0) { @@ -19154,32 +19197,6 @@ innobase_fts_retrieve_ranking( } /*********************************************************************** -functions for kill session of idle transaction */ -ibool -innobase_thd_is_idle( -/*=================*/ - const void* thd) /*!< in: thread handle (THD*) */ -{ -#ifdef EXTENDED_FOR_KILLIDLE - return(thd_command((const THD*) thd) == COM_SLEEP); -#else - return(FALSE); -#endif -} - -ib_int64_t -innobase_thd_get_start_time( -/*========================*/ - const void* thd) /*!< in: thread handle (THD*) */ -{ -#ifdef EXTENDED_FOR_KILLIDLE - return((ib_int64_t)thd_start_time((const THD*) thd)); -#else - return(0); /*dummy value*/ -#endif -} - -/*********************************************************************** Free the memory for the FTS handler */ UNIV_INTERN void @@ -19198,19 +19215,6 @@ innobase_fts_close_ranking( return; } -UNIV_INTERN -void -innobase_thd_kill( -/*==============*/ - ulong thd_id) -{ -#ifdef EXTENDED_FOR_KILLIDLE - thd_kill(thd_id); -#else - return; -#endif -} - /*********************************************************************** Find and Retrieve the FTS Relevance Ranking result for doc with doc_id of prebuilt->fts_doc_id @@ -19408,16 +19412,6 @@ innobase_fts_retrieve_docid( } -ulong -innobase_thd_get_thread_id( -/*=======================*/ - const void* thd) -{ - return(thd_get_thread_id((const THD*) thd)); -} - - - /*********************************************************************** Find and retrieve the size of the current result @return number of matching rows */ @@ -19651,19 +19645,21 @@ wsrep_innobase_kill_one_trx( if (!thd) { DBUG_PRINT("wsrep", ("no thd for conflicting lock")); - WSREP_WARN("no THD for trx: %lu", victim_trx->id); + WSREP_WARN("no THD for trx: " TRX_ID_FMT, victim_trx->id); DBUG_RETURN(1); } if (!bf_thd) { DBUG_PRINT("wsrep", ("no BF thd for conflicting lock")); - WSREP_WARN("no BF THD for trx: %lu", (bf_trx) ? bf_trx->id : 0); + WSREP_WARN("no BF THD for trx: " TRX_ID_FMT, + bf_trx ? bf_trx->id : 0); DBUG_RETURN(1); } WSREP_LOG_CONFLICT(bf_thd, thd, TRUE); - WSREP_DEBUG("BF kill (%lu, seqno: %lld), victim: (%lu) trx: %lu", + WSREP_DEBUG("BF kill (%lu, seqno: %lld), victim: (%lu) trx: " + TRX_ID_FMT, signal, (long long)bf_seqno, thd_get_thread_id(thd), victim_trx->id); @@ -19683,13 +19679,14 @@ wsrep_innobase_kill_one_trx( if (wsrep_thd_query_state(thd) == QUERY_EXITING) { - WSREP_DEBUG("kill trx EXITING for %lu", victim_trx->id); + WSREP_DEBUG("kill trx EXITING for " TRX_ID_FMT, + victim_trx->id); wsrep_thd_UNLOCK(thd); DBUG_RETURN(0); } if(wsrep_thd_exec_mode(thd) != LOCAL_STATE) { - WSREP_DEBUG("withdraw for BF trx: %lu, state: %d", + WSREP_DEBUG("withdraw for BF trx: " TRX_ID_FMT ", state: %d", victim_trx->id, wsrep_thd_get_conflict_state(thd)); } @@ -19699,7 +19696,7 @@ wsrep_innobase_kill_one_trx( wsrep_thd_set_conflict_state(thd, MUST_ABORT); break; case MUST_ABORT: - WSREP_DEBUG("victim %lu in MUST ABORT state", + WSREP_DEBUG("victim " TRX_ID_FMT " in MUST ABORT state", victim_trx->id); wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); @@ -19708,7 +19705,7 @@ wsrep_innobase_kill_one_trx( case ABORTED: case ABORTING: // fall through default: - WSREP_DEBUG("victim %lu in state %d", + WSREP_DEBUG("victim " TRX_ID_FMT " in state %d", victim_trx->id, wsrep_thd_get_conflict_state(thd)); wsrep_thd_UNLOCK(thd); DBUG_RETURN(0); @@ -19721,7 +19718,7 @@ wsrep_innobase_kill_one_trx( WSREP_DEBUG("kill query for: %ld", thd_get_thread_id(thd)); - WSREP_DEBUG("kill trx QUERY_COMMITTING for %lu", + WSREP_DEBUG("kill trx QUERY_COMMITTING for " TRX_ID_FMT, victim_trx->id); if (wsrep_thd_exec_mode(thd) == REPL_RECV) { @@ -19736,7 +19733,8 @@ wsrep_innobase_kill_one_trx( switch (rcode) { case WSREP_WARNING: - WSREP_DEBUG("cancel commit warning: %lu", + WSREP_DEBUG("cancel commit warning: " + TRX_ID_FMT, victim_trx->id); wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); @@ -19746,7 +19744,8 @@ wsrep_innobase_kill_one_trx( break; default: WSREP_ERROR( - "cancel commit bad exit: %d %lu", + "cancel commit bad exit: %d " + TRX_ID_FMT, rcode, victim_trx->id); /* unable to interrupt, must abort */ @@ -19764,7 +19763,8 @@ wsrep_innobase_kill_one_trx( /* it is possible that victim trx is itself waiting for some * other lock. We need to cancel this waiting */ - WSREP_DEBUG("kill trx QUERY_EXEC for %lu", victim_trx->id); + WSREP_DEBUG("kill trx QUERY_EXEC for " TRX_ID_FMT, + victim_trx->id); victim_trx->lock.was_chosen_as_deadlock_victim= TRUE; if (victim_trx->lock.wait_lock) { @@ -19799,7 +19799,7 @@ wsrep_innobase_kill_one_trx( break; case QUERY_IDLE: { - WSREP_DEBUG("kill IDLE for %lu", victim_trx->id); + WSREP_DEBUG("kill IDLE for " TRX_ID_FMT, victim_trx->id); if (wsrep_thd_exec_mode(thd) == REPL_RECV) { WSREP_DEBUG("kill BF IDLE, seqno: %lld", @@ -20072,6 +20072,12 @@ static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite, "Disable with --skip-innodb-doublewrite.", NULL, NULL, TRUE); +static MYSQL_SYSVAR_BOOL(stats_include_delete_marked, + srv_stats_include_delete_marked, + PLUGIN_VAR_OPCMDARG, + "Scan delete marked records for persistent stat", + NULL, NULL, FALSE); + static MYSQL_SYSVAR_BOOL(use_atomic_writes, innobase_use_atomic_writes, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, "Prevent partial page writes, via atomic writes (beta). " @@ -20782,13 +20788,6 @@ static MYSQL_SYSVAR_ULONG(force_recovery, srv_force_recovery, "Helps to save your data in case the disk image of the database becomes corrupt.", NULL, NULL, 0, 0, 6, 0); -#ifndef DBUG_OFF -static MYSQL_SYSVAR_ULONG(force_recovery_crash, srv_force_recovery_crash, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "Kills the server during crash recovery.", - NULL, NULL, 0, 0, 10, 0); -#endif /* !DBUG_OFF */ - static MYSQL_SYSVAR_ULONG(page_size, srv_page_size, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, "Page size to use for all InnoDB tablespaces.", @@ -21308,10 +21307,11 @@ static MYSQL_SYSVAR_UINT(encryption_rotate_key_age, PLUGIN_VAR_RQCMDARG, "Key rotation - re-encrypt in background " "all pages that were encrypted with a key that " - "many (or more) versions behind", + "many (or more) versions behind. Value 0 indicates " + "that key rotation is disabled.", NULL, innodb_encryption_rotate_key_age_update, - srv_fil_crypt_rotate_key_age, 0, UINT_MAX32, 0); + 1, 0, UINT_MAX32, 0); static MYSQL_SYSVAR_UINT(encryption_rotation_iops, srv_n_fil_crypt_iops, PLUGIN_VAR_RQCMDARG, @@ -21430,6 +21430,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(data_file_path), MYSQL_SYSVAR(data_home_dir), MYSQL_SYSVAR(doublewrite), + MYSQL_SYSVAR(stats_include_delete_marked), MYSQL_SYSVAR(api_enable_binlog), MYSQL_SYSVAR(api_enable_mdl), MYSQL_SYSVAR(api_disable_rowlock), @@ -21448,9 +21449,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(use_global_flush_log_at_trx_commit), MYSQL_SYSVAR(flush_method), MYSQL_SYSVAR(force_recovery), -#ifndef DBUG_OFF - MYSQL_SYSVAR(force_recovery_crash), -#endif /* !DBUG_OFF */ MYSQL_SYSVAR(ft_cache_size), MYSQL_SYSVAR(ft_total_cache_size), MYSQL_SYSVAR(ft_result_cache_limit), @@ -22274,8 +22272,9 @@ innodb_encrypt_tables_validate( for update function */ struct st_mysql_value* value) /*!< in: incoming string */ { - if (check_sysvar_enum(thd, var, save, value)) + if (check_sysvar_enum(thd, var, save, value)) { return 1; + } ulong encrypt_tables = *(ulong*)save; @@ -22287,6 +22286,17 @@ innodb_encrypt_tables_validate( "encryption plugin is not available"); return 1; } + + if (!srv_fil_crypt_rotate_key_age) { + const char *msg = (encrypt_tables ? "enable" : "disable"); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: cannot %s encryption, " + "innodb_encryption_rotate_key_age=0" + " i.e. key rotation disabled", msg); + return 1; + } + return 0; } diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h index 783077ceaf1..62b80c492a1 100644 --- a/storage/xtradb/handler/ha_innodb.h +++ b/storage/xtradb/handler/ha_innodb.h @@ -167,6 +167,8 @@ class ha_innobase: public handler int index_first(uchar * buf); int index_last(uchar * buf); + bool has_gap_locks() const { return true; } + int rnd_init(bool scan); int rnd_end(); int rnd_next(uchar *buf); @@ -426,6 +428,15 @@ int thd_slave_thread(const MYSQL_THD thd); int thd_non_transactional_update(const MYSQL_THD thd); /** + Get high resolution timestamp for the current query start time. + The timestamp is not anchored to any specific point in time, + but can be used for comparison. + + @retval timestamp in microseconds precision +*/ +unsigned long long thd_start_utime(const MYSQL_THD thd); + +/** Get the user thread's binary logging format @param thd user thread @return Value to be used as index into the binlog_format_names array diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc index c62dc5bc837..8aaf5cd83bc 100644 --- a/storage/xtradb/handler/handler0alter.cc +++ b/storage/xtradb/handler/handler0alter.cc @@ -1868,6 +1868,7 @@ innobase_fts_check_doc_id_index_in_def( return(FTS_NOT_EXIST_DOC_ID_INDEX); } + /*******************************************************************//** Create an index table where indexes are ordered as follows: @@ -1936,26 +1937,11 @@ innobase_create_key_defs( (only prefix/part of the column is indexed), MySQL will treat the index as a PRIMARY KEY unless the table already has one. */ - if (n_add > 0 && !new_primary && got_default_clust - && (key_info[*add].flags & HA_NOSAME) - && !(key_info[*add].flags & HA_KEY_HAS_PART_KEY_SEG)) { - uint key_part = key_info[*add].user_defined_key_parts; - - new_primary = true; + ut_ad(altered_table->s->primary_key == 0 + || altered_table->s->primary_key == MAX_KEY); - while (key_part--) { - const uint maybe_null - = key_info[*add].key_part[key_part].key_type - & FIELDFLAG_MAYBE_NULL; - DBUG_ASSERT(!maybe_null - == !key_info[*add].key_part[key_part]. - field->real_maybe_null()); - - if (maybe_null) { - new_primary = false; - break; - } - } + if (got_default_clust && !new_primary) { + new_primary = (altered_table->s->primary_key != MAX_KEY); } const bool rebuild = new_primary || add_fts_doc_id @@ -1974,8 +1960,14 @@ innobase_create_key_defs( ulint primary_key_number; if (new_primary) { - DBUG_ASSERT(n_add > 0); - primary_key_number = *add; + if (n_add == 0) { + DBUG_ASSERT(got_default_clust); + DBUG_ASSERT(altered_table->s->primary_key + == 0); + primary_key_number = 0; + } else { + primary_key_number = *add; + } } else if (got_default_clust) { /* Create the GEN_CLUST_INDEX */ index_def_t* index = indexdef++; @@ -2899,9 +2891,11 @@ prepare_inplace_alter_table_dict( ulint n_cols; dtuple_t* add_cols; ulint key_id = FIL_DEFAULT_ENCRYPTION_KEY; - fil_encryption_t mode = FIL_SPACE_ENCRYPTION_DEFAULT; + fil_encryption_t mode = FIL_ENCRYPTION_DEFAULT; - crypt_data = fil_space_get_crypt_data(ctx->prebuilt->table->space); + fil_space_t* space = fil_space_acquire(ctx->prebuilt->table->space); + crypt_data = space->crypt_data; + fil_space_release(space); if (crypt_data) { key_id = crypt_data->key_id; @@ -3097,6 +3091,8 @@ prepare_inplace_alter_table_dict( ctx->add_cols = add_cols; } else { DBUG_ASSERT(!innobase_need_rebuild(ha_alter_info, old_table)); + DBUG_ASSERT(old_table->s->primary_key + == altered_table->s->primary_key); if (!ctx->new_table->fts && innobase_fulltext_exist(altered_table)) { @@ -4142,6 +4138,27 @@ found_col: add_fts_doc_id_idx, prebuilt)); } +/** Get the name of an erroneous key. +@param[in] error_key_num InnoDB number of the erroneus key +@param[in] ha_alter_info changes that were being performed +@param[in] table InnoDB table +@return the name of the erroneous key */ +static +const char* +get_error_key_name( + ulint error_key_num, + const Alter_inplace_info* ha_alter_info, + const dict_table_t* table) +{ + if (error_key_num == ULINT_UNDEFINED) { + return(FTS_DOC_ID_INDEX_NAME); + } else if (ha_alter_info->key_count == 0) { + return(dict_table_get_first_index(table)->name); + } else { + return(ha_alter_info->key_info_buffer[error_key_num].name); + } +} + /** Alter the table structure in-place with operations specified using Alter_inplace_info. The level of concurrency allowed during this operation depends @@ -4264,17 +4281,13 @@ oom: case DB_ONLINE_LOG_TOO_BIG: DBUG_ASSERT(ctx->online); my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0), - (prebuilt->trx->error_key_num == ULINT_UNDEFINED) - ? FTS_DOC_ID_INDEX_NAME - : ha_alter_info->key_info_buffer[ - prebuilt->trx->error_key_num].name); + get_error_key_name(prebuilt->trx->error_key_num, + ha_alter_info, prebuilt->table)); break; case DB_INDEX_CORRUPT: my_error(ER_INDEX_CORRUPT, MYF(0), - (prebuilt->trx->error_key_num == ULINT_UNDEFINED) - ? FTS_DOC_ID_INDEX_NAME - : ha_alter_info->key_info_buffer[ - prebuilt->trx->error_key_num].name); + get_error_key_name(prebuilt->trx->error_key_num, + ha_alter_info, prebuilt->table)); break; case DB_DECRYPTION_FAILED: { String str; @@ -5094,7 +5107,6 @@ innobase_update_foreign_cache( "Foreign key constraints for table '%s'" " are loaded with charset check off", user_table->name); - } } @@ -5194,14 +5206,13 @@ commit_try_rebuild( DBUG_RETURN(true); case DB_ONLINE_LOG_TOO_BIG: my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0), - ha_alter_info->key_info_buffer[0].name); + get_error_key_name(err_key, ha_alter_info, + rebuilt_table)); DBUG_RETURN(true); case DB_INDEX_CORRUPT: my_error(ER_INDEX_CORRUPT, MYF(0), - (err_key == ULINT_UNDEFINED) - ? FTS_DOC_ID_INDEX_NAME - : ha_alter_info->key_info_buffer[err_key] - .name); + get_error_key_name(err_key, ha_alter_info, + rebuilt_table)); DBUG_RETURN(true); default: my_error_innodb(error, table_name, user_table->flags); diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc index 420dff83a40..086d5642dbb 100644 --- a/storage/xtradb/handler/i_s.cc +++ b/storage/xtradb/handler/i_s.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2007, 2016, Oracle and/or its affiliates. -Copyrigth (c) 2014, 2016, MariaDB Corporation +Copyrigth (c) 2014, 2017, MariaDB Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -8324,29 +8324,7 @@ i_s_innodb_changed_pages_fill( while(log_online_bitmap_iterator_next(&i) && (!srv_max_changed_pages || - output_rows_num < srv_max_changed_pages) && - /* - There is no need to compare both start LSN and end LSN fields - with maximum value. It's enough to compare only start LSN. - Example: - - max_lsn = 100 - \\\\\\\\\\\\\\\\\\\\\\\\\|\\\\\\\\ - Query 1 - I------I I-------I I-------------I I----I - ////////////////// | - Query 2 - 1 2 3 4 - - Query 1: - SELECT * FROM INNODB_CHANGED_PAGES WHERE start_lsn < 100 - will select 1,2,3 bitmaps - Query 2: - SELECT * FROM INNODB_CHANGED_PAGES WHERE end_lsn < 100 - will select 1,2 bitmaps - - The condition start_lsn <= 100 will be false after reading - 1,2,3 bitmaps which suits for both cases. - */ - LOG_BITMAP_ITERATOR_START_LSN(i) <= max_lsn) + output_rows_num < srv_max_changed_pages)) { if (!LOG_BITMAP_ITERATOR_PAGE_CHANGED(i)) continue; @@ -8514,22 +8492,31 @@ static ST_FIELD_INFO innodb_tablespaces_encryption_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, +#define TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING 9 + {STRUCT_FLD(field_name, "ROTATING_OR_FLUSHING"), + STRUCT_FLD(field_length, 1), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + END_OF_ST_FIELD_INFO }; /**********************************************************************//** Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION -with information collected by scanning SYS_TABLESPACES table and then use -fil_space() +with information collected by scanning SYS_TABLESPACES table. +@param[in] thd thread handle +@param[in] space Tablespace +@param[in] table_to_fill I_S table to fill @return 0 on success */ static int i_s_dict_fill_tablespaces_encryption( -/*==========================*/ - THD* thd, /*!< in: thread */ - ulint space, /*!< in: space ID */ - const char* name, /*!< in: tablespace name */ - TABLE* table_to_fill) /*!< in/out: fill this table */ + THD* thd, + fil_space_t* space, + TABLE* table_to_fill) { Field** fields; struct fil_space_crypt_status_t status; @@ -8539,10 +8526,11 @@ i_s_dict_fill_tablespaces_encryption( fields = table_to_fill->field; fil_space_crypt_get_status(space, &status); - OK(fields[TABLESPACES_ENCRYPTION_SPACE]->store(space)); + + OK(fields[TABLESPACES_ENCRYPTION_SPACE]->store(space->id)); OK(field_store_string(fields[TABLESPACES_ENCRYPTION_NAME], - name)); + space->name)); OK(fields[TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME]->store( status.scheme)); @@ -8554,6 +8542,9 @@ i_s_dict_fill_tablespaces_encryption( status.current_key_version)); OK(fields[TABLESPACES_ENCRYPTION_CURRENT_KEY_ID]->store( status.key_id)); + OK(fields[TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING]->store( + (status.rotating || status.flushing) ? 1 : 0)); + if (status.rotating) { fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->set_notnull(); OK(fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->store( @@ -8567,6 +8558,7 @@ i_s_dict_fill_tablespaces_encryption( fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER] ->set_null(); } + OK(schema_table_store_record(thd, table_to_fill)); DBUG_RETURN(0); @@ -8606,30 +8598,36 @@ i_s_tablespaces_encryption_fill_table( while (rec) { const char* err_msg; - ulint space; + ulint space_id; const char* name; ulint flags; /* Extract necessary information from a SYS_TABLESPACES row */ err_msg = dict_process_sys_tablespaces( - heap, rec, &space, &name, &flags); + heap, rec, &space_id, &name, &flags); mtr_commit(&mtr); mutex_exit(&dict_sys->mutex); - if (space == 0) { + if (space_id == 0) { found_space_0 = true; } - if (!err_msg) { + fil_space_t* space = fil_space_acquire_silent(space_id); + + if (!err_msg && space) { i_s_dict_fill_tablespaces_encryption( - thd, space, name, tables->table); + thd, space, tables->table); } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_FIND_SYSTEM_REC, "%s", err_msg); } + if (space) { + fil_space_release(space); + } + mem_heap_empty(heap); /* Get the next record */ @@ -8645,10 +8643,13 @@ i_s_tablespaces_encryption_fill_table( if (found_space_0 == false) { /* space 0 does for what ever unknown reason not show up * in iteration above, add it manually */ - ulint space = 0; - const char* name = NULL; + + fil_space_t* space = fil_space_acquire_silent(0); + i_s_dict_fill_tablespaces_encryption( - thd, space, name, tables->table); + thd, space, tables->table); + + fil_space_release(space); } DBUG_RETURN(0); @@ -8802,17 +8803,18 @@ static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[] = /**********************************************************************//** Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING -with information collected by scanning SYS_TABLESPACES table and then use -fil_space() +with information collected by scanning SYS_TABLESPACES table and +fil_space. +@param[in] thd Thread handle +@param[in] space Tablespace +@param[in] table_to_fill I_S table @return 0 on success */ static int i_s_dict_fill_tablespaces_scrubbing( -/*==========================*/ - THD* thd, /*!< in: thread */ - ulint space, /*!< in: space ID */ - const char* name, /*!< in: tablespace name */ - TABLE* table_to_fill) /*!< in/out: fill this table */ + THD* thd, + fil_space_t* space, + TABLE* table_to_fill) { Field** fields; struct fil_space_scrub_status_t status; @@ -8822,10 +8824,11 @@ i_s_dict_fill_tablespaces_scrubbing( fields = table_to_fill->field; fil_space_get_scrub_status(space, &status); - OK(fields[TABLESPACES_SCRUBBING_SPACE]->store(space)); + + OK(fields[TABLESPACES_SCRUBBING_SPACE]->store(space->id)); OK(field_store_string(fields[TABLESPACES_SCRUBBING_NAME], - name)); + space->name)); OK(fields[TABLESPACES_SCRUBBING_COMPRESSED]->store( status.compressed ? 1 : 0)); @@ -8845,6 +8848,7 @@ i_s_dict_fill_tablespaces_scrubbing( TABLESPACES_SCRUBBING_CURRENT_SCRUB_ACTIVE_THREADS, TABLESPACES_SCRUBBING_CURRENT_SCRUB_PAGE_NUMBER, TABLESPACES_SCRUBBING_CURRENT_SCRUB_MAX_PAGE_NUMBER }; + if (status.scrubbing) { for (uint i = 0; i < array_elements(field_numbers); i++) { fields[field_numbers[i]]->set_notnull(); @@ -8864,6 +8868,7 @@ i_s_dict_fill_tablespaces_scrubbing( fields[field_numbers[i]]->set_null(); } } + OK(schema_table_store_record(thd, table_to_fill)); DBUG_RETURN(0); @@ -8903,30 +8908,36 @@ i_s_tablespaces_scrubbing_fill_table( while (rec) { const char* err_msg; - ulint space; + ulint space_id; const char* name; ulint flags; /* Extract necessary information from a SYS_TABLESPACES row */ err_msg = dict_process_sys_tablespaces( - heap, rec, &space, &name, &flags); + heap, rec, &space_id, &name, &flags); mtr_commit(&mtr); mutex_exit(&dict_sys->mutex); - if (space == 0) { + if (space_id == 0) { found_space_0 = true; } - if (!err_msg) { + fil_space_t* space = fil_space_acquire_silent(space_id); + + if (!err_msg && space) { i_s_dict_fill_tablespaces_scrubbing( - thd, space, name, tables->table); + thd, space, tables->table); } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_FIND_SYSTEM_REC, "%s", err_msg); } + if (space) { + fil_space_release(space); + } + mem_heap_empty(heap); /* Get the next record */ @@ -8942,10 +8953,12 @@ i_s_tablespaces_scrubbing_fill_table( if (found_space_0 == false) { /* space 0 does for what ever unknown reason not show up * in iteration above, add it manually */ - ulint space = 0; - const char* name = NULL; + fil_space_t* space = fil_space_acquire_silent(0); + i_s_dict_fill_tablespaces_scrubbing( - thd, space, name, tables->table); + thd, space, tables->table); + + fil_space_release(space); } DBUG_RETURN(0); diff --git a/storage/xtradb/ibuf/ibuf0ibuf.cc b/storage/xtradb/ibuf/ibuf0ibuf.cc index c1d735eecdd..e66568565e1 100644 --- a/storage/xtradb/ibuf/ibuf0ibuf.cc +++ b/storage/xtradb/ibuf/ibuf0ibuf.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, MariaDB Corporation. +Copyright (c) 2016, 2017, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -4596,7 +4596,7 @@ ibuf_merge_or_delete_for_page( buf_block_t* block, /*!< in: if page has been read from disk, pointer to the page x-latched, else NULL */ - ulint space, /*!< in: space id of the index page */ + ulint space_id,/*!< in: space id of the index page */ ulint page_no,/*!< in: page number of the index page */ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ @@ -4613,21 +4613,21 @@ ibuf_merge_or_delete_for_page( ulint volume = 0; #endif page_zip_des_t* page_zip = NULL; - ibool tablespace_being_deleted = FALSE; ibool corruption_noticed = FALSE; mtr_t mtr; + fil_space_t* space = NULL; /* Counts for merged & discarded operations. */ ulint mops[IBUF_OP_COUNT]; ulint dops[IBUF_OP_COUNT]; - ut_ad(!block || buf_block_get_space(block) == space); + ut_ad(!block || buf_block_get_space(block) == space_id); ut_ad(!block || buf_block_get_page_no(block) == page_no); ut_ad(!block || buf_block_get_zip_size(block) == zip_size); ut_ad(!block || buf_block_get_io_fix_unlocked(block) == BUF_IO_READ); if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE - || trx_sys_hdr_page(space, page_no)) { + || trx_sys_hdr_page(space_id, page_no)) { return; } @@ -4641,7 +4641,7 @@ ibuf_merge_or_delete_for_page( uncompressed page size always is a power-of-2 multiple of the compressed page size. */ - if (ibuf_fixed_addr_page(space, 0, page_no) + if (ibuf_fixed_addr_page(space_id, 0, page_no) || fsp_descr_page(0, page_no)) { return; } @@ -4649,19 +4649,19 @@ ibuf_merge_or_delete_for_page( if (UNIV_LIKELY(update_ibuf_bitmap)) { ut_a(ut_is_2pow(zip_size)); - if (ibuf_fixed_addr_page(space, zip_size, page_no) + if (ibuf_fixed_addr_page(space_id, zip_size, page_no) || fsp_descr_page(zip_size, page_no)) { return; } - /* If the following returns FALSE, we get the counter + /* If the following returns space, we get the counter incremented, and must decrement it when we leave this function. When the counter is > 0, that prevents tablespace from being dropped. */ - tablespace_being_deleted = fil_inc_pending_ops(space, true); + space = fil_space_acquire(space_id); - if (UNIV_UNLIKELY(tablespace_being_deleted)) { + if (UNIV_UNLIKELY(!space)) { /* Do not try to read the bitmap page from space; just delete the ibuf records for the page */ @@ -4674,7 +4674,7 @@ ibuf_merge_or_delete_for_page( ibuf_mtr_start(&mtr); bitmap_page = ibuf_bitmap_get_map_page( - space, page_no, zip_size, &mtr); + space_id, page_no, zip_size, &mtr); if (bitmap_page && fil_page_get_type(bitmap_page) != FIL_PAGE_TYPE_ALLOCATED) { @@ -4688,15 +4688,15 @@ ibuf_merge_or_delete_for_page( if (!bitmap_bits) { /* No inserts buffered for this page */ - if (!tablespace_being_deleted) { - fil_decr_pending_ops(space); + if (space) { + fil_space_release(space); } return; } } } else if (block - && (ibuf_fixed_addr_page(space, zip_size, page_no) + && (ibuf_fixed_addr_page(space_id, zip_size, page_no) || fsp_descr_page(zip_size, page_no))) { return; @@ -4704,7 +4704,7 @@ ibuf_merge_or_delete_for_page( heap = mem_heap_create(512); - search_tuple = ibuf_search_tuple_build(space, page_no, heap); + search_tuple = ibuf_search_tuple_build(space_id, page_no, heap); if (block) { /* Move the ownership of the x-latch on the page to this OS @@ -4730,7 +4730,7 @@ ibuf_merge_or_delete_for_page( fputs(" InnoDB: Dump of the ibuf bitmap page:\n", stderr); - bitmap_page = ibuf_bitmap_get_map_page(space, page_no, + bitmap_page = ibuf_bitmap_get_map_page(space_id, page_no, zip_size, &mtr); if (bitmap_page == NULL) { @@ -4814,7 +4814,7 @@ loop: /* Check if the entry is for this index page */ if (ibuf_rec_get_page_no(&mtr, rec) != page_no - || ibuf_rec_get_space(&mtr, rec) != space) { + || ibuf_rec_get_space(&mtr, rec) != space_id) { if (block) { page_header_reset_last_insert( @@ -4881,7 +4881,7 @@ loop: ut_ad(page_rec_is_user_rec(rec)); ut_ad(ibuf_rec_get_page_no(&mtr, rec) == page_no); - ut_ad(ibuf_rec_get_space(&mtr, rec) == space); + ut_ad(ibuf_rec_get_space(&mtr, rec) == space_id); /* Mark the change buffer record processed, so that it will not be merged again in case @@ -4911,7 +4911,7 @@ loop: buf_block_dbg_add_level( block, SYNC_IBUF_TREE_NODE); - if (!ibuf_restore_pos(space, page_no, + if (!ibuf_restore_pos(space_id, page_no, search_tuple, BTR_MODIFY_LEAF, &pcur, &mtr)) { @@ -4935,7 +4935,7 @@ loop: } /* Delete the record from ibuf */ - if (ibuf_delete_rec(space, page_no, &pcur, search_tuple, + if (ibuf_delete_rec(space_id, page_no, &pcur, search_tuple, &mtr)) { /* Deletion was pessimistic and mtr was committed: we start from the beginning again */ @@ -4955,7 +4955,7 @@ reset_bit: page_t* bitmap_page; bitmap_page = ibuf_bitmap_get_map_page( - space, page_no, zip_size, &mtr); + space_id, page_no, zip_size, &mtr); ibuf_bitmap_page_set_bits( bitmap_page, page_no, zip_size, @@ -4996,13 +4996,12 @@ reset_bit: mutex_exit(&ibuf_mutex); #endif /* HAVE_ATOMIC_BUILTINS */ - if (update_ibuf_bitmap && !tablespace_being_deleted) { - - fil_decr_pending_ops(space); + if (space) { + fil_space_release(space); } #ifdef UNIV_IBUF_COUNT_DEBUG - ut_a(ibuf_count_get(space, page_no) == 0); + ut_a(ibuf_count_get(space_id, page_no) == 0); #endif } diff --git a/storage/xtradb/include/btr0cur.h b/storage/xtradb/include/btr0cur.h index f485d072c4c..960bd55d3d9 100644 --- a/storage/xtradb/include/btr0cur.h +++ b/storage/xtradb/include/btr0cur.h @@ -294,11 +294,7 @@ btr_cur_update_alloc_zip_func( false=update-in-place */ mtr_t* mtr, /*!< in/out: mini-transaction */ trx_t* trx) /*!< in: NULL or transaction */ -#ifdef UNIV_DEBUG - MY_ATTRIBUTE((nonnull (1, 2, 3, 4, 7), warn_unused_result)); -#else - MY_ATTRIBUTE((nonnull (1, 2, 3, 6), warn_unused_result)); -#endif + MY_ATTRIBUTE((warn_unused_result)); #ifdef UNIV_DEBUG # define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr,trx) \ @@ -428,7 +424,7 @@ btr_cur_del_mark_set_clust_rec( const ulint* offsets,/*!< in: rec_get_offsets(rec) */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in/out: mini-transaction */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /***********************************************************//** Sets a secondary index record delete mark to TRUE or FALSE. @return DB_SUCCESS, DB_LOCK_WAIT, or error number */ @@ -441,7 +437,7 @@ btr_cur_del_mark_set_sec_rec( ibool val, /*!< in: value to set */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in/out: mini-transaction */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /*************************************************************//** Tries to compress a page of the tree if it seems useful. It is assumed that mtr holds an x-latch on the tree and on the cursor page. To avoid @@ -609,8 +605,7 @@ btr_cur_disown_inherited_fields( dict_index_t* index, /*!< in: index of the page */ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ const upd_t* update, /*!< in: update vector */ - mtr_t* mtr) /*!< in/out: mini-transaction */ - MY_ATTRIBUTE((nonnull(2,3,4,5,6))); + mtr_t* mtr); /*!< in/out: mini-transaction */ /** Operation code for btr_store_big_rec_extern_fields(). */ enum blob_op { @@ -655,7 +650,7 @@ btr_store_big_rec_extern_fields( mtr_t* btr_mtr, /*!< in: mtr containing the latches to the clustered index */ enum blob_op op) /*! in: operation code */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /*******************************************************************//** Frees the space in an externally stored field to the file space @@ -751,8 +746,7 @@ btr_push_update_extern_fields( /*==========================*/ dtuple_t* tuple, /*!< in/out: data tuple */ const upd_t* update, /*!< in: update vector */ - mem_heap_t* heap) /*!< in: memory heap */ - MY_ATTRIBUTE((nonnull)); + mem_heap_t* heap); /*!< in: memory heap */ /***********************************************************//** Sets a secondary index record's delete mark to the given value. This function is only used by the insert buffer merge mechanism. */ diff --git a/storage/xtradb/include/btr0sea.h b/storage/xtradb/include/btr0sea.h index db7b477fae1..66c27607013 100644 --- a/storage/xtradb/include/btr0sea.h +++ b/storage/xtradb/include/btr0sea.h @@ -200,7 +200,7 @@ hash_table_t* btr_search_get_hash_table( /*======================*/ const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((pure,warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** Returns the adaptive hash index latch for a given index key. @@ -210,7 +210,7 @@ prio_rw_lock_t* btr_search_get_latch( /*=================*/ const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((pure,warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /*********************************************************************//** Returns the AHI partition number corresponding to a given index ID. */ @@ -227,8 +227,7 @@ UNIV_INLINE void btr_search_index_init( /*===============*/ - dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((nonnull)); + dict_index_t* index); /*!< in: index */ /********************************************************************//** Latches all adaptive hash index latches in exclusive mode. */ diff --git a/storage/xtradb/include/btr0sea.ic b/storage/xtradb/include/btr0sea.ic index 3cbcff75f31..e963d8a8449 100644 --- a/storage/xtradb/include/btr0sea.ic +++ b/storage/xtradb/include/btr0sea.ic @@ -90,7 +90,6 @@ btr_search_get_hash_table( /*======================*/ const dict_index_t* index) /*!< in: index */ { - ut_ad(index); ut_ad(index->search_table); return(index->search_table); @@ -105,7 +104,6 @@ btr_search_get_latch( /*=================*/ const dict_index_t* index) /*!< in: index */ { - ut_ad(index); ut_ad(index->search_latch >= btr_search_latch_arr && index->search_latch < btr_search_latch_arr + btr_search_index_num); @@ -132,8 +130,6 @@ btr_search_index_init( /*===============*/ dict_index_t* index) /*!< in: index */ { - ut_ad(index); - index->search_latch = &btr_search_latch_arr[btr_search_get_key(index->id)]; index->search_table = diff --git a/storage/xtradb/include/buf0buddy.ic b/storage/xtradb/include/buf0buddy.ic index 9bc8e9e8762..a5fb510dd19 100644 --- a/storage/xtradb/include/buf0buddy.ic +++ b/storage/xtradb/include/buf0buddy.ic @@ -50,7 +50,7 @@ buf_buddy_alloc_low( allocated from the LRU list and buf_pool->LRU_list_mutex was temporarily released */ - MY_ATTRIBUTE((malloc, nonnull)); + MY_ATTRIBUTE((malloc)); /**********************************************************************//** Deallocate a block. */ diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h index 6924481af49..1774d9445ff 100644 --- a/storage/xtradb/include/buf0buf.h +++ b/storage/xtradb/include/buf0buf.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2016, MariaDB Corporation. +Copyright (c) 2013, 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -243,8 +243,7 @@ buf_relocate( buf_page_t* bpage, /*!< in/out: control block being relocated; buf_page_get_state(bpage) must be BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */ - buf_page_t* dpage) /*!< in/out: destination control block */ - MY_ATTRIBUTE((nonnull)); + buf_page_t* dpage); /*!< in/out: destination control block */ /*********************************************************************//** Gets the current size of buffer buf_pool in bytes. @return size in bytes */ @@ -639,19 +638,68 @@ buf_block_unfix( #else /* !UNIV_HOTBACKUP */ # define buf_block_modify_clock_inc(block) ((void) 0) #endif /* !UNIV_HOTBACKUP */ + +/** Checks if the page is in crc32 checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in crc32 checksum format */ +bool +buf_page_is_checksum_valid_crc32( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) + MY_ATTRIBUTE((warn_unused_result)); + +/** Checks if the page is in innodb checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in innodb checksum format */ +bool +buf_page_is_checksum_valid_innodb( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) + MY_ATTRIBUTE((warn_unused_result)); + +/** Checks if the page is in none checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in none checksum format */ +bool +buf_page_is_checksum_valid_none( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) + MY_ATTRIBUTE((warn_unused_result)); + /********************************************************************//** Checks if a page is corrupt. -@return TRUE if corrupted */ -UNIV_INTERN -ibool +@param[in] check_lsn true if LSN should be checked +@param[in] read_buf Page to be checked +@param[in] zip_size compressed size or 0 +@param[in] space Pointer to tablespace +@return true if corrupted, false if not */ +bool buf_page_is_corrupted( -/*==================*/ - bool check_lsn, /*!< in: true if we need to check the - and complain about the LSN */ - const byte* read_buf, /*!< in: a database page */ - ulint zip_size) /*!< in: size of compressed page; - 0 for uncompressed pages */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + bool check_lsn, + const byte* read_buf, + ulint zip_size, + const fil_space_t* space) + MY_ATTRIBUTE((warn_unused_result)); +/********************************************************************//** +Check if page is maybe compressed, encrypted or both when we encounter +corrupted page. Note that we can't be 100% sure if page is corrupted +or decrypt/decompress just failed. +@param[in] bpage Page +@return true if page corrupted, false if not */ +bool +buf_page_check_corrupt( + buf_page_t* bpage) /*!< in/out: buffer page read from disk */ + MY_ATTRIBUTE(( warn_unused_result)); + /********************************************************************//** Checks if a page is all zeroes. @return TRUE if the page is all zeroes */ @@ -742,7 +790,7 @@ buf_page_print( ulint flags) /*!< in: 0 or BUF_PAGE_PRINT_NO_CRASH or BUF_PAGE_PRINT_NO_FULL */ - UNIV_COLD MY_ATTRIBUTE((nonnull)); + UNIV_COLD; /********************************************************************//** Decompress a block. @return TRUE if successful */ @@ -1524,7 +1572,7 @@ The hook that is called just after a page is read from disk. The function decrypt disk content into buf_page_t and releases the temporary buffer that was allocated in buf_page_decrypt_before_read */ UNIV_INTERN -ibool +bool buf_page_decrypt_after_read( /*========================*/ buf_page_t* page); /*!< in/out: buffer page read from disk */ @@ -1630,15 +1678,8 @@ struct buf_page_t{ if written again we check is TRIM operation needed. */ - unsigned key_version; /*!< key version for this block */ - bool page_encrypted; /*!< page is page encrypted */ - bool page_compressed;/*!< page is page compressed */ - ulint stored_checksum;/*!< stored page checksum if page - encrypted */ - bool encrypted; /*!< page is still encrypted */ - ulint calculated_checksum; - /*!< calculated checksum if page - encrypted */ + unsigned key_version; /*!< key version for this block */ + bool encrypted; /*!< page is still encrypted */ ulint real_size; /*!< Real size of the page Normal pages == UNIV_PAGE_SIZE @@ -2070,7 +2111,10 @@ struct buf_pool_t{ os_event_t no_flush[BUF_FLUSH_N_TYPES]; /*!< this is in the set state when there is no flush batch - of the given type running */ + of the given type running; + os_event_set() and os_event_reset() + are protected by + buf_pool_t::flush_state_mutex */ ib_rbt_t* flush_rbt; /*!< a red-black tree is used exclusively during recovery to speed up insertions in the @@ -2318,7 +2362,6 @@ buf_pool_mutex_exit( /*================*/ buf_pool_t* buf_pool); /*!< in: buffer pool */ - #ifndef UNIV_NONINL #include "buf0buf.ic" #endif diff --git a/storage/xtradb/include/buf0dblwr.h b/storage/xtradb/include/buf0dblwr.h index a62a6400d97..5582778825c 100644 --- a/storage/xtradb/include/buf0dblwr.h +++ b/storage/xtradb/include/buf0dblwr.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -134,11 +135,13 @@ struct buf_dblwr_t{ ulint b_reserved;/*!< number of slots currently reserved for batch flush. */ os_event_t b_event;/*!< event where threads wait for a - batch flush to end. */ + batch flush to end; + os_event_set() and os_event_reset() + are protected by buf_dblwr_t::mutex */ ulint s_reserved;/*!< number of slots currently reserved for single page flushes. */ os_event_t s_event;/*!< event where threads wait for a - single page flush slot. */ + single page flush slot. Protected by mutex. */ bool* in_use; /*!< flag used to indicate if a slot is in use. Only used for single page flushes. */ diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h index cfaf3e12e82..1622b927a76 100644 --- a/storage/xtradb/include/dict0dict.h +++ b/storage/xtradb/include/dict0dict.h @@ -762,7 +762,7 @@ ulint dict_index_is_clust( /*================*/ const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((nonnull, pure, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** Check whether the index is unique. @return nonzero for unique index, zero for other indexes */ @@ -771,7 +771,7 @@ ulint dict_index_is_unique( /*=================*/ const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((nonnull, pure, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** Check whether the index is the insert buffer tree. @return nonzero for insert buffer, zero for other indexes */ @@ -780,7 +780,7 @@ ulint dict_index_is_ibuf( /*===============*/ const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((nonnull, pure, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** Check whether the index is a secondary index or the insert buffer tree. @return nonzero for insert buffer, zero for other indexes */ @@ -789,7 +789,7 @@ ulint dict_index_is_sec_or_ibuf( /*======================*/ const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((nonnull, pure, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /************************************************************************ Gets the all the FTS indexes for the table. NOTE: must not be called for @@ -811,7 +811,7 @@ ulint dict_table_get_n_user_cols( /*=======================*/ const dict_table_t* table) /*!< in: table */ - MY_ATTRIBUTE((nonnull, pure, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** Gets the number of system columns in a table in the dictionary cache. @return number of system (e.g., ROW_ID) columns of a table */ @@ -830,7 +830,7 @@ ulint dict_table_get_n_cols( /*==================*/ const dict_table_t* table) /*!< in: table */ - MY_ATTRIBUTE((nonnull, pure, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** Gets the approximately estimated number of rows in the table. @return estimated number of rows */ @@ -1784,7 +1784,7 @@ ulint dict_index_is_corrupted( /*====================*/ const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** @@ -1797,7 +1797,7 @@ dict_set_corrupted( dict_index_t* index, /*!< in/out: index */ trx_t* trx, /*!< in/out: transaction */ const char* ctx) /*!< in: context */ - UNIV_COLD MY_ATTRIBUTE((nonnull)); + UNIV_COLD; /**********************************************************************//** Flags an index corrupted in the data dictionary cache only. This @@ -1808,8 +1808,7 @@ void dict_set_corrupted_index_cache_only( /*================================*/ dict_index_t* index, /*!< in/out: index */ - dict_table_t* table) /*!< in/out: table */ - MY_ATTRIBUTE((nonnull)); + dict_table_t* table); /*!< in/out: table */ /**********************************************************************//** Flags a table with specified space_id corrupted in the table dictionary diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic index 2b63ddea51d..81da2fa5580 100644 --- a/storage/xtradb/include/dict0dict.ic +++ b/storage/xtradb/include/dict0dict.ic @@ -267,7 +267,6 @@ dict_index_is_clust( /*================*/ const dict_index_t* index) /*!< in: index */ { - ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); return(index->type & DICT_CLUSTERED); @@ -281,7 +280,6 @@ dict_index_is_unique( /*=================*/ const dict_index_t* index) /*!< in: index */ { - ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); return(index->type & DICT_UNIQUE); @@ -296,7 +294,6 @@ dict_index_is_ibuf( /*===============*/ const dict_index_t* index) /*!< in: index */ { - ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); return(index->type & DICT_IBUF); @@ -328,7 +325,6 @@ dict_index_is_sec_or_ibuf( { ulint type; - ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); type = index->type; @@ -346,7 +342,6 @@ dict_table_get_n_user_cols( /*=======================*/ const dict_table_t* table) /*!< in: table */ { - ut_ad(table); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); return(table->n_cols - DATA_N_SYS_COLS); @@ -378,7 +373,6 @@ dict_table_get_n_cols( /*==================*/ const dict_table_t* table) /*!< in: table */ { - ut_ad(table); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); return(table->n_cols); @@ -1550,7 +1544,6 @@ dict_index_is_corrupted( /*====================*/ const dict_index_t* index) /*!< in: index */ { - ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); return((index->type & DICT_CORRUPT) diff --git a/storage/xtradb/include/dict0stats_bg.h b/storage/xtradb/include/dict0stats_bg.h index 34dc4657829..d5f0870718d 100644 --- a/storage/xtradb/include/dict0stats_bg.h +++ b/storage/xtradb/include/dict0stats_bg.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -32,7 +33,8 @@ Created Apr 26, 2012 Vasil Dimov #include "os0sync.h" /* os_event_t */ #include "os0thread.h" /* DECLARE_THREAD */ -/** Event to wake up the stats thread */ +/** Event to wake up dict_stats_thread on dict_stats_recalc_pool_add() +or shutdown. Not protected by any mutex. */ extern os_event_t dict_stats_event; /*****************************************************************//** diff --git a/storage/xtradb/include/dyn0dyn.h b/storage/xtradb/include/dyn0dyn.h index 1bd10b6bf58..20963a1472b 100644 --- a/storage/xtradb/include/dyn0dyn.h +++ b/storage/xtradb/include/dyn0dyn.h @@ -46,9 +46,8 @@ UNIV_INLINE dyn_array_t* dyn_array_create( /*=============*/ - dyn_array_t* arr) /*!< in/out memory buffer of + dyn_array_t* arr); /*!< in/out memory buffer of size sizeof(dyn_array_t) */ - MY_ATTRIBUTE((nonnull)); /************************************************************//** Frees a dynamic array. */ UNIV_INLINE @@ -69,7 +68,7 @@ dyn_array_open( dyn_array_t* arr, /*!< in: dynamic array */ ulint size) /*!< in: size in bytes of the buffer; MUST be smaller than DYN_ARRAY_DATA_SIZE! */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /*********************************************************************//** Closes the buffer returned by dyn_array_open. */ UNIV_INLINE @@ -77,8 +76,7 @@ void dyn_array_close( /*============*/ dyn_array_t* arr, /*!< in: dynamic array */ - const byte* ptr) /*!< in: end of used space */ - MY_ATTRIBUTE((nonnull)); + const byte* ptr); /*!< in: end of used space */ /*********************************************************************//** Makes room on top of a dyn array and returns a pointer to the added element. The caller must copy the element to @@ -90,7 +88,7 @@ dyn_array_push( /*===========*/ dyn_array_t* arr, /*!< in/out: dynamic array */ ulint size) /*!< in: size in bytes of the element */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /************************************************************//** Returns pointer to an element in dyn array. @return pointer to element */ @@ -101,7 +99,7 @@ dyn_array_get_element( const dyn_array_t* arr, /*!< in: dyn array */ ulint pos) /*!< in: position of element in bytes from array start */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /************************************************************//** Returns the size of stored data in a dyn array. @return data size in bytes */ @@ -110,7 +108,7 @@ ulint dyn_array_get_data_size( /*====================*/ const dyn_array_t* arr) /*!< in: dyn array */ - MY_ATTRIBUTE((nonnull, warn_unused_result, pure)); + MY_ATTRIBUTE((warn_unused_result)); /************************************************************//** Gets the first block in a dyn array. @param arr dyn array @@ -144,7 +142,7 @@ ulint dyn_block_get_used( /*===============*/ const dyn_block_t* block) /*!< in: dyn array block */ - MY_ATTRIBUTE((nonnull, warn_unused_result, pure)); + MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** Gets pointer to the start of data in a dyn array block. @return pointer to data */ @@ -153,7 +151,7 @@ byte* dyn_block_get_data( /*===============*/ const dyn_block_t* block) /*!< in: dyn array block */ - MY_ATTRIBUTE((nonnull, warn_unused_result, pure)); + MY_ATTRIBUTE((warn_unused_result)); /********************************************************//** Pushes n bytes to a dyn array. */ UNIV_INLINE diff --git a/storage/xtradb/include/dyn0dyn.ic b/storage/xtradb/include/dyn0dyn.ic index f18f2e6dff9..6e97649245e 100644 --- a/storage/xtradb/include/dyn0dyn.ic +++ b/storage/xtradb/include/dyn0dyn.ic @@ -36,7 +36,7 @@ dyn_block_t* dyn_array_add_block( /*================*/ dyn_array_t* arr) /*!< in/out: dyn array */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** Gets the number of used bytes in a dyn array block. @@ -47,8 +47,6 @@ dyn_block_get_used( /*===============*/ const dyn_block_t* block) /*!< in: dyn array block */ { - ut_ad(block); - return((block->used) & ~DYN_BLOCK_FULL_FLAG); } @@ -76,7 +74,6 @@ dyn_array_create( dyn_array_t* arr) /*!< in/out: memory buffer of size sizeof(dyn_array_t) */ { - ut_ad(arr); #if DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG # error "DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG" #endif @@ -119,7 +116,6 @@ dyn_array_push( dyn_block_t* block; ulint used; - ut_ad(arr); ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); ut_ad(size <= DYN_ARRAY_DATA_SIZE); ut_ad(size); @@ -159,7 +155,6 @@ dyn_array_open( { dyn_block_t* block; - ut_ad(arr); ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); ut_ad(size <= DYN_ARRAY_DATA_SIZE); ut_ad(size); @@ -195,7 +190,6 @@ dyn_array_close( { dyn_block_t* block; - ut_ad(arr); ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); block = dyn_array_get_last_block(arr); @@ -222,7 +216,6 @@ dyn_array_get_element( { const dyn_block_t* block; - ut_ad(arr); ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); /* Get the first array block */ @@ -260,7 +253,6 @@ dyn_array_get_data_size( const dyn_block_t* block; ulint sum = 0; - ut_ad(arr); ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); if (arr->heap == NULL) { diff --git a/storage/xtradb/include/fil0crypt.h b/storage/xtradb/include/fil0crypt.h index 42cdafde4d0..cfc2d850883 100644 --- a/storage/xtradb/include/fil0crypt.h +++ b/storage/xtradb/include/fil0crypt.h @@ -39,14 +39,6 @@ static const unsigned char CRYPT_MAGIC[MAGIC_SZ] = { /* This key will be used if nothing else is given */ #define FIL_DEFAULT_ENCRYPTION_KEY ENCRYPTION_KEY_SYSTEM_DATA -/** Enum values for encryption table option */ -typedef enum { - FIL_SPACE_ENCRYPTION_DEFAULT = 0, /* Tablespace encrypted if - srv_encrypt_tables = ON */ - FIL_SPACE_ENCRYPTION_ON = 1, /* Tablespace is encrypted always */ - FIL_SPACE_ENCRYPTION_OFF = 2 /* Tablespace is not encrypted */ -} fil_encryption_t; - extern os_event_t fil_crypt_threads_event; /** @@ -110,23 +102,21 @@ struct fil_space_rotate_state_t } scrubbing; }; -struct fil_space_crypt_struct : st_encryption_scheme +struct fil_space_crypt_t : st_encryption_scheme { public: /** Constructor. Does not initialize the members! The object is expected to be placed in a buffer that has been zero-initialized. */ - fil_space_crypt_struct( + fil_space_crypt_t( ulint new_type, uint new_min_key_version, uint new_key_id, - ulint offset, fil_encryption_t new_encryption) : st_encryption_scheme(), min_key_version(new_min_key_version), - page0_offset(offset), + page0_offset(0), encryption(new_encryption), - closing(false), key_found(), rotate_state() { @@ -138,9 +128,9 @@ struct fil_space_crypt_struct : st_encryption_scheme locker = crypt_data_scheme_locker; type = new_type; - if (new_encryption == FIL_SPACE_ENCRYPTION_OFF || + if (new_encryption == FIL_ENCRYPTION_OFF || (!srv_encrypt_tables && - new_encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) { + new_encryption == FIL_ENCRYPTION_DEFAULT)) { type = CRYPT_SCHEME_UNENCRYPTED; } else { type = CRYPT_SCHEME_1; @@ -149,9 +139,8 @@ struct fil_space_crypt_struct : st_encryption_scheme } /** Destructor */ - ~fil_space_crypt_struct() + ~fil_space_crypt_t() { - closing = true; mutex_free(&mutex); } @@ -169,45 +158,36 @@ struct fil_space_crypt_struct : st_encryption_scheme /** Returns true if tablespace should be encrypted */ bool should_encrypt() const { - return ((encryption == FIL_SPACE_ENCRYPTION_ON) || + return ((encryption == FIL_ENCRYPTION_ON) || (srv_encrypt_tables && - encryption == FIL_SPACE_ENCRYPTION_DEFAULT)); + encryption == FIL_ENCRYPTION_DEFAULT)); } /** Return true if tablespace is encrypted. */ bool is_encrypted() const { - return (encryption != FIL_SPACE_ENCRYPTION_OFF); + return (encryption != FIL_ENCRYPTION_OFF); } /** Return true if default tablespace encryption is used, */ bool is_default_encryption() const { - return (encryption == FIL_SPACE_ENCRYPTION_DEFAULT); + return (encryption == FIL_ENCRYPTION_DEFAULT); } /** Return true if tablespace is not encrypted. */ bool not_encrypted() const { - return (encryption == FIL_SPACE_ENCRYPTION_OFF); + return (encryption == FIL_ENCRYPTION_OFF); } - /** Is this tablespace closing. */ - bool is_closing(bool is_fixed) { - bool closed; - if (!is_fixed) { - mutex_enter(&mutex); - } - closed = closing; - if (!is_fixed) { - mutex_exit(&mutex); - } - return closed; - } + /** Write crypt data to a page (0) + @param[in,out] page0 Page 0 where to write + @param[in,out] mtr Minitransaction */ + void write_page0(byte* page0, mtr_t* mtr); uint min_key_version; // min key version for this space ulint page0_offset; // byte offset on page 0 for crypt data fil_encryption_t encryption; // Encryption setup ib_mutex_t mutex; // mutex protecting following variables - bool closing; // is tablespace being closed /** Return code from encryption_key_get_latest_version. If ENCRYPTION_KEY_VERSION_INVALID encryption plugin @@ -219,317 +199,307 @@ struct fil_space_crypt_struct : st_encryption_scheme fil_space_rotate_state_t rotate_state; }; -/* structure containing encryption specification */ -typedef struct fil_space_crypt_struct fil_space_crypt_t; +/** Status info about encryption */ +struct fil_space_crypt_status_t { + ulint space; /*!< tablespace id */ + ulint scheme; /*!< encryption scheme */ + uint min_key_version; /*!< min key version */ + uint current_key_version;/*!< current key version */ + uint keyserver_requests;/*!< no of key requests to key server */ + ulint key_id; /*!< current key_id */ + bool rotating; /*!< is key rotation ongoing */ + bool flushing; /*!< is flush at end of rotation ongoing */ + ulint rotate_next_page_number; /*!< next page if key rotating */ + ulint rotate_max_page_number; /*!< max page if key rotating */ +}; + +/** Statistics about encryption key rotation */ +struct fil_crypt_stat_t { + ulint pages_read_from_cache; + ulint pages_read_from_disk; + ulint pages_modified; + ulint pages_flushed; + ulint estimated_iops; +}; + +/** Status info about scrubbing */ +struct fil_space_scrub_status_t { + ulint space; /*!< tablespace id */ + bool compressed; /*!< is space compressed */ + time_t last_scrub_completed; /*!< when was last scrub completed */ + bool scrubbing; /*!< is scrubbing ongoing */ + time_t current_scrub_started; /*!< when started current scrubbing */ + ulint current_scrub_active_threads; /*!< current scrub active threads */ + ulint current_scrub_page_number; /*!< current scrub page no */ + ulint current_scrub_max_page_number; /*!< current scrub max page no */ +}; /********************************************************************* -Init global resources needed for tablespace encryption/decryption */ +Init space crypt */ UNIV_INTERN void fil_space_crypt_init(); /********************************************************************* -Cleanup global resources needed for tablespace encryption/decryption */ +Cleanup space crypt */ UNIV_INTERN void fil_space_crypt_cleanup(); -/********************************************************************* -Create crypt data, i.e data that is used for a single tablespace */ -UNIV_INTERN -fil_space_crypt_t * -fil_space_create_crypt_data( -/*========================*/ - fil_encryption_t encrypt_mode, /*!< in: encryption mode */ - uint key_id); /*!< in: encryption key id */ - -/********************************************************************* -Destroy crypt data */ -UNIV_INTERN -void -fil_space_destroy_crypt_data( -/*=========================*/ - fil_space_crypt_t **crypt_data); /*!< in/out: crypt data */ - -/********************************************************************* -Get crypt data for a space*/ -UNIV_INTERN -fil_space_crypt_t * -fil_space_get_crypt_data( -/*=====================*/ - ulint space); /*!< in: tablespace id */ +/****************************************************************** +Create a fil_space_crypt_t object +@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or + FIL_ENCRYPTION_ON or + FIL_ENCRYPTION_OFF -/********************************************************************* -Set crypt data for a space*/ +@param[in] key_id Encryption key id +@return crypt object */ UNIV_INTERN fil_space_crypt_t* -fil_space_set_crypt_data( -/*=====================*/ - ulint space, /*!< in: tablespace id */ - fil_space_crypt_t* crypt_data); /*!< in: crypt data to set */ +fil_space_create_crypt_data( + fil_encryption_t encrypt_mode, + uint key_id) + MY_ATTRIBUTE((warn_unused_result)); -/********************************************************************* -Merge crypt data */ +/****************************************************************** +Merge fil_space_crypt_t object +@param[in,out] dst Destination cryp data +@param[in] src Source crypt data */ UNIV_INTERN void fil_space_merge_crypt_data( -/*=======================*/ - fil_space_crypt_t* dst_crypt_data, /*!< in: crypt_data */ - const fil_space_crypt_t* src_crypt_data); /*!< in: crypt data */ + fil_space_crypt_t* dst, + const fil_space_crypt_t* src); -/********************************************************************* -Read crypt data from buffer page */ +/****************************************************************** +Read crypt data from a page (0) +@param[in] space space_id +@param[in] page Page 0 +@param[in] offset Offset to crypt data +@return crypt data from page 0 or NULL. */ UNIV_INTERN -fil_space_crypt_t * +fil_space_crypt_t* fil_space_read_crypt_data( -/*======================*/ - ulint space, /*!< in: tablespace id */ - const byte* page, /*!< in: buffer page */ - ulint offset); /*!< in: offset where crypt data is stored */ + ulint space, + const byte* page, + ulint offset) + MY_ATTRIBUTE((warn_unused_result)); -/********************************************************************* -Write crypt data to buffer page */ +/****************************************************************** +Free a crypt data object +@param[in,out] crypt_data crypt data to be freed */ UNIV_INTERN void -fil_space_write_crypt_data( -/*=======================*/ - ulint space, /*!< in: tablespace id */ - byte* page, /*!< in: buffer page */ - ulint offset, /*!< in: offset where to store data */ - ulint maxsize, /*!< in: max space available to store crypt data in */ - mtr_t * mtr); /*!< in: mini-transaction */ +fil_space_destroy_crypt_data( + fil_space_crypt_t **crypt_data); -/********************************************************************* -Clear crypt data from page 0 (used for import tablespace) */ +/****************************************************************** +Parse a MLOG_FILE_WRITE_CRYPT_DATA log entry +@param[in] ptr Log entry start +@param[in] end_ptr Log entry end +@param[in] block buffer block +@return position on log buffer */ UNIV_INTERN -void -fil_space_clear_crypt_data( -/*=======================*/ - byte* page, /*!< in: buffer page */ - ulint offset); /*!< in: offset where crypt data is stored */ +const byte* +fil_parse_write_crypt_data( + const byte* ptr, + const byte* end_ptr, + const buf_block_t* block) + MY_ATTRIBUTE((warn_unused_result)); -/********************************************************************* -Parse crypt data log record */ +/****************************************************************** +Encrypt a buffer +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in] zip_size Compressed size or 0 +@param[in,out] dst_frame Output buffer +@return encrypted buffer or NULL */ UNIV_INTERN byte* -fil_parse_write_crypt_data( -/*=======================*/ - byte* ptr, /*!< in: start of log record */ - byte* end_ptr, /*!< in: end of log record */ - buf_block_t*); /*!< in: buffer page to apply record to */ +fil_encrypt_buf( + fil_space_crypt_t* crypt_data, + ulint space, + ulint offset, + lsn_t lsn, + const byte* src_frame, + ulint zip_size, + byte* dst_frame) + MY_ATTRIBUTE((warn_unused_result)); -/********************************************************************* -Check if extra buffer shall be allocated for decrypting after read */ +/****************************************************************** +Encrypt a page + +@param[in] space Tablespace +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in,out] dst_frame Output buffer +@return encrypted buffer or NULL */ UNIV_INTERN -bool -fil_space_check_encryption_read( -/*============================*/ - ulint space); /*!< in: tablespace id */ +byte* +fil_space_encrypt( + const fil_space_t* space, + ulint offset, + lsn_t lsn, + byte* src_frame, + byte* dst_frame) + MY_ATTRIBUTE((warn_unused_result)); /****************************************************************** Decrypt a page -@return true if page is decrypted, false if not. */ +@param[in,out] crypt_data crypt_data +@param[in] tmp_frame Temporary buffer +@param[in] page_size Page size +@param[in,out] src_frame Page to decrypt +@param[out] err DB_SUCCESS or error +@return true if page decrypted, false if not.*/ UNIV_INTERN bool fil_space_decrypt( -/*==============*/ - fil_space_crypt_t* crypt_data, /*!< in: crypt data */ - byte* tmp_frame, /*!< in: temporary buffer */ - ulint page_size, /*!< in: page size */ - byte* src_frame, /*!< in:out: page buffer */ - dberr_t* err); /*!< in: out: DB_SUCCESS or - error code */ - -/********************************************************************* -Encrypt buffer page -@return encrypted page, or original not encrypted page if encrypt -is not needed. */ -UNIV_INTERN -byte* -fil_space_encrypt( -/*==============*/ - ulint space, /*!< in: tablespace id */ - ulint offset, /*!< in: page no */ - lsn_t lsn, /*!< in: page lsn */ - byte* src_frame, /*!< in: page frame */ - ulint size, /*!< in: size of data to encrypt */ - byte* dst_frame); /*!< in: where to encrypt to */ + fil_space_crypt_t* crypt_data, + byte* tmp_frame, + ulint page_size, + byte* src_frame, + dberr_t* err); -/********************************************************************* -Decrypt buffer page -@return decrypted page, or original not encrypted page if decrypt is +/****************************************************************** +Decrypt a page +@param[in] space Tablespace +@param[in] tmp_frame Temporary buffer used for decrypting +@param[in] page_size Page size +@param[in,out] src_frame Page to decrypt +@param[out] decrypted true if page was decrypted +@return decrypted page, or original not encrypted page if decryption is not needed.*/ UNIV_INTERN byte* fil_space_decrypt( -/*==============*/ - ulint space, /*!< in: tablespace id */ - byte* src_frame, /*!< in: page frame */ - ulint page_size, /*!< in: size of data to encrypt */ - byte* dst_frame) /*!< in: where to decrypt to */ - __attribute__((warn_unused_result)); + const fil_space_t* space, + byte* tmp_frame, + byte* src_frame, + bool* decrypted) + MY_ATTRIBUTE((warn_unused_result)); + +/****************************************************************** +Calculate post encryption checksum +@param[in] zip_size zip_size or 0 +@param[in] dst_frame Block where checksum is calculated +@return page checksum or BUF_NO_CHECKSUM_MAGIC +not needed. */ +UNIV_INTERN +ulint +fil_crypt_calculate_checksum( + ulint zip_size, + const byte* dst_frame) + MY_ATTRIBUTE((warn_unused_result)); /********************************************************************* -fil_space_verify_crypt_checksum -NOTE: currently this function can only be run in single threaded mode -as it modifies srv_checksum_algorithm (temporarily) +Verify that post encryption checksum match calculated checksum. +This function should be called only if tablespace contains crypt_data +metadata (this is strong indication that tablespace is encrypted). +Function also verifies that traditional checksum does not match +calculated checksum as if it does page could be valid unencrypted, +encrypted, or corrupted. + +@param[in] page Page to verify +@param[in] zip_size zip size +@param[in] space Tablespace +@param[in] pageno Page no @return true if page is encrypted AND OK, false otherwise */ UNIV_INTERN bool fil_space_verify_crypt_checksum( -/*============================*/ - const byte* src_frame,/*!< in: page frame */ - ulint zip_size); /*!< in: size of data to encrypt */ + byte* page, + ulint zip_size, + const fil_space_t* space, + ulint pageno) + MY_ATTRIBUTE((warn_unused_result)); /********************************************************************* -Init threads for key rotation */ +Adjust thread count for key rotation +@param[in] enw_cnt Number of threads to be used */ UNIV_INTERN void -fil_crypt_threads_init(); +fil_crypt_set_thread_cnt( + uint new_cnt); /********************************************************************* -Set thread count (e.g start or stops threads) used for key rotation */ +Adjust max key age +@param[in] val New max key age */ UNIV_INTERN void -fil_crypt_set_thread_cnt( -/*=====================*/ - uint new_cnt); /*!< in: requested #threads */ +fil_crypt_set_rotate_key_age( + uint val); /********************************************************************* -Cleanup resources for threads for key rotation */ +Adjust rotation iops +@param[in] val New max roation iops */ UNIV_INTERN void -fil_crypt_threads_cleanup(); +fil_crypt_set_rotation_iops( + uint val); /********************************************************************* -Set rotate key age */ +Adjust encrypt tables +@param[in] val New setting for innodb-encrypt-tables */ UNIV_INTERN void -fil_crypt_set_rotate_key_age( -/*=========================*/ - uint rotate_age); /*!< in: requested rotate age */ +fil_crypt_set_encrypt_tables( + uint val); /********************************************************************* -Set rotation threads iops */ +Init threads for key rotation */ UNIV_INTERN void -fil_crypt_set_rotation_iops( -/*========================*/ - uint iops); /*!< in: requested iops */ +fil_crypt_threads_init(); /********************************************************************* -Mark a space as closing */ +Clean up key rotation threads resources */ UNIV_INTERN void -fil_space_crypt_mark_space_closing( -/*===============================*/ - ulint space, /*!< in: tablespace id */ - fil_space_crypt_t* crypt_data); /*!< in: crypt_data or NULL */ +fil_crypt_threads_cleanup(); /********************************************************************* -Wait for crypt threads to stop accessing space */ +Wait for crypt threads to stop accessing space +@param[in] space Tablespace */ UNIV_INTERN void fil_space_crypt_close_tablespace( -/*=============================*/ - ulint space); /*!< in: tablespace id */ - -/** Struct for retreiving info about encryption */ -struct fil_space_crypt_status_t { - ulint space; /*!< tablespace id */ - ulint scheme; /*!< encryption scheme */ - uint min_key_version; /*!< min key version */ - uint current_key_version;/*!< current key version */ - uint keyserver_requests;/*!< no of key requests to key server */ - ulint key_id; /*!< current key_id */ - bool rotating; /*!< is key rotation ongoing */ - bool flushing; /*!< is flush at end of rotation ongoing */ - ulint rotate_next_page_number; /*!< next page if key rotating */ - ulint rotate_max_page_number; /*!< max page if key rotating */ -}; + const fil_space_t* space); /********************************************************************* -Get crypt status for a space -@return 0 if crypt data found */ +Get crypt status for a space (used by information_schema) +@param[in] space Tablespace +@param[out] status Crypt status +return 0 if crypt data present */ UNIV_INTERN -int +void fil_space_crypt_get_status( -/*=======================*/ - ulint id, /*!< in: space id */ - struct fil_space_crypt_status_t * status); /*!< out: status */ - -/** Struct for retreiving statistics about encryption key rotation */ -struct fil_crypt_stat_t { - ulint pages_read_from_cache; - ulint pages_read_from_disk; - ulint pages_modified; - ulint pages_flushed; - ulint estimated_iops; -}; + const fil_space_t* space, + struct fil_space_crypt_status_t* status); /********************************************************************* -Get crypt rotation statistics */ +Return crypt statistics +@param[out] stat Crypt statistics */ UNIV_INTERN void fil_crypt_total_stat( -/*==================*/ - fil_crypt_stat_t* stat); /*!< out: crypt stat */ - -/** Struct for retreiving info about scrubbing */ -struct fil_space_scrub_status_t { - ulint space; /*!< tablespace id */ - bool compressed; /*!< is space compressed */ - time_t last_scrub_completed; /*!< when was last scrub completed */ - bool scrubbing; /*!< is scrubbing ongoing */ - time_t current_scrub_started; /*!< when started current scrubbing */ - ulint current_scrub_active_threads; /*!< current scrub active threads */ - ulint current_scrub_page_number; /*!< current scrub page no */ - ulint current_scrub_max_page_number; /*!< current scrub max page no */ -}; + fil_crypt_stat_t *stat); /********************************************************************* -Get scrub status for a space -@return 0 if no scrub info found */ -UNIV_INTERN -int -fil_space_get_scrub_status( -/*=======================*/ - ulint id, /*!< in: space id */ - struct fil_space_scrub_status_t * status); /*!< out: status */ +Get scrub status for a space (used by information_schema) -/********************************************************************* -Adjust encrypt tables */ +@param[in] space Tablespace +@param[out] status Scrub status +return 0 if data found */ UNIV_INTERN void -fil_crypt_set_encrypt_tables( -/*=========================*/ - uint val); /*!< in: New srv_encrypt_tables setting */ - -/****************************************************************** -Encrypt a buffer */ -UNIV_INTERN -byte* -fil_encrypt_buf( -/*============*/ - fil_space_crypt_t* crypt_data, /*!< in: crypt data */ - ulint space, /*!< in: Space id */ - ulint offset, /*!< in: Page offset */ - lsn_t lsn, /*!< in: lsn */ - byte* src_frame, /*!< in: Source page to be encrypted */ - ulint zip_size, /*!< in: compressed size if - row_format compressed */ - byte* dst_frame); /*!< in: outbut buffer */ - -/****************************************************************** -Calculate post encryption checksum -@return page checksum or BUF_NO_CHECKSUM_MAGIC -not needed. */ -UNIV_INTERN -ulint -fil_crypt_calculate_checksum( -/*=========================*/ - ulint zip_size, /*!< in: zip_size or 0 */ - byte* dst_frame); /*!< in: page where to calculate */ +fil_space_get_scrub_status( + const fil_space_t* space, + struct fil_space_scrub_status_t* status); #ifndef UNIV_NONINL #include "fil0crypt.ic" diff --git a/storage/xtradb/include/fil0crypt.ic b/storage/xtradb/include/fil0crypt.ic index 0a1a60dfab8..cb9ba083466 100644 --- a/storage/xtradb/include/fil0crypt.ic +++ b/storage/xtradb/include/fil0crypt.ic @@ -34,35 +34,3 @@ fil_page_is_encrypted( { return(mach_read_from_4(buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0); } - -/*******************************************************************//** -Find out whether the page can be decrypted. -The function for decrypting the page should already be executed before this. -@return 1 if key provider not available or key is not available - 0 if decryption should be possible -*/ -UNIV_INLINE -bool -fil_page_encryption_status( -/*===================*/ - const byte *buf, /*!< in: page */ - ulint space_id) /*!< in: space_id */ -{ - fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space_id); - ulint page_type = mach_read_from_2(buf+FIL_PAGE_TYPE); - - if (page_type == FIL_PAGE_TYPE_FSP_HDR) { - if (crypt_data != NULL) { - if (!encryption_key_id_exists(crypt_data->key_id)) { - /* accessing table would surely fail, because no key or no key provider available */ - return 1; - } - } - } else { - ulint key = mach_read_from_4(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); - if (!encryption_key_version_exists(crypt_data->key_id, key)) { - return 1; - } - } - return 0; -} diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h index 41e38794ea9..b80df057351 100644 --- a/storage/xtradb/include/fil0fil.h +++ b/storage/xtradb/include/fil0fil.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -181,8 +181,18 @@ extern fil_addr_t fil_addr_null; #define FIL_LOG 502 /*!< redo log */ /* @} */ -/* structure containing encryption specification */ -typedef struct fil_space_crypt_struct fil_space_crypt_t; +/** Structure containing encryption specification */ +struct fil_space_crypt_t; + +/** Enum values for encryption table option */ +enum fil_encryption_t { + /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */ + FIL_ENCRYPTION_DEFAULT, + /** Encrypted */ + FIL_ENCRYPTION_ON, + /** Not encrypted */ + FIL_ENCRYPTION_OFF +}; /** The number of fsyncs done to the log */ extern ulint fil_n_log_flushes; @@ -219,7 +229,9 @@ struct fil_node_t { ibool open; /*!< TRUE if file open */ os_file_t handle; /*!< OS handle to the file, if file open */ os_event_t sync_event;/*!< Condition event to group and - serialize calls to fsync */ + serialize calls to fsync; + os_event_set() and os_event_reset() + are protected by fil_system_t::mutex */ ibool is_raw_disk;/*!< TRUE if the 'file' is actually a raw device or a raw disk partition */ ulint size; /*!< size of the file in database pages, 0 if @@ -267,8 +279,8 @@ struct fil_space_t { .ibd file of tablespace and want to stop temporarily posting of new i/o requests on the file */ - ibool stop_new_ops; - /*!< we set this TRUE when we start + bool stop_new_ops; + /*!< we set this true when we start deleting a single-table tablespace. When this is set following new ops are not allowed: @@ -314,13 +326,16 @@ struct fil_space_t { prio_rw_lock_t latch; /*!< latch protecting the file space storage allocation */ #endif /* !UNIV_HOTBACKUP */ + UT_LIST_NODE_T(fil_space_t) unflushed_spaces; /*!< list of spaces with at least one unflushed file we have written to */ bool is_in_unflushed_spaces; /*!< true if this space is currently in unflushed_spaces */ - ibool is_corrupt; + /** True if srv_pass_corrupt_table=true and tablespace contains + corrupted page. */ + bool is_corrupt; /*!< true if tablespace corrupted */ bool printed_compression_failure; /*!< true if we have already printed @@ -336,7 +351,22 @@ struct fil_space_t { UT_LIST_NODE_T(fil_space_t) space_list; /*!< list of all spaces */ + /*!< Protected by fil_system */ + UT_LIST_NODE_T(fil_space_t) rotation_list; + /*!< list of spaces needing + key rotation */ + + bool is_in_rotation_list; + /*!< true if this space is + currently in key rotation list */ + ulint magic_n;/*!< FIL_SPACE_MAGIC_N */ + + /** @return whether the tablespace is about to be dropped or truncated */ + bool is_stopping() const + { + return stop_new_ops; + } }; /** Value of fil_space_t::magic_n */ @@ -392,6 +422,11 @@ struct fil_system_t { request */ UT_LIST_BASE_NODE_T(fil_space_t) space_list; /*!< list of all file spaces */ + + UT_LIST_BASE_NODE_T(fil_space_t) rotation_list; + /*!< list of all file spaces needing + key rotation.*/ + ibool space_id_reuse_warned; /* !< TRUE if fil_space_create() has issued a warning about @@ -470,18 +505,24 @@ fil_space_contains_node( /*******************************************************************//** Creates a space memory object and puts it to the 'fil system' hash table. If there is an error, prints an error message to the .err log. +@param[in] name Space name +@param[in] id Space id +@param[in] flags Tablespace flags +@param[in] purpose FIL_TABLESPACE or FIL_LOG if log +@param[in] crypt_data Encryption information +@param[in] create_table True if this is create table +@param[in] mode Encryption mode @return TRUE if success */ UNIV_INTERN -ibool +bool fil_space_create( -/*=============*/ - const char* name, /*!< in: space name */ - ulint id, /*!< in: space id */ - ulint zip_size,/*!< in: compressed page size, or - 0 for uncompressed tablespaces */ - ulint purpose, /*!< in: FIL_TABLESPACE, or FIL_LOG if log */ - fil_space_crypt_t* crypt_data, /*!< in: crypt data */ - bool create_table); /*!< in: true if create table */ + const char* name, + ulint id, + ulint flags, + ulint purpose, + fil_space_crypt_t* crypt_data, + bool create_table, + fil_encryption_t mode = FIL_ENCRYPTION_DEFAULT); /*******************************************************************//** Assigns a new space id for a new single-table tablespace. This works simply by @@ -604,6 +645,59 @@ fil_write_flushed_lsn_to_data_files( /*================================*/ lsn_t lsn, /*!< in: lsn to write */ ulint arch_log_no); /*!< in: latest archived log file number */ + +/** Acquire a tablespace when it could be dropped concurrently. +Used by background threads that do not necessarily hold proper locks +for concurrency control. +@param[in] id tablespace ID +@return the tablespace, or NULL if missing or being deleted */ +fil_space_t* +fil_space_acquire( + ulint id) + MY_ATTRIBUTE((warn_unused_result)); + +/** Acquire a tablespace that may not exist. +Used by background threads that do not necessarily hold proper locks +for concurrency control. +@param[in] id tablespace ID +@return the tablespace, or NULL if missing or being deleted */ +fil_space_t* +fil_space_acquire_silent( + ulint id) + MY_ATTRIBUTE((warn_unused_result)); + +/** Release a tablespace acquired with fil_space_acquire(). +@param[in,out] space tablespace to release */ +void +fil_space_release( + fil_space_t* space); + +/** Return the next fil_space_t. +Once started, the caller must keep calling this until it returns NULL. +fil_space_acquire() and fil_space_release() are invoked here which +blocks a concurrent operation from dropping the tablespace. +@param[in,out] prev_space Pointer to the previous fil_space_t. +If NULL, use the first fil_space_t on fil_system->space_list. +@return pointer to the next fil_space_t. +@retval NULL if this was the last */ +fil_space_t* +fil_space_next( + fil_space_t* prev_space) + MY_ATTRIBUTE((warn_unused_result)); + +/** Return the next fil_space_t from key rotation list. +Once started, the caller must keep calling this until it returns NULL. +fil_space_acquire() and fil_space_release() are invoked here which +blocks a concurrent operation from dropping the tablespace. +@param[in,out] prev_space Pointer to the previous fil_space_t. +If NULL, use the first fil_space_t on fil_system->space_list. +@return pointer to the next fil_space_t. +@retval NULL if this was the last*/ +fil_space_t* +fil_space_keyrotate_next( + fil_space_t* prev_space) + MY_ATTRIBUTE((warn_unused_result)); + /*******************************************************************//** Reads the flushed lsn, arch no, and tablespace flag fields from a data file at database startup. @@ -1312,16 +1406,10 @@ fil_space_set_corrupt( /*==================*/ ulint space_id); -/****************************************************************//** -Acquire fil_system mutex */ -void -fil_system_enter(void); -/*==================*/ -/****************************************************************//** -Release fil_system mutex */ -void -fil_system_exit(void); -/*==================*/ +/** Acquire the fil_system mutex. */ +#define fil_system_enter() mutex_enter(&fil_system->mutex) +/** Release the fil_system mutex. */ +#define fil_system_exit() mutex_exit(&fil_system->mutex) #ifndef UNIV_INNOCHECKSUM /*******************************************************************//** diff --git a/storage/xtradb/include/fil0fil.ic b/storage/xtradb/include/fil0fil.ic index 23614a6567a..1179eea8b8e 100644 --- a/storage/xtradb/include/fil0fil.ic +++ b/storage/xtradb/include/fil0fil.ic @@ -58,38 +58,41 @@ fil_get_page_type_name( { switch(page_type) { case FIL_PAGE_PAGE_COMPRESSED: - return (const char*)"PAGE_COMPRESSED"; + return "PAGE_COMPRESSED"; + case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED: + return "PAGE_COMPRESSED_ENCRYPTED"; case FIL_PAGE_INDEX: - return (const char*)"INDEX"; + return "INDEX"; case FIL_PAGE_UNDO_LOG: - return (const char*)"UNDO LOG"; + return "UNDO LOG"; case FIL_PAGE_INODE: - return (const char*)"INODE"; + return "INODE"; case FIL_PAGE_IBUF_FREE_LIST: - return (const char*)"IBUF_FREE_LIST"; + return "IBUF_FREE_LIST"; case FIL_PAGE_TYPE_ALLOCATED: - return (const char*)"ALLOCATED"; + return "ALLOCATED"; case FIL_PAGE_IBUF_BITMAP: - return (const char*)"IBUF_BITMAP"; + return "IBUF_BITMAP"; case FIL_PAGE_TYPE_SYS: - return (const char*)"SYS"; + return "SYS"; case FIL_PAGE_TYPE_TRX_SYS: - return (const char*)"TRX_SYS"; + return "TRX_SYS"; case FIL_PAGE_TYPE_FSP_HDR: - return (const char*)"FSP_HDR"; + return "FSP_HDR"; case FIL_PAGE_TYPE_XDES: - return (const char*)"XDES"; + return "XDES"; case FIL_PAGE_TYPE_BLOB: - return (const char*)"BLOB"; + return "BLOB"; case FIL_PAGE_TYPE_ZBLOB: - return (const char*)"ZBLOB"; + return "ZBLOB"; case FIL_PAGE_TYPE_ZBLOB2: - return (const char*)"ZBLOB2"; + return "ZBLOB2"; case FIL_PAGE_TYPE_COMPRESSED: - return (const char*)"ORACLE PAGE COMPRESSED"; - default: - return (const char*)"PAGE TYPE CORRUPTED"; + return "ORACLE PAGE COMPRESSED"; } + + return "PAGE TYPE CORRUPTED"; + } /****************************************************************//** diff --git a/storage/xtradb/include/fsp0fsp.h b/storage/xtradb/include/fsp0fsp.h index 93f98eb0b0b..6ed78eba6f9 100644 --- a/storage/xtradb/include/fsp0fsp.h +++ b/storage/xtradb/include/fsp0fsp.h @@ -1037,14 +1037,15 @@ fsp_flags_get_page_size( /*====================*/ ulint flags); /*!< in: tablespace flags */ -/*********************************************************************/ -/* @return offset into fsp header where crypt data is stored */ +/********************************************************************* +Compute offset after xdes where crypt data can be stored +@param[in] zip_size Compressed size or 0 +@return offset */ UNIV_INTERN ulint fsp_header_get_crypt_offset( -/*========================*/ - ulint zip_size, /*!< in: zip_size */ - ulint* max_size); /*!< out: free space after offset */ + const ulint zip_size) + MY_ATTRIBUTE((warn_unused_result)); #define fsp_page_is_free(space,page,mtr) \ fsp_page_is_free_func(space,page,mtr, __FILE__, __LINE__) diff --git a/storage/xtradb/include/fsp0types.h b/storage/xtradb/include/fsp0types.h index 509909d1cf5..7152d65054f 100644 --- a/storage/xtradb/include/fsp0types.h +++ b/storage/xtradb/include/fsp0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software diff --git a/storage/xtradb/include/fts0types.h b/storage/xtradb/include/fts0types.h index e495fe72a60..0dad75d8f1b 100644 --- a/storage/xtradb/include/fts0types.h +++ b/storage/xtradb/include/fts0types.h @@ -126,7 +126,9 @@ struct fts_sync_t { bool in_progress; /*!< flag whether sync is in progress.*/ bool unlock_cache; /*!< flag whether unlock cache when write fts node */ - os_event_t event; /*!< sync finish event */ + os_event_t event; /*!< sync finish event; + only os_event_set() and os_event_wait() + are used */ }; /** The cache for the FTS system. It is a memory-based inverted index diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h index a35f975b13c..a161ec8c06c 100644 --- a/storage/xtradb/include/ha_prototypes.h +++ b/storage/xtradb/include/ha_prototypes.h @@ -158,6 +158,13 @@ thd_has_edited_nontrans_tables( /*===========================*/ THD* thd); /*!< in: thread handle */ +/** +Get high resolution timestamp for the current query start time. + +@retval timestamp in microseconds precision +*/ +unsigned long long thd_query_start_micro(const MYSQL_THD thd); + /*************************************************************//** Prints info of a THD object (== user session thread) to the given file. */ UNIV_INTERN diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h index a12ca1d85e6..923c463aa22 100644 --- a/storage/xtradb/include/lock0lock.h +++ b/storage/xtradb/include/lock0lock.h @@ -962,7 +962,12 @@ struct lock_sys_t{ srv_slot_t* waiting_threads; /*!< Array of user threads suspended while waiting for locks within InnoDB, protected - by the lock_sys->wait_mutex */ + by the lock_sys->wait_mutex; + os_event_set() and + os_event_reset() on + waiting_threads[]->event + are protected by + trx_t::mutex */ srv_slot_t* last_slot; /*!< highest slot ever used in the waiting_threads array, protected by @@ -975,10 +980,11 @@ struct lock_sys_t{ ulint n_lock_max_wait_time; /*!< Max wait time */ - os_event_t timeout_event; /*!< Set to the event that is - created in the lock wait monitor - thread. A value of 0 means the - thread is not active */ + os_event_t timeout_event; /*!< An event waited for by + lock_wait_timeout_thread. + Not protected by a mutex, + but the waits are timed. + Signaled on shutdown only. */ bool timeout_thread_active; /*!< True if the timeout thread is running */ diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h index 8bcee8b1919..a55c1ea818c 100644 --- a/storage/xtradb/include/log0log.h +++ b/storage/xtradb/include/log0log.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2009, Google Inc. -Copyright (c) 2017, MariaDB Corporation +Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -934,10 +934,8 @@ struct log_t{ be 'flush_or_write'! */ os_event_t no_flush_event; /*!< this event is in the reset state when a flush or a write is running; - a thread should wait for this without - owning the log mutex, but NOTE that - to set or reset this event, the - thread MUST own the log mutex! */ + os_event_set() and os_event_reset() + are protected by log_sys_t::mutex */ ibool one_flushed; /*!< during a flush, this is first FALSE and becomes TRUE when one log group has been @@ -946,11 +944,9 @@ struct log_t{ flush or write has not yet completed for any log group; e.g., this means that a transaction has been committed - when this is set; a thread should wait - for this without owning the log mutex, - but NOTE that to set or reset this - event, the thread MUST own the log - mutex! */ + when this is set; + os_event_set() and os_event_reset() + are protected by log_sys_t::mutex */ ulint n_log_ios; /*!< number of log i/os initiated thus far */ ulint n_log_ios_old; /*!< number of log i/o's at the @@ -1036,9 +1032,9 @@ struct log_t{ byte* archive_buf_ptr;/*!< unaligned archived_buf */ byte* archive_buf; /*!< log segment is written to the archive from this buffer */ - os_event_t archiving_on; /*!< if archiving has been stopped, - a thread can wait for this event to - become signaled */ + os_event_t archiving_on; /*!< if archiving has been stopped; + os_event_set() and os_event_reset() + are protected by log_sys_t::mutex */ /* @} */ #endif /* UNIV_LOG_ARCHIVE */ lsn_t tracked_lsn; /*!< log tracking has advanced to this diff --git a/storage/xtradb/include/log0online.h b/storage/xtradb/include/log0online.h index 5706f3af4b0..722336dd6b4 100644 --- a/storage/xtradb/include/log0online.h +++ b/storage/xtradb/include/log0online.h @@ -38,19 +38,25 @@ log_online_bitmap_file_range_t; /** An iterator over changed page info */ typedef struct log_bitmap_iterator_struct log_bitmap_iterator_t; -/*********************************************************************//** -Initializes the online log following subsytem. */ +/** Initialize the constant part of the log tracking subsystem */ +UNIV_INTERN +void +log_online_init(void); + +/** Initialize the dynamic part of the log tracking subsystem */ UNIV_INTERN void log_online_read_init(void); -/*=======================*/ -/*********************************************************************//** -Shuts down the online log following subsystem. */ +/** Shut down the dynamic part of the log tracking subsystem */ UNIV_INTERN void log_online_read_shutdown(void); -/*===========================*/ + +/** Shut down the constant part of the log tracking subsystem */ +UNIV_INTERN +void +log_online_shutdown(void); /*********************************************************************//** Reads and parses the redo log up to last checkpoint LSN to build the changed @@ -147,6 +153,8 @@ struct log_online_bitmap_file_range_struct { /** Struct for an iterator through all bits of changed pages bitmap blocks */ struct log_bitmap_iterator_struct { + lsn_t max_lsn; /*!< End LSN of the + range */ ibool failed; /*!< Has the iteration stopped prematurely */ log_online_bitmap_file_range_t in_files; /*!< The bitmap files diff --git a/storage/xtradb/include/log0recv.h b/storage/xtradb/include/log0recv.h index e93ec2666af..e7b6a937f01 100644 --- a/storage/xtradb/include/log0recv.h +++ b/storage/xtradb/include/log0recv.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -300,20 +301,12 @@ void recv_sys_var_init(void); /*===================*/ #endif /* !UNIV_HOTBACKUP */ -/*******************************************************************//** -Empties the hash table of stored log records, applying them to appropriate -pages. */ +/** Apply the hash table of stored log records to persistent data pages. +@param[in] last_batch whether the change buffer merge will be + performed as part of the operation */ UNIV_INTERN -dberr_t -recv_apply_hashed_log_recs( -/*=======================*/ - ibool allow_ibuf); /*!< in: if TRUE, also ibuf operations are - allowed during the application; if FALSE, - no ibuf operations are allowed, and after - the application all file pages are flushed to - disk and invalidated in buffer pool: this - alternative means that no new log records - can be generated during the application */ +void +recv_apply_hashed_log_recs(bool last_batch); #ifdef UNIV_HOTBACKUP /*******************************************************************//** Applies log records in the hash table to a backup. */ @@ -439,6 +432,8 @@ struct recv_sys_t{ scan find a corrupt log block, or a corrupt log record, or there is a log parsing buffer overflow */ + /** the time when progress was last reported */ + ib_time_t progress_time; #ifdef UNIV_LOG_ARCHIVE log_group_t* archive_group; /*!< in archive recovery: the log group whose @@ -451,6 +446,20 @@ struct recv_sys_t{ addresses in the hash table */ recv_dblwr_t dblwr; + + /** Determine whether redo log recovery progress should be reported. + @param[in] time the current time + @return whether progress should be reported + (the last report was at least 15 seconds ago) */ + bool report(ib_time_t time) + { + if (time - progress_time < 15) { + return false; + } + + progress_time = time; + return true; + } }; /** The recovery system */ diff --git a/storage/xtradb/include/mach0data.h b/storage/xtradb/include/mach0data.h index 9859def0adc..2e16634a6c2 100644 --- a/storage/xtradb/include/mach0data.h +++ b/storage/xtradb/include/mach0data.h @@ -53,7 +53,7 @@ ulint mach_read_from_1( /*=============*/ const byte* b) /*!< in: pointer to byte */ - MY_ATTRIBUTE((nonnull, pure)); + MY_ATTRIBUTE((warn_unused_result)); /*******************************************************//** The following function is used to store data in two consecutive bytes. We store the most significant byte to the lower address. */ @@ -114,7 +114,7 @@ ulint mach_read_from_3( /*=============*/ const byte* b) /*!< in: pointer to 3 bytes */ - MY_ATTRIBUTE((nonnull, pure)); + MY_ATTRIBUTE((warn_unused_result)); /*******************************************************//** The following function is used to store data in four consecutive bytes. We store the most significant byte to the lowest address. */ @@ -133,7 +133,7 @@ ulint mach_read_from_4( /*=============*/ const byte* b) /*!< in: pointer to four bytes */ - MY_ATTRIBUTE((nonnull, pure)); + MY_ATTRIBUTE((warn_unused_result)); /*********************************************************//** Writes a ulint in a compressed form (1..5 bytes). @return stored size in bytes */ @@ -160,7 +160,7 @@ ulint mach_read_compressed( /*=================*/ const byte* b) /*!< in: pointer to memory from where to read */ - MY_ATTRIBUTE((nonnull, pure)); + MY_ATTRIBUTE((warn_unused_result)); /*******************************************************//** The following function is used to store data in 6 consecutive bytes. We store the most significant byte to the lowest address. */ @@ -179,7 +179,7 @@ ib_uint64_t mach_read_from_6( /*=============*/ const byte* b) /*!< in: pointer to 6 bytes */ - MY_ATTRIBUTE((nonnull, pure)); + MY_ATTRIBUTE((warn_unused_result)); /*******************************************************//** The following function is used to store data in 7 consecutive bytes. We store the most significant byte to the lowest address. */ @@ -198,7 +198,7 @@ ib_uint64_t mach_read_from_7( /*=============*/ const byte* b) /*!< in: pointer to 7 bytes */ - MY_ATTRIBUTE((nonnull, pure)); + MY_ATTRIBUTE((warn_unused_result)); /*******************************************************//** The following function is used to store data in 8 consecutive bytes. We store the most significant byte to the lowest address. */ @@ -243,7 +243,7 @@ ib_uint64_t mach_ull_read_compressed( /*=====================*/ const byte* b) /*!< in: pointer to memory from where to read */ - MY_ATTRIBUTE((nonnull, pure)); + MY_ATTRIBUTE((warn_unused_result)); /*********************************************************//** Writes a 64-bit integer in a compressed form (1..11 bytes). @return size in bytes */ @@ -270,7 +270,7 @@ ib_uint64_t mach_ull_read_much_compressed( /*==========================*/ const byte* b) /*!< in: pointer to memory from where to read */ - MY_ATTRIBUTE((nonnull, pure)); + MY_ATTRIBUTE((warn_unused_result)); /*********************************************************//** Reads a ulint in a compressed form if the log record fully contains it. @return pointer to end of the stored field, NULL if not complete */ diff --git a/storage/xtradb/include/mach0data.ic b/storage/xtradb/include/mach0data.ic index bf2c735b0da..3904d96c09f 100644 --- a/storage/xtradb/include/mach0data.ic +++ b/storage/xtradb/include/mach0data.ic @@ -52,7 +52,6 @@ mach_read_from_1( /*=============*/ const byte* b) /*!< in: pointer to byte */ { - ut_ad(b); return((ulint)(b[0])); } @@ -132,7 +131,6 @@ mach_read_from_3( /*=============*/ const byte* b) /*!< in: pointer to 3 bytes */ { - ut_ad(b); return( ((ulint)(b[0]) << 16) | ((ulint)(b[1]) << 8) | (ulint)(b[2]) @@ -182,7 +180,6 @@ mach_read_from_4( /*=============*/ const byte* b) /*!< in: pointer to four bytes */ { - ut_ad(b); return( ((ulint)(b[0]) << 24) | ((ulint)(b[1]) << 16) | ((ulint)(b[2]) << 8) @@ -261,8 +258,6 @@ mach_read_compressed( { ulint flag; - ut_ad(b); - flag = mach_read_from_1(b); if (flag < 0x80UL) { @@ -339,8 +334,6 @@ mach_read_from_7( /*=============*/ const byte* b) /*!< in: pointer to 7 bytes */ { - ut_ad(b); - return(ut_ull_create(mach_read_from_3(b), mach_read_from_4(b + 3))); } @@ -370,8 +363,6 @@ mach_read_from_6( /*=============*/ const byte* b) /*!< in: pointer to 6 bytes */ { - ut_ad(b); - return(ut_ull_create(mach_read_from_2(b), mach_read_from_4(b + 2))); } @@ -419,8 +410,6 @@ mach_ull_read_compressed( ib_uint64_t n; ulint size; - ut_ad(b); - n = (ib_uint64_t) mach_read_compressed(b); size = mach_get_compressed_size((ulint) n); @@ -486,8 +475,6 @@ mach_ull_read_much_compressed( ib_uint64_t n; ulint size; - ut_ad(b); - if (*b != (byte)0xFF) { n = 0; size = 0; diff --git a/storage/xtradb/include/mtr0mtr.h b/storage/xtradb/include/mtr0mtr.h index 23992598f2e..ef6cd61719d 100644 --- a/storage/xtradb/include/mtr0mtr.h +++ b/storage/xtradb/include/mtr0mtr.h @@ -235,8 +235,7 @@ UNIV_INTERN void mtr_commit( /*=======*/ - mtr_t* mtr) /*!< in/out: mini-transaction */ - MY_ATTRIBUTE((nonnull)); + mtr_t* mtr); /*!< in/out: mini-transaction */ /**********************************************************//** Sets and returns a savepoint in mtr. @return savepoint */ @@ -354,7 +353,7 @@ mtr_memo_contains( mtr_t* mtr, /*!< in: mtr */ const void* object, /*!< in: object to search */ ulint type) /*!< in: type of object */ - MY_ATTRIBUTE((warn_unused_result, nonnull)); + MY_ATTRIBUTE((warn_unused_result)); /**********************************************************//** Checks if memo contains the given page. diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h index 2a385c9bf58..d6f0ecfb69c 100644 --- a/storage/xtradb/include/os0file.h +++ b/storage/xtradb/include/os0file.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2015, MariaDB Corporation. +Copyright (c) 2013, 2017, MariaDB Corporation. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -557,9 +557,10 @@ os_file_create_simple_no_error_handling_func( value */ __attribute__((nonnull, warn_unused_result)); /****************************************************************//** -Tries to disable OS caching on an opened file descriptor. */ +Tries to disable OS caching on an opened file descriptor. +@return true if operation is success and false otherwise */ UNIV_INTERN -void +bool os_file_set_nocache( /*================*/ os_file_t fd, /*!< in: file descriptor to alter */ @@ -904,17 +905,19 @@ os_file_get_size( /*=============*/ os_file_t file) /*!< in: handle to a file */ MY_ATTRIBUTE((warn_unused_result)); -/***********************************************************************//** -Write the specified number of zeros to a newly created file. -@return TRUE if success */ +/** Set the size of a newly created file. +@param[in] name file name +@param[in] file file handle +@param[in] size desired file size +@param[in] sparse whether to create a sparse file (no preallocating) +@return whether the operation succeeded */ UNIV_INTERN -ibool +bool os_file_set_size( -/*=============*/ - const char* name, /*!< in: name of the file or path as a - null-terminated string */ - os_file_t file, /*!< in: handle to a file */ - os_offset_t size) /*!< in: file size */ + const char* name, + os_file_t file, + os_offset_t size, + bool is_sparse = false) MY_ATTRIBUTE((nonnull, warn_unused_result)); /***********************************************************************//** Truncates a file at its current position. @@ -1203,6 +1206,7 @@ UNIV_INTERN void os_aio_simulated_wake_handler_threads(void); /*=======================================*/ +#ifdef _WIN32 /**********************************************************************//** This function can be called if one wants to post a batch of reads and prefers an i/o-handler thread to handle them all at once later. You must @@ -1210,8 +1214,10 @@ call os_aio_simulated_wake_handler_threads later to ensure the threads are not left sleeping! */ UNIV_INTERN void -os_aio_simulated_put_read_threads_to_sleep(void); -/*============================================*/ +os_aio_simulated_put_read_threads_to_sleep(); +#else /* _WIN32 */ +# define os_aio_simulated_put_read_threads_to_sleep() +#endif /* _WIN32 */ #ifdef WIN_ASYNC_IO /**********************************************************************//** diff --git a/storage/xtradb/include/os0thread.h b/storage/xtradb/include/os0thread.h index 671b9b7dc3f..7865358b0f7 100644 --- a/storage/xtradb/include/os0thread.h +++ b/storage/xtradb/include/os0thread.h @@ -131,11 +131,9 @@ os_thread_create_func( os_thread_id_t* thread_id); /*!< out: id of the created thread, or NULL */ -/** -Waits until the specified thread completes and joins it. Its return value is -ignored. - -@param thread thread to join */ +/** Waits until the specified thread completes and joins it. +Its return value is ignored. +@param[in,out] thread thread to join */ UNIV_INTERN void os_thread_join( diff --git a/storage/xtradb/include/page0page.h b/storage/xtradb/include/page0page.h index cb43c937757..eefa0fa4c5b 100644 --- a/storage/xtradb/include/page0page.h +++ b/storage/xtradb/include/page0page.h @@ -235,8 +235,7 @@ ulint page_header_get_offs( /*=================*/ const page_t* page, /*!< in: page */ - ulint field) /*!< in: PAGE_FREE, ... */ - MY_ATTRIBUTE((nonnull, pure)); + ulint field); /*!< in: PAGE_FREE, ... */ /*************************************************************//** Returns the pointer stored in the given header field, or NULL. */ @@ -528,7 +527,7 @@ bool page_is_leaf( /*=========*/ const page_t* page) /*!< in: page */ - MY_ATTRIBUTE((nonnull, pure)); + MY_ATTRIBUTE((warn_unused_result)); /************************************************************//** Determine whether the page is empty. @return true if the page is empty (PAGE_N_RECS = 0) */ @@ -849,8 +848,7 @@ page_copy_rec_list_end( buf_block_t* block, /*!< in: index page containing rec */ rec_t* rec, /*!< in: record on page */ dict_index_t* index, /*!< in: record descriptor */ - mtr_t* mtr) /*!< in: mtr */ - MY_ATTRIBUTE((nonnull)); + mtr_t* mtr); /*!< in: mtr */ /*************************************************************//** Copies records from page to new_page, up to the given record, NOT including that record. Infimum and supremum records are not copied. @@ -871,8 +869,7 @@ page_copy_rec_list_start( buf_block_t* block, /*!< in: index page containing rec */ rec_t* rec, /*!< in: record on page */ dict_index_t* index, /*!< in: record descriptor */ - mtr_t* mtr) /*!< in: mtr */ - MY_ATTRIBUTE((nonnull)); + mtr_t* mtr); /*!< in: mtr */ /*************************************************************//** Deletes records from a page from a given record onward, including that record. The infimum and supremum records are not deleted. */ @@ -921,8 +918,7 @@ page_move_rec_list_end( buf_block_t* block, /*!< in: index page from where to move */ rec_t* split_rec, /*!< in: first record to move */ dict_index_t* index, /*!< in: record descriptor */ - mtr_t* mtr) /*!< in: mtr */ - MY_ATTRIBUTE((nonnull(1, 2, 4, 5))); + mtr_t* mtr); /*!< in: mtr */ /*************************************************************//** Moves record list start to another page. Moved records do not include split_rec. @@ -952,8 +948,7 @@ page_dir_split_slot( page_t* page, /*!< in: index page */ page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed part will be written, or NULL */ - ulint slot_no)/*!< in: the directory slot */ - MY_ATTRIBUTE((nonnull(1))); + ulint slot_no);/*!< in: the directory slot */ /*************************************************************//** Tries to balance the given directory slot with too few records with the upper neighbor, so that there are at least the minimum number @@ -965,8 +960,7 @@ page_dir_balance_slot( /*==================*/ page_t* page, /*!< in/out: index page */ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ - ulint slot_no)/*!< in: the directory slot */ - MY_ATTRIBUTE((nonnull(1))); + ulint slot_no);/*!< in: the directory slot */ /**********************************************************//** Parses a log record of a record list end or start deletion. @return end of log record or NULL */ diff --git a/storage/xtradb/include/page0page.ic b/storage/xtradb/include/page0page.ic index 5cf92fd5d8d..364536b86f8 100644 --- a/storage/xtradb/include/page0page.ic +++ b/storage/xtradb/include/page0page.ic @@ -156,7 +156,6 @@ page_header_get_offs( { ulint offs; - ut_ad(page); ut_ad((field == PAGE_FREE) || (field == PAGE_LAST_INSERT) || (field == PAGE_HEAP_TOP)); diff --git a/storage/xtradb/include/page0zip.h b/storage/xtradb/include/page0zip.h index 81068e7bd29..adafaa6d8b6 100644 --- a/storage/xtradb/include/page0zip.h +++ b/storage/xtradb/include/page0zip.h @@ -132,7 +132,7 @@ page_zip_compress( dict_index_t* index, /*!< in: index of the B-tree node */ ulint level, /*!< in: compression level */ mtr_t* mtr) /*!< in: mini-transaction, or NULL */ - MY_ATTRIBUTE((nonnull(1,2,3))); + MY_ATTRIBUTE((warn_unused_result)); /**********************************************************************//** Decompress a page. This function should tolerate errors on the compressed @@ -424,8 +424,7 @@ page_zip_reorganize( out: data, n_blobs, m_start, m_end, m_nonempty */ dict_index_t* index, /*!< in: index of the B-tree node */ - mtr_t* mtr) /*!< in: mini-transaction */ - MY_ATTRIBUTE((nonnull)); + mtr_t* mtr); /*!< in: mini-transaction */ #ifndef UNIV_HOTBACKUP /**********************************************************************//** Copy the records of a page byte for byte. Do not copy the page header @@ -458,7 +457,7 @@ page_zip_parse_compress( byte* end_ptr,/*!< in: buffer end */ page_t* page, /*!< out: uncompressed page */ page_zip_des_t* page_zip)/*!< out: compressed page */ - MY_ATTRIBUTE((nonnull(1,2))); + MY_ATTRIBUTE((warn_unused_result)); #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/xtradb/include/rem0rec.h b/storage/xtradb/include/rem0rec.h index d72f2760a8c..9baf0ab380a 100644 --- a/storage/xtradb/include/rem0rec.h +++ b/storage/xtradb/include/rem0rec.h @@ -747,8 +747,7 @@ rec_copy( /*=====*/ void* buf, /*!< in: buffer */ const rec_t* rec, /*!< in: physical record */ - const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ - MY_ATTRIBUTE((nonnull)); + const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ #ifndef UNIV_HOTBACKUP /**********************************************************//** Determines the size of a data tuple prefix in a temporary file. diff --git a/storage/xtradb/include/row0upd.h b/storage/xtradb/include/row0upd.h index e59ec58b63c..4312fcf7339 100644 --- a/storage/xtradb/include/row0upd.h +++ b/storage/xtradb/include/row0upd.h @@ -248,9 +248,8 @@ row_upd_index_replace_new_col_vals_index_pos( /*!< in: if TRUE, limit the replacement to ordering fields of index; note that this does not work for non-clustered indexes. */ - mem_heap_t* heap) /*!< in: memory heap for allocating and + mem_heap_t* heap); /*!< in: memory heap for allocating and copying the new values */ - MY_ATTRIBUTE((nonnull)); /***********************************************************//** Replaces the new column values stored in the update vector to the index entry given. */ @@ -311,7 +310,7 @@ row_upd_changes_ord_field_binary_func( compile time */ const row_ext_t*ext) /*!< NULL, or prefixes of the externally stored columns in the old row */ - MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); #ifdef UNIV_DEBUG # define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ row_upd_changes_ord_field_binary_func(index,update,thr,row,ext) diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index 4eb67a84cc1..c18bc7c1fc3 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -3,7 +3,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009, Google Inc. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2017, MariaDB Corporation +Copyright (c) 2013, 2017, MariaDB Corporation Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -192,6 +192,9 @@ struct srv_stats_t { /** Number of encryption_get_latest_key_version calls */ ulint_ctr_64_t n_key_requests; + + /** Number of spaces in keyrotation list */ + ulint_ctr_64_t key_rotation_list_length; }; extern const char* srv_main_thread_op_info; @@ -199,13 +202,16 @@ extern const char* srv_main_thread_op_info; /** Prefix used by MySQL to indicate pre-5.1 table name encoding */ extern const char srv_mysql50_table_name_prefix[10]; -/* The monitor thread waits on this event. */ +/** Event to signal srv_monitor_thread. Not protected by a mutex. +Set after setting srv_print_innodb_monitor. */ extern os_event_t srv_monitor_event; -/* The error monitor thread waits on this event. */ +/** Event to signal the shutdown of srv_error_monitor_thread. +Not protected by a mutex. */ extern os_event_t srv_error_event; -/** The buffer pool dump/load thread waits on this event. */ +/** Event for waking up buf_dump_thread. Not protected by a mutex. +Set on shutdown or by buf_dump_start() or buf_load_start(). */ extern os_event_t srv_buf_dump_event; /** The buffer pool dump/load file name */ @@ -506,9 +512,6 @@ extern double srv_adaptive_flushing_lwm; extern ulong srv_flushing_avg_loops; extern ulong srv_force_recovery; -#ifndef DBUG_OFF -extern ulong srv_force_recovery_crash; -#endif /* !DBUG_OFF */ extern ulint srv_fast_shutdown; /*!< If this is 1, do not do a purge and index buffer merge. @@ -523,6 +526,7 @@ extern unsigned long long srv_stats_transient_sample_pages; extern my_bool srv_stats_persistent; extern unsigned long long srv_stats_persistent_sample_pages; extern my_bool srv_stats_auto_recalc; +extern my_bool srv_stats_include_delete_marked; extern unsigned long long srv_stats_modified_counter; extern my_bool srv_stats_sample_traditional; @@ -1070,24 +1074,17 @@ ulint srv_get_task_queue_length(void); /*===========================*/ -/*********************************************************************//** -Releases threads of the type given from suspension in the thread table. -NOTE! The server mutex has to be reserved by the caller! -@return number of threads released: this may be less than n if not -enough threads were suspended at the moment */ -UNIV_INTERN -ulint -srv_release_threads( -/*================*/ - enum srv_thread_type type, /*!< in: thread type */ - ulint n); /*!< in: number of threads to release */ +/** Ensure that a given number of threads of the type given are running +(or are already terminated). +@param[in] type thread type +@param[in] n number of threads that have to run */ +void +srv_release_threads(enum srv_thread_type type, ulint n); -/**********************************************************************//** -Wakeup the purge threads. */ +/** Wake up the purge threads. */ UNIV_INTERN void -srv_purge_wakeup(void); -/*==================*/ +srv_purge_wakeup(); /** Status variables to be passed to MySQL */ struct export_var_t{ @@ -1268,6 +1265,7 @@ struct export_var_t{ ulint innodb_encryption_rotation_pages_flushed; ulint innodb_encryption_rotation_estimated_iops; ib_int64_t innodb_encryption_key_requests; + ib_int64_t innodb_key_rotation_list_length; ulint innodb_scrub_page_reorganizations; ulint innodb_scrub_page_splits; @@ -1325,4 +1323,12 @@ wsrep_srv_conc_cancel_wait( thread */ #endif /* WITH_WSREP */ +#ifndef DBUG_OFF +/** false before InnoDB monitor has been printed at least once, true +afterwards */ +extern bool srv_debug_monitor_printed; +#else +#define srv_debug_monitor_printed false +#endif + #endif diff --git a/storage/xtradb/include/trx0purge.h b/storage/xtradb/include/trx0purge.h index a862523c092..7b9b5dc49cd 100644 --- a/storage/xtradb/include/trx0purge.h +++ b/storage/xtradb/include/trx0purge.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -145,7 +146,10 @@ struct trx_purge_t{ log operation can prevent this by obtaining an s-latch here. It also protects state and running */ - os_event_t event; /*!< State signal event */ + os_event_t event; /*!< State signal event; + os_event_set() and os_event_reset() + are protected by trx_purge_t::latch + X-lock */ ulint n_stop; /*!< Counter to track number stops */ volatile bool running; /*!< true, if purge is active, we check this without the latch too */ diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h index 239ed0b273b..e621d4226a7 100644 --- a/storage/xtradb/include/trx0trx.h +++ b/storage/xtradb/include/trx0trx.h @@ -107,7 +107,7 @@ void trx_free_prepared( /*==============*/ trx_t* trx) /*!< in, own: trx object */ - UNIV_COLD MY_ATTRIBUTE((nonnull)); + UNIV_COLD; /********************************************************************//** Frees a transaction object for MySQL. */ UNIV_INTERN @@ -881,7 +881,7 @@ struct trx_t{ time_t start_time; /*!< time the trx state last time became TRX_STATE_ACTIVE */ - clock_t start_time_micro; /*!< start time of transaction in + ib_uint64_t start_time_micro; /*!< start time of transaction in microseconds */ trx_id_t id; /*!< transaction id */ XID xid; /*!< X/Open XA transaction diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i index ad0565a0290..1e375ba2c09 100644 --- a/storage/xtradb/include/univ.i +++ b/storage/xtradb/include/univ.i @@ -45,10 +45,10 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_MAJOR 5 #define INNODB_VERSION_MINOR 6 -#define INNODB_VERSION_BUGFIX 34 +#define INNODB_VERSION_BUGFIX 35 #ifndef PERCONA_INNODB_VERSION -#define PERCONA_INNODB_VERSION 79.1 +#define PERCONA_INNODB_VERSION 80.0 #endif /* Enable UNIV_LOG_ARCHIVE in XtraDB */ diff --git a/storage/xtradb/include/ut0ut.h b/storage/xtradb/include/ut0ut.h index c5944bb0547..ca4ce0d4ef9 100644 --- a/storage/xtradb/include/ut0ut.h +++ b/storage/xtradb/include/ut0ut.h @@ -87,9 +87,7 @@ private: # define UT_RELAX_CPU() YieldProcessor() # elif defined(__powerpc__) && defined __GLIBC__ #include <sys/platform/ppc.h> -# define UT_RELAX_CPU() do { \ - volatile lint volatile_var = __ppc_get_timebase(); \ - } while (0) +# define UT_RELAX_CPU() __ppc_get_timebase() # else # define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */ # endif diff --git a/storage/xtradb/include/ut0wqueue.h b/storage/xtradb/include/ut0wqueue.h index e6b9891aed1..d69363afe7b 100644 --- a/storage/xtradb/include/ut0wqueue.h +++ b/storage/xtradb/include/ut0wqueue.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -116,7 +117,9 @@ ib_wqueue_len( struct ib_wqueue_t { ib_mutex_t mutex; /*!< mutex protecting everything */ ib_list_t* items; /*!< work item list */ - os_event_t event; /*!< event we use to signal additions to list */ + os_event_t event; /*!< event we use to signal additions to list; + os_event_set() and os_event_reset() are + protected by ib_wqueue_t::mutex */ }; #endif diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc index af2c823af64..0d555ed2dd7 100644 --- a/storage/xtradb/lock/lock0lock.cc +++ b/storage/xtradb/lock/lock0lock.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2014, 2015, MariaDB Corporation +Copyright (c) 2014, 2017, MariaDB Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -930,8 +930,10 @@ lock_reset_lock_and_trx_wait( } ib_logf(IB_LOG_LEVEL_INFO, - "Trx id %lu is waiting a lock in statement %s" - " for this trx id %lu and statement %s wait_lock %p", + "Trx id " TRX_ID_FMT + " is waiting a lock in statement %s" + " for this trx id " TRX_ID_FMT + " and statement %s wait_lock %p", lock->trx->id, stmt ? stmt : "NULL", trx_id, @@ -2654,7 +2656,8 @@ lock_rec_add_to_queue( if (wsrep_thd_is_BF(trx->mysql_thd, FALSE)) { if (wsrep_debug) { fprintf(stderr, - "BF skipping wait: %lu\n", + "BF skipping wait: " + TRX_ID_FMT "\n", trx->id); lock_rec_print(stderr, lock); } @@ -5316,7 +5319,9 @@ lock_table_other_has_incompatible( #ifdef WITH_WSREP if(wsrep_thd_is_wsrep(trx->mysql_thd)) { if (wsrep_debug) { - fprintf(stderr, "WSREP: trx %ld table lock abort\n", + fprintf(stderr, "WSREP: trx " + TRX_ID_FMT + " table lock abort\n", trx->id); } trx_mutex_enter(lock->trx); @@ -6445,12 +6450,13 @@ loop: if (lock_get_type_low(lock) == LOCK_REC) { if (load_page_first) { - ulint space = lock->un_member.rec_lock.space; - ulint zip_size= fil_space_get_zip_size(space); + ulint space_id = lock->un_member.rec_lock.space; + /* Check if the space is exists or not. only + when the space is valid, try to get the page. */ + fil_space_t* space = fil_space_acquire(space_id); ulint page_no = lock->un_member.rec_lock.page_no; - ibool tablespace_being_deleted = FALSE; - if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + if (!space) { /* It is a single table tablespace and the .ibd file is missing (TRUNCATE @@ -6459,11 +6465,13 @@ loop: load the page in the buffer pool. */ fprintf(file, "RECORD LOCKS on" - " non-existing space %lu\n", - (ulong) space); + " non-existing space: " ULINTPF "\n", + space_id); goto print_rec; } + const ulint zip_size = fsp_flags_get_zip_size(space->flags); + lock_mutex_exit(); mutex_exit(&trx_sys->mutex); @@ -6471,15 +6479,10 @@ loop: DEBUG_SYNC_C("innodb_monitor_before_lock_page_read"); - /* Check if the space is exists or not. only - when the space is valid, try to get the page. */ - tablespace_being_deleted - = fil_inc_pending_ops(space, false); - - if (!tablespace_being_deleted) { + if (space) { mtr_start(&mtr); - buf_page_get_gen(space, zip_size, + buf_page_get_gen(space_id, zip_size, page_no, RW_NO_LATCH, NULL, BUF_GET_POSSIBLY_FREED, @@ -6488,14 +6491,11 @@ loop: mtr_commit(&mtr); - fil_decr_pending_ops(space); - } else { - fprintf(file, "RECORD LOCKS on" - " non-existing space %lu\n", - (ulong) space); } } + fil_space_release(space); + load_page_first = FALSE; lock_mutex_enter(); @@ -6924,7 +6924,7 @@ static void lock_rec_block_validate( /*====================*/ - ulint space, + ulint space_id, ulint page_no) { /* The lock and the block that it is referring to may be freed at @@ -6937,10 +6937,11 @@ lock_rec_block_validate( /* Make sure that the tablespace is not deleted while we are trying to access the page. */ - if (!fil_inc_pending_ops(space, true)) { + if (fil_space_t* space = fil_space_acquire(space_id)) { + mtr_start(&mtr); block = buf_page_get_gen( - space, fil_space_get_zip_size(space), + space_id, fsp_flags_get_zip_size(space->flags), page_no, RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, __FILE__, __LINE__, &mtr); @@ -6950,7 +6951,7 @@ lock_rec_block_validate( ut_ad(lock_rec_validate_page(block)); mtr_commit(&mtr); - fil_decr_pending_ops(space); + fil_space_release(space); } } diff --git a/storage/xtradb/log/log0crypt.cc b/storage/xtradb/log/log0crypt.cc index f518845b1a8..e6b5c845757 100644 --- a/storage/xtradb/log/log0crypt.cc +++ b/storage/xtradb/log/log0crypt.cc @@ -144,11 +144,13 @@ log_crypt_print_checkpoint_keys( ib_uint64_t checkpoint_no = log_block_get_checkpoint_no(log_block); if (crypt_info.size()) { - fprintf(stderr, "InnoDB: redo log checkpoint: %lu [ chk key ]: ", (ulong) checkpoint_no); + fprintf(stderr, + "InnoDB: redo log checkpoint: " UINT64PF " [ chk key ]: ", + checkpoint_no); for (size_t i = 0; i < crypt_info.size(); i++) { struct crypt_info_t* it = &crypt_info[i]; - fprintf(stderr, "[ %lu %u ] ", - (ulong) it->checkpoint_no, + fprintf(stderr, "[ " UINT64PF " %u ] ", + it->checkpoint_no, it->key_version); } fprintf(stderr, "\n"); diff --git a/storage/xtradb/log/log0log.cc b/storage/xtradb/log/log0log.cc index 9245ae160e6..25c8ed06981 100644 --- a/storage/xtradb/log/log0log.cc +++ b/storage/xtradb/log/log0log.cc @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Google Inc. -Copyright (c) 2014, 2017, MariaDB Corporation. All Rights Reserved. +Copyright (c) 2014, 2017, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -48,6 +48,10 @@ Created 12/9/1995 Heikki Tuuri #endif #ifndef UNIV_HOTBACKUP +#if MYSQL_VERSION_ID < 100200 +# include <my_systemd.h> /* sd_notifyf() */ +#endif + #include "mem0mem.h" #include "buf0buf.h" #include "buf0flu.h" @@ -1588,17 +1592,7 @@ log_write_up_to( } loop: -#ifdef UNIV_DEBUG - loop_count++; - - ut_ad(loop_count < 5); - -# if 0 - if (loop_count > 2) { - fprintf(stderr, "Log loop count %lu\n", loop_count); - } -# endif -#endif + ut_ad(++loop_count < 100); mutex_enter(&(log_sys->mutex)); ut_ad(!recv_no_log_write); @@ -1886,7 +1880,7 @@ log_preflush_pool_modified_pages( and we could not make a new checkpoint on the basis of the info on the buffer pool only. */ - recv_apply_hashed_log_recs(TRUE); + recv_apply_hashed_log_recs(true); } if (!buf_page_cleaner_is_active @@ -2261,7 +2255,7 @@ log_checkpoint( ut_ad(!srv_read_only_mode); if (recv_recovery_is_on()) { - recv_apply_hashed_log_recs(TRUE); + recv_apply_hashed_log_recs(true); } if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC && @@ -2635,6 +2629,13 @@ loop: start_lsn += len; buf += len; + if (recv_sys->report(ut_time())) { + ib_logf(IB_LOG_LEVEL_INFO, "Read redo log up to LSN=" LSN_PF, + start_lsn); + sd_notifyf(0, "STATUS=Read redo log up to LSN=" LSN_PF, + start_lsn); + } + if (start_lsn != end_lsn) { if (release_mutex) { @@ -3560,7 +3561,6 @@ logs_empty_and_mark_files_at_shutdown(void) lsn_t lsn; lsn_t tracked_lsn; ulint count = 0; - ulint total_trx; ulint pending_io; ibool server_busy; @@ -3570,12 +3570,6 @@ logs_empty_and_mark_files_at_shutdown(void) if (log_disable_checkpoint_active) log_enable_checkpoint(); - while (srv_fast_shutdown == 0 && trx_rollback_or_clean_is_active) { - /* we should wait until rollback after recovery end - for slow shutdown */ - os_thread_sleep(100000); - } - /* Wait until the master thread and all other operations are idle: our algorithm only works if the server is idle at shutdown */ @@ -3597,10 +3591,9 @@ loop: shutdown, because the InnoDB layer may have committed or prepared transactions and we don't want to lose them. */ - total_trx = trx_sys_any_active_transactions(); - - if (total_trx > 0) { - + if (ulint total_trx = srv_was_started && !srv_read_only_mode + && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO + ? trx_sys_any_active_transactions() : 0) { if (srv_print_verbose_log && count > 600) { ib_logf(IB_LOG_LEVEL_INFO, "Waiting for %lu active transactions to finish", @@ -3625,6 +3618,8 @@ loop: thread_name = "lock_wait_timeout_thread"; } else if (srv_buf_dump_thread_active) { thread_name = "buf_dump_thread"; + } else if (srv_fast_shutdown != 2 && trx_rollback_or_clean_is_active) { + thread_name = "rollback of recovered transactions"; } else { thread_name = NULL; } @@ -4086,6 +4081,7 @@ log_shutdown(void) if (!srv_read_only_mode && srv_scrub_log) { os_event_free(log_scrub_event); + log_scrub_event = NULL; } #ifdef UNIV_LOG_ARCHIVE diff --git a/storage/xtradb/log/log0online.cc b/storage/xtradb/log/log0online.cc index 4e6ad65a906..74f2e2360a8 100644 --- a/storage/xtradb/log/log0online.cc +++ b/storage/xtradb/log/log0online.cc @@ -77,12 +77,14 @@ struct log_bitmap_struct { both the correct type and the tree does not mind its overwrite during rbt_next() tree traversal. */ - ib_mutex_t mutex; /*!< mutex protecting all the fields.*/ }; /* The log parsing and bitmap output struct instance */ static struct log_bitmap_struct* log_bmp_sys; +/* Mutex protecting log_bmp_sys */ +static ib_mutex_t log_bmp_sys_mutex; + /** File name stem for bitmap files. */ static const char* bmp_file_name_stem = "ib_modified_log_"; @@ -174,28 +176,24 @@ log_online_set_page_bit( ulint space, /*!<in: log record space id */ ulint page_no)/*!<in: log record page id */ { - ulint block_start_page; - ulint block_pos; - uint bit_pos; - ib_rbt_bound_t tree_search_pos; - byte search_page[MODIFIED_PAGE_BLOCK_SIZE]; - byte *page_ptr; - - ut_ad(mutex_own(&log_bmp_sys->mutex)); + ut_ad(mutex_own(&log_bmp_sys_mutex)); ut_a(space != ULINT_UNDEFINED); ut_a(page_no != ULINT_UNDEFINED); - block_start_page = page_no / MODIFIED_PAGE_BLOCK_ID_COUNT + ulint block_start_page = page_no / MODIFIED_PAGE_BLOCK_ID_COUNT * MODIFIED_PAGE_BLOCK_ID_COUNT; - block_pos = block_start_page ? (page_no % block_start_page / 8) + ulint block_pos = block_start_page ? (page_no % block_start_page / 8) : (page_no / 8); - bit_pos = page_no % 8; + uint bit_pos = page_no % 8; + byte search_page[MODIFIED_PAGE_BLOCK_SIZE]; mach_write_to_4(search_page + MODIFIED_PAGE_SPACE_ID, space); mach_write_to_4(search_page + MODIFIED_PAGE_1ST_PAGE_ID, block_start_page); + byte *page_ptr; + ib_rbt_bound_t tree_search_pos; if (!rbt_search(log_bmp_sys->modified_pages, &tree_search_pos, search_page)) { page_ptr = rbt_value(byte, tree_search_pos.last); @@ -594,12 +592,19 @@ log_online_is_bitmap_file( && (!strcmp(stem, bmp_file_name_stem))); } -/*********************************************************************//** -Initialize the online log following subsytem. */ +/** Initialize the constant part of the log tracking subsystem */ +UNIV_INTERN +void +log_online_init(void) +{ + mutex_create(log_bmp_sys_mutex_key, &log_bmp_sys_mutex, + SYNC_LOG_ONLINE); +} + +/** Initialize the dynamic part of the log tracking subsystem */ UNIV_INTERN void log_online_read_init(void) -/*======================*/ { ibool success; lsn_t tracking_start_lsn @@ -623,9 +628,6 @@ log_online_read_init(void) log_bmp_sys->read_buf = static_cast<byte *> (ut_align(log_bmp_sys->read_buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); - mutex_create(log_bmp_sys_mutex_key, &log_bmp_sys->mutex, - SYNC_LOG_ONLINE); - /* Initialize bitmap file directory from srv_data_home and add a path separator if needed. */ srv_data_home_len = strlen(srv_data_home); @@ -760,13 +762,15 @@ log_online_read_init(void) log_set_tracked_lsn(tracking_start_lsn); } -/*********************************************************************//** -Shut down the online log following subsystem. */ +/** Shut down the dynamic part of the log tracking subsystem */ UNIV_INTERN void log_online_read_shutdown(void) -/*==========================*/ { + mutex_enter(&log_bmp_sys_mutex); + + srv_track_changed_pages = FALSE; + ib_rbt_node_t *free_list_node = log_bmp_sys->page_free_list; if (log_bmp_sys->out.file != os_file_invalid) { @@ -782,10 +786,21 @@ log_online_read_shutdown(void) free_list_node = next; } - mutex_free(&log_bmp_sys->mutex); - ut_free(log_bmp_sys->read_buf_ptr); ut_free(log_bmp_sys); + log_bmp_sys = NULL; + + srv_redo_log_thread_started = false; + + mutex_exit(&log_bmp_sys_mutex); +} + +/** Shut down the constant part of the log tracking subsystem */ +UNIV_INTERN +void +log_online_shutdown(void) +{ + mutex_free(&log_bmp_sys_mutex); } /*********************************************************************//** @@ -831,13 +846,12 @@ void log_online_parse_redo_log(void) /*===========================*/ { + ut_ad(mutex_own(&log_bmp_sys_mutex)); + byte *ptr = log_bmp_sys->parse_buf; byte *end = log_bmp_sys->parse_buf_end; - ulint len = 0; - ut_ad(mutex_own(&log_bmp_sys->mutex)); - while (ptr != end && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) { @@ -919,6 +933,8 @@ log_online_add_to_parse_buf( ulint skip_len) /*!< in: how much of log data to skip */ { + ut_ad(mutex_own(&log_bmp_sys_mutex)); + ulint start_offset = skip_len ? skip_len : LOG_BLOCK_HDR_SIZE; ulint end_offset = (data_len == OS_FILE_LOG_BLOCK_SIZE) @@ -927,8 +943,6 @@ log_online_add_to_parse_buf( ulint actual_data_len = (end_offset >= start_offset) ? end_offset - start_offset : 0; - ut_ad(mutex_own(&log_bmp_sys->mutex)); - ut_memcpy(log_bmp_sys->parse_buf_end, log_block + start_offset, actual_data_len); @@ -951,11 +965,9 @@ log_online_parse_redo_log_block( log data should be skipped as they were parsed before */ { - ulint block_data_len; - - ut_ad(mutex_own(&log_bmp_sys->mutex)); + ut_ad(mutex_own(&log_bmp_sys_mutex)); - block_data_len = log_block_get_data_len(log_block); + ulint block_data_len = log_block_get_data_len(log_block); ut_ad(block_data_len % OS_FILE_LOG_BLOCK_SIZE == 0 || block_data_len < OS_FILE_LOG_BLOCK_SIZE); @@ -975,14 +987,14 @@ log_online_follow_log_seg( lsn_t block_start_lsn, /*!< in: the LSN to read from */ lsn_t block_end_lsn) /*!< in: the LSN to read to */ { + ut_ad(mutex_own(&log_bmp_sys_mutex)); + /* Pointer to the current OS_FILE_LOG_BLOCK-sized chunk of the read log data to parse */ byte* log_block = log_bmp_sys->read_buf; byte* log_block_end = log_bmp_sys->read_buf + (block_end_lsn - block_start_lsn); - ut_ad(mutex_own(&log_bmp_sys->mutex)); - mutex_enter(&log_sys->mutex); log_group_read_log_seg(LOG_RECOVER, log_bmp_sys->read_buf, group, block_start_lsn, block_end_lsn, TRUE); @@ -1042,11 +1054,11 @@ log_online_follow_log_group( lsn_t contiguous_lsn) /*!< in: the LSN of log block start containing the log_parse_start_lsn */ { + ut_ad(mutex_own(&log_bmp_sys_mutex)); + lsn_t block_start_lsn = contiguous_lsn; lsn_t block_end_lsn; - ut_ad(mutex_own(&log_bmp_sys->mutex)); - log_bmp_sys->next_parse_lsn = log_bmp_sys->start_lsn; log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf; @@ -1083,21 +1095,29 @@ log_online_write_bitmap_page( /*=========================*/ const byte *block) /*!< in: block to write */ { - ibool success; - - ut_ad(srv_track_changed_pages); - ut_ad(mutex_own(&log_bmp_sys->mutex)); + ut_ad(mutex_own(&log_bmp_sys_mutex)); /* Simulate a write error */ DBUG_EXECUTE_IF("bitmap_page_write_error", - ib_logf(IB_LOG_LEVEL_ERROR, - "simulating bitmap write error in " - "log_online_write_bitmap_page"); - return FALSE;); - - success = os_file_write(log_bmp_sys->out.name, log_bmp_sys->out.file, - block, log_bmp_sys->out.offset, - MODIFIED_PAGE_BLOCK_SIZE); + { + ulint space_id + = mach_read_from_4(block + + MODIFIED_PAGE_SPACE_ID); + if (space_id > 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "simulating bitmap write " + "error in " + "log_online_write_bitmap_page " + "for space ID %lu", + space_id); + return FALSE; + } + }); + + ibool success = os_file_write(log_bmp_sys->out.name, + log_bmp_sys->out.file, block, + log_bmp_sys->out.offset, + MODIFIED_PAGE_BLOCK_SIZE); if (UNIV_UNLIKELY(!success)) { /* The following call prints an error message */ @@ -1136,11 +1156,7 @@ ibool log_online_write_bitmap(void) /*=========================*/ { - ib_rbt_node_t *bmp_tree_node; - const ib_rbt_node_t *last_bmp_tree_node; - ibool success = TRUE; - - ut_ad(mutex_own(&log_bmp_sys->mutex)); + ut_ad(mutex_own(&log_bmp_sys_mutex)); if (log_bmp_sys->out.offset >= srv_max_bitmap_file_size) { if (!log_online_rotate_bitmap_file(log_bmp_sys->start_lsn)) { @@ -1148,9 +1164,12 @@ log_online_write_bitmap(void) } } - bmp_tree_node = (ib_rbt_node_t *) - rbt_first(log_bmp_sys->modified_pages); - last_bmp_tree_node = rbt_last(log_bmp_sys->modified_pages); + ib_rbt_node_t *bmp_tree_node + = (ib_rbt_node_t *)rbt_first(log_bmp_sys->modified_pages); + const ib_rbt_node_t * const last_bmp_tree_node + = rbt_last(log_bmp_sys->modified_pages); + + ibool success = TRUE; while (bmp_tree_node) { @@ -1183,9 +1202,11 @@ log_online_write_bitmap(void) rbt_next(log_bmp_sys->modified_pages, bmp_tree_node); DBUG_EXECUTE_IF("bitmap_page_2_write_error", - ut_ad(bmp_tree_node); /* 2nd page must exist */ - DBUG_SET("+d,bitmap_page_write_error"); - DBUG_SET("-d,bitmap_page_2_write_error");); + if (bmp_tree_node) + { + DBUG_SET("+d,bitmap_page_write_error"); + DBUG_SET("-d,bitmap_page_2_write_error"); + }); } rbt_reset(log_bmp_sys->modified_pages); @@ -1206,10 +1227,19 @@ log_online_follow_redo_log(void) log_group_t* group; ibool result; - ut_ad(srv_track_changed_pages); ut_ad(!srv_read_only_mode); - mutex_enter(&log_bmp_sys->mutex); + if (!srv_track_changed_pages) + return TRUE; + + DEBUG_SYNC_C("log_online_follow_redo_log"); + + mutex_enter(&log_bmp_sys_mutex); + + if (!srv_track_changed_pages) { + mutex_exit(&log_bmp_sys_mutex); + return TRUE; + } /* Grab the LSN of the last checkpoint, we will parse up to it */ mutex_enter(&(log_sys->mutex)); @@ -1217,7 +1247,7 @@ log_online_follow_redo_log(void) mutex_exit(&(log_sys->mutex)); if (log_bmp_sys->end_lsn == log_bmp_sys->start_lsn) { - mutex_exit(&log_bmp_sys->mutex); + mutex_exit(&log_bmp_sys_mutex); return TRUE; } @@ -1240,7 +1270,7 @@ log_online_follow_redo_log(void) log_bmp_sys->start_lsn = log_bmp_sys->end_lsn; log_set_tracked_lsn(log_bmp_sys->start_lsn); - mutex_exit(&log_bmp_sys->mutex); + mutex_exit(&log_bmp_sys_mutex); return result; } @@ -1587,6 +1617,8 @@ log_online_bitmap_iterator_init( { ut_a(i); + i->max_lsn = max_lsn; + if (UNIV_UNLIKELY(min_lsn > max_lsn)) { /* Empty range */ @@ -1695,6 +1727,9 @@ log_online_bitmap_iterator_next( return TRUE; } + if (i->end_lsn >= i->max_lsn && i->last_page_in_run) + return FALSE; + while (!checksum_ok) { while (i->in.size < MODIFIED_PAGE_BLOCK_SIZE @@ -1790,15 +1825,21 @@ log_online_purge_changed_page_bitmaps( lsn = LSN_MAX; } + bool log_bmp_sys_inited = false; if (srv_redo_log_thread_started) { /* User requests might happen with both enabled and disabled tracking */ - mutex_enter(&log_bmp_sys->mutex); + log_bmp_sys_inited = true; + mutex_enter(&log_bmp_sys_mutex); + if (!srv_redo_log_thread_started) { + log_bmp_sys_inited = false; + mutex_exit(&log_bmp_sys_mutex); + } } if (!log_online_setup_bitmap_file_range(&bitmap_files, 0, LSN_MAX)) { - if (srv_redo_log_thread_started) { - mutex_exit(&log_bmp_sys->mutex); + if (log_bmp_sys_inited) { + mutex_exit(&log_bmp_sys_mutex); } return TRUE; } @@ -1836,7 +1877,7 @@ log_online_purge_changed_page_bitmaps( } } - if (srv_redo_log_thread_started) { + if (log_bmp_sys_inited) { if (lsn > log_bmp_sys->end_lsn) { lsn_t new_file_lsn; if (lsn == LSN_MAX) { @@ -1852,7 +1893,7 @@ log_online_purge_changed_page_bitmaps( } } - mutex_exit(&log_bmp_sys->mutex); + mutex_exit(&log_bmp_sys_mutex); } free(bitmap_files.files); diff --git a/storage/xtradb/log/log0recv.cc b/storage/xtradb/log/log0recv.cc index 11c643afff1..6b7c8d77824 100644 --- a/storage/xtradb/log/log0recv.cc +++ b/storage/xtradb/log/log0recv.cc @@ -2,7 +2,7 @@ Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2017, MariaDB Corporation. All Rights Reserved. +Copyright (c) 2013, 2017, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -85,7 +85,7 @@ this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */ #define RECV_READ_AHEAD_AREA 32 /** The recovery system */ -UNIV_INTERN recv_sys_t* recv_sys = NULL; +UNIV_INTERN recv_sys_t* recv_sys; /** TRUE when applying redo log records during crash recovery; FALSE otherwise. Note that this is FALSE while a background thread is rolling back incomplete transactions. */ @@ -137,9 +137,6 @@ UNIV_INTERN ibool recv_is_making_a_backup = FALSE; UNIV_INTERN ibool recv_is_from_backup = FALSE; # define buf_pool_get_curr_size() (5 * 1024 * 1024) #endif /* !UNIV_HOTBACKUP */ -/** The following counter is used to decide when to print info on -log scan */ -static ulint recv_scan_print_counter; /** The type of the previous parsed redo log record */ static ulint recv_previous_parsed_rec_type; @@ -182,7 +179,7 @@ UNIV_INTERN mysql_pfs_key_t recv_writer_mutex_key; # endif /* UNIV_PFS_MUTEX */ /** Flag indicating if recv_writer thread is active. */ -UNIV_INTERN bool recv_writer_thread_active = false; +static volatile bool recv_writer_thread_active; UNIV_INTERN os_thread_t recv_writer_thread_handle = 0; #endif /* !UNIV_HOTBACKUP */ @@ -310,8 +307,6 @@ recv_sys_var_init(void) recv_no_ibuf_operations = FALSE; - recv_scan_print_counter = 0; - recv_previous_parsed_rec_type = 999999; recv_previous_parsed_rec_offset = 0; @@ -348,8 +343,6 @@ DECLARE_THREAD(recv_writer_thread)( os_thread_pf(os_thread_get_curr_id())); #endif /* UNIV_DEBUG_THREAD_CREATION */ - recv_writer_thread_active = true; - while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { os_thread_sleep(100000); @@ -424,6 +417,7 @@ recv_sys_init( recv_sys->last_block_buf_start, OS_FILE_LOG_BLOCK_SIZE)); recv_sys->found_corrupt_log = FALSE; + recv_sys->progress_time = ut_time(); recv_max_page_lsn = 0; @@ -433,33 +427,18 @@ recv_sys_init( mutex_exit(&(recv_sys->mutex)); } -/********************************************************//** -Empties the hash table when it has been fully processed. -@return DB_SUCCESS when successfull or DB_ERROR when fails. */ +/** Empty a fully processed hash table. */ static -dberr_t -recv_sys_empty_hash(void) -/*=====================*/ +void +recv_sys_empty_hash() { ut_ad(mutex_own(&(recv_sys->mutex))); - - if (recv_sys->n_addrs != 0) { - fprintf(stderr, - "InnoDB: Error: %lu pages with log records" - " were left unprocessed!\n" - "InnoDB: Maximum page number with" - " log records on it %lu\n", - (ulong) recv_sys->n_addrs, - (ulong) recv_max_parsed_page_no); - return DB_ERROR; - } + ut_a(recv_sys->n_addrs == 0); hash_table_free(recv_sys->addr_hash); mem_heap_empty(recv_sys->heap); recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 512); - - return DB_SUCCESS; } #ifndef UNIV_HOTBACKUP @@ -1404,7 +1383,7 @@ recv_parse_or_apply_log_rec_body( } break; case MLOG_FILE_WRITE_CRYPT_DATA: - ptr = fil_parse_write_crypt_data(ptr, end_ptr, block); + ptr = const_cast<byte*>(fil_parse_write_crypt_data(ptr, end_ptr, block)); break; default: ptr = NULL; @@ -1806,6 +1785,8 @@ recv_recover_page_func( mtr_commit(&mtr); + ib_time_t time = ut_time(); + mutex_enter(&(recv_sys->mutex)); if (recv_max_page_lsn < page_lsn) { @@ -1814,11 +1795,17 @@ recv_recover_page_func( recv_addr->state = RECV_PROCESSED; - ut_a(recv_sys->n_addrs); - recv_sys->n_addrs--; - - mutex_exit(&(recv_sys->mutex)); + ut_a(recv_sys->n_addrs > 0); + if (ulint n = --recv_sys->n_addrs) { + if (recv_sys->report(time)) { + ib_logf(IB_LOG_LEVEL_INFO, + "To recover: " ULINTPF " pages from log", n); + sd_notifyf(0, "STATUS=To recover: " ULINTPF + " pages from log", n); + } + } + mutex_exit(&recv_sys->mutex); } #ifndef UNIV_HOTBACKUP @@ -1864,62 +1851,50 @@ recv_read_in_area( } buf_read_recv_pages(FALSE, space, zip_size, page_nos, n); - /* - fprintf(stderr, "Recv pages at %lu n %lu\n", page_nos[0], n); - */ return(n); } -/*******************************************************************//** -Empties the hash table of stored log records, applying them to appropriate -pages. -@return DB_SUCCESS when successfull or DB_ERROR when fails. */ +/** Apply the hash table of stored log records to persistent data pages. +@param[in] last_batch whether the change buffer merge will be + performed as part of the operation */ UNIV_INTERN -dberr_t -recv_apply_hashed_log_recs( -/*=======================*/ - ibool allow_ibuf) /*!< in: if TRUE, also ibuf operations are - allowed during the application; if FALSE, - no ibuf operations are allowed, and after - the application all file pages are flushed to - disk and invalidated in buffer pool: this - alternative means that no new log records - can be generated during the application; - the caller must in this case own the log - mutex */ +void +recv_apply_hashed_log_recs(bool last_batch) { - recv_addr_t* recv_addr; - ulint i; - ibool has_printed = FALSE; - ulong progress; - mtr_t mtr; - dberr_t err = DB_SUCCESS; -loop: - mutex_enter(&(recv_sys->mutex)); - - if (recv_sys->apply_batch_on) { + for (;;) { + mutex_enter(&recv_sys->mutex); - mutex_exit(&(recv_sys->mutex)); + if (!recv_sys->apply_batch_on) { + break; + } + mutex_exit(&recv_sys->mutex); os_thread_sleep(500000); - - goto loop; } - ut_ad((allow_ibuf == 0) == (mutex_own(&log_sys->mutex) != 0)); + ut_ad(!last_batch == mutex_own(&log_sys->mutex)); - if (!allow_ibuf) { + if (!last_batch) { recv_no_ibuf_operations = TRUE; } + if (ulint n = recv_sys->n_addrs) { + const char* msg = last_batch + ? "Starting final batch to recover " + : "Starting a batch to recover "; + ib_logf(IB_LOG_LEVEL_INFO, + "%s" ULINTPF " pages from redo log", msg, n); + sd_notifyf(0, "STATUS=%s" ULINTPF " pages from redo log", + msg, n); + } + recv_sys->apply_log_recs = TRUE; recv_sys->apply_batch_on = TRUE; - for (i = 0; i < hash_get_n_cells(recv_sys->addr_hash); i++) { - - for (recv_addr = static_cast<recv_addr_t*>( - HASH_GET_FIRST(recv_sys->addr_hash, i)); - recv_addr != 0; + for (ulint i = 0; i < hash_get_n_cells(recv_sys->addr_hash); i++) { + for (recv_addr_t* recv_addr = static_cast<recv_addr_t*>( + HASH_GET_FIRST(recv_sys->addr_hash, i)); + recv_addr; recv_addr = static_cast<recv_addr_t*>( HASH_GET_NEXT(addr_hash, recv_addr))) { @@ -1928,24 +1903,12 @@ loop: ulint page_no = recv_addr->page_no; if (recv_addr->state == RECV_NOT_PROCESSED) { - if (!has_printed) { - ib_logf(IB_LOG_LEVEL_INFO, - "Starting an apply batch" - " of log records" - " to the database..."); - fputs("InnoDB: Progress in percent: ", - stderr); - has_printed = TRUE; - } - - mutex_exit(&(recv_sys->mutex)); + mutex_exit(&recv_sys->mutex); if (buf_page_peek(space, page_no)) { - buf_block_t* block; - + mtr_t mtr; mtr_start(&mtr); - - block = buf_page_get( + buf_block_t* block = buf_page_get( space, zip_size, page_no, RW_X_LATCH, &mtr); buf_block_dbg_add_level( @@ -1958,21 +1921,9 @@ loop: page_no); } - mutex_enter(&(recv_sys->mutex)); + mutex_enter(&recv_sys->mutex); } } - - progress = (ulong) (i * 100) - / hash_get_n_cells(recv_sys->addr_hash); - if (has_printed - && progress - != ((i + 1) * 100) - / hash_get_n_cells(recv_sys->addr_hash)) { - - fprintf(stderr, "%lu ", progress); - sd_notifyf(0, "STATUS=Applying batch of log records for" - " InnoDB: Progress %lu", progress); - } } /* Wait until all the pages have been processed */ @@ -1986,12 +1937,7 @@ loop: mutex_enter(&(recv_sys->mutex)); } - if (has_printed) { - - fprintf(stderr, "\n"); - } - - if (!allow_ibuf) { + if (!last_batch) { bool success; /* Flush all the file pages to disk and invalidate them in @@ -2029,16 +1975,9 @@ loop: recv_sys->apply_log_recs = FALSE; recv_sys->apply_batch_on = FALSE; - err = recv_sys_empty_hash(); - - if (has_printed) { - fprintf(stderr, "InnoDB: Apply batch completed\n"); - sd_notify(0, "STATUS=InnoDB: Apply batch completed"); - } - - mutex_exit(&(recv_sys->mutex)); + recv_sys_empty_hash(); - return err; + mutex_exit(&recv_sys->mutex); } #else /* !UNIV_HOTBACKUP */ /*******************************************************************//** @@ -2061,11 +2000,6 @@ recv_apply_log_recs_for_backup(void) block = back_block1; - ib_logf(IB_LOG_LEVEL_INFO, - "Starting an apply batch of log records to the database..."); - - fputs("InnoDB: Progress in percent: ", stderr); - n_hash_cells = hash_get_n_cells(recv_sys->addr_hash); for (i = 0; i < n_hash_cells; i++) { @@ -2179,16 +2113,6 @@ recv_apply_log_recs_for_backup(void) skip_this_recv_addr: recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); } - - if ((100 * i) / n_hash_cells - != (100 * (i + 1)) / n_hash_cells) { - fprintf(stderr, "%lu ", - (ulong) ((100 * i) / n_hash_cells)); - fflush(stderr); - sd_notifyf(0, "STATUS=Applying batch of log records for" - " backup InnoDB: Progress %lu", - (ulong) (100 * i) / n_hash_cells); - } } sd_notify(0, "STATUS=InnoDB: Apply batch for backup completed"); @@ -2891,11 +2815,10 @@ recv_scan_log_recs( #ifndef UNIV_HOTBACKUP if (recv_log_scan_is_startup_type && !recv_needed_recovery) { - if (!srv_read_only_mode) { ib_logf(IB_LOG_LEVEL_INFO, - "Log scan progressed past the " - "checkpoint lsn " LSN_PF "", + "Starting crash recovery from " + "checkpoint LSN=" LSN_PF, recv_sys->scanned_lsn); recv_init_crash_recovery(); @@ -2955,19 +2878,6 @@ recv_scan_log_recs( *group_scanned_lsn = scanned_lsn; - if (recv_needed_recovery - || (recv_is_from_backup && !recv_is_making_a_backup)) { - recv_scan_print_counter++; - - if (finished || (recv_scan_print_counter % 80 == 0)) { - - fprintf(stderr, - "InnoDB: Doing recovery: scanned up to" - " log sequence number " LSN_PF "\n", - *group_scanned_lsn); - } - } - if (more_data && !recv_sys->found_corrupt_log) { /* Try to parse more log records */ @@ -2987,12 +2897,7 @@ recv_scan_log_recs( log yet: they would be produced by ibuf operations */ - *err = recv_apply_hashed_log_recs(FALSE); - - if (*err != DB_SUCCESS) { - /* Finish processing because of error */ - return (TRUE); - } + recv_apply_hashed_log_recs(false); } #endif /* !UNIV_HOTBACKUP */ @@ -3076,11 +2981,6 @@ recv_init_crash_recovery(void) recv_needed_recovery = TRUE; - ib_logf(IB_LOG_LEVEL_INFO, "Database was not shutdown normally!"); - ib_logf(IB_LOG_LEVEL_INFO, "Starting crash recovery."); - ib_logf(IB_LOG_LEVEL_INFO, - "Reading tablespace information from the .ibd files..."); - fil_load_single_table_tablespaces(); /* If we are using the doublewrite method, we will @@ -3091,15 +2991,14 @@ recv_init_crash_recovery(void) if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { ib_logf(IB_LOG_LEVEL_INFO, - "Restoring possible half-written data pages "); - - ib_logf(IB_LOG_LEVEL_INFO, + "Restoring possible half-written data pages " "from the doublewrite buffer..."); buf_dblwr_process(); /* Spawn the background thread to flush dirty pages from the buffer pools. */ + recv_writer_thread_active = true; recv_writer_thread_handle = os_thread_create( recv_writer_thread, 0, 0); } diff --git a/storage/xtradb/mach/mach0data.cc b/storage/xtradb/mach/mach0data.cc index 206434dc5ab..feeedb01609 100644 --- a/storage/xtradb/mach/mach0data.cc +++ b/storage/xtradb/mach/mach0data.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -55,7 +55,6 @@ mach_parse_compressed( if (flag < 0x80UL) { *val = flag; return(ptr + 1); - } /* Workaround GCC bug @@ -64,7 +63,11 @@ mach_parse_compressed( function, causing and out-of-bounds read if we are reading a short integer close to the end of buffer. */ #if defined(__GNUC__) && (__GNUC__ >= 5) && !defined(__clang__) - asm volatile("": : :"memory"); +#define DEPLOY_FENCE +#endif + +#ifdef DEPLOY_FENCE + __atomic_thread_fence(__ATOMIC_ACQUIRE); #endif if (flag < 0xC0UL) { @@ -75,8 +78,13 @@ mach_parse_compressed( *val = mach_read_from_2(ptr) & 0x7FFFUL; return(ptr + 2); + } - } else if (flag < 0xE0UL) { +#ifdef DEPLOY_FENCE + __atomic_thread_fence(__ATOMIC_ACQUIRE); +#endif + + if (flag < 0xE0UL) { if (end_ptr < ptr + 3) { return(NULL); } @@ -84,7 +92,13 @@ mach_parse_compressed( *val = mach_read_from_3(ptr) & 0x3FFFFFUL; return(ptr + 3); - } else if (flag < 0xF0UL) { + } + +#ifdef DEPLOY_FENCE + __atomic_thread_fence(__ATOMIC_ACQUIRE); +#endif + + if (flag < 0xF0UL) { if (end_ptr < ptr + 4) { return(NULL); } @@ -92,14 +106,20 @@ mach_parse_compressed( *val = mach_read_from_4(ptr) & 0x1FFFFFFFUL; return(ptr + 4); - } else { - ut_ad(flag == 0xF0UL); + } - if (end_ptr < ptr + 5) { - return(NULL); - } +#ifdef DEPLOY_FENCE + __atomic_thread_fence(__ATOMIC_ACQUIRE); +#endif - *val = mach_read_from_4(ptr + 1); - return(ptr + 5); +#undef DEPLOY_FENCE + + ut_ad(flag == 0xF0UL); + + if (end_ptr < ptr + 5) { + return(NULL); } + + *val = mach_read_from_4(ptr + 1); + return(ptr + 5); } diff --git a/storage/xtradb/mtr/mtr0mtr.cc b/storage/xtradb/mtr/mtr0mtr.cc index a1d7261e43c..e564b270d00 100644 --- a/storage/xtradb/mtr/mtr0mtr.cc +++ b/storage/xtradb/mtr/mtr0mtr.cc @@ -312,7 +312,6 @@ mtr_commit( /*=======*/ mtr_t* mtr) /*!< in: mini-transaction */ { - ut_ad(mtr); ut_ad(mtr->magic_n == MTR_MAGIC_N); ut_ad(mtr->state == MTR_ACTIVE); ut_ad(!mtr->inside_ibuf); diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index caf2becae72..ed84834e6ea 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2016, MariaDB Corporation. +Copyright (c) 2013, 2017, MariaDB Corporation. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -258,11 +258,15 @@ struct os_aio_array_t{ os_event_t not_full; /*!< The event which is set to the signaled state when there is space in - the aio outside the ibuf segment */ + the aio outside the ibuf segment; + os_event_set() and os_event_reset() + are protected by os_aio_array_t::mutex */ os_event_t is_empty; /*!< The event which is set to the signaled state when there are no - pending i/os in this array */ + pending i/os in this array; + os_event_set() and os_event_reset() + are protected by os_aio_array_t::mutex */ ulint n_slots;/*!< Total number of slots in the aio array. This must be divisible by n_threads. */ @@ -304,8 +308,8 @@ struct os_aio_array_t{ #define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5 #endif -/** Array of events used in simulated aio */ -static os_event_t* os_aio_segment_wait_events = NULL; +/** Array of events used in simulated aio. */ +static os_event_t* os_aio_segment_wait_events; /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These are NULL when the module has not yet been initialized. @{ */ @@ -342,16 +346,17 @@ static os_ib_mutex_t os_file_count_mutex; #endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */ /** Number of pending os_file_pread() operations */ -UNIV_INTERN ulint os_file_n_pending_preads = 0; +UNIV_INTERN ulint os_file_n_pending_preads; /** Number of pending os_file_pwrite() operations */ -UNIV_INTERN ulint os_file_n_pending_pwrites = 0; +UNIV_INTERN ulint os_file_n_pending_pwrites; /** Number of pending write operations */ -UNIV_INTERN ulint os_n_pending_writes = 0; +UNIV_INTERN ulint os_n_pending_writes; /** Number of pending read operations */ -UNIV_INTERN ulint os_n_pending_reads = 0; +UNIV_INTERN ulint os_n_pending_reads; +#if defined(WIN_ASYNC_IO) || defined(LINUX_NATIVE_AIO) /** After first fallocate failure we will disable os_file_trim */ -UNIV_INTERN ibool os_fallocate_failed = FALSE; +static bool os_fallocate_failed; /**********************************************************************//** Directly manipulate the allocated disk space by deallocating for the file referred to @@ -360,11 +365,12 @@ Within the specified range, partial file system blocks are zeroed, and whole file system blocks are removed from the file. After a successful call, subsequent reads from this range will return zeroes. @return true if success, false if error */ -UNIV_INTERN +static ibool os_file_trim( /*=========*/ os_aio_slot_t* slot); /*!< in: slot structure */ +#endif /* WIN_ASYNC_IO || LINUX_NATIVE_AIO */ /****************************************************************//** Does error handling when a file operation fails. @@ -1230,50 +1236,15 @@ next_file: char* full_path; int ret; struct stat statinfo; -#ifdef HAVE_READDIR_R - char dirent_buf[sizeof(struct dirent) - + _POSIX_PATH_MAX + 100]; - /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as - the max file name len; but in most standards, the - length is NAME_MAX; we add 100 to be even safer */ -#endif next_file: -#ifdef HAVE_READDIR_R - ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent); - - if (ret != 0 -#ifdef UNIV_AIX - /* On AIX, only if we got non-NULL 'ent' (result) value and - a non-zero 'ret' (return) value, it indicates a failed - readdir_r() call. An NULL 'ent' with an non-zero 'ret' - would indicate the "end of the directory" is reached. */ - && ent != NULL -#endif - ) { - fprintf(stderr, - "InnoDB: cannot read directory %s, error %lu\n", - dirname, (ulong) ret); - - return(-1); - } - - if (ent == NULL) { - /* End of directory */ - - return(1); - } - - ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1); -#else ent = readdir(dir); if (ent == NULL) { return(1); } -#endif ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { @@ -1601,9 +1572,13 @@ os_file_set_nocache_if_needed(os_file_t file, const char* name, if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT || (type == OS_DATA_FILE && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT - || (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)))) { - os_file_set_nocache(file, name, mode_str); - } + || (srv_unix_file_flush_method + == SRV_UNIX_O_DIRECT_NO_FSYNC)))) + /* Do fsync() on log files when setting O_DIRECT fails. + See log_io_complete() */ + if (!os_file_set_nocache(file, name, mode_str) + && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) + srv_unix_file_flush_method = SRV_UNIX_O_DIRECT; } /****************************************************************//** @@ -1811,9 +1786,10 @@ os_file_create_simple_no_error_handling_func( } /****************************************************************//** -Tries to disable OS caching on an opened file descriptor. */ +Tries to disable OS caching on an opened file descriptor. +@return TRUE if operation is success and FALSE otherwise */ UNIV_INTERN -void +bool os_file_set_nocache( /*================*/ os_file_t fd /*!< in: file descriptor to alter */ @@ -1834,6 +1810,7 @@ os_file_set_nocache( "Failed to set DIRECTIO_ON on file %s: %s: %s, " "continuing anyway.", file_name, operation_name, strerror(errno_save)); + return false; } #elif defined(O_DIRECT) if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { @@ -1864,8 +1841,10 @@ short_warning: "continuing anyway.", file_name, operation_name, strerror(errno_save)); } + return false; } #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */ + return true; } @@ -2005,10 +1984,6 @@ os_file_create_func( attributes |= FILE_FLAG_NO_BUFFERING; #else if (purpose == OS_FILE_AIO) { - - bool encrypt_later; /*!< should the page be encrypted - before write */ - #ifdef WIN_ASYNC_IO /* If specified, use asynchronous (overlapped) io and no buffering of writes in the OS */ @@ -2537,60 +2512,80 @@ os_file_get_size( #endif /* __WIN__ */ } -/***********************************************************************//** -Write the specified number of zeros to a newly created file. -@return TRUE if success */ +/** Set the size of a newly created file. +@param[in] name file name +@param[in] file file handle +@param[in] size desired file size +@param[in] sparse whether to create a sparse file (no preallocating) +@return whether the operation succeeded */ UNIV_INTERN -ibool +bool os_file_set_size( -/*=============*/ - const char* name, /*!< in: name of the file or path as a - null-terminated string */ - os_file_t file, /*!< in: handle to a file */ - os_offset_t size) /*!< in: file size */ + const char* name, + os_file_t file, + os_offset_t size, + bool is_sparse) { - os_offset_t current_size; - ibool ret; - byte* buf; - byte* buf2; - ulint buf_size; - - current_size = 0; +#ifdef _WIN32 + FILE_END_OF_FILE_INFO feof; + feof.EndOfFile.QuadPart = size; + bool success = SetFileInformationByHandle(file, + FileEndOfFileInfo, + &feof, sizeof feof); + if (!success) { + ib_logf(IB_LOG_LEVEL_ERROR, "os_file_set_size() of file %s" + " to " INT64PF " bytes failed with %u", + name, size, GetLastError()); + } + return(success); +#else + if (is_sparse) { + bool success = !ftruncate(file, size); + if (!success) { + ib_logf(IB_LOG_LEVEL_ERROR, "ftruncate of file %s" + " to " INT64PF " bytes failed with error %d", + name, size, errno); + } + return(success); + } -#ifdef HAVE_POSIX_FALLOCATE +# ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { + int err; + do { + err = posix_fallocate(file, 0, size); + } while (err == EINTR + && srv_shutdown_state == SRV_SHUTDOWN_NONE); - if (posix_fallocate(file, current_size, size) == -1) { - - ib_logf(IB_LOG_LEVEL_ERROR, "preallocating file " - "space for file \'%s\' failed. Current size " - INT64PF ", desired size " INT64PF, - name, current_size, size); - os_file_handle_error_no_exit (name, "posix_fallocate", - FALSE, __FILE__, __LINE__); - return(FALSE); + if (err) { + ib_logf(IB_LOG_LEVEL_ERROR, + "preallocating " INT64PF " bytes for" + "file %s failed with error %d", + size, name, err); } - return(TRUE); + return(!err); } -#endif +# endif /* Write up to 1 megabyte at a time. */ - buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE)) + ulint buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE)) * UNIV_PAGE_SIZE; - buf2 = static_cast<byte*>(ut_malloc(buf_size + UNIV_PAGE_SIZE)); - - /* Align the buffer for possible raw i/o */ - buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); + os_offset_t current_size = 0; - /* Write buffer full of zeros */ - memset(buf, 0, buf_size); + byte* buf2 = static_cast<byte*>(calloc(1, buf_size + UNIV_PAGE_SIZE)); - if (size >= (os_offset_t) 100 << 20) { - - fprintf(stderr, "InnoDB: Progress in MB:"); + if (!buf2) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot allocate " ULINTPF " bytes to extend file\n", + buf_size + UNIV_PAGE_SIZE); + return(false); } - while (current_size < size) { + /* Align the buffer for possible raw i/o */ + byte* buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); + bool ret; + + do { ulint n_bytes; if (size - current_size < (os_offset_t) buf_size) { @@ -2602,37 +2597,16 @@ os_file_set_size( ret = os_file_write(name, file, buf, current_size, n_bytes); if (!ret) { - ut_free(buf2); - goto error_handling; - } - - /* Print about progress for each 100 MB written */ - if ((current_size + n_bytes) / (100 << 20) - != current_size / (100 << 20)) { - - fprintf(stderr, " %lu00", - (ulong) ((current_size + n_bytes) - / (100 << 20))); + break; } current_size += n_bytes; - } - - if (size >= (os_offset_t) 100 << 20) { - - fprintf(stderr, "\n"); - } - - ut_free(buf2); + } while (current_size < size); - ret = os_file_flush(file); + free(buf2); - if (ret) { - return(TRUE); - } - -error_handling: - return(FALSE); + return(ret && os_file_flush(file)); +#endif } /***********************************************************************//** @@ -4435,13 +4409,6 @@ os_aio_init( os_aio_validate(); - os_aio_segment_wait_events = static_cast<os_event_t*>( - ut_malloc(n_segments * sizeof *os_aio_segment_wait_events)); - - for (ulint i = 0; i < n_segments; ++i) { - os_aio_segment_wait_events[i] = os_event_create(); - } - os_last_printout = ut_time(); #ifdef _WIN32 @@ -4451,8 +4418,18 @@ os_aio_init( ut_a(completion_port && read_completion_port); #endif - return(TRUE); + if (srv_use_native_aio) { + return(TRUE); + } + os_aio_segment_wait_events = static_cast<os_event_t*>( + ut_malloc(n_segments * sizeof *os_aio_segment_wait_events)); + + for (ulint i = 0; i < n_segments; ++i) { + os_aio_segment_wait_events[i] = os_event_create(); + } + + return(TRUE); } /*********************************************************************** @@ -4480,8 +4457,10 @@ os_aio_free(void) os_aio_array_free(os_aio_read_array); - for (ulint i = 0; i < os_aio_n_segments; i++) { - os_event_free(os_aio_segment_wait_events[i]); + if (!srv_use_native_aio) { + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_event_free(os_aio_segment_wait_events[i]); + } } #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8 @@ -4541,22 +4520,17 @@ os_aio_wake_all_threads_at_shutdown(void) if (os_aio_log_array != 0) { os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array); } - #elif defined(LINUX_NATIVE_AIO) - /* When using native AIO interface the io helper threads wait on io_getevents with a timeout value of 500ms. At each wake up these threads check the server status. No need to do anything to wake them up. */ +#endif /* !WIN_ASYNC_AIO */ if (srv_use_native_aio) { return; } - /* Fall through to simulated AIO handler wakeup if we are - not using native AIO. */ -#endif /* !WIN_ASYNC_AIO */ - /* This loop wakes up all simulated ai/o threads */ for (ulint i = 0; i < os_aio_n_segments; i++) { @@ -4939,6 +4913,7 @@ os_aio_simulated_wake_handler_threads(void) } } +#ifdef _WIN32 /**********************************************************************//** This function can be called if one wants to post a batch of reads and prefers an i/o-handler thread to handle them all at once later. You must @@ -4946,15 +4921,14 @@ call os_aio_simulated_wake_handler_threads later to ensure the threads are not left sleeping! */ UNIV_INTERN void -os_aio_simulated_put_read_threads_to_sleep(void) -/*============================================*/ +os_aio_simulated_put_read_threads_to_sleep() { /* The idea of putting background IO threads to sleep is only for Windows when using simulated AIO. Windows XP seems to schedule background threads too eagerly to allow for coalescing during readahead requests. */ -#ifdef __WIN__ + os_aio_array_t* array; if (srv_use_native_aio) { @@ -4973,8 +4947,8 @@ readahead requests. */ os_event_reset(os_aio_segment_wait_events[i]); } } -#endif /* __WIN__ */ } +#endif /* _WIN32 */ #if defined(LINUX_NATIVE_AIO) /*******************************************************************//** @@ -5364,7 +5338,7 @@ os_aio_windows_handle( } if (slot->type == OS_FILE_WRITE) { - if (!slot->is_log && srv_use_trim && os_fallocate_failed == FALSE) { + if (!slot->is_log && srv_use_trim && !os_fallocate_failed) { // Deallocate unused blocks from file system os_file_trim(slot); } @@ -5460,7 +5434,8 @@ retry: ut_a(slot->pos < end_pos); if (slot->type == OS_FILE_WRITE) { - if (!slot->is_log && srv_use_trim && os_fallocate_failed == FALSE) { + if (!slot->is_log && srv_use_trim + && !os_fallocate_failed) { // Deallocate unused blocks from file system os_file_trim(slot); } @@ -6178,11 +6153,12 @@ os_aio_print( srv_io_thread_op_info[i], srv_io_thread_function[i]); -#ifndef __WIN__ - if (os_aio_segment_wait_events[i]->is_set()) { +#ifndef _WIN32 + if (!srv_use_native_aio + && os_aio_segment_wait_events[i]->is_set()) { fprintf(file, " ev set"); } -#endif /* __WIN__ */ +#endif /* _WIN32 */ fprintf(file, "\n"); } @@ -6361,6 +6337,7 @@ typedef struct _FILE_LEVEL_TRIM { #endif #endif +#if defined(WIN_ASYNC_IO) || defined(LINUX_NATIVE_AIO) /**********************************************************************//** Directly manipulate the allocated disk space by deallocating for the file referred to by fd for the byte range starting at offset and continuing for len bytes. @@ -6368,7 +6345,7 @@ Within the specified range, partial file system blocks are zeroed, and whole file system blocks are removed from the file. After a successful call, subsequent reads from this range will return zeroes. @return true if success, false if error */ -UNIV_INTERN +static ibool os_file_trim( /*=========*/ @@ -6413,13 +6390,13 @@ os_file_trim( if (ret) { /* After first failure do not try to trim again */ - os_fallocate_failed = TRUE; + os_fallocate_failed = true; srv_use_trim = FALSE; - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Warning: fallocate call failed with error code %d.\n" - " InnoDB: start: %lu len: %lu payload: %lu\n" - " InnoDB: Disabling fallocate for now.\n", errno, (ulong) off, (ulong) trim_len, (ulong) len); + ib_logf(IB_LOG_LEVEL_WARN, + "fallocate() failed with error %d." + " start: " UINT64PF " len: " ULINTPF " payload: " ULINTPF "." + " Disabling fallocate for now.", + errno, off, ulint(trim_len), ulint(len)); os_file_handle_error_no_exit(slot->name, " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ", @@ -6440,7 +6417,7 @@ os_file_trim( fprintf(stderr, " InnoDB: Warning: fallocate not supported on this installation." " InnoDB: Disabling fallocate for now."); - os_fallocate_failed = TRUE; + os_fallocate_failed = true; srv_use_trim = FALSE; if (slot->write_size) { *slot->write_size = 0; @@ -6460,7 +6437,7 @@ os_file_trim( if (!ret) { /* After first failure do not try to trim again */ - os_fallocate_failed = TRUE; + os_fallocate_failed = true; srv_use_trim = FALSE; ut_print_timestamp(stderr); fprintf(stderr, @@ -6514,6 +6491,7 @@ os_file_trim( return (TRUE); } +#endif /* WIN_ASYNC_IO || LINUX_NATIVE_AIO */ /***********************************************************************//** Try to get number of bytes per sector from file system. diff --git a/storage/xtradb/os/os0thread.cc b/storage/xtradb/os/os0thread.cc index 5ddc40b0eeb..8baf06b9bb7 100644 --- a/storage/xtradb/os/os0thread.cc +++ b/storage/xtradb/os/os0thread.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -206,29 +206,32 @@ os_thread_create_func( #endif } -/** -Waits until the specified thread completes and joins it. Its return value is -ignored. - -@param thread thread to join */ +/** Waits until the specified thread completes and joins it. +Its return value is ignored. +@param[in,out] thread thread to join */ UNIV_INTERN void os_thread_join( os_thread_t thread) { - /*This function is currently only used to workaround glibc bug + /* This function is currently only used to workaround glibc bug described in http://bugs.mysql.com/bug.php?id=82886 On Windows, no workarounds are necessary, all threads are "detached" upon thread exit (handle is closed), so we do nothing. */ -#ifndef _WIN32 - int ret MY_ATTRIBUTE((unused)) = pthread_join(thread, NULL); +#ifdef __WIN__ + /* Do nothing. */ +#else +#ifdef UNIV_DEBUG + const int ret MY_ATTRIBUTE((unused)) = +#endif /* UNIV_DEBUG */ + pthread_join(thread, NULL); - /* Waiting on already-quit threads is allowed */ + /* Waiting on already-quit threads is allowed. */ ut_ad(ret == 0 || ret == ESRCH); -#endif +#endif /* __WIN__ */ } /*****************************************************************//** @@ -257,8 +260,9 @@ os_thread_exit( #ifdef __WIN__ ExitThread((DWORD) exit_value); #else - if (detach) + if (detach) { pthread_detach(pthread_self()); + } pthread_exit(exit_value); #endif } diff --git a/storage/xtradb/page/page0page.cc b/storage/xtradb/page/page0page.cc index a6fba4074ef..3f8e47adafd 100644 --- a/storage/xtradb/page/page0page.cc +++ b/storage/xtradb/page/page0page.cc @@ -1455,7 +1455,6 @@ page_dir_split_slot( ulint i; ulint n_owned; - ut_ad(page); ut_ad(!page_zip || page_is_comp(page)); ut_ad(slot_no > 0); @@ -1517,7 +1516,6 @@ page_dir_balance_slot( rec_t* old_rec; rec_t* new_rec; - ut_ad(page); ut_ad(!page_zip || page_is_comp(page)); ut_ad(slot_no > 0); diff --git a/storage/xtradb/page/page0zip.cc b/storage/xtradb/page/page0zip.cc index 04340c0f3d2..32e76fb44e6 100644 --- a/storage/xtradb/page/page0zip.cc +++ b/storage/xtradb/page/page0zip.cc @@ -4810,8 +4810,6 @@ page_zip_parse_compress( ulint size; ulint trailer_size; - ut_ad(ptr != NULL); - ut_ad(end_ptr != NULL); ut_ad(!page == !page_zip); if (UNIV_UNLIKELY(ptr + (2 + 2) > end_ptr)) { diff --git a/storage/xtradb/pars/pars0pars.cc b/storage/xtradb/pars/pars0pars.cc index e6af3d25e86..ce61d6e1e3b 100644 --- a/storage/xtradb/pars/pars0pars.cc +++ b/storage/xtradb/pars/pars0pars.cc @@ -2020,7 +2020,7 @@ pars_create_table( } node = tab_create_graph_create(table, pars_sym_tab_global->heap, true, - FIL_SPACE_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); + FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); table_sym->resolved = TRUE; table_sym->token_type = SYM_TABLE; diff --git a/storage/xtradb/rem/rem0rec.cc b/storage/xtradb/rem/rem0rec.cc index b9496b7f620..6770748c38b 100644 --- a/storage/xtradb/rem/rem0rec.cc +++ b/storage/xtradb/rem/rem0rec.cc @@ -789,7 +789,7 @@ rec_get_nth_field_offs_old( /**********************************************************//** Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. @return total size */ -UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(1,2))) +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) ulint rec_get_converted_size_comp_prefix_low( /*===================================*/ diff --git a/storage/xtradb/row/row0ftsort.cc b/storage/xtradb/row/row0ftsort.cc index fb78808ae80..29ddffd2587 100644 --- a/storage/xtradb/row/row0ftsort.cc +++ b/storage/xtradb/row/row0ftsort.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2016, MariaDB Corporation. +Copyright (c) 2015, 2017, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -224,7 +224,14 @@ row_fts_psort_info_init( common_info->sort_event = os_event_create(); common_info->merge_event = os_event_create(); common_info->opt_doc_id_size = opt_doc_id_size; - crypt_data = fil_space_get_crypt_data(new_table->space); + + /* Theoretically the tablespace can be dropped straight away. + In practice, the DDL completion will wait for this thread to + finish. */ + if (fil_space_t* space = fil_space_acquire(new_table->space)) { + crypt_data = space->crypt_data; + fil_space_release(space); + } if (crypt_data && crypt_data->should_encrypt()) { common_info->crypt_data = crypt_data; diff --git a/storage/xtradb/row/row0import.cc b/storage/xtradb/row/row0import.cc index 6170eb66195..6dc01907710 100644 --- a/storage/xtradb/row/row0import.cc +++ b/storage/xtradb/row/row0import.cc @@ -40,6 +40,7 @@ Created 2012-02-08 by Sunny Bains. #include "row0mysql.h" #include "srv0start.h" #include "row0quiesce.h" +#include "buf0buf.h" #include <vector> @@ -1873,10 +1874,10 @@ PageConverter::update_index_page( if (index == 0) { ib_logf(IB_LOG_LEVEL_ERROR, - "Page for tablespace %lu is " - " index page with id %lu but that" + "Page for tablespace " ULINTPF " is " + " index page with id " IB_ID_FMT " but that" " index is not found from configuration file." - " Current index name %s and id %lu.", + " Current index name %s and id " IB_ID_FMT ".", m_space, id, m_index->m_name, @@ -2036,12 +2037,15 @@ PageConverter::validate( buf_block_t* block) UNIV_NOTHROW { buf_frame_t* page = get_frame(block); + ulint space_id = mach_read_from_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + fil_space_t* space = fil_space_found_by_id(space_id); /* Check that the page number corresponds to the offset in the file. Flag as corrupt if it doesn't. Disable the check for LSN in buf_page_is_corrupted() */ - if (buf_page_is_corrupted(false, page, get_zip_size()) + if (buf_page_is_corrupted(false, page, get_zip_size(), space) || (page_get_page_no(page) != offset / m_page_size && page_get_page_no(page) != 0)) { diff --git a/storage/xtradb/row/row0merge.cc b/storage/xtradb/row/row0merge.cc index ed8ab600611..72be305d481 100644 --- a/storage/xtradb/row/row0merge.cc +++ b/storage/xtradb/row/row0merge.cc @@ -112,9 +112,8 @@ row_merge_encrypt_buf( "Unable to encrypt data-block " " src: %p srclen: %lu buf: %p buflen: %u." " return-code: %d. Can't continue!\n", - input_buf, (ulong) srv_sort_buf_size, + input_buf, srv_sort_buf_size, crypted_buf, dstlen, rc); - ut_error; } } @@ -154,9 +153,8 @@ row_merge_decrypt_buf( "Unable to encrypt data-block " " src: %p srclen: %lu buf: %p buflen: %d." " return-code: %d. Can't continue!\n", - input_buf, (ulong) srv_sort_buf_size, + input_buf, srv_sort_buf_size, crypted_buf, dstlen, rc); - ut_error; } return (true); @@ -1073,14 +1071,8 @@ row_merge_read_rec( ulint data_size; ulint avail_size; - ut_ad(block); - ut_ad(buf); ut_ad(b >= &block[0]); ut_ad(b < &block[srv_sort_buf_size]); - ut_ad(index); - ut_ad(foffs); - ut_ad(mrec); - ut_ad(offsets); ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE + dict_index_get_n_fields(index)); @@ -3959,7 +3951,7 @@ row_merge_build_indexes( { merge_file_t* merge_files; row_merge_block_t* block; - row_merge_block_t* crypt_block; + row_merge_block_t* crypt_block = NULL; ulint block_size; ulint i; ulint j; @@ -3995,9 +3987,15 @@ row_merge_build_indexes( DBUG_RETURN(DB_OUT_OF_MEMORY); } - /* Get crypt data from tablespace if present. */ - crypt_data = fil_space_get_crypt_data(new_table->space); - crypt_block = NULL; + /* Get crypt data from tablespace if present. We should be protected + from concurrent DDL (e.g. drop table) by MDL-locks. */ + fil_space_t* space = fil_space_acquire(new_table->space); + + if (space) { + crypt_data = space->crypt_data; + } else { + DBUG_RETURN(DB_TABLESPACE_NOT_FOUND); + } /* If tablespace is encrypted, allocate additional buffer for encryption/decryption. */ @@ -4172,8 +4170,8 @@ wait_again: for (j = 0; j < FTS_NUM_AUX_INDEX; j++) { - os_thread_join(merge_info[j] - .thread_hdl); + os_thread_join(merge_info[j] + .thread_hdl); } } } else { @@ -4361,5 +4359,9 @@ func_exit: } } + if (space) { + fil_space_release(space); + } + DBUG_RETURN(error); } diff --git a/storage/xtradb/row/row0mysql.cc b/storage/xtradb/row/row0mysql.cc index c81b10b93f1..7c8636d354f 100644 --- a/storage/xtradb/row/row0mysql.cc +++ b/storage/xtradb/row/row0mysql.cc @@ -1373,6 +1373,8 @@ run_again: row_ins_step(thr); + DEBUG_SYNC_C("ib_after_row_insert_step"); + err = trx->error_state; if (err != DB_SUCCESS) { @@ -3308,21 +3310,17 @@ void fil_wait_crypt_bg_threads( dict_table_t* table) { - uint start = time(0); - uint last = start; - - if (table->space != 0) { - fil_space_crypt_mark_space_closing(table->space, table->crypt_data); - } + time_t start = time(0); + time_t last = start; while (table->n_ref_count > 0) { dict_mutex_exit_for_mysql(); os_thread_sleep(20000); dict_mutex_enter_for_mysql(); - uint now = time(0); + time_t now = time(0); if (now >= last + 30) { fprintf(stderr, - "WARNING: waited %u seconds " + "WARNING: waited %ld seconds " "for ref-count on table: %s space: %u\n", now - start, table->name, table->space); last = now; @@ -3330,7 +3328,7 @@ fil_wait_crypt_bg_threads( if (now >= start + 300) { fprintf(stderr, - "WARNING: after %u seconds, gave up waiting " + "WARNING: after %ld seconds, gave up waiting " "for ref-count on table: %s space: %u\n", now - start, table->name, table->space); break; @@ -3526,35 +3524,40 @@ row_truncate_table_for_mysql( if (table->space && !DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)) { /* Discard and create the single-table tablespace. */ - fil_space_crypt_t* crypt_data; - ulint space = table->space; - ulint flags = fil_space_get_flags(space); + ulint space_id = table->space; + ulint flags = ULINT_UNDEFINED; ulint key_id = FIL_DEFAULT_ENCRYPTION_KEY; - fil_encryption_t mode = FIL_SPACE_ENCRYPTION_DEFAULT; + fil_encryption_t mode = FIL_ENCRYPTION_DEFAULT; dict_get_and_save_data_dir_path(table, true); - crypt_data = fil_space_get_crypt_data(space); - if (crypt_data) { - key_id = crypt_data->key_id; - mode = crypt_data->encryption; + if (fil_space_t* space = fil_space_acquire(space_id)) { + fil_space_crypt_t* crypt_data = space->crypt_data; + + if (crypt_data) { + key_id = crypt_data->key_id; + mode = crypt_data->encryption; + } + + flags = space->flags; + fil_space_release(space); } if (flags != ULINT_UNDEFINED - && fil_discard_tablespace(space) == DB_SUCCESS) { + && fil_discard_tablespace(space_id) == DB_SUCCESS) { dict_index_t* index; - dict_hdr_get_new_id(NULL, NULL, &space); + dict_hdr_get_new_id(NULL, NULL, &space_id); /* Lock all index trees for this table. We must do so after dict_hdr_get_new_id() to preserve the latch order */ dict_table_x_lock_indexes(table); - if (space == ULINT_UNDEFINED + if (space_id == ULINT_UNDEFINED || fil_create_new_single_table_tablespace( - space, table->name, + space_id, table->name, table->data_dir_path, flags, table->flags2, FIL_IBD_FILE_INITIAL_SIZE, @@ -3572,21 +3575,21 @@ row_truncate_table_for_mysql( goto funct_exit; } - recreate_space = space; + recreate_space = space_id; /* Replace the space_id in the data dictionary cache. The persisent data dictionary (SYS_TABLES.SPACE and SYS_INDEXES.SPACE) are updated later in this function. */ - table->space = space; + table->space = space_id; index = dict_table_get_first_index(table); do { - index->space = space; + index->space = space_id; index = dict_table_get_next_index(index); } while (index); mtr_start_trx(&mtr, trx); - fsp_header_init(space, + fsp_header_init(space_id, FIL_IBD_FILE_INITIAL_SIZE, &mtr); mtr_commit(&mtr); } @@ -4260,7 +4263,13 @@ row_drop_table_for_mysql( /* If table has not yet have crypt_data, try to read it to make freeing the table easier. */ if (!table->crypt_data) { - table->crypt_data = fil_space_get_crypt_data(table->space); + + if (fil_space_t* space = fil_space_acquire_silent(table->space)) { + /* We use crypt data in dict_table_t in ha_innodb.cc + to push warnings to user thread. */ + table->crypt_data = space->crypt_data; + fil_space_release(space); + } } /* We use the private SQL parser of Innobase to generate the diff --git a/storage/xtradb/row/row0purge.cc b/storage/xtradb/row/row0purge.cc index bc2e0b0e1cb..35b3520749b 100644 --- a/storage/xtradb/row/row0purge.cc +++ b/storage/xtradb/row/row0purge.cc @@ -897,7 +897,7 @@ row_purge_record_func( Fetches an undo log record and does the purge for the recorded operation. If none left, or the current purge completed, returns the control to the parent node, which is always a query thread node. */ -static MY_ATTRIBUTE((nonnull)) +static void row_purge( /*======*/ diff --git a/storage/xtradb/row/row0upd.cc b/storage/xtradb/row/row0upd.cc index 69206efd530..1156cbe4b4c 100644 --- a/storage/xtradb/row/row0upd.cc +++ b/storage/xtradb/row/row0upd.cc @@ -1285,8 +1285,6 @@ row_upd_index_replace_new_col_vals_index_pos( ulint n_fields; const ulint zip_size = dict_table_zip_size(index->table); - ut_ad(index); - dtuple_set_info_bits(entry, update->info_bits); if (order_only) { @@ -1471,8 +1469,6 @@ row_upd_changes_ord_field_binary_func( ulint i; const dict_index_t* clust_index; - ut_ad(index); - ut_ad(update); ut_ad(thr); ut_ad(thr->graph); ut_ad(thr->graph->trx); diff --git a/storage/xtradb/srv/srv0conc.cc b/storage/xtradb/srv/srv0conc.cc index a8e7e2ab1aa..e90f744cfa4 100644 --- a/storage/xtradb/srv/srv0conc.cc +++ b/storage/xtradb/srv/srv0conc.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -79,7 +80,9 @@ typedef UT_LIST_NODE_T(struct srv_conc_slot_t) srv_conc_node_t; /** Slot for a thread waiting in the concurrency control queue. */ struct srv_conc_slot_t{ - os_event_t event; /*!< event to wait */ + os_event_t event; /*!< event to wait for; + os_event_set() and os_event_reset() + are protected by srv_conc_mutex */ ibool reserved; /*!< TRUE if slot reserved */ ibool wait_ended; /*!< TRUE when another thread has @@ -378,11 +381,11 @@ srv_conc_exit_innodb_without_atomics( } } - os_fast_mutex_unlock(&srv_conc_mutex); - if (slot != NULL) { os_event_set(slot->event); } + + os_fast_mutex_unlock(&srv_conc_mutex); } /*********************************************************************//** diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index 42d667b111c..f8c8c330f0c 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -79,12 +79,6 @@ Created 10/8/1995 Heikki Tuuri #include <my_rdtsc.h> #include "btr0scrub.h" -/* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */ -ibool innobase_thd_is_idle(const void* thd); -ib_int64_t innobase_thd_get_start_time(const void* thd); -void innobase_thd_kill(ulong thd_id); -ulong innobase_thd_get_thread_id(const void* thd); - /* prototypes for new functions added to ha_innodb.cc */ ibool innobase_get_slow_log(); @@ -474,11 +468,6 @@ starting from SRV_FORCE_IGNORE_CORRUPT, so that data can be recovered by SELECT or mysqldump. When this is nonzero, we do not allow any user modifications to the data. */ UNIV_INTERN ulong srv_force_recovery; -#ifndef DBUG_OFF -/** Inject a crash at different steps of the recovery process. -This is for testing and debugging only. */ -UNIV_INTERN ulong srv_force_recovery_crash; -#endif /* !DBUG_OFF */ /** Print all user-level transactions deadlocks to mysqld stderr */ @@ -511,6 +500,7 @@ this many index pages, there are 2 ways to calculate statistics: table/index are not found in the innodb database */ UNIV_INTERN unsigned long long srv_stats_transient_sample_pages = 8; UNIV_INTERN my_bool srv_stats_persistent = TRUE; +UNIV_INTERN my_bool srv_stats_include_delete_marked = FALSE; UNIV_INTERN unsigned long long srv_stats_persistent_sample_pages = 20; UNIV_INTERN my_bool srv_stats_auto_recalc = TRUE; @@ -609,7 +599,7 @@ UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS]; UNIV_INTERN time_t srv_last_monitor_time; -UNIV_INTERN ib_mutex_t srv_innodb_monitor_mutex; +static ib_mutex_t srv_innodb_monitor_mutex; /* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */ UNIV_INTERN ib_mutex_t srv_monitor_file_mutex; @@ -795,7 +785,11 @@ struct srv_sys_t{ ulint n_sys_threads; /*!< size of the sys_threads array */ - srv_slot_t* sys_threads; /*!< server thread table */ + srv_slot_t* sys_threads; /*!< server thread table; + os_event_set() and + os_event_reset() on + sys_threads[]->event are + covered by srv_sys_t::mutex */ ulint n_threads_active[SRV_MASTER + 1]; /*!< number of threads active @@ -817,13 +811,16 @@ UNIV_INTERN ib_mutex_t server_mutex; static srv_sys_t* srv_sys = NULL; -/** Event to signal the monitor thread. */ +/** Event to signal srv_monitor_thread. Not protected by a mutex. +Set after setting srv_print_innodb_monitor. */ UNIV_INTERN os_event_t srv_monitor_event; -/** Event to signal the error thread */ +/** Event to signal the shutdown of srv_error_monitor_thread. +Not protected by a mutex. */ UNIV_INTERN os_event_t srv_error_event; -/** Event to signal the buffer pool dump/load thread */ +/** Event for waking up buf_dump_thread. Not protected by a mutex. +Set on shutdown or by buf_dump_start() or buf_load_start(). */ UNIV_INTERN os_event_t srv_buf_dump_event; /** The buffer pool dump/load file name */ @@ -993,7 +990,6 @@ srv_suspend_thread_low( /*===================*/ srv_slot_t* slot) /*!< in/out: thread slot */ { - ut_ad(!srv_read_only_mode); ut_ad(srv_sys_mutex_own()); @@ -1051,34 +1047,71 @@ srv_suspend_thread( return(sig_count); } -/*********************************************************************//** -Releases threads of the type given from suspension in the thread table. -NOTE! The server mutex has to be reserved by the caller! -@return number of threads released: this may be less than n if not - enough threads were suspended at the moment. */ -UNIV_INTERN -ulint -srv_release_threads( -/*================*/ - srv_thread_type type, /*!< in: thread type */ - ulint n) /*!< in: number of threads to release */ +/** Resume the calling thread. +@param[in,out] slot thread slot +@param[in] sig_count signal count (if wait) +@param[in] wait whether to wait for the event +@param[in] timeout_usec timeout in microseconds (0=infinite) +@return whether the wait timed out */ +static +bool +srv_resume_thread(srv_slot_t* slot, ib_int64_t sig_count = 0, bool wait = true, + ulint timeout_usec = 0) +{ + bool timeout; + + ut_ad(!srv_read_only_mode); + ut_ad(slot->in_use); + ut_ad(slot->suspended); + + if (!wait) { + timeout = false; + } else if (timeout_usec) { + timeout = OS_SYNC_TIME_EXCEEDED == os_event_wait_time_low( + slot->event, timeout_usec, sig_count); + } else { + timeout = false; + os_event_wait_low(slot->event, sig_count); + } + + srv_sys_mutex_enter(); + ut_ad(slot->in_use); + ut_ad(slot->suspended); + + slot->suspended = FALSE; + ++srv_sys->n_threads_active[slot->type]; + srv_sys_mutex_exit(); + return(timeout); +} + +/** Ensure that a given number of threads of the type given are running +(or are already terminated). +@param[in] type thread type +@param[in] n number of threads that have to run */ +void +srv_release_threads(enum srv_thread_type type, ulint n) { - ulint i; - ulint count = 0; + ulint running; ut_ad(srv_thread_type_validate(type)); ut_ad(n > 0); - srv_sys_mutex_enter(); + do { + running = 0; - for (i = 0; i < srv_sys->n_sys_threads; i++) { - srv_slot_t* slot; + srv_sys_mutex_enter(); - slot = &srv_sys->sys_threads[i]; + for (ulint i = 0; i < srv_sys->n_sys_threads; i++) { + srv_slot_t* slot = &srv_sys->sys_threads[i]; - if (slot->in_use - && srv_slot_get_type(slot) == type - && slot->suspended) { + if (!slot->in_use || srv_slot_get_type(slot) != type) { + continue; + } else if (!slot->suspended) { + if (++running >= n) { + break; + } + continue; + } switch (type) { case SRV_NONE: @@ -1108,21 +1141,11 @@ srv_release_threads( break; } - slot->suspended = FALSE; - - ++srv_sys->n_threads_active[type]; - os_event_set(slot->event); - - if (++count == n) { - break; - } } - } - srv_sys_mutex_exit(); - - return(count); + srv_sys_mutex_exit(); + } while (running && running < n); } /*********************************************************************//** @@ -1135,11 +1158,8 @@ srv_free_slot( { srv_sys_mutex_enter(); - if (!slot->suspended) { - /* Mark the thread as inactive. */ - srv_suspend_thread_low(slot); - } - + /* Mark the thread as inactive. */ + srv_suspend_thread_low(slot); /* Free the slot for reuse. */ ut_ad(slot->in_use); slot->in_use = FALSE; @@ -1259,16 +1279,22 @@ srv_free(void) os_event_free(srv_sys->sys_threads[i].event); os_event_free(srv_error_event); + srv_error_event = NULL; os_event_free(srv_monitor_event); + srv_monitor_event = NULL; os_event_free(srv_buf_dump_event); + srv_buf_dump_event = NULL; os_event_free(srv_checkpoint_completed_event); + srv_checkpoint_completed_event = NULL; os_event_free(srv_redo_log_tracked_event); + srv_redo_log_tracked_event = NULL; mutex_free(&srv_sys->mutex); mutex_free(&srv_sys->tasks_mutex); } #ifdef WITH_INNODB_DISALLOW_WRITES os_event_free(srv_allow_writes_event); + srv_allow_writes_event = NULL; #endif /* WITH_INNODB_DISALLOW_WRITES */ #ifndef HAVE_ATOMIC_BUILTINS @@ -1450,22 +1476,26 @@ srv_printf_innodb_monitor( low level 135. Therefore we can reserve the latter mutex here without a danger of a deadlock of threads. */ - mutex_enter(&dict_foreign_err_mutex); + if (!recv_recovery_on) { - if (!srv_read_only_mode && ftell(dict_foreign_err_file) != 0L) { - fputs("------------------------\n" - "LATEST FOREIGN KEY ERROR\n" - "------------------------\n", file); - ut_copy_file(file, dict_foreign_err_file); - } + mutex_enter(&dict_foreign_err_mutex); - mutex_exit(&dict_foreign_err_mutex); + if (!srv_read_only_mode + && ftell(dict_foreign_err_file) != 0L) { + fputs("------------------------\n" + "LATEST FOREIGN KEY ERROR\n" + "------------------------\n", file); + ut_copy_file(file, dict_foreign_err_file); + } + + mutex_exit(&dict_foreign_err_mutex); + } /* Only if lock_print_info_summary proceeds correctly, before we call the lock_print_info_all_transactions to print all the lock information. IMPORTANT NOTE: This function acquires the lock mutex on success. */ - ret = lock_print_info_summary(file, nowait); + ret = recv_recovery_on ? FALSE : lock_print_info_summary(file, nowait); if (ret) { if (trx_start_pos) { @@ -1498,10 +1528,13 @@ srv_printf_innodb_monitor( "--------\n", file); os_aio_print(file); - fputs("-------------------------------------\n" - "INSERT BUFFER AND ADAPTIVE HASH INDEX\n" - "-------------------------------------\n", file); - ibuf_print(file); + if (!recv_recovery_on) { + + fputs("-------------------------------------\n" + "INSERT BUFFER AND ADAPTIVE HASH INDEX\n" + "-------------------------------------\n", file); + ibuf_print(file); + } fprintf(file, @@ -1513,10 +1546,13 @@ srv_printf_innodb_monitor( btr_cur_n_sea_old = btr_cur_n_sea; btr_cur_n_non_sea_old = btr_cur_n_non_sea; - fputs("---\n" - "LOG\n" - "---\n", file); - log_print(file); + if (!recv_recovery_on) { + + fputs("---\n" + "LOG\n" + "---\n", file); + log_print(file); + } fputs("----------------------\n" "BUFFER POOL AND MEMORY\n" @@ -1611,8 +1647,9 @@ srv_printf_innodb_monitor( ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0), recv_sys_subtotal); + fprintf(file, "Dictionary memory allocated " ULINTPF "\n", - dict_sys->size); + dict_sys ? dict_sys->size : 0); buf_print_io(file); @@ -1720,6 +1757,10 @@ srv_printf_innodb_monitor( mutex_exit(&srv_innodb_monitor_mutex); fflush(file); +#ifndef DBUG_OFF + srv_debug_monitor_printed = true; +#endif + return(ret); } @@ -2071,6 +2112,8 @@ srv_export_innodb_status(void) crypt_stat.estimated_iops; export_vars.innodb_encryption_key_requests = srv_stats.n_key_requests; + export_vars.innodb_key_rotation_list_length = + srv_stats.key_rotation_list_length; export_vars.innodb_scrub_page_reorganizations = scrub_stat.page_reorganizations; @@ -2088,6 +2131,12 @@ srv_export_innodb_status(void) mutex_exit(&srv_innodb_monitor_mutex); } +#ifndef DBUG_OFF +/** false before InnoDB monitor has been printed at least once, true +afterwards */ +bool srv_debug_monitor_printed = false; +#endif + /*********************************************************************//** A thread which prints the info output by various InnoDB monitors. @return a dummy parameter */ @@ -2361,36 +2410,6 @@ loop: old_sema = sema; } - if (srv_kill_idle_transaction && trx_sys) { - trx_t* trx; - time_t now; -rescan_idle: - now = time(NULL); - mutex_enter(&trx_sys->mutex); - trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); - while (trx) { - if (trx->state == TRX_STATE_ACTIVE - && trx->mysql_thd - && innobase_thd_is_idle(trx->mysql_thd)) { - ib_int64_t start_time = innobase_thd_get_start_time(trx->mysql_thd); - ulong thd_id = innobase_thd_get_thread_id(trx->mysql_thd); - - if (trx->last_stmt_start != start_time) { - trx->idle_start = now; - trx->last_stmt_start = start_time; - } else if (difftime(now, trx->idle_start) - > srv_kill_idle_transaction) { - /* kill the session */ - mutex_exit(&trx_sys->mutex); - innobase_thd_kill(thd_id); - goto rescan_idle; - } - } - trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); - } - mutex_exit(&trx_sys->mutex); - } - /* Flush stderr so that a database user gets the output to possible MySQL error file */ @@ -2512,10 +2531,8 @@ DECLARE_THREAD(srv_redo_log_follow_thread)( } while (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE); - srv_track_changed_pages = FALSE; log_online_read_shutdown(); os_event_set(srv_redo_log_tracked_event); - srv_redo_log_thread_started = false; /* Defensive, not required */ my_thread_end(); os_thread_exit(NULL); @@ -2681,15 +2698,7 @@ srv_active_wake_master_thread(void) if (slot->in_use) { ut_a(srv_slot_get_type(slot) == SRV_MASTER); - - if (slot->suspended) { - - slot->suspended = FALSE; - - ++srv_sys->n_threads_active[SRV_MASTER]; - - os_event_set(slot->event); - } + os_event_set(slot->event); } srv_sys_mutex_exit(); @@ -3216,7 +3225,7 @@ suspend_thread: manual also mentions this string in several places. */ srv_main_thread_op_info = "waiting for server activity"; - os_event_wait(slot->event); + srv_resume_thread(slot); if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { os_thread_exit(NULL); @@ -3338,8 +3347,7 @@ DECLARE_THREAD(srv_worker_thread)( do { srv_suspend_thread(slot); - - os_event_wait(slot->event); + srv_resume_thread(slot); srv_current_thread_priority = srv_purge_thread_priority; @@ -3479,8 +3487,6 @@ srv_purge_coordinator_suspend( ib_int64_t sig_count = srv_suspend_thread(slot); do { - ulint ret; - rw_lock_x_lock(&purge_sys->latch); purge_sys->running = false; @@ -3489,32 +3495,11 @@ srv_purge_coordinator_suspend( /* We don't wait right away on the the non-timed wait because we want to signal the thread that wants to suspend purge. */ - - if (stop) { - os_event_wait_low(slot->event, sig_count); - ret = 0; - } else if (rseg_history_len <= trx_sys->rseg_history_len) { - ret = os_event_wait_time_low( - slot->event, SRV_PURGE_MAX_TIMEOUT, sig_count); - } else { - /* We don't want to waste time waiting, if the - history list increased by the time we got here, - unless purge has been stopped. */ - ret = 0; - } - - srv_sys_mutex_enter(); - - /* The thread can be in state !suspended after the timeout - but before this check if another thread sent a wakeup signal. */ - - if (slot->suspended) { - slot->suspended = FALSE; - ++srv_sys->n_threads_active[slot->type]; - ut_a(srv_sys->n_threads_active[slot->type] == 1); - } - - srv_sys_mutex_exit(); + const bool wait = stop + || rseg_history_len <= trx_sys->rseg_history_len; + const bool timeout = srv_resume_thread( + slot, sig_count, wait, + stop ? 0 : SRV_PURGE_MAX_TIMEOUT); sig_count = srv_suspend_thread(slot); @@ -3526,6 +3511,19 @@ srv_purge_coordinator_suspend( if (!stop) { ut_a(purge_sys->n_stop == 0); purge_sys->running = true; + + if (timeout + && rseg_history_len == trx_sys->rseg_history_len + && trx_sys->rseg_history_len < 5000) { + /* No new records were added since the + wait started. Simply wait for new + records. The magic number 5000 is an + approximation for the case where we + have cached UNDO log records which + prevent truncate of the UNDO + segments. */ + stop = true; + } } else { ut_a(purge_sys->n_stop > 0); @@ -3534,33 +3532,9 @@ srv_purge_coordinator_suspend( } rw_lock_x_unlock(&purge_sys->latch); - - if (ret == OS_SYNC_TIME_EXCEEDED) { - - /* No new records added since wait started then simply - wait for new records. The magic number 5000 is an - approximation for the case where we have cached UNDO - log records which prevent truncate of the UNDO - segments. */ - - if (rseg_history_len == trx_sys->rseg_history_len - && trx_sys->rseg_history_len < 5000) { - - stop = true; - } - } - } while (stop); - srv_sys_mutex_enter(); - - if (slot->suspended) { - slot->suspended = FALSE; - ++srv_sys->n_threads_active[slot->type]; - ut_a(srv_sys->n_threads_active[slot->type] == 1); - } - - srv_sys_mutex_exit(); + srv_resume_thread(slot, 0, false); } /*********************************************************************//** @@ -3616,8 +3590,9 @@ DECLARE_THREAD(srv_purge_coordinator_thread)( srv_purge_coordinator_suspend(slot, rseg_history_len); } + ut_ad(!slot->suspended); + if (srv_purge_should_exit(n_total_purged)) { - ut_a(!slot->suspended); break; } @@ -3732,12 +3707,10 @@ srv_get_task_queue_length(void) return(n_tasks); } -/**********************************************************************//** -Wakeup the purge threads. */ +/** Wake up the purge threads. */ UNIV_INTERN void -srv_purge_wakeup(void) -/*==================*/ +srv_purge_wakeup() { ut_ad(!srv_read_only_mode); @@ -3752,4 +3725,3 @@ srv_purge_wakeup(void) } } } - diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index 4dd31ad43f6..5255a7454ea 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -623,19 +623,6 @@ create_log_file( /** Initial number of the first redo log file */ #define INIT_LOG_FILE0 (SRV_N_LOG_FILES_MAX + 1) -#ifdef DBUG_OFF -# define RECOVERY_CRASH(x) do {} while(0) -#else -# define RECOVERY_CRASH(x) do { \ - if (srv_force_recovery_crash == x) { \ - fprintf(stderr, "innodb_force_recovery_crash=%lu\n", \ - srv_force_recovery_crash); \ - fflush(stderr); \ - exit(3); \ - } \ -} while (0) -#endif - /*********************************************************************//** Creates all log files. @return DB_SUCCESS or error code */ @@ -676,13 +663,14 @@ create_log_files( file should be recoverable. The buffer pool was clean, and we can simply create all log files from the scratch. */ - RECOVERY_CRASH(6); + DBUG_EXECUTE_IF("innodb_log_abort_6", + return(DB_ERROR);); } } ut_ad(!buf_pool_check_no_pending_io()); - RECOVERY_CRASH(7); + DBUG_EXECUTE_IF("innodb_log_abort_7", return(DB_ERROR);); for (unsigned i = 0; i < srv_n_log_files; i++) { sprintf(logfilename + dirnamelen, @@ -695,7 +683,7 @@ create_log_files( } } - RECOVERY_CRASH(8); + DBUG_EXECUTE_IF("innodb_log_abort_8", return(DB_ERROR);); /* We did not create the first log file initially as ib_logfile0, so that crash recovery cannot find it until it @@ -707,6 +695,7 @@ create_log_files( FIL_LOG, NULL /* no encryption yet */, true /* this is create */); + ut_a(fil_validate()); logfile0 = fil_node_create( @@ -751,10 +740,16 @@ create_log_files( return(DB_SUCCESS); } -/*********************************************************************//** -Renames the first log file. */ +/** Rename the first redo log file. +@param[in,out] logfilename buffer for the log file name +@param[in] dirnamelen length of the directory path +@param[in] lsn FIL_PAGE_FILE_FLUSH_LSN value +@param[in,out] logfile0 name of the first log file +@return error code +@retval DB_SUCCESS on successful operation */ +MY_ATTRIBUTE((warn_unused_result, nonnull)) static -void +dberr_t create_log_files_rename( /*====================*/ char* logfilename, /*!< in/out: buffer for log file name */ @@ -765,6 +760,9 @@ create_log_files_rename( /* If innodb_flush_method=O_DSYNC, we need to explicitly flush the log buffers. */ fil_flush(SRV_LOG_SPACE_FIRST_ID); + + DBUG_EXECUTE_IF("innodb_log_abort_9", return(DB_ERROR);); + /* Close the log files, so that we can rename the first one. */ fil_close_log_files(false); @@ -773,26 +771,28 @@ create_log_files_rename( checkpoint has been created. */ sprintf(logfilename + dirnamelen, "ib_logfile%u", 0); - RECOVERY_CRASH(9); - ib_logf(IB_LOG_LEVEL_INFO, "Renaming log file %s to %s", logfile0, logfilename); mutex_enter(&log_sys->mutex); ut_ad(strlen(logfile0) == 2 + strlen(logfilename)); - ibool success = os_file_rename( - innodb_file_log_key, logfile0, logfilename); - ut_a(success); - - RECOVERY_CRASH(10); + dberr_t err = os_file_rename( + innodb_file_log_key, logfile0, logfilename) + ? DB_SUCCESS : DB_ERROR; /* Replace the first file with ib_logfile0. */ strcpy(logfile0, logfilename); mutex_exit(&log_sys->mutex); - fil_open_log_and_system_tablespace_files(); + DBUG_EXECUTE_IF("innodb_log_abort_10", err = DB_ERROR;); - ib_logf(IB_LOG_LEVEL_WARN, "New log files created, LSN=" LSN_PF, lsn); + if (err == DB_SUCCESS) { + fil_open_log_and_system_tablespace_files(); + ib_logf(IB_LOG_LEVEL_WARN, + "New log files created, LSN=" LSN_PF, lsn); + } + + return(err); } /*********************************************************************//** @@ -1163,14 +1163,13 @@ check_first_page: (ulong) (srv_data_file_sizes[i] >> (20 - UNIV_PAGE_SIZE_SHIFT))); - ib_logf(IB_LOG_LEVEL_INFO, - "Database physically writes the" - " file full: wait..."); - ret = os_file_set_size( name, files[i], (os_offset_t) srv_data_file_sizes[i] - << UNIV_PAGE_SIZE_SHIFT); + << UNIV_PAGE_SIZE_SHIFT + /* TODO: enable page_compression on the + system tablespace and add + , FSP_FLAGS_HAS_PAGE_COMPRESSION(flags)*/); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, @@ -1189,13 +1188,14 @@ check_first_page: if (i == 0) { if (!crypt_data) { - crypt_data = fil_space_create_crypt_data(FIL_SPACE_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); + crypt_data = fil_space_create_crypt_data(FIL_ENCRYPTION_DEFAULT, + FIL_DEFAULT_ENCRYPTION_KEY); } flags = FSP_FLAGS_PAGE_SSIZE(); fil_space_create(name, 0, flags, FIL_TABLESPACE, - crypt_data, (*create_new_db) == true); + crypt_data, (*create_new_db) == true); } ut_a(fil_validate()); @@ -1267,10 +1267,11 @@ srv_undo_tablespace_create( "Setting file %s size to %lu MB", name, size >> (20 - UNIV_PAGE_SIZE_SHIFT)); - ib_logf(IB_LOG_LEVEL_INFO, - "Database physically writes the file full: wait..."); - - ret = os_file_set_size(name, fh, size << UNIV_PAGE_SIZE_SHIFT); + ret = os_file_set_size(name, fh, size << UNIV_PAGE_SIZE_SHIFT + /* TODO: enable page_compression on the + system tablespace and add + FSP_FLAGS_HAS_PAGE_COMPRESSION(flags) + */); if (!ret) { ib_logf(IB_LOG_LEVEL_INFO, @@ -2090,6 +2091,7 @@ innobase_start_or_create_for_mysql(void) fsp_init(); log_init(); + log_online_init(); lock_sys_create(srv_lock_table_size); @@ -2268,14 +2270,18 @@ innobase_start_or_create_for_mysql(void) dirnamelen, max_flushed_lsn, logfile0); + if (err == DB_SUCCESS) { + err = create_log_files_rename( + logfilename, + dirnamelen, + max_flushed_lsn, + logfile0); + } + if (err != DB_SUCCESS) { return(err); } - create_log_files_rename( - logfilename, dirnamelen, - max_flushed_lsn, logfile0); - /* Suppress the message about crash recovery. */ max_flushed_lsn = min_flushed_lsn @@ -2445,8 +2451,12 @@ files_checked: fil_flush_file_spaces(FIL_TABLESPACE); - create_log_files_rename(logfilename, dirnamelen, - max_flushed_lsn, logfile0); + err = create_log_files_rename(logfilename, dirnamelen, + max_flushed_lsn, logfile0); + + if (err != DB_SUCCESS) { + return(err); + } } else { /* Check if we support the max format that is stamped @@ -2475,6 +2485,23 @@ files_checked: and there must be no page in the buf_flush list. */ buf_pool_invalidate(); + /* Start monitor thread early enough so that e.g. crash + recovery failing to find free pages in the buffer pool is + diagnosed. */ + if (!srv_read_only_mode) + { + /* Create the thread which prints InnoDB monitor + info */ + srv_monitor_active = true; + thread_handles[4 + SRV_MAX_N_IO_THREADS] = + os_thread_create( + srv_monitor_thread, + NULL, + thread_ids + 4 + SRV_MAX_N_IO_THREADS); + + thread_started[4 + SRV_MAX_N_IO_THREADS] = true; + } + /* We always try to do a recovery, even if the database had been shut down normally: this is the normal startup path */ @@ -2495,7 +2522,7 @@ files_checked: return(err); } - /* This must precede recv_apply_hashed_log_recs(TRUE). */ + /* This must precede recv_apply_hashed_log_recs(true). */ ib_bh = trx_sys_init_at_db_start(); if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { @@ -2503,12 +2530,8 @@ files_checked: respective file pages, for the last batch of recv_group_scan_log_recs(). */ - err = recv_apply_hashed_log_recs(TRUE); + recv_apply_hashed_log_recs(true); DBUG_PRINT("ib_log", ("apply completed")); - - if (err != DB_SUCCESS) { - return(err); - } } if (!srv_read_only_mode) { @@ -2648,7 +2671,8 @@ files_checked: ULINT_MAX, LSN_MAX, NULL); ut_a(success); - RECOVERY_CRASH(1); + DBUG_EXECUTE_IF("innodb_log_abort_1", + return(DB_ERROR);); min_flushed_lsn = max_flushed_lsn = log_get_lsn(); @@ -2663,8 +2687,6 @@ files_checked: buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); - RECOVERY_CRASH(2); - /* Flush the old log files. */ log_buffer_flush_to_disk(); /* If innodb_flush_method=O_DSYNC, @@ -2679,21 +2701,27 @@ files_checked: ut_d(recv_no_log_write = TRUE); ut_ad(!buf_pool_check_no_pending_io()); - RECOVERY_CRASH(3); + DBUG_EXECUTE_IF("innodb_log_abort_3", + return(DB_ERROR);); /* Stamp the LSN to the data files. */ fil_write_flushed_lsn_to_data_files( max_flushed_lsn, 0); - fil_flush_file_spaces(FIL_TABLESPACE); + DBUG_EXECUTE_IF("innodb_log_abort_4", err = DB_ERROR;); - RECOVERY_CRASH(4); + if (err != DB_SUCCESS) { + return(err); + } + + fil_flush_file_spaces(FIL_TABLESPACE); /* Close and free the redo log files, so that we can replace them. */ fil_close_log_files(true); - RECOVERY_CRASH(5); + DBUG_EXECUTE_IF("innodb_log_abort_5", + return(DB_ERROR);); /* Free the old log file space. */ log_group_close_all(); @@ -2717,8 +2745,11 @@ files_checked: fil_write_flushed_lsn_to_data_files(min_flushed_lsn, 0); fil_flush_file_spaces(FIL_TABLESPACE); - create_log_files_rename(logfilename, dirnamelen, - log_get_lsn(), logfile0); + err = create_log_files_rename(logfilename, dirnamelen, + log_get_lsn(), logfile0); + if (err != DB_SUCCESS) { + return(err); + } } srv_startup_is_before_trx_rollback_phase = FALSE; @@ -2814,11 +2845,14 @@ files_checked: thread_started[3 + SRV_MAX_N_IO_THREADS] = true; /* Create the thread which prints InnoDB monitor info */ - srv_monitor_active = true; - thread_handles[4 + SRV_MAX_N_IO_THREADS] = os_thread_create( - srv_monitor_thread, - NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS); - thread_started[4 + SRV_MAX_N_IO_THREADS] = true; + if (!thread_started[4 + SRV_MAX_N_IO_THREADS]) { + /* srv_monitor_thread not yet started */ + srv_monitor_active = true; + thread_handles[4 + SRV_MAX_N_IO_THREADS] = os_thread_create( + srv_monitor_thread, + NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS); + thread_started[4 + SRV_MAX_N_IO_THREADS] = true; + } } /* Create the SYS_FOREIGN and SYS_FOREIGN_COLS system tables */ @@ -3189,6 +3223,7 @@ innobase_shutdown_for_mysql(void) btr_search_disable(); ibuf_close(); + log_online_shutdown(); log_shutdown(); trx_sys_file_format_close(); trx_sys_close(); diff --git a/storage/xtradb/sync/sync0sync.cc b/storage/xtradb/sync/sync0sync.cc index 5c4b45eb3c0..6692eef9fb0 100644 --- a/storage/xtradb/sync/sync0sync.cc +++ b/storage/xtradb/sync/sync0sync.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2017, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -1363,7 +1364,7 @@ sync_thread_add_level( case SYNC_TRX_UNDO_PAGE: /* Purge is allowed to read in as many UNDO pages as it likes, there was a bogus rule here earlier that forced the caller to - acquire the purge_sys_t::mutex. The purge mutex did not really + acquire the trx_purge_t::mutex. The purge mutex did not really protect anything because it was only ever acquired by the single purge thread. The purge thread can read the UNDO pages without any covering mutex. */ diff --git a/storage/xtradb/trx/trx0purge.cc b/storage/xtradb/trx/trx0purge.cc index 57338a73450..7d35bb12093 100644 --- a/storage/xtradb/trx/trx0purge.cc +++ b/storage/xtradb/trx/trx0purge.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -172,13 +173,9 @@ trx_purge_sys_close(void) sess_close(purge_sys->sess); - purge_sys->sess = NULL; - read_view_free(purge_sys->prebuilt_view); read_view_free(purge_sys->prebuilt_clone); - purge_sys->view = NULL; - rw_lock_free(&purge_sys->latch); mutex_free(&purge_sys->bh_mutex); @@ -187,9 +184,6 @@ trx_purge_sys_close(void) ib_bh_free(purge_sys->ib_bh); os_event_free(purge_sys->event); - - purge_sys->event = NULL; - mem_free(purge_sys); purge_sys = NULL; @@ -1301,20 +1295,16 @@ void trx_purge_stop(void) /*================*/ { - purge_state_t state; - ib_int64_t sig_count = os_event_reset(purge_sys->event); - ut_a(srv_n_purge_threads > 0); rw_lock_x_lock(&purge_sys->latch); - ut_a(purge_sys->state != PURGE_STATE_INIT); - ut_a(purge_sys->state != PURGE_STATE_EXIT); - ut_a(purge_sys->state != PURGE_STATE_DISABLED); + const ib_int64_t sig_count = os_event_reset(purge_sys->event); + const purge_state_t state = purge_sys->state; - ++purge_sys->n_stop; + ut_a(state == PURGE_STATE_RUN || state == PURGE_STATE_STOP); - state = purge_sys->state; + ++purge_sys->n_stop; if (state == PURGE_STATE_RUN) { ib_logf(IB_LOG_LEVEL_INFO, "Stopping purge"); diff --git a/storage/xtradb/trx/trx0roll.cc b/storage/xtradb/trx/trx0roll.cc index ee1497b209f..d228743d300 100644 --- a/storage/xtradb/trx/trx0roll.cc +++ b/storage/xtradb/trx/trx0roll.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, MariaDB Corporation. All Rights Reserved. +Copyright (c) 2016, 2017, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -753,9 +753,9 @@ trx_rollback_or_clean_recovered( } if (all) { - fprintf(stderr, - "InnoDB: Starting in background the rollback" - " of uncommitted transactions\n"); + ib_logf(IB_LOG_LEVEL_INFO, + "Starting in background the rollback" + " of recovered transactions"); } /* Note: For XA recovered transactions, we rely on MySQL to @@ -775,6 +775,12 @@ trx_rollback_or_clean_recovered( assert_trx_in_rw_list(trx); + if (srv_shutdown_state != SRV_SHUTDOWN_NONE + && srv_fast_shutdown != 0) { + all = FALSE; + break; + } + /* If this function does a cleanup or rollback then it will release the trx_sys->mutex, therefore we need to reacquire it before retrying the loop. */ @@ -792,10 +798,8 @@ trx_rollback_or_clean_recovered( } while (trx != NULL); if (all) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Rollback of non-prepared" - " transactions completed\n"); + ib_logf(IB_LOG_LEVEL_INFO, + "Rollback of non-prepared transactions completed"); } } diff --git a/storage/xtradb/trx/trx0sys.cc b/storage/xtradb/trx/trx0sys.cc index 182bdc5d74c..1c4fb19430e 100644 --- a/storage/xtradb/trx/trx0sys.cc +++ b/storage/xtradb/trx/trx0sys.cc @@ -1344,7 +1344,9 @@ trx_sys_close(void) ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0); /* Only prepared transactions may be left in the system. Free them. */ - ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == trx_sys->n_prepared_trx); + ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == trx_sys->n_prepared_trx + || srv_read_only_mode + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); while ((trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list)) != NULL) { trx_free_prepared(trx); @@ -1390,6 +1392,33 @@ trx_sys_close(void) trx_sys = NULL; } +/** @brief Convert an undo log to TRX_UNDO_PREPARED state on shutdown. + +If any prepared ACTIVE transactions exist, and their rollback was +prevented by innodb_force_recovery, we convert these transactions to +XA PREPARE state in the main-memory data structures, so that shutdown +will proceed normally. These transactions will again recover as ACTIVE +on the next restart, and they will be rolled back unless +innodb_force_recovery prevents it again. + +@param[in] trx transaction +@param[in,out] undo undo log to convert to TRX_UNDO_PREPARED */ +static +void +trx_undo_fake_prepared( + const trx_t* trx, + trx_undo_t* undo) +{ + ut_ad(srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(trx->is_recovered); + + if (undo != NULL) { + ut_ad(undo->state == TRX_UNDO_ACTIVE); + undo->state = TRX_UNDO_PREPARED; + } +} + /********************************************************************* Check if there are any active (non-prepared) transactions. @return total number of active transactions or 0 if none */ @@ -1398,15 +1427,42 @@ ulint trx_sys_any_active_transactions(void) /*=================================*/ { - ulint total_trx = 0; - mutex_enter(&trx_sys->mutex); - total_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list) - + UT_LIST_GET_LEN(trx_sys->mysql_trx_list); + ulint total_trx = UT_LIST_GET_LEN(trx_sys->mysql_trx_list); + + if (total_trx == 0) { + total_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list); + ut_a(total_trx >= trx_sys->n_prepared_trx); + + if (total_trx > trx_sys->n_prepared_trx + && srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) { + for (trx_t* trx = UT_LIST_GET_FIRST( + trx_sys->rw_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + if (!trx_state_eq(trx, TRX_STATE_ACTIVE) + || !trx->is_recovered) { + continue; + } + /* This was a recovered transaction + whose rollback was disabled by + the innodb_force_recovery setting. + Pretend that it is in XA PREPARE + state so that shutdown will work. */ + trx_undo_fake_prepared( + trx, trx->insert_undo); + trx_undo_fake_prepared( + trx, trx->update_undo); + trx->state = TRX_STATE_PREPARED; + trx_sys->n_prepared_trx++; + trx_sys->n_prepared_recovered_trx++; + } + } - ut_a(total_trx >= trx_sys->n_prepared_trx); - total_trx -= trx_sys->n_prepared_trx; + ut_a(total_trx >= trx_sys->n_prepared_trx); + total_trx -= trx_sys->n_prepared_trx; + } mutex_exit(&trx_sys->mutex); diff --git a/storage/xtradb/trx/trx0trx.cc b/storage/xtradb/trx/trx0trx.cc index bdf407ff7fb..439897a5b96 100644 --- a/storage/xtradb/trx/trx0trx.cc +++ b/storage/xtradb/trx/trx0trx.cc @@ -478,7 +478,11 @@ trx_free_prepared( /*==============*/ trx_t* trx) /*!< in, own: trx object */ { - ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_a(trx_state_eq(trx, TRX_STATE_PREPARED) + || (trx_state_eq(trx, TRX_STATE_ACTIVE) + && trx->is_recovered + && (srv_read_only_mode + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO))); ut_a(trx->magic_n == TRX_MAGIC_N); lock_trx_release_locks(trx); @@ -1117,7 +1121,8 @@ trx_start_low( trx->start_time = ut_time(); - trx->start_time_micro = clock(); + trx->start_time_micro = + trx->mysql_thd ? thd_query_start_micro(trx->mysql_thd) : 0; MONITOR_INC(MONITOR_TRX_ACTIVE); } diff --git a/storage/xtradb/trx/trx0undo.cc b/storage/xtradb/trx/trx0undo.cc index cdd23726f2e..220589dd9ff 100644 --- a/storage/xtradb/trx/trx0undo.cc +++ b/storage/xtradb/trx/trx0undo.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2017, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -2011,13 +2012,37 @@ trx_undo_free_prepared( ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); if (trx->update_undo) { - ut_a(trx->update_undo->state == TRX_UNDO_PREPARED); + switch (trx->update_undo->state) { + case TRX_UNDO_PREPARED: + break; + case TRX_UNDO_ACTIVE: + /* lock_trx_release_locks() assigns + trx->is_recovered=false */ + ut_a(srv_read_only_mode + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); + break; + default: + ut_error; + } + UT_LIST_REMOVE(undo_list, trx->rseg->update_undo_list, trx->update_undo); trx_undo_mem_free(trx->update_undo); } if (trx->insert_undo) { - ut_a(trx->insert_undo->state == TRX_UNDO_PREPARED); + switch (trx->insert_undo->state) { + case TRX_UNDO_PREPARED: + break; + case TRX_UNDO_ACTIVE: + /* lock_trx_release_locks() assigns + trx->is_recovered=false */ + ut_a(srv_read_only_mode + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); + break; + default: + ut_error; + } + UT_LIST_REMOVE(undo_list, trx->rseg->insert_undo_list, trx->insert_undo); trx_undo_mem_free(trx->insert_undo); |