diff options
author | Marko Mäkelä <marko.makela@mariadb.com> | 2021-09-22 14:15:00 +0300 |
---|---|---|
committer | Marko Mäkelä <marko.makela@mariadb.com> | 2021-09-22 14:15:00 +0300 |
commit | 1cb218c37cc3fe01a1ff2fe9b1cbfb591e90d5ce (patch) | |
tree | f5be99c4e6675ac4fd8b4123de07d0058d303b55 | |
parent | 21d19ed45b1b71c2815559a8ad68888a4bfe902f (diff) | |
download | mariadb-git-1cb218c37cc3fe01a1ff2fe9b1cbfb591e90d5ce.tar.gz |
MDEV-26450: Corruption due to innodb_undo_log_truncatebb-10.2-MDEV-26450
At least since commit 055a3334adc004bd3a897990c2f93178e6bb5f90
(MDEV-13564) the undo log truncation in InnoDB did not work correctly.
The main issue is that during the execution of
trx_purge_truncate_history() some pages of the newly truncated
undo tablespace could be discarded.
fsp_try_extend_data_file(): Apply the peculiar rounding of
fil_space_t::size_in_header only to the system tablespace,
whose size can be expressed in megabytes in a configuration parameter.
Other files may freely grow by a number of pages.
fseg_alloc_free_page_low(): Do allow the extension of undo tablespaces,
and mention the file name in the error message.
mtr_t::commit_shrink(): Implement crash-safe shrinking of a tablespace
file. First, durably write the log, then shrink the file, and finally
release the page latches of the rebuilt tablespace. Refactored from
trx_purge_truncate_history().
log_write_and_flush_prepare(), log_write_and_flush(): New functions
to durably write log during mtr_t::commit_shrink().
-rw-r--r-- | mysql-test/suite/innodb/r/undo_truncate.result | 22 | ||||
-rw-r--r-- | mysql-test/suite/innodb/t/undo_truncate.test | 64 | ||||
-rw-r--r-- | storage/innobase/fsp/fsp0fsp.cc | 20 | ||||
-rw-r--r-- | storage/innobase/include/log0log.h | 14 | ||||
-rw-r--r-- | storage/innobase/include/mtr0mtr.h | 6 | ||||
-rw-r--r-- | storage/innobase/include/mtr0mtr.ic | 6 | ||||
-rw-r--r-- | storage/innobase/log/log0log.cc | 103 | ||||
-rw-r--r-- | storage/innobase/mtr/mtr0mtr.cc | 86 | ||||
-rw-r--r-- | storage/innobase/trx/trx0purge.cc | 21 |
9 files changed, 219 insertions, 123 deletions
diff --git a/mysql-test/suite/innodb/r/undo_truncate.result b/mysql-test/suite/innodb/r/undo_truncate.result index 89171d36d0f..67a587b3c1c 100644 --- a/mysql-test/suite/innodb/r/undo_truncate.result +++ b/mysql-test/suite/innodb/r/undo_truncate.result @@ -10,28 +10,12 @@ SET @trunc_start= WHERE variable_name = 'innodb_undo_truncations'); create table t1(keyc int primary key, c char(100)) engine = innodb; create table t2(keyc int primary key, c char(100)) engine = innodb; -CREATE PROCEDURE populate_t1() -BEGIN -DECLARE i INT DEFAULT 1; -while (i <= 20000) DO -insert into t1 values (i, 'a'); -SET i = i + 1; -END WHILE; -END | -CREATE PROCEDURE populate_t2() -BEGIN -DECLARE i INT DEFAULT 1; -while (i <= 20000) DO -insert into t2 values (i, 'a'); -SET i = i + 1; -END WHILE; -END | connect con1,localhost,root,,; begin; -call populate_t1(); +insert into t1 select seq,'a' from seq_1_to_20000; connect con2,localhost,root,,; begin; -call populate_t2(); +insert into t2 select seq,'a' from seq_1_to_20000; connection con1; update t1 set c = 'mysql'; connection con2; @@ -53,8 +37,6 @@ commit; disconnect con2; connection default; drop table t1, t2; -drop PROCEDURE populate_t1; -drop PROCEDURE populate_t2; InnoDB 0 transactions not purged SET GLOBAL innodb_undo_logs = @save_undo_logs; SET GLOBAL innodb_purge_rseg_truncate_frequency = @save_frequency; diff --git a/mysql-test/suite/innodb/t/undo_truncate.test b/mysql-test/suite/innodb/t/undo_truncate.test index 4f350e380ee..9feee84491a 100644 --- a/mysql-test/suite/innodb/t/undo_truncate.test +++ b/mysql-test/suite/innodb/t/undo_truncate.test @@ -5,6 +5,7 @@ # --source include/innodb_page_size.inc --source include/innodb_page_size_small.inc --source include/have_undo_tablespaces.inc +--source include/have_sequence.inc call mtr.add_suppression("InnoDB: The transaction log size is too large"); @@ -27,37 +28,14 @@ WHERE variable_name = 'innodb_undo_truncations'); create table t1(keyc int primary key, c char(100)) engine = innodb; create table t2(keyc int primary key, c char(100)) engine = innodb; # -delimiter |; -CREATE PROCEDURE populate_t1() -BEGIN - DECLARE i INT DEFAULT 1; - while (i <= 20000) DO - insert into t1 values (i, 'a'); - SET i = i + 1; - END WHILE; -END | -delimiter ;| -# -delimiter |; -CREATE PROCEDURE populate_t2() -BEGIN - DECLARE i INT DEFAULT 1; - while (i <= 20000) DO - insert into t2 values (i, 'a'); - SET i = i + 1; - END WHILE; -END | -delimiter ;| -# -# let DATADIR = `select @@datadir`; connect (con1,localhost,root,,); begin; -send call populate_t1(); +send insert into t1 select seq,'a' from seq_1_to_20000; connect (con2,localhost,root,,); begin; -send call populate_t2(); +send insert into t2 select seq,'a' from seq_1_to_20000; connection con1; reap; send update t1 set c = 'mysql'; connection con2; reap; send update t2 set c = 'mysql'; @@ -67,25 +45,12 @@ connection con1; reap; send delete from t1; connection con2; reap; delete from t2; connection con1; reap; -let CHECKFILE = $MYSQL_TMP_DIR/check.txt; -perl; -($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size1) - = stat("$ENV{DATADIR}/undo001"); -($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size2) - = stat("$ENV{DATADIR}/undo002"); -open(OUT, ">$ENV{CHECKFILE}") || die; -print OUT "let \$size1='$size1,$size2';\n"; -close(OUT); -EOF - SET GLOBAL innodb_undo_log_truncate = 1; commit; disconnect con1; connection con2; commit; disconnect con2; connection default; drop table t1, t2; -drop PROCEDURE populate_t1; -drop PROCEDURE populate_t2; --source include/wait_all_purged.inc @@ -100,29 +65,6 @@ if (`select @@innodb_page_size IN (4096,8192,16384)`) source include/wait_condition.inc; } ---source $CHECKFILE -perl; -($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size1) - = stat("$ENV{DATADIR}/undo001"); -($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size2) - = stat("$ENV{DATADIR}/undo002"); -open(OUT, ">$ENV{CHECKFILE}") || die; -print OUT "let \$size2='$size1,$size2';\n"; -close(OUT); -EOF - ---source $CHECKFILE ---remove_file $CHECKFILE - -if ($size1 == $size2) -{ - # This fails for innodb_page_size=64k, occasionally also for 32k. - if (`select @@innodb_page_size IN (4096,8192,16384)`) - { - echo Truncation did not happen: $size1; - } -} - SET GLOBAL innodb_undo_logs = @save_undo_logs; SET GLOBAL innodb_purge_rseg_truncate_frequency = @save_frequency; SET GLOBAL innodb_undo_log_truncate = @save_truncate; diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 43989d57db8..064eb133ebc 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -956,11 +956,13 @@ fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr) return(0); } - /* We ignore any fragments of a full megabyte when storing the size - to the space header */ + /* For the system tablespace, we ignore any fragments of a + full megabyte when storing the size to the space header */ - space->size_in_header = ut_2pow_round( - space->size, (1024 * 1024) / page_size.physical()); + space->size_in_header = space->id + ? space->size + : ut_2pow_round(space->size, + (1024 * 1024) / page_size.physical()); mlog_write_ulint( header + FSP_SIZE, space->size_in_header, MLOG_4BYTES, mtr); @@ -1392,7 +1394,7 @@ fsp_alloc_free_page( /* It must be that we are extending a single-table tablespace whose size is still < 64 pages */ - ut_a(!is_system_tablespace(space_id)); + ut_a(!is_predefined_tablespace(space_id)); if (page_no >= FSP_EXTENT_SIZE) { ib::error() << "Trying to extend a single-table" " tablespace " << space->name << " , by single" @@ -2514,14 +2516,14 @@ take_hinted_page: return(NULL); } - if (space->size <= ret_page && !is_system_tablespace(space_id)) { + if (space->size <= ret_page && !is_predefined_tablespace(space_id)) { /* It must be that we are extending a single-table tablespace whose size is still < 64 pages */ if (ret_page >= FSP_EXTENT_SIZE) { - ib::error() << "Error (2): trying to extend" - " a single-table tablespace " << space_id - << " by single page(s) though the" + ib::error() << "Trying to extend '" + << space->chain.start->name + << "' by single page(s) though the" << " space size " << space->size << ". Page no " << ret_page << "."; ut_ad(!has_done_reservation); diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 612a27976e7..c0b92fb7497 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2009, Google Inc. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -189,9 +189,15 @@ log_write_up_to( /** write to the log file up to the last log entry. @param[in] sync whether we want the written log also to be flushed to disk. */ -void -log_buffer_flush_to_disk( - bool sync = true); +void log_buffer_flush_to_disk(bool sync= true); + + +/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.mutex. */ +#define log_write_and_flush_prepare() log_write_mutex_enter() + +/** Durably write the log up to log_sys.lsn and release log_sys.mutex. */ +ATTRIBUTE_COLD void log_write_and_flush(); + /****************************************************************//** This functions writes the log buffer to the log file and if 'flush' is set it forces a flush of the log file as well. This is meant to be diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index b57a38f8eab..6d729beab12 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2020, MariaDB Corporation. +Copyright (c) 2013, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -161,6 +161,10 @@ struct mtr_t { /** Commit the mini-transaction. */ void commit(); + /** Commit a mini-transaction that is shrinking a tablespace. + @param space tablespace that is being shrunk */ + ATTRIBUTE_COLD void commit_shrink(fil_space_t &space); + /** Commit a mini-transaction that did not modify any pages, but generated some redo log on a higher level, such as MLOG_FILE_NAME records and a MLOG_CHECKPOINT marker. diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index a45d088d5d7..24477689db8 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -53,8 +53,8 @@ mtr_t::memo_push(void* object, mtr_memo_type_t type) /* If this mtr has x-fixed a clean page then we set the made_dirty flag. This tells us if we need to - grab log_flush_order_mutex at mtr_commit so that we - can insert the dirtied page to the flush list. */ + grab log_sys.flush_order_mutex at mtr_t::commit() so that we + can insert the dirtied page into the flush list. */ if ((type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX) && !m_made_dirty) { diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 4c68f3743e9..945c97daf4a 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Google Inc. -Copyright (c) 2014, 2020, MariaDB Corporation. +Copyright (c) 2014, 2021, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -1247,12 +1247,103 @@ loop: /** write to the log file up to the last log entry. @param[in] sync whether we want the written log also to be flushed to disk. */ -void -log_buffer_flush_to_disk( - bool sync) +void log_buffer_flush_to_disk(bool sync) { - ut_ad(!srv_read_only_mode); - log_write_up_to(log_get_lsn(), sync); + ut_ad(!srv_read_only_mode); + log_write_up_to(log_get_lsn(), sync); +} + + +/** Durably write the log and release log_sys.mutex */ +ATTRIBUTE_COLD void log_write_and_flush() +{ + ut_ad(!srv_read_only_mode); + ut_ad(!recv_no_log_write); + ut_ad(!recv_recovery_is_on()); + + /* The following code is adapted from log_write_up_to(). */ + DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF, + log_sys->write_lsn, log_sys->lsn)); + log_sys->n_pending_flushes++; + log_sys->current_flush_lsn= log_sys->lsn; + os_event_reset(log_sys->flush_event); + ut_ad(log_sys->buf_free != log_sys->buf_next_to_write); + ulint start_offset= log_sys->buf_next_to_write; + ulint end_offset= log_sys->buf_free; + ulint area_start= ut_2pow_round(start_offset, ulint(OS_FILE_LOG_BLOCK_SIZE)); + ulint area_end= ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE)); + ulong write_ahead_size= srv_log_write_ahead_size; + + log_block_set_flush_bit(log_sys->buf + area_start, TRUE); + log_block_set_checkpoint_no(log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE, + log_sys->next_checkpoint_no); + lsn_t write_lsn= log_sys->lsn; + byte *write_buf= log_sys->buf; + + ut_ad(area_end - area_start > 0); + + log_buffer_switch(); + + log_group_set_fields(&log_sys->log, log_sys->write_lsn); + + /* Erase the end of the last log block. */ + memset(write_buf + end_offset, 0, + ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1)); + /* Calculate pad_size if needed. */ + ulint pad_size= 0; + if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) + { + lsn_t end_offset= + log_group_calc_lsn_offset(ut_uint64_align_up(write_lsn, + OS_FILE_LOG_BLOCK_SIZE), + &log_sys->log); + ulint end_offset_in_unit= (ulint) (end_offset % write_ahead_size); + + if (end_offset_in_unit && (area_end - area_start) > end_offset_in_unit) + { + /* The first block in the unit was initialized after the last + writing. Needs to be written padded data once. */ + pad_size= std::min(ulint(write_ahead_size) - end_offset_in_unit, + log_sys->buf_size - area_end); + memset(write_buf + area_end, 0, pad_size); + } + } + + if (log_sys->is_encrypted()) + log_crypt(write_buf + area_start, log_sys->write_lsn, + area_end - area_start); + + /* Do the write to the log files */ + log_group_write_buf(&log_sys->log, write_buf + area_start, + area_end - area_start + pad_size, +#ifdef UNIV_DEBUG + pad_size, +#endif /* UNIV_DEBUG */ + ut_uint64_align_down(log_sys->write_lsn, + OS_FILE_LOG_BLOCK_SIZE), + start_offset - area_start); + srv_stats.log_padded.add(pad_size); + log_sys->write_lsn= write_lsn; + + log_write_mutex_exit(); + + /* Code adapted from log_write_flush_to_disk_low() */ + + ut_a(log_sys->n_pending_flushes == 1); /* No other threads here */ + + if (srv_file_flush_method != SRV_O_DSYNC) + fil_flush(SRV_LOG_SPACE_FIRST_ID); + + log_sys->flushed_to_disk_lsn= log_sys->current_flush_lsn; + + log_sys->n_pending_flushes--; + + os_event_set(log_sys->flush_event); + + lsn_t wrote_lsn= log_sys->write_lsn, flush_lsn= log_sys->flushed_to_disk_lsn; + log_mutex_exit(); + + innobase_mysql_log_notify(wrote_lsn, flush_lsn); } /****************************************************************//** diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index fefc0687ddb..4bd90e7842d 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -465,6 +465,90 @@ mtr_t::commit() release_resources(); } +#ifdef UNIV_DEBUG +/** Check that all pages belong to a shrunk tablespace. */ +struct Shrink +{ + const fil_space_t &space; + Shrink(const fil_space_t &space) : space(space) {} + + bool operator()(const mtr_memo_slot_t *slot) const + { + if (!slot->object) + return true; + switch (slot->type) { + default: + ut_ad("invalid type" == 0); + return false; + case MTR_MEMO_MODIFY: + break; + case MTR_MEMO_SPACE_X_LOCK: + ut_ad(&space == slot->object); + return true; + case MTR_MEMO_PAGE_X_FIX: + case MTR_MEMO_PAGE_SX_FIX: + const buf_page_t &bpage= static_cast<buf_block_t*>(slot->object)->page; + const page_id_t &id= bpage.id; + if (id.space() == 0 && id.page_no() == TRX_SYS_PAGE_NO) + { + ut_ad(srv_is_undo_tablespace(space.id)); + break; + } + ut_ad(id.space() == space.id); + ut_ad(id.page_no() < space.size); + ut_ad(bpage.state == BUF_BLOCK_FILE_PAGE); + ut_ad(!bpage.oldest_modification); + break; + } + return true; + } +}; +#endif + +/** Commit a mini-transaction that is shrinking a tablespace. +@param space tablespace that is being shrunk */ +void mtr_t::commit_shrink(fil_space_t &space) +{ + ut_ad(is_active()); + ut_ad(!is_inside_ibuf()); + ut_ad(!high_level_read_only); + ut_ad(m_modifications); + ut_ad(m_made_dirty); + ut_ad(!recv_recovery_is_on()); + ut_ad(m_log_mode == MTR_LOG_ALL); + ut_ad(UT_LIST_GET_LEN(space.chain) == 1); + + log_write_and_flush_prepare(); + + const lsn_t start_lsn= finish_write(prepare_write()); + + log_flush_order_mutex_enter(); + /* Durably write the reduced FSP_SIZE before truncating the data file. */ + log_write_and_flush(); + + os_file_truncate(space.chain.start->name, space.chain.start->handle, + os_offset_t(space.size) << srv_page_size_shift, true); + + ut_d(m_memo.for_each_block_in_reverse(CIterate<Shrink>(space))); + + m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks> + (ReleaseBlocks(start_lsn, m_commit_lsn, + m_flush_observer))); + log_flush_order_mutex_exit(); + + mutex_enter(&fil_system->mutex); + ut_ad(space.is_being_truncated); + ut_ad(space.stop_new_ops); + space.stop_new_ops= false; + space.is_being_truncated= false; + mutex_exit(&fil_system->mutex); + + m_memo.for_each_block_in_reverse(CIterate<ReleaseLatches>()); + srv_stats.log_write_requests.inc(); + + release_resources(); +} + /** Commit a mini-transaction that did not modify any pages, but generated some redo log on a higher level, such as MLOG_FILE_NAME records and a MLOG_CHECKPOINT marker. diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 2025ac70beb..a9593807d53 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1062,26 +1062,11 @@ trx_purge_initiate_truncate( rseg->last_del_marks = FALSE; } - mtr.commit(); - /* Write-ahead the redo log record. */ - log_write_up_to(mtr.commit_lsn(), true); + mtr.commit_shrink(*space); - /* Trim the file size. */ - os_file_truncate(file->name, file->handle, - os_offset_t(size) << srv_page_size_shift, true); - - /* This is only executed by the srv_purge_coordinator_thread. */ + /* No mutex; this is only updated by the purge coordinator. */ export_vars.innodb_undo_truncations++; - /* TODO: PUNCH_HOLE the garbage (with write-ahead logging) */ - - mutex_enter(&fil_system->mutex); - ut_ad(space->stop_new_ops); - ut_ad(space->is_being_truncated); - space->stop_new_ops = false; - space->is_being_truncated = false; - mutex_exit(&fil_system->mutex); - if (purge_sys->rseg != NULL && purge_sys->rseg->last_page_no == FIL_NULL) { /* If purge_sys->rseg is pointing to rseg that was recently |