summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Lindström <jan.lindstrom@mariadb.com>2017-07-26 13:36:05 +0300
committerJan Lindström <jan.lindstrom@mariadb.com>2017-07-26 13:36:05 +0300
commitf3a9a45a70c351cab678c0ac7b95af5a04ba6679 (patch)
tree99c3b2518e228addcb3b27b7d6c1408de93c14a6
parent2a1035b004dfabcf3a7113be632b0030721a44d6 (diff)
downloadmariadb-git-bb-10.2-MDEV-11125.tar.gz
MDEV-11125: Introduce a reduced doublewrite mode, handling error detection onlybb-10.2-MDEV-11125
Merged implementation from: https://github.com/webscalesql/webscalesql-5.6/commit/3676902ccd6df75ddb51da9ef1b04c93d7f3da5c Introduce a new double write mode where only the page ids and space ids are written to the double write buffer but not the entire page. We can use these page numbers and space ids to determine whether any pages that were written to the disk (or flash) were torn. If there are torn pages, crash recovery fails and the server must be recovered from some other source. The implementation makes innodb_doublewrite a dynamic sysvar that may be set to 0, 1, or 2. * innodb_doublewrite=0 and innodb_doublewrite=1 work the same way as before. The former disables the doublewrite buffer while the latter writes each page twice (plus extra overhead for compressed pages). * When innodb_doublewrite=2, only the space ids and the page numbers of flushed pages are written to the doublewrite buffer, in a single page. * We disallow changing innodb_doublewrite between nonzero values and zero dynamically. In order to turn the doublewrite buffer on or off, the user must do a clean shutdown and then change the variable, just as before. This is done to avoid changing innodb_doublewrite to zero by accident. * When innodb_doublewrite is changed from 1 to 2 or 2 to 1, a clean version of the doublewrite buffer will be written to the disk before the next batch doublewrite flush. This is done to make sure that the stale pages do not override a corrupt page on disk in case the user changes innodb_doublewrite from 1 to 2. Note that the implementation doesn't really change anything about the in-memory representation of the doublewrite buffer (buf_dblwr) aside from adding a new field called 'header'. This field is used to serially write the space_id s and page_no s of the pages that are flushed. Because we read the value of innodb_doublewrite only in one place (during batch flushing), we do not need to do anything special if the user changes the value of innodb_doublewrite while the server is running. It will be consistent through the course of each write/flush. During recovery, we determine which mode of doublewrite was used based on the page type of the first doublewrite page. If the first page seems to be a doublewrite header page then this page includes the space_ids and page_nos of the recently flushed pages. In this case we simply validate those pages based on their checksums. 5.6 introduced a new type of flush operation which flushes single pages from the doublewrite buffer as opposed to flushing all of them. Single-flushing seems to be rare so we still do double-writing in case of a single-flush. This means that when mysqld starts, doublewrite buffer may contain pages from both batch-flushes and single-flushes. For this reason, we always scan the entire doublewrite buffer even when we detect that the first double write page is a doublwrite header page.
-rw-r--r--mysql-test/suite/innodb/include/innodb_doublewrite.inc257
-rw-r--r--mysql-test/suite/innodb/r/doublewrite.result2
-rw-r--r--mysql-test/suite/innodb/r/innodb_doublewrite.result111
-rw-r--r--mysql-test/suite/innodb/r/innodb_doublewrite_odirect.result111
-rw-r--r--mysql-test/suite/innodb/t/innodb_doublewrite-master.opt7
-rw-r--r--mysql-test/suite/innodb/t/innodb_doublewrite.test1
-rw-r--r--mysql-test/suite/innodb/t/innodb_doublewrite_odirect-master.opt6
-rw-r--r--mysql-test/suite/innodb/t/innodb_doublewrite_odirect.test1
-rw-r--r--mysql-test/suite/sys_vars/r/innodb_doublewrite_basic.result33
-rw-r--r--mysql-test/suite/sys_vars/r/sysvars_innodb.result20
-rw-r--r--mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.test31
-rw-r--r--storage/innobase/buf/buf0buf.cc12
-rw-r--r--storage/innobase/buf/buf0dblwr.cc254
-rw-r--r--storage/innobase/fsp/fsp0sysspace.cc2
-rw-r--r--storage/innobase/handler/ha_innodb.cc46
-rw-r--r--storage/innobase/handler/i_s.cc1
-rw-r--r--storage/innobase/include/buf0dblwr.h11
-rw-r--r--storage/innobase/include/fil0fil.h4
-rw-r--r--storage/innobase/include/fil0fil.ic3
-rw-r--r--storage/innobase/include/log0recv.h21
-rw-r--r--storage/innobase/include/srv0srv.h3
-rw-r--r--storage/innobase/log/log0recv.cc10
-rw-r--r--storage/innobase/srv/srv0srv.cc4
23 files changed, 867 insertions, 84 deletions
diff --git a/mysql-test/suite/innodb/include/innodb_doublewrite.inc b/mysql-test/suite/innodb/include/innodb_doublewrite.inc
new file mode 100644
index 00000000000..d537280d178
--- /dev/null
+++ b/mysql-test/suite/innodb/include/innodb_doublewrite.inc
@@ -0,0 +1,257 @@
+# Embedded server does not support crashing
+--source include/not_embedded.inc
+--source include/have_innodb.inc
+--source include/innodb_page_size.inc
+--source include/have_debug.inc
+
+call mtr.add_suppression("InnoDB: Failed to set O_DIRECT on file.*");
+call mtr.add_suppression("InnoDB: Cannot recover page \\[page id: space=[1-9][0-9]*, page number=[1-9][0-9]*\\] from the doublewrite buffer because it was written in reduced-doublewrite mode");
+call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed file read of tablespace .*");
+call mtr.add_suppression("InnoDB: Failed to read file .* at offset .*: Page read from tablespace is corrupted.");
+call mtr.add_suppression("InnoDB: Table .* is corrupted. Please drop the table and recreate.");
+
+# Slow shutdown and restart to make sure ibuf merge is finished
+SET GLOBAL innodb_fast_shutdown = 0;
+
+let INNODB_PAGE_SIZE=`select @@innodb_page_size`;
+let MYSQLD_DATADIR=`select @@datadir`;
+
+SET GLOBAL innodb_doublewrite=2;
+
+show variables like 'innodb_doublewrite';
+show variables like 'innodb_fil_make_page_dirty_debug';
+show variables like 'innodb_saved_page_number_debug';
+
+CREATE TABLE t1(a INT PRIMARY KEY AUTO_INCREMENT, b char(255) default '') ENGINE=innodb;
+start transaction;
+INSERT INTO t1(b) VALUES(repeat('#',200));
+INSERT INTO t1(b) VALUES(repeat('+',200));
+INSERT INTO t1(b) VALUES(repeat('/',200));
+INSERT INTO t1(b) VALUES(repeat('|',200));
+INSERT INTO t1(b) VALUES(repeat('\\',200));
+INSERT INTO t1(b) VALUES(repeat('-',200));
+INSERT INTO t1(b) VALUES(repeat('&',200));
+INSERT INTO t1(b) VALUES(repeat('%',200));
+INSERT INTO t1(b) VALUES(repeat('@',200));
+INSERT INTO t1(b) VALUES(repeat('?',200));
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+commit work;
+
+select space from information_schema.innodb_sys_tables where name = 'test/t1' into @space_id;
+let SPACE_ID=`select space from information_schema.innodb_sys_tables where name = 'test/t1'`;
+
+--echo # Ensure that dirty pages of table t1 is flushed.
+flush tables t1 for export;
+unlock tables;
+
+begin;
+insert into t1(b) values (repeat('_', 42));
+
+--source ../include/no_checkpoint_start.inc
+
+--echo # Make the first page dirty for table t1
+set global innodb_saved_page_number_debug = 0;
+set global innodb_fil_make_page_dirty_debug = @space_id;
+
+--echo # Ensure that dirty pages of table t1 are flushed.
+set global innodb_buf_flush_list_now = 1;
+
+--let CLEANUP_IF_CHECKPOINT=drop table t1;
+--source ../include/no_checkpoint_end.inc
+
+--echo # Backup table and system tablespace before corrupting
+--copy_file $MYSQLD_DATADIR/test/t1.ibd $MYSQLD_DATADIR/test/t1.ibd.backup
+--copy_file $MYSQLD_DATADIR/ibdata1 $MYSQLD_DATADIR/ibdata1.backup
+
+#
+# Corrupt page 5 from table t1 and write page no 5 to first doublewrite
+# buffer
+#
+perl;
+use IO::Handle;
+use Data::HexDump;
+my $fname= "$ENV{'MYSQLD_DATADIR'}test/t1.ibd";
+my $page_size = $ENV{INNODB_PAGE_SIZE};
+my $tspace_id = $ENV{SPACE_ID};
+my $page;
+
+open(FILE, "+<", $fname) or die "Unable to open $fname\n";;
+binmode FILE;
+sysseek(FILE, 5 * $page_size + 48, 0)||die "Unable to seek $fname\n";
+print FILE pack("H*", "deadbeefdeadbeefdeadbeefdeadbeef");
+close FILE or die "Unable to close $fname\n";
+
+open(FILE, "+<", "$ENV{MYSQLD_DATADIR}ibdata1")||die "cannot open ibdata1\n";
+sysseek(FILE, 6 * $page_size - 190, 0)||die "Unable to seek ibdata1\n";
+sysread(FILE, $_, 12) == 12||die "Unable to read TRX_SYS\n";
+my($magic,$d1,$d2)=unpack "NNN", $_;
+die "magic=$magic, $d1, $d2\n" unless $magic == 536853855 && $d2 >= $d1 + 64;
+my($offset)=$d1*$page_size;
+# Find and read the page type from first page in the doublewrite buffer
+sysseek(FILE, $offset, 0)||die "Unable to seek ibdata1\n";
+sysread(FILE, $_, 4096)== 4096||die "Cannot read doublewrite\n";
+sysseek(FILE, $offset + 24, 0)||die "Unable to seek ibdata1\n";
+sysread(FILE, $_, 2) == 2||die "Unable to read FIL_PAGE_TYPE\n";
+my($pagetype)=unpack "n", $_;
+die "Not reduced doublewrite page page_type=$pagetype\n" unless $pagetype == 32124;
+# Find and read the space_id + page_no from first doublewrite page
+sysseek(FILE, $offset + 38, 0)||die "Unable to seek ibdata1\n";
+sysread(FILE, $_, 10) == 10||die "Unable to read doublewrite buf\n";
+my($first_free,$space_id,$page_no)=unpack ("nNN", $_);
+# Write space_id + page_no = 5 to first doublewrite page
+sysseek(FILE, $offset, 0)||die "Unable to seek ibdata1\n";
+sysread(FILE, $_, $page_size)==$page_size||die "Cannot read doublewrite\n";
+sysseek(FILE, $offset, 0)||die "Unable to seek ibdata1\n";
+substr ($_, 38, 10) = pack("nNN", 1, $tspace_id, 5);
+# Replace the innodb_checksum_algorithm=none checksum
+substr ($_, 0, 4) = pack("N", 0xdeadbeef);
+substr ($_, $page_size - 8, 4) = pack("N", 0xdeadbeef);
+syswrite(FILE, $_, $page_size)==$page_size||die;
+close(FILE);
+exit 0;
+EOF
+
+--enable_reconnect
+# Write file to make mysql-test-run.pl start up the server again
+--exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+--error 1
+--source include/wait_until_connected_again.inc
+
+--error 1932
+SELECT * FROM t1;
+
+--source include/shutdown_mysqld.inc
+
+--let SEARCH_RANGE = 10000000
+--let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err
+--let SEARCH_PATTERN=buffer because it was written in reduced-doublewrite mode
+--replace_regex /FOUND [1-9][0-9]*/ FOUND 1/
+--source include/search_pattern_in_file.inc
+--let SEARCH_PATTERN=Database page corruption on disk or a failed file read of tables
+--replace_regex /FOUND [1-9][0-9]*/ FOUND 1/
+--source include/search_pattern_in_file.inc
+
+--echo # Backup table and system tablespace BACK
+--move_file $MYSQLD_DATADIR/test/t1.ibd.backup $MYSQLD_DATADIR/test/t1.ibd
+--move_file $MYSQLD_DATADIR/ibdata1.backup $MYSQLD_DATADIR/ibdata1
+
+--source include/start_mysqld.inc
+
+CHECK TABLE t1;
+SELECT COUNT(*) FROM t1;
+
+SET GLOBAL innodb_doublewrite=1;
+CREATE TABLE t2(a INT PRIMARY KEY AUTO_INCREMENT, b char(255) default '') ENGINE=innodb;
+start transaction;
+INSERT INTO t2(b) VALUES(repeat('#',200));
+INSERT INTO t2(b) VALUES(repeat('+',200));
+INSERT INTO t2(b) VALUES(repeat('/',200));
+INSERT INTO t2(b) VALUES(repeat('|',200));
+INSERT INTO t2(b) VALUES(repeat('\\',200));
+INSERT INTO t2(b) VALUES(repeat('-',200));
+INSERT INTO t2(b) VALUES(repeat('&',200));
+INSERT INTO t2(b) VALUES(repeat('%',200));
+INSERT INTO t2(b) VALUES(repeat('@',200));
+INSERT INTO t2(b) VALUES(repeat('?',200));
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+commit work;
+
+select space from information_schema.innodb_sys_tables where name = 'test/t2' into @space_id;
+let SPACE_ID=`select space from information_schema.innodb_sys_tables where name = 'test/t2'`;
+
+--echo # Ensure that dirty pages of table t2 is flushed.
+flush tables t2 for export;
+unlock tables;
+
+begin;
+insert into t2(b) values (repeat('_', 42));
+
+--source ../include/no_checkpoint_start.inc
+
+--echo # Make the first page dirty for table t2
+set global innodb_saved_page_number_debug = 0;
+set global innodb_fil_make_page_dirty_debug = @space_id;
+
+--echo # Ensure that dirty pages of table t2 are flushed.
+set global innodb_buf_flush_list_now = 1;
+
+--let CLEANUP_IF_CHECKPOINT=drop table t2;
+--source ../include/no_checkpoint_end.inc
+
+#
+# Write page 5 from t2.ibd to first doublewrite buffer page
+# and then corrupt the page 5 from t2.ibd.
+#
+perl;
+use IO::Handle;
+my $fname= "$ENV{'MYSQLD_DATADIR'}test/t2.ibd";
+my $page_size = $ENV{INNODB_PAGE_SIZE};
+my $tspace_id = $ENV{SPACE_ID};
+my $page;
+my $page2;
+
+open(FILE, "+<", "$ENV{MYSQLD_DATADIR}ibdata1")||die "cannot open ibdata1\n";
+sysseek(FILE, 6 * $page_size - 190, 0)||die "Unable to seek ibdata1\n";
+sysread(FILE, $_, 12) == 12||die "Unable to read TRX_SYS\n";
+my($magic,$d1,$d2)=unpack "NNN", $_;
+die "magic=$magic, $d1, $d2\n" unless $magic == 536853855 && $d2 >= $d1 + 64;
+my($offset)=$d1*$page_size;
+# Find and read the page type from first page in the doublewrite buffer
+sysseek(FILE, $offset, 0)||die "Unable to seek ibdata1\n";
+sysread(FILE, $_, 4096)== 4096||die "Cannot read doublewrite\n";
+sysseek(FILE, $offset + 24, 0)||die "Unable to seek ibdata1\n";
+sysread(FILE, $_, 2) == 2||die "Unable to read FIL_PAGE_TYPE\n";
+my($pagetype)=unpack "n", $_;
+die "Not full doublewrite page page_type=$pagetype\n" unless $pagetype != 32124;
+sysseek(FILE, $offset, 0)||die "Unable to seek ibdata1\n";
+# Read page 5 from t2.ibd
+open(FILE2, "+<", $fname) or die "Unable to open $fname\n";;
+binmode FILE2;
+sysseek(FILE2, 5 * $page_size, 0)||die "Unable to seek $fname\n";
+sysread(FILE2, $page2, $page_size)==$page_size||die "Cannot read page from $fname\n";
+# Write page 5 from t2.ibd to first doublewrite buffer page
+syswrite(FILE, $page2, $page_size)==$page_size||die "Cannot write doublewrite page to ibdata1\n";
+close(FILE);
+close(FILE2);
+# Corrupt page 5 from t2.ibd
+open(FILE, "+<", $fname) or die "Unable to open $fname\n";;
+binmode FILE;
+sysseek(FILE, 5 * $page_size + 50, 0)||die "Unable to seek $fname\n";
+print FILE pack("H*", "deadbeefdeadbeefdeadbeefdddddddddddffffffffffffeeeeeeeeeeeebbbbbbbbbbbb");
+close(FILE);
+exit 0;
+EOF
+
+--source include/start_mysqld.inc
+
+CHECK TABLE t1;
+CHECK TABLE t2;
+
+SELECT COUNT(*) FROM t1;
+SELECT COUNT(*) FROM t2;
+
+DROP TABLE t1;
+DROP TABLE t2;
+
+--let SEARCH_RANGE = 10000000
+--let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err
+--let SEARCH_PATTERN=Trying to recover page
+--replace_regex /FOUND [1-9][0-9]*/ FOUND 1/
+--source include/search_pattern_in_file.inc
+--let SEARCH_PATTERN=Recovered page
+--replace_regex /FOUND [1-9][0-9]*/ FOUND 1/
+--source include/search_pattern_in_file.inc
diff --git a/mysql-test/suite/innodb/r/doublewrite.result b/mysql-test/suite/innodb/r/doublewrite.result
index 61c81ee9dff..62421ddd312 100644
--- a/mysql-test/suite/innodb/r/doublewrite.result
+++ b/mysql-test/suite/innodb/r/doublewrite.result
@@ -6,7 +6,7 @@
SET GLOBAL innodb_fast_shutdown = 0;
show variables like 'innodb_doublewrite';
Variable_name Value
-innodb_doublewrite ON
+innodb_doublewrite 1
show variables like 'innodb_fil_make_page_dirty_debug';
Variable_name Value
innodb_fil_make_page_dirty_debug 0
diff --git a/mysql-test/suite/innodb/r/innodb_doublewrite.result b/mysql-test/suite/innodb/r/innodb_doublewrite.result
new file mode 100644
index 00000000000..8bd112f9dcd
--- /dev/null
+++ b/mysql-test/suite/innodb/r/innodb_doublewrite.result
@@ -0,0 +1,111 @@
+call mtr.add_suppression("InnoDB: Failed to set O_DIRECT on file.*");
+call mtr.add_suppression("InnoDB: Cannot recover page \\[page id: space=[1-9][0-9]*, page number=[1-9][0-9]*\\] from the doublewrite buffer because it was written in reduced-doublewrite mode");
+call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed file read of tablespace .*");
+call mtr.add_suppression("InnoDB: Failed to read file .* at offset .*: Page read from tablespace is corrupted.");
+call mtr.add_suppression("InnoDB: Table .* is corrupted. Please drop the table and recreate.");
+SET GLOBAL innodb_fast_shutdown = 0;
+SET GLOBAL innodb_doublewrite=2;
+show variables like 'innodb_doublewrite';
+Variable_name Value
+innodb_doublewrite 2
+show variables like 'innodb_fil_make_page_dirty_debug';
+Variable_name Value
+innodb_fil_make_page_dirty_debug 0
+show variables like 'innodb_saved_page_number_debug';
+Variable_name Value
+innodb_saved_page_number_debug 0
+CREATE TABLE t1(a INT PRIMARY KEY AUTO_INCREMENT, b char(255) default '') ENGINE=innodb;
+start transaction;
+INSERT INTO t1(b) VALUES(repeat('#',200));
+INSERT INTO t1(b) VALUES(repeat('+',200));
+INSERT INTO t1(b) VALUES(repeat('/',200));
+INSERT INTO t1(b) VALUES(repeat('|',200));
+INSERT INTO t1(b) VALUES(repeat('\\',200));
+INSERT INTO t1(b) VALUES(repeat('-',200));
+INSERT INTO t1(b) VALUES(repeat('&',200));
+INSERT INTO t1(b) VALUES(repeat('%',200));
+INSERT INTO t1(b) VALUES(repeat('@',200));
+INSERT INTO t1(b) VALUES(repeat('?',200));
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+commit work;
+select space from information_schema.innodb_sys_tables where name = 'test/t1' into @space_id;
+# Ensure that dirty pages of table t1 is flushed.
+flush tables t1 for export;
+unlock tables;
+begin;
+insert into t1(b) values (repeat('_', 42));
+# Make the first page dirty for table t1
+set global innodb_saved_page_number_debug = 0;
+set global innodb_fil_make_page_dirty_debug = @space_id;
+# Ensure that dirty pages of table t1 are flushed.
+set global innodb_buf_flush_list_now = 1;
+# Kill the server
+# Backup table and system tablespace before corrupting
+SELECT * FROM t1;
+ERROR 42S02: Table 'test.t1' doesn't exist in engine
+ FOUND 1 /buffer because it was written in reduced-doublewrite mode/ in mysqld.1.err
+ FOUND 1 /Database page corruption on disk or a failed file read of tables/ in mysqld.1.err
+# Backup table and system tablespace BACK
+CHECK TABLE t1;
+Table Op Msg_type Msg_text
+test.t1 check status OK
+SELECT COUNT(*) FROM t1;
+COUNT(*)
+2560
+SET GLOBAL innodb_doublewrite=1;
+CREATE TABLE t2(a INT PRIMARY KEY AUTO_INCREMENT, b char(255) default '') ENGINE=innodb;
+start transaction;
+INSERT INTO t2(b) VALUES(repeat('#',200));
+INSERT INTO t2(b) VALUES(repeat('+',200));
+INSERT INTO t2(b) VALUES(repeat('/',200));
+INSERT INTO t2(b) VALUES(repeat('|',200));
+INSERT INTO t2(b) VALUES(repeat('\\',200));
+INSERT INTO t2(b) VALUES(repeat('-',200));
+INSERT INTO t2(b) VALUES(repeat('&',200));
+INSERT INTO t2(b) VALUES(repeat('%',200));
+INSERT INTO t2(b) VALUES(repeat('@',200));
+INSERT INTO t2(b) VALUES(repeat('?',200));
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+commit work;
+select space from information_schema.innodb_sys_tables where name = 'test/t2' into @space_id;
+# Ensure that dirty pages of table t2 is flushed.
+flush tables t2 for export;
+unlock tables;
+begin;
+insert into t2(b) values (repeat('_', 42));
+# Make the first page dirty for table t2
+set global innodb_saved_page_number_debug = 0;
+set global innodb_fil_make_page_dirty_debug = @space_id;
+# Ensure that dirty pages of table t2 are flushed.
+set global innodb_buf_flush_list_now = 1;
+# Kill the server
+CHECK TABLE t1;
+Table Op Msg_type Msg_text
+test.t1 check status OK
+CHECK TABLE t2;
+Table Op Msg_type Msg_text
+test.t2 check status OK
+SELECT COUNT(*) FROM t1;
+COUNT(*)
+2560
+SELECT COUNT(*) FROM t2;
+COUNT(*)
+2560
+DROP TABLE t1;
+DROP TABLE t2;
+ FOUND 1 /Trying to recover page/ in mysqld.1.err
+ FOUND 1 /Recovered page/ in mysqld.1.err
diff --git a/mysql-test/suite/innodb/r/innodb_doublewrite_odirect.result b/mysql-test/suite/innodb/r/innodb_doublewrite_odirect.result
new file mode 100644
index 00000000000..8bd112f9dcd
--- /dev/null
+++ b/mysql-test/suite/innodb/r/innodb_doublewrite_odirect.result
@@ -0,0 +1,111 @@
+call mtr.add_suppression("InnoDB: Failed to set O_DIRECT on file.*");
+call mtr.add_suppression("InnoDB: Cannot recover page \\[page id: space=[1-9][0-9]*, page number=[1-9][0-9]*\\] from the doublewrite buffer because it was written in reduced-doublewrite mode");
+call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed file read of tablespace .*");
+call mtr.add_suppression("InnoDB: Failed to read file .* at offset .*: Page read from tablespace is corrupted.");
+call mtr.add_suppression("InnoDB: Table .* is corrupted. Please drop the table and recreate.");
+SET GLOBAL innodb_fast_shutdown = 0;
+SET GLOBAL innodb_doublewrite=2;
+show variables like 'innodb_doublewrite';
+Variable_name Value
+innodb_doublewrite 2
+show variables like 'innodb_fil_make_page_dirty_debug';
+Variable_name Value
+innodb_fil_make_page_dirty_debug 0
+show variables like 'innodb_saved_page_number_debug';
+Variable_name Value
+innodb_saved_page_number_debug 0
+CREATE TABLE t1(a INT PRIMARY KEY AUTO_INCREMENT, b char(255) default '') ENGINE=innodb;
+start transaction;
+INSERT INTO t1(b) VALUES(repeat('#',200));
+INSERT INTO t1(b) VALUES(repeat('+',200));
+INSERT INTO t1(b) VALUES(repeat('/',200));
+INSERT INTO t1(b) VALUES(repeat('|',200));
+INSERT INTO t1(b) VALUES(repeat('\\',200));
+INSERT INTO t1(b) VALUES(repeat('-',200));
+INSERT INTO t1(b) VALUES(repeat('&',200));
+INSERT INTO t1(b) VALUES(repeat('%',200));
+INSERT INTO t1(b) VALUES(repeat('@',200));
+INSERT INTO t1(b) VALUES(repeat('?',200));
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+INSERT INTO t1(b) SELECT b FROM t1;
+commit work;
+select space from information_schema.innodb_sys_tables where name = 'test/t1' into @space_id;
+# Ensure that dirty pages of table t1 is flushed.
+flush tables t1 for export;
+unlock tables;
+begin;
+insert into t1(b) values (repeat('_', 42));
+# Make the first page dirty for table t1
+set global innodb_saved_page_number_debug = 0;
+set global innodb_fil_make_page_dirty_debug = @space_id;
+# Ensure that dirty pages of table t1 are flushed.
+set global innodb_buf_flush_list_now = 1;
+# Kill the server
+# Backup table and system tablespace before corrupting
+SELECT * FROM t1;
+ERROR 42S02: Table 'test.t1' doesn't exist in engine
+ FOUND 1 /buffer because it was written in reduced-doublewrite mode/ in mysqld.1.err
+ FOUND 1 /Database page corruption on disk or a failed file read of tables/ in mysqld.1.err
+# Backup table and system tablespace BACK
+CHECK TABLE t1;
+Table Op Msg_type Msg_text
+test.t1 check status OK
+SELECT COUNT(*) FROM t1;
+COUNT(*)
+2560
+SET GLOBAL innodb_doublewrite=1;
+CREATE TABLE t2(a INT PRIMARY KEY AUTO_INCREMENT, b char(255) default '') ENGINE=innodb;
+start transaction;
+INSERT INTO t2(b) VALUES(repeat('#',200));
+INSERT INTO t2(b) VALUES(repeat('+',200));
+INSERT INTO t2(b) VALUES(repeat('/',200));
+INSERT INTO t2(b) VALUES(repeat('|',200));
+INSERT INTO t2(b) VALUES(repeat('\\',200));
+INSERT INTO t2(b) VALUES(repeat('-',200));
+INSERT INTO t2(b) VALUES(repeat('&',200));
+INSERT INTO t2(b) VALUES(repeat('%',200));
+INSERT INTO t2(b) VALUES(repeat('@',200));
+INSERT INTO t2(b) VALUES(repeat('?',200));
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+INSERT INTO t2(b) SELECT b FROM t2;
+commit work;
+select space from information_schema.innodb_sys_tables where name = 'test/t2' into @space_id;
+# Ensure that dirty pages of table t2 is flushed.
+flush tables t2 for export;
+unlock tables;
+begin;
+insert into t2(b) values (repeat('_', 42));
+# Make the first page dirty for table t2
+set global innodb_saved_page_number_debug = 0;
+set global innodb_fil_make_page_dirty_debug = @space_id;
+# Ensure that dirty pages of table t2 are flushed.
+set global innodb_buf_flush_list_now = 1;
+# Kill the server
+CHECK TABLE t1;
+Table Op Msg_type Msg_text
+test.t1 check status OK
+CHECK TABLE t2;
+Table Op Msg_type Msg_text
+test.t2 check status OK
+SELECT COUNT(*) FROM t1;
+COUNT(*)
+2560
+SELECT COUNT(*) FROM t2;
+COUNT(*)
+2560
+DROP TABLE t1;
+DROP TABLE t2;
+ FOUND 1 /Trying to recover page/ in mysqld.1.err
+ FOUND 1 /Recovered page/ in mysqld.1.err
diff --git a/mysql-test/suite/innodb/t/innodb_doublewrite-master.opt b/mysql-test/suite/innodb/t/innodb_doublewrite-master.opt
new file mode 100644
index 00000000000..82df307e376
--- /dev/null
+++ b/mysql-test/suite/innodb/t/innodb_doublewrite-master.opt
@@ -0,0 +1,7 @@
+--innodb-fast-shutdown=2
+--innodb-file-per-table
+--innodb_file_format='Barracuda'
+--innodb_flush_log_at_trx_commit=1
+--innodb_buffer_pool_load_at_startup=OFF
+
+
diff --git a/mysql-test/suite/innodb/t/innodb_doublewrite.test b/mysql-test/suite/innodb/t/innodb_doublewrite.test
new file mode 100644
index 00000000000..ef6191b2449
--- /dev/null
+++ b/mysql-test/suite/innodb/t/innodb_doublewrite.test
@@ -0,0 +1 @@
+--source suite/innodb/include/innodb_doublewrite.inc
diff --git a/mysql-test/suite/innodb/t/innodb_doublewrite_odirect-master.opt b/mysql-test/suite/innodb/t/innodb_doublewrite_odirect-master.opt
new file mode 100644
index 00000000000..ee9cd08c8be
--- /dev/null
+++ b/mysql-test/suite/innodb/t/innodb_doublewrite_odirect-master.opt
@@ -0,0 +1,6 @@
+--innodb-fast-shutdown=2
+--innodb-file-per-table
+--innodb_file_format='Barracuda'
+--innodb_flush_log_at_trx_commit=1
+--innodb-flush-method=O_DIRECT
+--innodb_buffer_pool_load_at_startup=OFF
diff --git a/mysql-test/suite/innodb/t/innodb_doublewrite_odirect.test b/mysql-test/suite/innodb/t/innodb_doublewrite_odirect.test
new file mode 100644
index 00000000000..ef6191b2449
--- /dev/null
+++ b/mysql-test/suite/innodb/t/innodb_doublewrite_odirect.test
@@ -0,0 +1 @@
+--source suite/innodb/include/innodb_doublewrite.inc
diff --git a/mysql-test/suite/sys_vars/r/innodb_doublewrite_basic.result b/mysql-test/suite/sys_vars/r/innodb_doublewrite_basic.result
index 4a5baf0aeda..641d8cf5cc8 100644
--- a/mysql-test/suite/sys_vars/r/innodb_doublewrite_basic.result
+++ b/mysql-test/suite/sys_vars/r/innodb_doublewrite_basic.result
@@ -4,18 +4,31 @@ COUNT(@@GLOBAL.innodb_doublewrite)
1
1 Expected
'#---------------------BS_STVARS_026_02----------------------#'
-SET @@GLOBAL.innodb_doublewrite=1;
-ERROR HY000: Variable 'innodb_doublewrite' is a read only variable
-Expected error 'Read only variable'
+SET @global_start_value = @@global.innodb_doublewrite;
+SELECT @global_start_value;
+@global_start_value
+1
+SET @@GLOBAL.innodb_doublewrite = 1 ;
SELECT COUNT(@@GLOBAL.innodb_doublewrite);
COUNT(@@GLOBAL.innodb_doublewrite)
1
1 Expected
+SET @@GLOBAL.innodb_doublewrite = 2;
+SELECT @@GLOBAL.innodb_doublewrite;
+@@GLOBAL.innodb_doublewrite
+2
+SET @@GLOBAL.innodb_doublewrite = 0;
+Warnings:
+Warning 1210 innodb_doublewrite can not be dynamically changed to or from 0. Do a clean shutdown if you want to change it from or to 0.
+SELECT @@GLOBAL.innodb_doublewrite;
+@@GLOBAL.innodb_doublewrite
+2
+2 Expected
'#---------------------BS_STVARS_026_03----------------------#'
-SELECT IF(@@GLOBAL.innodb_doublewrite, "ON", "OFF") = VARIABLE_VALUE
+SELECT @@GLOBAL.innodb_doublewrite = VARIABLE_VALUE
FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
WHERE VARIABLE_NAME='innodb_doublewrite';
-IF(@@GLOBAL.innodb_doublewrite, "ON", "OFF") = VARIABLE_VALUE
+@@GLOBAL.innodb_doublewrite = VARIABLE_VALUE
1
1 Expected
SELECT COUNT(@@GLOBAL.innodb_doublewrite);
@@ -41,13 +54,11 @@ COUNT(@@innodb_doublewrite)
SELECT COUNT(@@local.innodb_doublewrite);
ERROR HY000: Variable 'innodb_doublewrite' is a GLOBAL variable
Expected error 'Variable is a GLOBAL variable'
-SELECT COUNT(@@SESSION.innodb_doublewrite);
-ERROR HY000: Variable 'innodb_doublewrite' is a GLOBAL variable
-Expected error 'Variable is a GLOBAL variable'
SELECT COUNT(@@GLOBAL.innodb_doublewrite);
COUNT(@@GLOBAL.innodb_doublewrite)
1
1 Expected
-SELECT innodb_doublewrite = @@SESSION.innodb_doublewrite;
-ERROR 42S22: Unknown column 'innodb_doublewrite' in 'field list'
-Expected error 'Readonly variable'
+SET @@global.innodb_doublewrite = @global_start_value;
+SELECT @@global.innodb_doublewrite;
+@@global.innodb_doublewrite
+1
diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
index 7402b84dc96..516975f1675 100644
--- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
@@ -918,18 +918,18 @@ READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_DOUBLEWRITE
SESSION_VALUE NULL
-GLOBAL_VALUE ON
+GLOBAL_VALUE 1
GLOBAL_VALUE_ORIGIN COMPILE-TIME
-DEFAULT_VALUE ON
+DEFAULT_VALUE 1
VARIABLE_SCOPE GLOBAL
-VARIABLE_TYPE BOOLEAN
-VARIABLE_COMMENT Enable InnoDB doublewrite buffer (enabled by default). Disable with --skip-innodb-doublewrite.
-NUMERIC_MIN_VALUE NULL
-NUMERIC_MAX_VALUE NULL
-NUMERIC_BLOCK_SIZE NULL
-ENUM_VALUE_LIST OFF,ON
-READ_ONLY YES
-COMMAND_LINE_ARGUMENT NONE
+VARIABLE_TYPE BIGINT UNSIGNED
+VARIABLE_COMMENT 0=Disable InnoDB doublewrite buffer.1=Enable full doublewrite mode (default).2=Enable reduced doublewrite mode.
+NUMERIC_MIN_VALUE 0
+NUMERIC_MAX_VALUE 2
+NUMERIC_BLOCK_SIZE 0
+ENUM_VALUE_LIST NULL
+READ_ONLY NO
+COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_DOUBLEWRITE_BATCH_SIZE
SESSION_VALUE NULL
GLOBAL_VALUE 120
diff --git a/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.test b/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.test
index 1ae10d0f7cf..4827aafccde 100644
--- a/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.test
+++ b/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.test
@@ -37,36 +37,40 @@ SELECT COUNT(@@GLOBAL.innodb_doublewrite);
# Check if Value can set #
####################################################################
---error ER_INCORRECT_GLOBAL_LOCAL_VAR
-SET @@GLOBAL.innodb_doublewrite=1;
---echo Expected error 'Read only variable'
+SET @global_start_value = @@global.innodb_doublewrite;
+SELECT @global_start_value;
+
+SET @@GLOBAL.innodb_doublewrite = 1 ;
+
SELECT COUNT(@@GLOBAL.innodb_doublewrite);
--echo 1 Expected
+SET @@GLOBAL.innodb_doublewrite = 2;
+SELECT @@GLOBAL.innodb_doublewrite;
+
+SET @@GLOBAL.innodb_doublewrite = 0;
+SELECT @@GLOBAL.innodb_doublewrite;
+--echo 2 Expected
--echo '#---------------------BS_STVARS_026_03----------------------#'
#################################################################
# Check if the value in GLOBAL Table matches value in variable #
#################################################################
---disable_warnings
-SELECT IF(@@GLOBAL.innodb_doublewrite, "ON", "OFF") = VARIABLE_VALUE
+SELECT @@GLOBAL.innodb_doublewrite = VARIABLE_VALUE
FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
WHERE VARIABLE_NAME='innodb_doublewrite';
---enable_warnings
--echo 1 Expected
SELECT COUNT(@@GLOBAL.innodb_doublewrite);
--echo 1 Expected
---disable_warnings
SELECT COUNT(VARIABLE_VALUE)
FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
WHERE VARIABLE_NAME='innodb_doublewrite';
---enable_warnings
--echo 1 Expected
@@ -92,15 +96,8 @@ SELECT COUNT(@@innodb_doublewrite);
SELECT COUNT(@@local.innodb_doublewrite);
--echo Expected error 'Variable is a GLOBAL variable'
---Error ER_INCORRECT_GLOBAL_LOCAL_VAR
-SELECT COUNT(@@SESSION.innodb_doublewrite);
---echo Expected error 'Variable is a GLOBAL variable'
-
SELECT COUNT(@@GLOBAL.innodb_doublewrite);
--echo 1 Expected
---Error ER_BAD_FIELD_ERROR
-SELECT innodb_doublewrite = @@SESSION.innodb_doublewrite;
---echo Expected error 'Readonly variable'
-
-
+SET @@global.innodb_doublewrite = @global_start_value;
+SELECT @@global.innodb_doublewrite;
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 61006c8d89d..706011eebfb 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -5994,6 +5994,18 @@ database_corrupted:
" You can use CHECK TABLE to scan"
" your table for corruption. "
<< FORCE_RECOVERY_MSG;
+
+ /* Remove the page that is corrupted when
+ recovering. */
+ if (recv_recovery_on) {
+ ib::info() << "Removing the corrupted page "
+ << bpage->id << " in tablepace "
+ << space->name << " from recovered pages.";
+ mutex_enter(&recv_sys->mutex);
+ ut_ad(recv_sys->n_addrs > 0);
+ recv_sys->n_addrs--;
+ mutex_exit(&recv_sys->mutex);
+ }
}
if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
index 17b2229f1da..61151e4bf4b 100644
--- a/storage/innobase/buf/buf0dblwr.cc
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -152,14 +152,28 @@ buf_dblwr_init(
ut_zalloc_nokey(buf_size * sizeof(bool)));
buf_dblwr->write_buf_unaligned = static_cast<byte*>(
- ut_malloc_nokey((1 + buf_size) * UNIV_PAGE_SIZE));
+ ut_zalloc_nokey((1 + buf_size) * UNIV_PAGE_SIZE));
buf_dblwr->write_buf = static_cast<byte*>(
ut_align(buf_dblwr->write_buf_unaligned,
UNIV_PAGE_SIZE));
+ buf_dblwr->header_unaligned = static_cast<byte*>(
+ ut_zalloc_nokey(2 * UNIV_PAGE_SIZE));
+
+ buf_dblwr->header = static_cast<byte*>(
+ ut_align(buf_dblwr->header_unaligned,
+ UNIV_PAGE_SIZE));
+
buf_dblwr->buf_block_arr = static_cast<buf_page_t**>(
ut_zalloc_nokey(buf_size * sizeof(void*)));
+
+ /* Write the page number and the page type to the doublewrite
+ * header in case it gets used. */
+ mach_write_to_4(buf_dblwr->header + FIL_PAGE_OFFSET,
+ buf_dblwr->block1);
+ mach_write_to_2(buf_dblwr->header + FIL_PAGE_TYPE,
+ FIL_PAGE_TYPE_DBLWR_HEADER);
}
/** Create the doublewrite buffer if the doublewrite buffer header
@@ -342,6 +356,73 @@ too_small:
goto start_again;
}
+/***************************************************************//**
+Zeroes out the pages in the doublewrite buffer on disk, and flushes them.
+This function must be called before the first double-write batch flush after
+the doublewrite mode is changed by the user.
+This function is only called from buf_dblwr_flush_buffered_writes(), while
+it is holding buf_dblwr->mutex, so this function need not be thread-safe. */
+static
+void
+buf_dblwr_reset(
+ ulint doublewrite_mode) {
+ ulint i = 0;
+ void* page_unaligned = ut_zalloc_nokey(
+ (TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + 1) * UNIV_PAGE_SIZE);
+ byte* page = page_align((byte*)page_unaligned + UNIV_PAGE_SIZE);
+
+ ut_a(doublewrite_mode);
+
+ buf_dblwr_being_created = TRUE;
+
+ /* Reset the first half of doublewrite buffer on disk.
+ * We handle the first block separately because it determines whether
+ * the last flush was done in the reduced-doublewrite mode.
+ */
+ if (doublewrite_mode == 2) {
+ /* Write an empty header page */
+ i = 1;
+ memcpy(page, buf_dblwr->header, FIL_PAGE_DATA);
+ buf_flush_init_for_writing(NULL, page, NULL, 0);
+ ut_ad(!buf_page_is_corrupted(FALSE, page, univ_page_size, NULL));
+ page += UNIV_PAGE_SIZE;
+ }
+
+ for (; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; ++i) {
+ mach_write_to_4(page + FIL_PAGE_OFFSET, buf_dblwr->block1 + i);
+ buf_flush_init_for_writing(NULL, page, NULL, 0);
+ page += UNIV_PAGE_SIZE;
+ }
+
+ IORequest write_request(IORequest::WRITE);
+ const page_id_t page_id(TRX_SYS_SPACE, buf_dblwr->block1);
+
+ page = page_align((byte*)page_unaligned + UNIV_PAGE_SIZE);
+
+ fil_io(write_request, true, page_id, univ_page_size,
+ 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ (void*) page, NULL);
+
+ /* Reset the second half of doublewrite buffer on disk. */
+
+ for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; ++i) {
+ mach_write_to_4(page + FIL_PAGE_OFFSET, buf_dblwr->block2 + i);
+ buf_flush_init_for_writing(NULL, page, NULL, 0);
+ page += UNIV_PAGE_SIZE;
+ }
+
+ page = page_align((byte*)page_unaligned + UNIV_PAGE_SIZE);
+
+ const page_id_t page_id2(TRX_SYS_SPACE, buf_dblwr->block2);
+
+ fil_io(write_request, true, page_id2, univ_page_size, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ (void*) page, NULL);
+
+ ut_free(page_unaligned);
+}
+
/**
At database startup initializes the doublewrite buffer memory structure if
we already have a doublewrite buffer created in the data files. If we are
@@ -354,7 +435,8 @@ recovery, this function loads the pages from double write buffer into memory.
dberr_t
buf_dblwr_init_or_load_pages(
pfs_os_file_t file,
- const char* path)
+ const char* path,
+ bool load_corrupt_pages)
{
byte* buf;
byte* page;
@@ -366,6 +448,7 @@ buf_dblwr_init_or_load_pages(
byte* unaligned_read_buf;
ibool reset_space_ids = FALSE;
recv_dblwr_t& recv_dblwr = recv_sys->dblwr;
+ bool header_found = false;
/* We do the file i/o past the buffer pool */
@@ -469,9 +552,57 @@ buf_dblwr_init_or_load_pages(
page = buf;
- for (ulint i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+ /* First check if the first page is of type FIL_PAGE_TYPE_DBLWR_HEADER.
+ * If so, this means that the last time the doublewrite was used in
+ * reduced doublewrite mode (innodb_doublewrite=2).
+ */
+ if (fil_page_get_type(page) == FIL_PAGE_TYPE_DBLWR_HEADER) {
+ header_found = TRUE;
+ }
+
+ if (header_found && load_corrupt_pages) {
+ byte* ptr = page + FIL_PAGE_DATA;
+ ulint num_pages;
+ const page_size_t& page_size = page_size_t(BUF_DBLWR_HEADER_SIZE, BUF_DBLWR_HEADER_SIZE, true);
+
+ ut_a(!reset_space_ids);
+
+ if (buf_page_is_corrupted(
+ FALSE, page, page_size, NULL)) {
+ ib::error()
+ << "InnoDB: The first block of the doublewrite "
+ << "buffer is corrupt.";
+ buf_page_print(
+ page,
+ page_size,
+ BUF_PAGE_PRINT_NO_CRASH);
+ ut_error;
+ }
+
+ num_pages = mach_read_from_2(ptr);
+ ptr += 2;
+
+ for (ulint i = 0; i < num_pages; ++i) {
+ ulint space_id = mach_read_from_4(ptr);
+ ptr += 4;
+ ulint page_no = mach_read_from_4(ptr);
+ ptr += 4;
+ recv_dblwr.add(NULL, space_id, page_no);
+ }
+ }
+
+ if (header_found) {
+ page += univ_page_size.physical();
+ }
+
+ /* We go through all of the pages in the doublewrite buffer even if
+ * we found the header page.
+ */
+ for (ulint i = (header_found ? 1 : 0);
+ i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
if (reset_space_ids) {
ulint source_page_no;
+ ut_a(!header_found);
space_id = 0;
mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
@@ -493,6 +624,7 @@ buf_dblwr_init_or_load_pages(
write_request, path, file, page,
source_page_no * UNIV_PAGE_SIZE,
UNIV_PAGE_SIZE);
+
if (err != DB_SUCCESS) {
ib::error()
@@ -504,10 +636,15 @@ buf_dblwr_init_or_load_pages(
return(err);
}
- } else if (memcmp(field_ref_zero, page + FIL_PAGE_LSN, 8)) {
+ } else if (load_corrupt_pages
+ && memcmp(field_ref_zero, page + FIL_PAGE_LSN, 8)) {
/* Each valid page header must contain
a nonzero FIL_PAGE_LSN field. */
- recv_dblwr.add(page);
+ ulint space_id =
+ mach_read_from_4(page + FIL_PAGE_SPACE_ID);
+ ulint page_no =
+ mach_read_from_4(page + FIL_PAGE_OFFSET);
+ recv_dblwr.add(page, space_id, page_no);
}
page += univ_page_size.physical();
@@ -529,7 +666,7 @@ buf_dblwr_process()
ulint page_no_dblwr = 0;
byte* read_buf;
byte* unaligned_read_buf;
- recv_dblwr_t& recv_dblwr = recv_sys->dblwr;
+ std::list<recv_dblwr_item_t, ut_allocator<recv_dblwr_item_t> >& dblwr_pages = recv_sys->dblwr.pages;
if (!buf_dblwr) {
return;
@@ -541,12 +678,11 @@ buf_dblwr_process()
read_buf = static_cast<byte*>(
ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
- for (recv_dblwr_t::list::iterator i = recv_dblwr.pages.begin();
- i != recv_dblwr.pages.end();
- ++i, ++page_no_dblwr) {
- byte* page = *i;
- ulint space_id = page_get_space_id(page);
- fil_space_t* space = fil_space_get(space_id);
+ for (std::list<recv_dblwr_item_t>::iterator i = dblwr_pages.begin();
+ i != dblwr_pages.end(); ++i, ++page_no_dblwr ) {
+ byte* page = const_cast<byte*>(i->page);
+ fil_space_t* space = fil_space_get(i->space_id);
+ ulint space_id = i->space_id;
if (space == NULL) {
/* Maybe we have dropped the tablespace
@@ -556,7 +692,7 @@ buf_dblwr_process()
fil_space_open_if_needed(space);
- const ulint page_no = page_get_page_no(page);
+ const ulint page_no = i->page_no;
const page_id_t page_id(space_id, page_no);
if (page_no >= space->size) {
@@ -575,7 +711,6 @@ buf_dblwr_process()
}
const page_size_t page_size(space->flags);
- ut_ad(!buf_page_is_zeroes(page, page_size));
/* We want to ensure that for partial reads the
unread portion of the page is NUL. */
@@ -632,6 +767,21 @@ buf_dblwr_process()
<< " from the doublewrite buffer.";
}
+ if (!page) {
+ /* Theoretically we could have another good
+ copy for this page in the doublewrite
+ buffer. If not, we will report a fatal error
+ for a corrupted page somewhere else if that
+ page was truly needed. */
+
+ ib::warn() << "Cannot recover page " << page_id
+ << " from the doublewrite buffer"
+ << " because it was written in reduced-doublewrite mode";
+ continue;
+ }
+
+ ut_ad(!buf_page_is_zeroes(page, page_size));
+
/* Next, validate the doublewrite page. */
if (fil_page_is_compressed_encrypted(page) ||
fil_page_is_compressed(page)) {
@@ -684,7 +834,7 @@ buf_dblwr_process()
<< " from the doublewrite buffer.";
}
- recv_dblwr.pages.clear();
+ dblwr_pages.clear();
fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
ut_free(unaligned_read_buf);
@@ -704,6 +854,9 @@ buf_dblwr_free()
os_event_destroy(buf_dblwr->s_event);
ut_free(buf_dblwr->write_buf_unaligned);
buf_dblwr->write_buf_unaligned = NULL;
+ ut_free(buf_dblwr->header_unaligned);
+ buf_dblwr->header_unaligned = NULL;
+ buf_dblwr->header = NULL;
ut_free(buf_dblwr->buf_block_arr);
buf_dblwr->buf_block_arr = NULL;
@@ -953,8 +1106,10 @@ buf_dblwr_flush_buffered_writes()
byte* write_buf;
ulint first_free;
ulint len;
+ byte* header_ptr;
+ ulong use_doublewrite_buf = srv_use_doublewrite_buf;
- if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
+ if (!use_doublewrite_buf || buf_dblwr == NULL) {
/* Sync the writes to the disk. */
buf_dblwr_sync_datafiles();
return;
@@ -999,6 +1154,31 @@ try_again:
start another batch of flushing. */
buf_dblwr->batch_running = true;
first_free = buf_dblwr->first_free;
+ /* Reset the doublewrite buffer if srv_doublewrite_reset is set.
+ * This protects against the following scenario:
+ * 1- server starts with full(=1) doublewrite mode and writes a bunch
+ * of pages to the doublewrite buffer.
+ * 2- user changes doublewrite mode from full(=1) to reduced(=2).
+ * 3- server runs for a long time in the reduced doublewrite mode so
+ * that the copies that were written to the doublewrite buffer in step
+ * 1 become stale.
+ * 4- some of the non-doublewrite pages on disk whose copies in the
+ * doublewrite buffer became stale get corrupted because of a hardware
+ * or a software failure.
+ * 5- server crashes. During recovery InnoDB processes pages both
+ * in the doublewrite header and the following full pages.
+ * 6- The stale copies in the doublewrite buffer are used to restore
+ * corrupt non-doublewrite pages on disk. Now the stale data will be
+ * served when these pages are accessed.
+ * This is a rare case because it needs the corruption to happen to one
+ * of the pages written to the doublewrite buffer in full mode. We
+ * nevertheless protect against this case by resetting the doublewrite
+ * buffer on disk, when the doublewrite mode changes.
+ */
+ if (srv_doublewrite_reset) {
+ buf_dblwr_reset(use_doublewrite_buf);
+ srv_doublewrite_reset = FALSE;
+ }
/* Now safe to release the mutex. Note that though no other
thread is allowed to post to the doublewrite batch flushing
@@ -1007,14 +1187,23 @@ try_again:
mutex_exit(&buf_dblwr->mutex);
write_buf = buf_dblwr->write_buf;
+ header_ptr = buf_dblwr->header + FIL_PAGE_DATA;
+ memset(header_ptr, 0, BUF_DBLWR_HEADER_SIZE - FIL_PAGE_DATA);
+ mach_write_to_2(header_ptr, buf_dblwr->first_free);
+ header_ptr += 2;
for (ulint len2 = 0, i = 0;
i < buf_dblwr->first_free;
len2 += UNIV_PAGE_SIZE, i++) {
- const buf_block_t* block;
-
+ const buf_block_t* block;
block = (buf_block_t*) buf_dblwr->buf_block_arr[i];
+ const buf_page_t* page = &block->page;
+
+ mach_write_to_4(header_ptr, page->id.space());
+ header_ptr += 4;
+ mach_write_to_4(header_ptr, page->id.page_no());
+ header_ptr += 4;
if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
|| block->page.zip.data) {
@@ -1033,14 +1222,30 @@ try_again:
}
/* Write out the first block of the doublewrite buffer */
- len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
- buf_dblwr->first_free) * UNIV_PAGE_SIZE;
+ if (use_doublewrite_buf == 2) {
+ ib_uint32_t checksum = page_zip_calc_checksum(
+ buf_dblwr->header, BUF_DBLWR_HEADER_SIZE,
+ static_cast<srv_checksum_algorithm_t>(
+ srv_checksum_algorithm));
+
+ const page_size_t& page_size = page_size_t(BUF_DBLWR_HEADER_SIZE, BUF_DBLWR_HEADER_SIZE, true);
+ mach_write_to_4(buf_dblwr->header + FIL_PAGE_SPACE_OR_CHKSUM,
+ checksum);
+ len = BUF_DBLWR_HEADER_SIZE;
+ write_buf = buf_dblwr->header;
+ ut_ad(!buf_page_is_corrupted(FALSE, write_buf, page_size, NULL));
+ } else {
+ len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
+ buf_dblwr->first_free) * UNIV_PAGE_SIZE;
+ }
fil_io(IORequestWrite, true,
page_id_t(TRX_SYS_SPACE, buf_dblwr->block1), univ_page_size,
0, len, (void*) write_buf, NULL);
- if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+
+ if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ || use_doublewrite_buf == 2) {
/* No unwritten pages in the second block. */
goto flush;
}
@@ -1058,7 +1263,12 @@ try_again:
flush:
/* increment the doublewrite flushed pages counter */
- srv_stats.dblwr_pages_written.add(buf_dblwr->first_free);
+ if (use_doublewrite_buf == 1) {
+ srv_stats.dblwr_pages_written.add(buf_dblwr->first_free);
+ } else {
+ srv_stats.dblwr_pages_written.inc();
+ }
+
srv_stats.dblwr_writes.inc();
/* Now flush the doublewrite buffer data to disk */
diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc
index e4bb11c9a22..aca84b9cfd4 100644
--- a/storage/innobase/fsp/fsp0sysspace.cc
+++ b/storage/innobase/fsp/fsp0sysspace.cc
@@ -571,7 +571,7 @@ SysTablespace::read_lsn_and_check_flags(lsn_t* flushed_lsn)
ut_a(it->order() == 0);
- buf_dblwr_init_or_load_pages(it->handle(), it->filepath());
+ buf_dblwr_init_or_load_pages(it->handle(), it->filepath(), true);
/* Check the contents of the first page of the
first datafile. */
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 68b69e94681..fc42d016698 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -254,7 +254,6 @@ values */
static my_bool innobase_file_format_check;
static my_bool innobase_use_atomic_writes;
static my_bool innobase_use_fallocate;
-static my_bool innobase_use_doublewrite;
static my_bool innobase_use_checksums;
static my_bool innobase_locks_unsafe_for_binlog;
static my_bool innobase_rollback_on_timeout;
@@ -4279,8 +4278,6 @@ innobase_change_buffering_inited_ok:
srv_n_read_io_threads = (ulint) innobase_read_io_threads;
srv_n_write_io_threads = (ulint) innobase_write_io_threads;
- srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
-
if (!innobase_use_checksums) {
ib::warn() << "Setting innodb_checksums to OFF is DEPRECATED."
" This option may be removed in future releases. You"
@@ -18086,6 +18083,38 @@ innodb_io_capacity_max_update(
}
/****************************************************************//**
+Update the system variable innodb_doublewrite using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_doublewrite_update(
+/*======================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ ulong in_val = *static_cast<const ulong*>(save);
+ if (!in_val || !srv_use_doublewrite_buf) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "innodb_doublewrite can not be "
+ "dynamically changed to or from 0. "
+ "Do a clean shutdown if you want to "
+ "change it from or to 0.");
+ } else {
+ ut_a(in_val == 1 || in_val == 2);
+ if (srv_use_doublewrite_buf != in_val) {
+ srv_use_doublewrite_buf = in_val;
+ srv_doublewrite_reset = 1;
+ }
+ }
+}
+
+/****************************************************************//**
Update the system variable innodb_io_capacity using the "saved"
value. This function is registered as a callback with MySQL. */
static
@@ -20530,11 +20559,12 @@ static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir,
"The common part for InnoDB table spaces.",
NULL, NULL, NULL);
-static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite,
- PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
- "Enable InnoDB doublewrite buffer (enabled by default)."
- " Disable with --skip-innodb-doublewrite.",
- NULL, NULL, TRUE);
+static MYSQL_SYSVAR_ULONG(doublewrite, srv_use_doublewrite_buf,
+ PLUGIN_VAR_OPCMDARG,
+ "0=Disable InnoDB doublewrite buffer."
+ "1=Enable full doublewrite mode (default)."
+ "2=Enable reduced doublewrite mode.",
+ NULL, innodb_doublewrite_update, 1, 0, 2, 0);
static MYSQL_SYSVAR_BOOL(use_atomic_writes, innobase_use_atomic_writes,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
index 8f04f9d15ee..87fa470f897 100644
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -112,6 +112,7 @@ static buf_page_desc_t i_s_page_type[] = {
{"IBUF_INDEX", I_S_PAGE_TYPE_IBUF},
{"PAGE COMPRESSED", FIL_PAGE_PAGE_COMPRESSED},
{"PAGE COMPRESSED AND ENCRYPTED", FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED},
+ {"DOUBLEWRITE HEADER", FIL_PAGE_TYPE_DBLWR_HEADER},
};
/** This structure defines information we will fetch from pages
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
index 598609e2be4..f560429feb9 100644
--- a/storage/innobase/include/buf0dblwr.h
+++ b/storage/innobase/include/buf0dblwr.h
@@ -37,6 +37,9 @@ Created 2011/12/19 Inaam Rana
extern buf_dblwr_t* buf_dblwr;
/** Set to TRUE when the doublewrite buffer is being created */
extern ibool buf_dblwr_being_created;
+/** The size of the doublewrite header page when the reduced-doublewrite mode
+is used. */
+#define BUF_DBLWR_HEADER_SIZE 4096
/** Create the doublewrite buffer if the doublewrite buffer header
is not present in the TRX_SYS page.
@@ -59,7 +62,8 @@ recovery, this function loads the pages from double write buffer into memory.
dberr_t
buf_dblwr_init_or_load_pages(
pfs_os_file_t file,
- const char* path);
+ const char* path,
+ bool load_corrupt_pages);
/** Process and remove the double write buffer pages for all tablespaces. */
void
@@ -157,6 +161,11 @@ struct buf_dblwr_t{
buf_page_t** buf_block_arr;/*!< array to store pointers to
the buffer blocks which have been
cached to write_buf */
+ byte* header;/*!< write buffer used for writing out the
+ doublewrite header for reduced doublewrite
+ mode (innodb_doublewrite=2) */
+ byte* header_unaligned;/*!< pointer to header,
+ but unaligned */
};
#endif
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index bf231565657..87f94c0234e 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -392,6 +392,10 @@ extern fil_addr_t fil_addr_null;
#define FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED 37401 /*!< Page is compressed and
then encrypted */
#define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< page compressed page */
+/*!< First page of the double write buffer holds the
+space ids and the page numbers for the most recently
+flushed pages. */
+#define FIL_PAGE_TYPE_DBLWR_HEADER 32124 /*!< Doublewrite header */
#define FIL_PAGE_INDEX 17855 /*!< B-tree node */
#define FIL_PAGE_RTREE 17854 /*!< B-tree node */
#define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */
diff --git a/storage/innobase/include/fil0fil.ic b/storage/innobase/include/fil0fil.ic
index 9505cc0bd69..d3f26334f3b 100644
--- a/storage/innobase/include/fil0fil.ic
+++ b/storage/innobase/include/fil0fil.ic
@@ -35,6 +35,8 @@ fil_get_page_type_name(
ulint page_type) /*!< in: FIL_PAGE_TYPE */
{
switch(page_type) {
+ case FIL_PAGE_TYPE_DBLWR_HEADER:
+ return "PAGE_TYPE_DBLWR_HEADER";
case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
return "PAGE_COMPRESSED_ENRYPTED";
case FIL_PAGE_PAGE_COMPRESSED:
@@ -88,6 +90,7 @@ fil_page_type_validate(
/* Validate page type */
if (!((page_type == FIL_PAGE_PAGE_COMPRESSED ||
page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED ||
+ page_type == FIL_PAGE_TYPE_DBLWR_HEADER ||
page_type == FIL_PAGE_INDEX ||
page_type == FIL_PAGE_RTREE ||
page_type == FIL_PAGE_UNDO_LOG ||
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
index 24ad9ae2a30..29b268fcff7 100644
--- a/storage/innobase/include/log0recv.h
+++ b/storage/innobase/include/log0recv.h
@@ -173,10 +173,20 @@ struct recv_addr_t{
hash_node_t addr_hash;/*!< hash node in the hash bucket chain */
};
+struct recv_dblwr_item_t {
+ const byte* page;
+ ulint space_id;
+ ulint page_no;
+};
+
struct recv_dblwr_t {
/** Add a page frame to the doublewrite recovery buffer. */
- void add(byte* page) {
- pages.push_back(page);
+ void add(const byte* page, ulint space_id, ulint page_no) {
+ recv_dblwr_item_t item;
+ item.page = page;
+ item.space_id = space_id;
+ item.page_no = page_no;
+ pages.push_back(item);
}
/** Find a doublewrite copy of a page.
@@ -186,10 +196,11 @@ struct recv_dblwr_t {
@retval NULL if no page was found */
const byte* find_page(ulint space_id, ulint page_no);
- typedef std::list<byte*, ut_allocator<byte*> > list;
+ std::list<recv_dblwr_item_t, ut_allocator<recv_dblwr_item_t> > pages;
- /** Recovered doublewrite buffer page frames */
- list pages;
+ void operator() () {
+ pages.clear();
+ }
};
/** Recovery system data structure */
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 891f25f68f1..f88669bdd7d 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -486,7 +486,8 @@ extern my_bool srv_stats_include_delete_marked;
extern unsigned long long srv_stats_modified_counter;
extern my_bool srv_stats_sample_traditional;
-extern ibool srv_use_doublewrite_buf;
+extern ulong srv_use_doublewrite_buf;
+extern my_bool srv_doublewrite_reset;
extern ulong srv_doublewrite_batch_size;
extern ulong srv_checksum_algorithm;
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index dc8977e49c8..814d4e4804a 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -3500,14 +3500,14 @@ recv_dblwr_t::find_page(ulint space_id, ulint page_no)
{
typedef std::vector<const byte*, ut_allocator<const byte*> >
matches_t;
-
matches_t matches;
const byte* result = 0;
- for (list::iterator i = pages.begin(); i != pages.end(); ++i) {
- if (page_get_space_id(*i) == space_id
- && page_get_page_no(*i) == page_no) {
- matches.push_back(*i);
+ for (std::list<recv_dblwr_item_t>::iterator i = pages.begin(); i != pages.end(); ++i) {
+ if (i->page
+ && (page_get_space_id(i->page) == space_id)
+ && (page_get_page_no(i->page) == page_no)) {
+ matches.push_back(i->page);
}
}
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index 2894be6b12c..3999f7bc90a 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -392,8 +392,8 @@ unsigned long long srv_stats_modified_counter;
based on number of configured pages */
my_bool srv_stats_sample_traditional;
-/** copy of innodb_doublewrite */
-ibool srv_use_doublewrite_buf;
+ulong srv_use_doublewrite_buf = 1;
+my_bool srv_doublewrite_reset = FALSE;
/** innodb_doublewrite_batch_size (a debug parameter) specifies the
number of pages to use in LRU and flush_list batch flushing.