From 905baa646df0d5d9204ccd84370a5033f89a2cbf Mon Sep 17 00:00:00 2001 From: Brandon Nesterenko Date: Tue, 22 Feb 2022 08:54:54 -0700 Subject: MDEV-27850: MTR tests can hang due to DEBUG_SYNC race condition DEBUG_SYNC signals can get lost in certain tests due to later DEBUG_SYNC commands overwriting them. This patch addresses these issues in three tests: main.query_cache_debug, main.partition_debug_sync, and rpl.rpl_dump_request_retry_warning. Additionally, main.partition_debug_sync needed changes to the result file (the others did not). The synchronization happened between two commands, one based on ALTER, the other on DROP. A new thread/connection was needed to synchronize the DEBUG_SYNC actions between these commands, thereby changing the result file. Additional comments were added for clarification. Reviewed By: ============ Andrei Elkin --- mysql-test/r/partition_debug_sync.result | 7 ++++++- .../suite/rpl/t/rpl_dump_request_retry_warning.test | 6 ++++++ mysql-test/t/partition_debug_sync.test | 19 +++++++++++++++++-- mysql-test/t/query_cache_debug.test | 11 +++++++++++ 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/mysql-test/r/partition_debug_sync.result b/mysql-test/r/partition_debug_sync.result index 7006491fcaa..635f7735e6c 100644 --- a/mysql-test/r/partition_debug_sync.result +++ b/mysql-test/r/partition_debug_sync.result @@ -53,9 +53,14 @@ SET DEBUG_SYNC= 'alter_table_before_rename_result_table WAIT_FOR delete_done'; ALTER TABLE t2 REMOVE PARTITIONING; connection default; SET DEBUG_SYNC= 'now WAIT_FOR removing_partitions'; -SET DEBUG_SYNC= 'rm_table_no_locks_before_delete_table SIGNAL waiting_for_alter'; +SET DEBUG_SYNC= 'rm_table_no_locks_before_delete_table SIGNAL waiting_for_alter WAIT_FOR mtr_continue'; SET DEBUG_SYNC= 'rm_table_no_locks_before_binlog SIGNAL delete_done'; DROP TABLE IF EXISTS t2; +# Create a new connection thread to orchestrate the ALTER and DROP +# threads' DEBUG_SYNC points to avoid lost signals +connect con2,localhost,root,,; +SET DEBUG_SYNC= 'now SIGNAL mtr_continue'; +connection default; connection con1; ERROR 42S02: Table 'test.t2' doesn't exist SET DEBUG_SYNC= 'RESET'; diff --git a/mysql-test/suite/rpl/t/rpl_dump_request_retry_warning.test b/mysql-test/suite/rpl/t/rpl_dump_request_retry_warning.test index d750d44ae71..1ee043623ae 100644 --- a/mysql-test/suite/rpl/t/rpl_dump_request_retry_warning.test +++ b/mysql-test/suite/rpl/t/rpl_dump_request_retry_warning.test @@ -43,6 +43,12 @@ START SLAVE; SET DEBUG_SYNC= 'now WAIT_FOR parked'; SET @@GLOBAL.debug_dbug = @saved_dbug; SET DEBUG_SYNC= 'now SIGNAL continue'; + +# Ensure the last DEBUG_SYNC signal was received by the target thread before +# reset; otherwise, the reset can drop the last signal before it gets +# acknowledged +let $wait_condition= select count(*)=0 from information_schema.processlist where state like "%debug%"; +source include/wait_condition.inc; SET DEBUG_SYNC= 'RESET'; --source include/wait_for_slave_io_to_start.inc diff --git a/mysql-test/t/partition_debug_sync.test b/mysql-test/t/partition_debug_sync.test index 09ebba0b0e2..bc49153f803 100644 --- a/mysql-test/t/partition_debug_sync.test +++ b/mysql-test/t/partition_debug_sync.test @@ -66,12 +66,27 @@ SET DEBUG_SYNC= 'alter_table_before_rename_result_table WAIT_FOR delete_done'; --send ALTER TABLE t2 REMOVE PARTITIONING connection default; SET DEBUG_SYNC= 'now WAIT_FOR removing_partitions'; -SET DEBUG_SYNC= 'rm_table_no_locks_before_delete_table SIGNAL waiting_for_alter'; + +SET DEBUG_SYNC= 'rm_table_no_locks_before_delete_table SIGNAL waiting_for_alter WAIT_FOR mtr_continue'; SET DEBUG_SYNC= 'rm_table_no_locks_before_binlog SIGNAL delete_done'; -DROP TABLE IF EXISTS t2; +--send DROP TABLE IF EXISTS t2 + +--echo # Create a new connection thread to orchestrate the ALTER and DROP +--echo # threads' DEBUG_SYNC points to avoid lost signals +connect(con2,localhost,root,,); +let $wait_condition= select count(*)=0 from information_schema.processlist where state like "debug sync point: alter_table_before_open_tables"; +source include/wait_condition.inc; +SET DEBUG_SYNC= 'now SIGNAL mtr_continue'; + +# Return execution for DROP TABLE +connection default; +--reap + +# Return execution for ALTER TABLE connection con1; --error ER_NO_SUCH_TABLE --reap + SET DEBUG_SYNC= 'RESET'; disconnect con1; --source include/wait_until_disconnected.inc diff --git a/mysql-test/t/query_cache_debug.test b/mysql-test/t/query_cache_debug.test index 90b76077cac..af82927245b 100644 --- a/mysql-test/t/query_cache_debug.test +++ b/mysql-test/t/query_cache_debug.test @@ -197,8 +197,17 @@ SET DEBUG_SYNC="now WAIT_FOR parked1_2"; --echo ** and finally release the mutex. The threads will continue to wait --echo ** until a broadcast signal reaches them causing both threads to --echo ** come alive and check the condition. + +# Before sending signals back-to-back, we have to ensure the previous signal +# was received +let $wait_condition= select count(*)=3 from information_schema.processlist where state like "%debug%"; +source include/wait_condition.inc; SET DEBUG_SYNC="now SIGNAL go2"; +let $wait_condition= select count(*)=2 from information_schema.processlist where state like "%debug%"; +source include/wait_condition.inc; SET DEBUG_SYNC="now SIGNAL go3"; +let $wait_condition= select count(*)=1 from information_schema.processlist where state like "%debug%"; +source include/wait_condition.inc; --echo ** --echo ** Finally signal the DELETE statement on THD1 one last time. @@ -208,6 +217,8 @@ SET DEBUG_SYNC="now SIGNAL go3"; --echo ** invalidations and a broadcast signal will be sent to the thread --echo ** group holding result set writers. SET DEBUG_SYNC="now SIGNAL go1_2"; +let $wait_condition= select count(*)=0 from information_schema.processlist where state like "%debug%"; +source include/wait_condition.inc; --echo ** --echo ************************************************************************* -- cgit v1.2.1 From 0635088deb7a8440d918d5f988c7042f9deb59e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Mon, 28 Feb 2022 12:12:12 +0200 Subject: MDEV-27800: Avoid garbage TRX_UNDO_TRX_NO on TRX_UNDO_CACHED pages In commit c7d0448797a2013079c3e0b7ca97f7bdc1138696 (MDEV-15132) MariaDB Server 10.3 stopped writing the latest transaction identifier to the TRX_SYS page. Instead, the transaction identifier will be recovered from undo log pages. Unfortunately, before commit 3926673ce7149aa223103126b6aeac819b10fab5 and mysql/mysql-server@dc29792ff2996aefbb6e64bb2f0bc3aa8fc879e9 (MySQL 5.1.48 or MariaDB 5.1.48) InnoDB did not always initialize all data fields, but some garbage could be left behind in unused parts of data pages. In undo log pages that are essentially free, but added to a list for reuse (TRX_UNDO_CACHED) the TRX_UNDO_TRX_NO fields could contain garbage, instead of 0. As long as such undo pages are being reused and never marked completely free, the garbage contents may remain forever. In fact, the function trx_undo_header_create() and the record MLOG_UNDO_HDR_CREATE will only initialize TRX_UNDO_TRX_ID, but leave TRX_UNDO_TRX_NO uninitialized. trx_undo_mem_create_at_db_start(): Only read the TRX_UNDO_TRX_NO fields of TRX_UNDO_CACHED pages if the TRX_UNDO_PAGE_TYPE is 0, that is, the page was updated by MariaDB Server 10.3. Earlier versions would always write the TRX_UNDO_PAGE_TYPE as 1 or 2. trx_undo_header_create(): Zero out the TRX_UNDO_TRX_NO field. Strictly speaking, this will change the semantics of the MLOG_UNDO_HDR_CREATE record, but it should not do any harm to overwrite a potentially garbage field with zeroes. Note: This fix will only help future upgrades straight from MariaDB Server 10.2 or MySQL 5.6 or earlier. If such an upgrade has already been made, then an earlier server startup could have fast-forwarded the transaction ID sequence to a large value. If this large value cannot be represented in 48 bits (the size of the DB_TRX_ID column in clustered index records), then various strange things can happen. --- storage/innobase/trx/trx0undo.cc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index 928411b1163..6edc16e48be 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2014, 2021, MariaDB Corporation. +Copyright (c) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -639,6 +639,7 @@ trx_undo_header_create( mach_write_to_2(log_hdr + TRX_UNDO_NEEDS_PURGE, 1); mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); + memset(log_hdr + TRX_UNDO_TRX_NO, 0, 8); mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); @@ -1112,12 +1113,19 @@ corrupted: sql_print_error("InnoDB: unsupported undo header state %u", state); goto corrupted; + case TRX_UNDO_CACHED: + if (UNIV_UNLIKELY(type != 0)) { + /* This undo page was not updated by MariaDB + 10.3 or later. The TRX_UNDO_TRX_NO field may + contain garbage. */ + break; + } + goto read_trx_no; case TRX_UNDO_TO_PURGE: if (UNIV_UNLIKELY(type == 1)) { goto corrupted_type; } - /* fall through */ - case TRX_UNDO_CACHED: + read_trx_no: trx_id_t id = mach_read_from_8(TRX_UNDO_TRX_NO + undo_header); if (id >> 48) { sql_print_error("InnoDB: corrupted TRX_NO %llx", id); -- cgit v1.2.1