1 files changed, 473 insertions, 1 deletions
diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test
index d4b99d4b0f7..feac32b1454 100644
--- a/mysql-test/suite/rpl/t/rpl_parallel.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel.test
@@ -12,9 +12,20 @@ SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
 SET GLOBAL slave_parallel_threads=10;
 --source include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=10;
+
+# Check that we do not spawn any worker threads when no slave is running.
+SELECT IF(COUNT(*) < 10, "OK", CONCAT("Found too many system user processes: ", COUNT(*))) FROM information_schema.processlist WHERE user = "system user";
+
 CHANGE MASTER TO master_use_gtid=slave_pos;
 --source include/start_slave.inc
 
+# Check that worker threads get spawned when slave starts.
+SELECT IF(COUNT(*) >= 10, "OK", CONCAT("Found too few system user processes: ", COUNT(*))) FROM information_schema.processlist WHERE user = "system user";
+# ... and that worker threads get removed when slave stops.
+--source include/stop_slave.inc
+SELECT IF(COUNT(*) < 10, "OK", CONCAT("Found too many system user processes: ", COUNT(*))) FROM information_schema.processlist WHERE user = "system user";
+--source include/start_slave.inc
+SELECT IF(COUNT(*) >= 10, "OK", CONCAT("Found too few system user processes: ", COUNT(*))) FROM information_schema.processlist WHERE user = "system user";
 
 --echo *** Test long-running query in domain 1 can run in parallel with short queries in domain 0 ***
 
@@ -1843,6 +1854,467 @@ SET GLOBAL slave_parallel_threads=10;
 --source include/start_slave.inc
 
 
+--echo *** MDEV-7335: Potential parallel slave deadlock with specific binlog corruption ***
+
+--connection server_2
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=1;
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
+
+--connection server_1
+INSERT INTO t2 VALUES (101);
+INSERT INTO t2 VALUES (102);
+INSERT INTO t2 VALUES (103);
+INSERT INTO t2 VALUES (104);
+INSERT INTO t2 VALUES (105);
+# Inject a partial event group (missing XID at the end). The bug was that such
+# partial group was not handled appropriately, leading to server deadlock.
+SET gtid_seq_no=1000;
+INSERT INTO t2 VALUES (106);
+INSERT INTO t2 VALUES (107);
+INSERT INTO t2 VALUES (108);
+INSERT INTO t2 VALUES (109);
+INSERT INTO t2 VALUES (110);
+INSERT INTO t2 VALUES (111);
+INSERT INTO t2 VALUES (112);
+INSERT INTO t2 VALUES (113);
+INSERT INTO t2 VALUES (114);
+INSERT INTO t2 VALUES (115);
+INSERT INTO t2 VALUES (116);
+INSERT INTO t2 VALUES (117);
+INSERT INTO t2 VALUES (118);
+INSERT INTO t2 VALUES (119);
+INSERT INTO t2 VALUES (120);
+INSERT INTO t2 VALUES (121);
+INSERT INTO t2 VALUES (122);
+INSERT INTO t2 VALUES (123);
+INSERT INTO t2 VALUES (124);
+INSERT INTO t2 VALUES (125);
+INSERT INTO t2 VALUES (126);
+INSERT INTO t2 VALUES (127);
+INSERT INTO t2 VALUES (128);
+INSERT INTO t2 VALUES (129);
+INSERT INTO t2 VALUES (130);
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+# The partial event group (a=106) should be rolled back and thus missing.
+SELECT * FROM t2 WHERE a >= 100 ORDER BY a;
+
+--source include/stop_slave.inc
+SET GLOBAL debug_dbug=@old_dbug;
+SET GLOBAL slave_parallel_threads=10;
+--source include/start_slave.inc
+
+
+--echo *** MDEV-7847: "Slave worker thread retried transaction 10 time(s) in vain, giving up", followed by replication hanging ***
+--echo *** MDEV-7882: Excessive transaction retry in parallel replication ***
+
+--connection server_1
+CREATE TABLE t7 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
+CREATE TABLE t8 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=40;
+SELECT @old_retries:=@@GLOBAL.slave_transaction_retries;
+SET GLOBAL slave_transaction_retries= 5;
+
+
+# Using dbug error injection, we artificially create event groups with a lot of
+# conflicting transactions in each event group. The bugs were originally seen
+# "in the wild" with transactions that did not conflict on the master, and only
+# conflicted very rarely on the slave (maybe some edge case with InnoDB btree
+# page splits or something like that). The event groups here loosely reflect
+# the structure of the original failure's group commits.
+
+
+--connection server_1
+INSERT INTO t7 VALUES (1,1), (2,2), (3,3), (4,4), (5,5);
+SET @old_dbug= @@SESSION.debug_dbug;
+SET @commit_id= 42;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+INSERT INTO t8 VALUES (1,1);
+INSERT INTO t8 VALUES (2,2);
+INSERT INTO t8 VALUES (3,3);
+INSERT INTO t8 VALUES (4,4);
+INSERT INTO t8 VALUES (5,5);
+INSERT INTO t8 VALUES (6,6);
+INSERT INTO t8 VALUES (7,7);
+INSERT INTO t8 VALUES (8,8);
+
+UPDATE t7 SET b=9 WHERE a=3;
+UPDATE t7 SET b=10 WHERE a=3;
+UPDATE t7 SET b=11 WHERE a=3;
+
+INSERT INTO t8 VALUES (12,12);
+INSERT INTO t8 VALUES (13,13);
+
+UPDATE t7 SET b=14 WHERE a=3;
+UPDATE t7 SET b=15 WHERE a=3;
+
+INSERT INTO t8 VALUES (16,16);
+
+UPDATE t7 SET b=17 WHERE a=3;
+
+INSERT INTO t8 VALUES (18,18);
+INSERT INTO t8 VALUES (19,19);
+
+UPDATE t7 SET b=20 WHERE a=3;
+
+INSERT INTO t8 VALUES (21,21);
+
+UPDATE t7 SET b=22 WHERE a=3;
+
+INSERT INTO t8 VALUES (23,24);
+INSERT INTO t8 VALUES (24,24);
+
+UPDATE t7 SET b=25 WHERE a=3;
+
+INSERT INTO t8 VALUES (26,26);
+
+UPDATE t7 SET b=27 WHERE a=3;
+
+BEGIN;
+INSERT INTO t8 VALUES (28,28);
+INSERT INTO t8 VALUES (29,28), (30,28);
+INSERT INTO t8 VALUES (31,28);
+INSERT INTO t8 VALUES (32,28);
+INSERT INTO t8 VALUES (33,28);
+INSERT INTO t8 VALUES (34,28);
+INSERT INTO t8 VALUES (35,28);
+INSERT INTO t8 VALUES (36,28);
+INSERT INTO t8 VALUES (37,28);
+INSERT INTO t8 VALUES (38,28);
+INSERT INTO t8 VALUES (39,28);
+INSERT INTO t8 VALUES (40,28);
+INSERT INTO t8 VALUES (41,28);
+INSERT INTO t8 VALUES (42,28);
+COMMIT;
+
+
+SET @commit_id=43;
+INSERT INTO t8 VALUES (43,43);
+INSERT INTO t8 VALUES (44,44);
+
+UPDATE t7 SET b=45 WHERE a=3;
+
+INSERT INTO t8 VALUES (46,46);
+INSERT INTO t8 VALUES (47,47);
+
+UPDATE t7 SET b=48 WHERE a=3;
+
+INSERT INTO t8 VALUES (49,49);
+INSERT INTO t8 VALUES (50,50);
+
+
+SET @commit_id=44;
+INSERT INTO t8 VALUES (51,51);
+INSERT INTO t8 VALUES (52,52);
+
+UPDATE t7 SET b=53 WHERE a=3;
+
+INSERT INTO t8 VALUES (54,54);
+INSERT INTO t8 VALUES (55,55);
+
+UPDATE t7 SET b=56 WHERE a=3;
+
+INSERT INTO t8 VALUES (57,57);
+
+UPDATE t7 SET b=58 WHERE a=3;
+
+INSERT INTO t8 VALUES (58,58);
+INSERT INTO t8 VALUES (59,59);
+INSERT INTO t8 VALUES (60,60);
+INSERT INTO t8 VALUES (61,61);
+
+UPDATE t7 SET b=62 WHERE a=3;
+
+INSERT INTO t8 VALUES (63,63);
+INSERT INTO t8 VALUES (64,64);
+INSERT INTO t8 VALUES (65,65);
+INSERT INTO t8 VALUES (66,66);
+
+UPDATE t7 SET b=67 WHERE a=3;
+
+INSERT INTO t8 VALUES (68,68);
+
+UPDATE t7 SET b=69 WHERE a=3;
+UPDATE t7 SET b=70 WHERE a=3;
+UPDATE t7 SET b=71 WHERE a=3;
+
+INSERT INTO t8 VALUES (72,72);
+
+UPDATE t7 SET b=73 WHERE a=3;
+UPDATE t7 SET b=74 WHERE a=3;
+UPDATE t7 SET b=75 WHERE a=3;
+UPDATE t7 SET b=76 WHERE a=3;
+
+INSERT INTO t8 VALUES (77,77);
+
+UPDATE t7 SET b=78 WHERE a=3;
+
+INSERT INTO t8 VALUES (79,79);
+
+UPDATE t7 SET b=80 WHERE a=3;
+
+INSERT INTO t8 VALUES (81,81);
+
+UPDATE t7 SET b=82 WHERE a=3;
+
+INSERT INTO t8 VALUES (83,83);
+
+UPDATE t7 SET b=84 WHERE a=3;
+
+
+SET @commit_id=45;
+INSERT INTO t8 VALUES (85,85);
+UPDATE t7 SET b=86 WHERE a=3;
+INSERT INTO t8 VALUES (87,87);
+
+
+SET @commit_id=46;
+INSERT INTO t8 VALUES (88,88);
+INSERT INTO t8 VALUES (89,89);
+INSERT INTO t8 VALUES (90,90);
+
+SET SESSION debug_dbug=@old_dbug;
+
+INSERT INTO t8 VALUES (91,91);
+INSERT INTO t8 VALUES (92,92);
+INSERT INTO t8 VALUES (93,93);
+INSERT INTO t8 VALUES (94,94);
+INSERT INTO t8 VALUES (95,95);
+INSERT INTO t8 VALUES (96,96);
+INSERT INTO t8 VALUES (97,97);
+INSERT INTO t8 VALUES (98,98);
+INSERT INTO t8 VALUES (99,99);
+
+
+SELECT * FROM t7 ORDER BY a;
+SELECT * FROM t8 ORDER BY a;
+--source include/save_master_gtid.inc
+
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SELECT * FROM t7 ORDER BY a;
+SELECT * FROM t8 ORDER BY a;
+
+--source include/stop_slave.inc
+SET GLOBAL slave_transaction_retries= @old_retries;
+SET GLOBAL slave_parallel_threads=10;
+--source include/start_slave.inc
+
+
+--echo *** MDEV-7888: ANALYZE TABLE does wakeup_subsequent_commits(), causing wrong binlog order and parallel replication hang ***
+
+--connection server_2
+--source include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_analyze_table_sleep';
+
+--connection server_1
+# Inject two group commits. The bug was that ANALYZE TABLE would call
+# wakeup_subsequent_commits() too early, allowing the following transaction
+# in the same group to run ahead and binlog and free the GCO. Then we get
+# wrong binlog order and later access freed GCO, which causes lost wakeup
+# of following GCO and thus replication hang.
+# We injected a small sleep in ANALYZE to make the race easier to hit (this
+# can only cause false negatives in versions with the bug, not false positives,
+# so sleep is ok here. And it's in general not possible to trigger reliably
+# the race with debug_sync, since the bugfix makes the race impossible).
+
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+
+# Group commit with cid=10000, two event groups.
+SET @commit_id= 10000;
+ANALYZE TABLE t2;
+INSERT INTO t3 VALUES (120, 0);
+
+# Group commit with cid=10001, one event group.
+SET @commit_id= 10001;
+INSERT INTO t3 VALUES (121, 0);
+
+SET SESSION debug_dbug=@old_dbug;
+
+SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+
+SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
+
+--source include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_dbug;
+--source include/start_slave.inc
+
+
+--echo *** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. ***
+
+--connection server_2
+--source include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_record_gtid_serverid_100_sleep';
+
+--connection server_1
+# Inject two group commits. The bug was that record_gtid for a
+# non-transactional event group would commit its own transaction, which would
+# cause ha_commit_trans() to call wakeup_subsequent_commits() too early. This
+# in turn lead to access to freed group_commit_orderer object, losing a wakeup
+# and causing slave threads to hang.
+# We inject a small sleep in the corresponding record_gtid() to make the race
+# easier to hit.
+
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+
+# Group commit with cid=10010, two event groups.
+SET @old_server_id= @@SESSION.server_id;
+SET SESSION server_id= 100;
+SET @commit_id= 10010;
+ALTER TABLE t1 COMMENT "Hulubulu!";
+SET SESSION server_id= @old_server_id;
+INSERT INTO t3 VALUES (130, 0);
+
+# Group commit with cid=10011, one event group.
+SET @commit_id= 10011;
+INSERT INTO t3 VALUES (131, 0);
+
+SET SESSION debug_dbug=@old_dbug;
+
+SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+
+SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
+
+--source include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_dbug;
+--source include/start_slave.inc
+
+
+--echo *** MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill) ***
+
+--connection server_1
+INSERT INTO t3 VALUES (201,0), (202,0);
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_mdev8031';
+
+--connection server_1
+# We artificially create a situation that hopefully resembles the original
+# bug which was only seen "in the wild", and only once.
+# Setup a fake group commit with lots of conflicts that will lead to deadloc
+# kill. The slave DBUG injection causes the slave to be deadlock killed at
+# a particular point during the retry, and then later do a small sleep at
+# another critical point where the prior transaction then has a chance to
+# complete. Finally an extra KILL check catches an unhandled, lingering
+# deadlock kill. So rather artificial, but at least it exercises the
+# relevant code paths.
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+
+SET @commit_id= 10200;
+INSERT INTO t3 VALUES (203, 1);
+INSERT INTO t3 VALUES (204, 1);
+INSERT INTO t3 VALUES (205, 1);
+UPDATE t3 SET b=b+1 WHERE a=201;
+UPDATE t3 SET b=b+1 WHERE a=201;
+UPDATE t3 SET b=b+1 WHERE a=201;
+UPDATE t3 SET b=b+1 WHERE a=202;
+UPDATE t3 SET b=b+1 WHERE a=202;
+UPDATE t3 SET b=b+1 WHERE a=202;
+UPDATE t3 SET b=b+1 WHERE a=202;
+UPDATE t3 SET b=b+1 WHERE a=203;
+UPDATE t3 SET b=b+1 WHERE a=203;
+UPDATE t3 SET b=b+1 WHERE a=204;
+UPDATE t3 SET b=b+1 WHERE a=204;
+UPDATE t3 SET b=b+1 WHERE a=204;
+UPDATE t3 SET b=b+1 WHERE a=203;
+UPDATE t3 SET b=b+1 WHERE a=205;
+UPDATE t3 SET b=b+1 WHERE a=205;
+SET SESSION debug_dbug=@old_dbug;
+
+SELECT * FROM t3 WHERE a>=200 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+
+SELECT * FROM t3 WHERE a>=200 ORDER BY a;
+--source include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_dbug;
+--source include/start_slave.inc
+
+
+--echo *** Check getting deadlock killed inside open_binlog() during retry. ***
+
+--connection server_2
+--source include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_retry_event_group_open_binlog_kill';
+SET @old_max= @@GLOBAL.max_relay_log_size;
+SET GLOBAL max_relay_log_size= 4096;
+
+--connection server_1
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+
+--let $large= `SELECT REPEAT("*", 8192)`
+SET @commit_id= 10210;
+--echo Omit long queries that cause relaylog rotations and transaction retries...
+--disable_query_log
+eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=205 /* $large */;
+eval UPDATE t3 SET b=b+1 WHERE a=205 /* $large */;
+--enable_query_log
+SET SESSION debug_dbug=@old_dbug;
+
+SELECT * FROM t3 WHERE a>=200 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+
+SELECT * FROM t3 WHERE a>=200 ORDER BY a;
+--source include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_debg;
+SET GLOBAL max_relay_log_size= @old_max;
+--source include/start_slave.inc
+
+
+
 # Clean up.
 --connection server_2
 --source include/stop_slave.inc
@@ -1852,7 +2324,7 @@ SET DEBUG_SYNC= 'RESET';
 
 --connection server_1
 DROP function foo;
-DROP TABLE t1,t2,t3,t4,t5,t6;
+DROP TABLE t1,t2,t3,t4,t5,t6,t7,t8;
 SET DEBUG_SYNC= 'RESET';
 
 --source include/rpl_end.inc