src/test/recovery/t/031_recovery_conflict.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

# Copyright (c) 2021-2023, PostgreSQL Global Development Group

# Test that connections to a hot standby are correctly canceled when a
# recovery conflict is detected Also, test that statistics in
# pg_stat_database_conflicts are populated correctly

use strict;
use warnings;
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;


# Set up nodes
my $node_primary = PostgreSQL::Test::Cluster->new('primary');
$node_primary->init(allows_streaming => 1);

my $tablespace1 = "test_recovery_conflict_tblspc";

$node_primary->append_conf(
	'postgresql.conf', qq[
allow_in_place_tablespaces = on
log_temp_files = 0

# for deadlock test
max_prepared_transactions = 10

# wait some to test the wait paths as well, but not long for obvious reasons
max_standby_streaming_delay = 50ms

temp_tablespaces = $tablespace1
# Some of the recovery conflict logging code only gets exercised after
# deadlock_timeout. The test doesn't rely on that additional output, but it's
# nice to get some minimal coverage of that code.
log_recovery_conflict_waits = on
deadlock_timeout = 10ms
]);
$node_primary->start;

my $backup_name = 'my_backup';

$node_primary->safe_psql('postgres',
	qq[CREATE TABLESPACE $tablespace1 LOCATION '']);

$node_primary->backup($backup_name);
my $node_standby = PostgreSQL::Test::Cluster->new('standby');
$node_standby->init_from_backup($node_primary, $backup_name,
	has_streaming => 1);

$node_standby->start;

my $test_db = "test_db";

# use a new database, to trigger database recovery conflict
$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db");

# test schema / data
my $table1 = "test_recovery_conflict_table1";
my $table2 = "test_recovery_conflict_table2";
$node_primary->safe_psql(
	$test_db, qq[
CREATE TABLE ${table1}(a int, b int);
INSERT INTO $table1 SELECT i % 3, 0 FROM generate_series(1,20) i;
CREATE TABLE ${table2}(a int, b int);
]);
my $primary_lsn = $node_primary->lsn('flush');
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);


# a longrunning psql that we can use to trigger conflicts
my $psql_timeout = IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default);
my %psql_standby = ('stdin' => '', 'stdout' => '');
$psql_standby{run} =
  $node_standby->background_psql($test_db, \$psql_standby{stdin},
	\$psql_standby{stdout},
	$psql_timeout);
$psql_standby{stdout} = '';

my $expected_conflicts = 0;


## RECOVERY CONFLICT 1: Buffer pin conflict
my $sect = "buffer pin conflict";
$expected_conflicts++;

# Aborted INSERT on primary that will be cleaned up by vacuum. Has to be old
# enough so that there's not a snapshot conflict before the buffer pin
# conflict.

$node_primary->safe_psql(
	$test_db,
	qq[
	BEGIN;
	INSERT INTO $table1 VALUES (1,0);
	ROLLBACK;
	-- ensure flush, rollback doesn't do so
	BEGIN; LOCK $table1; COMMIT;
	]);

$primary_lsn = $node_primary->lsn('flush');
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);

my $cursor1 = "test_recovery_conflict_cursor";

# DECLARE and use a cursor on standby, causing buffer with the only block of
# the relation to be pinned on the standby
$psql_standby{stdin} .= qq[
        BEGIN;
        DECLARE $cursor1 CURSOR FOR SELECT b FROM $table1;
        FETCH FORWARD FROM $cursor1;
        ];
# FETCH FORWARD should have returned a 0 since all values of b in the table
# are 0
ok(pump_until_standby(qr/^0$/m),
	"$sect: cursor with conflicting pin established");

# to check the log starting now for recovery conflict messages
my $log_location = -s $node_standby->logfile;

# VACUUM on the primary
$node_primary->safe_psql($test_db, qq[VACUUM $table1;]);

# Wait for catchup. Existing connection will be terminated before replay is
# finished, so waiting for catchup ensures that there is no race between
# encountering the recovery conflict which causes the disconnect and checking
# the logfile for the terminated connection.
$primary_lsn = $node_primary->lsn('flush');
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);

check_conflict_log("User was holding shared buffer pin for too long");
reconnect_and_clear();
check_conflict_stat("bufferpin");


## RECOVERY CONFLICT 2: Snapshot conflict
$sect = "snapshot conflict";
$expected_conflicts++;

$node_primary->safe_psql($test_db,
	qq[INSERT INTO $table1 SELECT i, 0 FROM generate_series(1,20) i]);
$primary_lsn = $node_primary->lsn('flush');
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);

# DECLARE and FETCH from cursor on the standby
$psql_standby{stdin} .= qq[
        BEGIN;
        DECLARE $cursor1 CURSOR FOR SELECT b FROM $table1;
        FETCH FORWARD FROM $cursor1;
        ];
ok( pump_until(
		$psql_standby{run},     $psql_timeout,
		\$psql_standby{stdout}, qr/^0$/m,),
	"$sect: cursor with conflicting snapshot established");

# Do some HOT updates
$node_primary->safe_psql($test_db,
	qq[UPDATE $table1 SET a = a + 1 WHERE a > 2;]);

# VACUUM, pruning those dead tuples
$node_primary->safe_psql($test_db, qq[VACUUM $table1;]);

# Wait for attempted replay of PRUNE records
$primary_lsn = $node_primary->lsn('flush');
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);

check_conflict_log(
	"User query might have needed to see row versions that must be removed");
reconnect_and_clear();
check_conflict_stat("snapshot");


## RECOVERY CONFLICT 3: Lock conflict
$sect = "lock conflict";
$expected_conflicts++;

# acquire lock to conflict with
$psql_standby{stdin} .= qq[
        BEGIN;
        LOCK TABLE $table1 IN ACCESS SHARE MODE;
        SELECT 1;
        ];
ok(pump_until_standby(qr/^1$/m), "$sect: conflicting lock acquired");

# DROP TABLE containing block which standby has in a pinned buffer
$node_primary->safe_psql($test_db, qq[DROP TABLE $table1;]);

$primary_lsn = $node_primary->lsn('flush');
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);

check_conflict_log("User was holding a relation lock for too long");
reconnect_and_clear();
check_conflict_stat("lock");


## RECOVERY CONFLICT 4: Tablespace conflict
$sect = "tablespace conflict";
$expected_conflicts++;

# DECLARE a cursor for a query which, with sufficiently low work_mem, will
# spill tuples into temp files in the temporary tablespace created during
# setup.
$psql_standby{stdin} .= qq[
        BEGIN;
        SET work_mem = '64kB';
        DECLARE $cursor1 CURSOR FOR
          SELECT count(*) FROM generate_series(1,6000);
        FETCH FORWARD FROM $cursor1;
        ];
ok(pump_until_standby(qr/^6000$/m),
	"$sect: cursor with conflicting temp file established");

# Drop the tablespace currently containing spill files for the query on the
# standby
$node_primary->safe_psql($test_db, qq[DROP TABLESPACE $tablespace1;]);

$primary_lsn = $node_primary->lsn('flush');
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);

check_conflict_log(
	"User was or might have been using tablespace that must be dropped");
reconnect_and_clear();
check_conflict_stat("tablespace");


## RECOVERY CONFLICT 5: Deadlock
$sect = "startup deadlock";
$expected_conflicts++;

# Want to test recovery deadlock conflicts, not buffer pin conflicts. Without
# changing max_standby_streaming_delay it'd be timing dependent what we hit
# first
$node_standby->adjust_conf(
	'postgresql.conf',
	'max_standby_streaming_delay',
	"${PostgreSQL::Test::Utils::timeout_default}s");
$node_standby->restart();
reconnect_and_clear();

# Generate a few dead rows, to later be cleaned up by vacuum. Then acquire a
# lock on another relation in a prepared xact, so it's held continuously by
# the startup process. The standby psql will block acquiring that lock while
# holding a pin that vacuum needs, triggering the deadlock.
$node_primary->safe_psql(
	$test_db,
	qq[
CREATE TABLE $table1(a int, b int);
INSERT INTO $table1 VALUES (1);
BEGIN;
INSERT INTO $table1(a) SELECT generate_series(1, 100) i;
ROLLBACK;
BEGIN;
LOCK TABLE $table2;
PREPARE TRANSACTION 'lock';
INSERT INTO $table1(a) VALUES (170);
SELECT txid_current();
]);

$primary_lsn = $node_primary->lsn('flush');
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);

$psql_standby{stdin} .= qq[
    BEGIN;
    -- hold pin
    DECLARE $cursor1 CURSOR FOR SELECT a FROM $table1;
    FETCH FORWARD FROM $cursor1;
    -- wait for lock held by prepared transaction
	SELECT * FROM $table2;
    ];
ok( pump_until(
		$psql_standby{run},     $psql_timeout,
		\$psql_standby{stdout}, qr/^1$/m,),
	"$sect: cursor holding conflicting pin, also waiting for lock, established"
);

# just to make sure we're waiting for lock already
ok( $node_standby->poll_query_until(
		'postgres', qq[
SELECT 'waiting' FROM pg_locks WHERE locktype = 'relation' AND NOT granted;
], 'waiting'),
	"$sect: lock acquisition is waiting");

# VACUUM will prune away rows, causing a buffer pin conflict, while standby
# psql is waiting on lock
$node_primary->safe_psql($test_db, qq[VACUUM $table1;]);
$primary_lsn = $node_primary->lsn('flush');
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);

check_conflict_log("User transaction caused buffer deadlock with recovery.");
reconnect_and_clear();
check_conflict_stat("deadlock");

# clean up for next tests
$node_primary->safe_psql($test_db, qq[ROLLBACK PREPARED 'lock';]);
$node_standby->adjust_conf('postgresql.conf', 'max_standby_streaming_delay',
	'50ms');
$node_standby->restart();
reconnect_and_clear();


# Check that expected number of conflicts show in pg_stat_database. Needs to
# be tested before database is dropped, for obvious reasons.
is( $node_standby->safe_psql(
		$test_db,
		qq[SELECT conflicts FROM pg_stat_database WHERE datname='$test_db';]),
	$expected_conflicts,
	qq[$expected_conflicts recovery conflicts shown in pg_stat_database]);


## RECOVERY CONFLICT 6: Database conflict
$sect = "database conflict";

$node_primary->safe_psql('postgres', qq[DROP DATABASE $test_db;]);

$primary_lsn = $node_primary->lsn('flush');
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);

check_conflict_log("User was connected to a database that must be dropped");


# explicitly shut down psql instances gracefully - to avoid hangs or worse on
# windows
$psql_standby{stdin} .= "\\q\n";
$psql_standby{run}->finish;

$node_standby->stop();
$node_primary->stop();


done_testing();


sub pump_until_standby
{
	my $match = shift;

	return pump_until($psql_standby{run}, $psql_timeout,
		\$psql_standby{stdout}, $match);
}

sub reconnect_and_clear
{
	# If psql isn't dead already, tell it to quit as \q, when already dead,
	# causes IPC::Run to unhelpfully error out with "ack Broken pipe:".
	$psql_standby{run}->pump_nb();
	if ($psql_standby{run}->pumpable())
	{
		$psql_standby{stdin} .= "\\q\n";
	}
	$psql_standby{run}->finish;

	# restart
	$psql_standby{run}->run();
	$psql_standby{stdin}  = '';
	$psql_standby{stdout} = '';

	# Run query to ensure connection has finished re-establishing
	$psql_standby{stdin} .= qq[SELECT 1;\n];
	die unless pump_until_standby(qr/^1$/m);
	$psql_standby{stdout} = '';
}

sub check_conflict_log
{
	my $message          = shift;
	my $old_log_location = $log_location;

	$log_location = $node_standby->wait_for_log(qr/$message/, $log_location);

	cmp_ok($log_location, '>', $old_log_location,
		"$sect: logfile contains terminated connection due to recovery conflict"
	);
}

sub check_conflict_stat
{
	my $conflict_type = shift;
	my $count         = $node_standby->safe_psql($test_db,
		qq[SELECT confl_$conflict_type FROM pg_stat_database_conflicts WHERE datname='$test_db';]
	);

	is($count, 1, "$sect: stats show conflict on standby");
}