1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
|
/*-------------------------------------------------------------------------
*
* storage.c
* code to create and destroy physical storage for relations
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/catalog/storage.c
*
* NOTES
* Some of this code used to be in storage/smgr/smgr.c, and the
* function names still reflect that.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/visibilitymap.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "miscadmin.h"
#include "storage/freespace.h"
#include "storage/smgr.h"
#include "utils/memutils.h"
#include "utils/rel.h"
/*
* We keep a list of all relations (represented as RelFileNode values)
* that have been created or deleted in the current transaction. When
* a relation is created, we create the physical file immediately, but
* remember it so that we can delete the file again if the current
* transaction is aborted. Conversely, a deletion request is NOT
* executed immediately, but is just entered in the list. When and if
* the transaction commits, we can delete the physical file.
*
* To handle subtransactions, every entry is marked with its transaction
* nesting level. At subtransaction commit, we reassign the subtransaction's
* entries to the parent nesting level. At subtransaction abort, we can
* immediately execute the abort-time actions for all entries of the current
* nesting level.
*
* NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
* unbetimes. It'd probably be OK to keep it in TopTransactionContext,
* but I'm being paranoid.
*/
typedef struct PendingRelDelete
{
RelFileNode relnode; /* relation that may need to be deleted */
BackendId backend; /* InvalidBackendId if not a temp rel */
bool atCommit; /* T=delete at commit; F=delete at abort */
int nestLevel; /* xact nesting level of request */
struct PendingRelDelete *next; /* linked-list link */
} PendingRelDelete;
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
/*
* RelationCreateStorage
* Create physical storage for a relation.
*
* Create the underlying disk file storage for the relation. This only
* creates the main fork; additional forks are created lazily by the
* modules that need them.
*
* This function is transactional. The creation is WAL-logged, and if the
* transaction aborts later on, the storage will be destroyed.
*/
SMgrRelation
RelationCreateStorage(RelFileNode rnode, char relpersistence)
{
PendingRelDelete *pending;
SMgrRelation srel;
BackendId backend;
bool needs_wal;
switch (relpersistence)
{
case RELPERSISTENCE_TEMP:
backend = BackendIdForTempRelations();
needs_wal = false;
break;
case RELPERSISTENCE_UNLOGGED:
backend = InvalidBackendId;
needs_wal = false;
break;
case RELPERSISTENCE_PERMANENT:
backend = InvalidBackendId;
needs_wal = true;
break;
default:
elog(ERROR, "invalid relpersistence: %c", relpersistence);
return NULL; /* placate compiler */
}
srel = smgropen(rnode, backend);
smgrcreate(srel, MAIN_FORKNUM, false);
if (needs_wal)
log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
/* Add the relation to the list of stuff to delete at abort */
pending = (PendingRelDelete *)
MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
pending->relnode = rnode;
pending->backend = backend;
pending->atCommit = false; /* delete if abort */
pending->nestLevel = GetCurrentTransactionNestLevel();
pending->next = pendingDeletes;
pendingDeletes = pending;
return srel;
}
/*
* Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
*/
void
log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
{
xl_smgr_create xlrec;
/*
* Make an XLOG entry reporting the file creation.
*/
xlrec.rnode = *rnode;
xlrec.forkNum = forkNum;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, sizeof(xlrec));
XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
}
/*
* RelationDropStorage
* Schedule unlinking of physical storage at transaction commit.
*/
void
RelationDropStorage(Relation rel)
{
PendingRelDelete *pending;
/* Add the relation to the list of stuff to delete at commit */
pending = (PendingRelDelete *)
MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
pending->relnode = rel->rd_node;
pending->backend = rel->rd_backend;
pending->atCommit = true; /* delete if commit */
pending->nestLevel = GetCurrentTransactionNestLevel();
pending->next = pendingDeletes;
pendingDeletes = pending;
/*
* NOTE: if the relation was created in this transaction, it will now be
* present in the pending-delete list twice, once with atCommit true and
* once with atCommit false. Hence, it will be physically deleted at end
* of xact in either case (and the other entry will be ignored by
* smgrDoPendingDeletes, so no error will occur). We could instead remove
* the existing list entry and delete the physical file immediately, but
* for now I'll keep the logic simple.
*/
RelationCloseSmgr(rel);
}
/*
* RelationPreserveStorage
* Mark a relation as not to be deleted after all.
*
* We need this function because relation mapping changes are committed
* separately from commit of the whole transaction, so it's still possible
* for the transaction to abort after the mapping update is done.
* When a new physical relation is installed in the map, it would be
* scheduled for delete-on-abort, so we'd delete it, and be in trouble.
* The relation mapper fixes this by telling us to not delete such relations
* after all as part of its commit.
*
* We also use this to reuse an old build of an index during ALTER TABLE, this
* time removing the delete-at-commit entry.
*
* No-op if the relation is not among those scheduled for deletion.
*/
void
RelationPreserveStorage(RelFileNode rnode, bool atCommit)
{
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
{
next = pending->next;
if (RelFileNodeEquals(rnode, pending->relnode)
&& pending->atCommit == atCommit)
{
/* unlink and delete list entry */
if (prev)
prev->next = next;
else
pendingDeletes = next;
pfree(pending);
/* prev does not change */
}
else
{
/* unrelated entry, don't touch it */
prev = pending;
}
}
}
/*
* RelationTruncate
* Physically truncate a relation to the specified number of blocks.
*
* This includes getting rid of any buffers for the blocks that are to be
* dropped.
*/
void
RelationTruncate(Relation rel, BlockNumber nblocks)
{
bool fsm;
bool vm;
bool need_fsm_vacuum = false;
ForkNumber forks[MAX_FORKNUM];
BlockNumber blocks[MAX_FORKNUM];
int nforks = 0;
/* Open it at the smgr level if not already done */
RelationOpenSmgr(rel);
/*
* Make sure smgr_targblock etc aren't pointing somewhere past new end
*/
rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
/* Prepare for truncation of MAIN fork of the relation */
forks[nforks] = MAIN_FORKNUM;
blocks[nforks] = nblocks;
nforks++;
/* Prepare for truncation of the FSM if it exists */
fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
if (fsm)
{
blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
if (BlockNumberIsValid(blocks[nforks]))
{
forks[nforks] = FSM_FORKNUM;
nforks++;
need_fsm_vacuum = true;
}
}
/* Prepare for truncation of the visibility map too if it exists */
vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
if (vm)
{
blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
if (BlockNumberIsValid(blocks[nforks]))
{
forks[nforks] = VISIBILITYMAP_FORKNUM;
nforks++;
}
}
/*
* We WAL-log the truncation before actually truncating, which means
* trouble if the truncation fails. If we then crash, the WAL replay
* likely isn't going to succeed in the truncation either, and cause a
* PANIC. It's tempting to put a critical section here, but that cure
* would be worse than the disease. It would turn a usually harmless
* failure to truncate, that might spell trouble at WAL replay, into a
* certain PANIC.
*/
if (RelationNeedsWAL(rel))
{
/*
* Make an XLOG entry reporting the file truncation.
*/
XLogRecPtr lsn;
xl_smgr_truncate xlrec;
xlrec.blkno = nblocks;
xlrec.rnode = rel->rd_node;
xlrec.flags = SMGR_TRUNCATE_ALL;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, sizeof(xlrec));
lsn = XLogInsert(RM_SMGR_ID,
XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
/*
* Flush, because otherwise the truncation of the main relation might
* hit the disk before the WAL record, and the truncation of the FSM
* or visibility map. If we crashed during that window, we'd be left
* with a truncated heap, but the FSM or visibility map would still
* contain entries for the non-existent heap pages.
*/
if (fsm || vm)
XLogFlush(lsn);
}
/* Do the real work to truncate relation forks */
smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
/*
* Update upper-level FSM pages to account for the truncation.
* This is important because the just-truncated pages were likely
* marked as all-free, and would be preferentially selected.
*/
if (need_fsm_vacuum)
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
}
/*
* Copy a fork's data, block by block.
*
* Note that this requires that there is no dirty data in shared buffers. If
* it's possible that there are, callers need to flush those using
* e.g. FlushRelationBuffers(rel).
*/
void
RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
ForkNumber forkNum, char relpersistence)
{
PGAlignedBlock buf;
Page page;
bool use_wal;
bool copying_initfork;
BlockNumber nblocks;
BlockNumber blkno;
page = (Page) buf.data;
/*
* The init fork for an unlogged relation in many respects has to be
* treated the same as normal relation, changes need to be WAL logged and
* it needs to be synced to disk.
*/
copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
forkNum == INIT_FORKNUM;
/*
* We need to log the copied data in WAL iff WAL archiving/streaming is
* enabled AND it's a permanent relation.
*/
use_wal = XLogIsNeeded() &&
(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
nblocks = smgrnblocks(src, forkNum);
for (blkno = 0; blkno < nblocks; blkno++)
{
/* If we got a cancel signal during the copy of the data, quit */
CHECK_FOR_INTERRUPTS();
smgrread(src, forkNum, blkno, buf.data);
if (!PageIsVerified(page, blkno))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid page in block %u of relation %s",
blkno,
relpathbackend(src->smgr_rnode.node,
src->smgr_rnode.backend,
forkNum))));
/*
* WAL-log the copied page. Unfortunately we don't know what kind of a
* page this is, so we have to log the full page including any unused
* space.
*/
if (use_wal)
log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
PageSetChecksumInplace(page, blkno);
/*
* Now write the page. We say skipFsync = true because there's no
* need for smgr to schedule an fsync for this write; we'll do it
* ourselves below.
*/
smgrextend(dst, forkNum, blkno, buf.data, true);
}
/*
* If the rel is WAL-logged, must fsync before commit. We use heap_sync
* to ensure that the toast table gets fsync'd too. (For a temp or
* unlogged rel we don't care since the data will be gone after a crash
* anyway.)
*
* It's obvious that we must do this when not WAL-logging the copy. It's
* less obvious that we have to do it even if we did WAL-log the copied
* pages. The reason is that since we're copying outside shared buffers, a
* CHECKPOINT occurring during the copy has no way to flush the previously
* written data to disk (indeed it won't know the new rel even exists). A
* crash later on would replay WAL from the checkpoint, therefore it
* wouldn't replay our earlier WAL entries. If we do not fsync those pages
* here, they might still not be on disk when the crash occurs.
*/
if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
smgrimmedsync(dst, forkNum);
}
/*
* smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
*
* This also runs when aborting a subxact; we want to clean up a failed
* subxact immediately.
*
* Note: It's possible that we're being asked to remove a relation that has
* no physical storage in any fork. In particular, it's possible that we're
* cleaning up an old temporary relation for which RemovePgTempFiles has
* already recovered the physical storage.
*/
void
smgrDoPendingDeletes(bool isCommit)
{
int nestLevel = GetCurrentTransactionNestLevel();
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
int nrels = 0,
i = 0,
maxrels = 0;
SMgrRelation *srels = NULL;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
{
next = pending->next;
if (pending->nestLevel < nestLevel)
{
/* outer-level entries should not be processed yet */
prev = pending;
}
else
{
/* unlink list entry first, so we don't retry on failure */
if (prev)
prev->next = next;
else
pendingDeletes = next;
/* do deletion if called for */
if (pending->atCommit == isCommit)
{
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
/* allocate the initial array, or extend it, if needed */
if (maxrels == 0)
{
maxrels = 8;
srels = palloc(sizeof(SMgrRelation) * maxrels);
}
else if (maxrels <= nrels)
{
maxrels *= 2;
srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
}
srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
if (nrels > 0)
{
smgrdounlinkall(srels, nrels, false);
for (i = 0; i < nrels; i++)
smgrclose(srels[i]);
pfree(srels);
}
}
/*
* smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
*
* The return value is the number of relations scheduled for termination.
* *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
* If there are no relations to be deleted, *ptr is set to NULL.
*
* Only non-temporary relations are included in the returned list. This is OK
* because the list is used only in contexts where temporary relations don't
* matter: we're either writing to the two-phase state file (and transactions
* that have touched temp tables can't be prepared) or we're writing to xlog
* (and all temporary files will be zapped if we restart anyway, so no need
* for redo to do it also).
*
* Note that the list does not include anything scheduled for termination
* by upper-level transactions.
*/
int
smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
{
int nestLevel = GetCurrentTransactionNestLevel();
int nrels;
RelFileNode *rptr;
PendingRelDelete *pending;
nrels = 0;
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
{
if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
&& pending->backend == InvalidBackendId)
nrels++;
}
if (nrels == 0)
{
*ptr = NULL;
return 0;
}
rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
*ptr = rptr;
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
{
if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
&& pending->backend == InvalidBackendId)
{
*rptr = pending->relnode;
rptr++;
}
}
return nrels;
}
/*
* PostPrepare_smgr -- Clean up after a successful PREPARE
*
* What we have to do here is throw away the in-memory state about pending
* relation deletes. It's all been recorded in the 2PC state file and
* it's no longer smgr's job to worry about it.
*/
void
PostPrepare_smgr(void)
{
PendingRelDelete *pending;
PendingRelDelete *next;
for (pending = pendingDeletes; pending != NULL; pending = next)
{
next = pending->next;
pendingDeletes = next;
/* must explicitly free the list entry */
pfree(pending);
}
}
/*
* AtSubCommit_smgr() --- Take care of subtransaction commit.
*
* Reassign all items in the pending-deletes list to the parent transaction.
*/
void
AtSubCommit_smgr(void)
{
int nestLevel = GetCurrentTransactionNestLevel();
PendingRelDelete *pending;
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
{
if (pending->nestLevel >= nestLevel)
pending->nestLevel = nestLevel - 1;
}
}
/*
* AtSubAbort_smgr() --- Take care of subtransaction abort.
*
* Delete created relations and forget about deleted relations.
* We can execute these operations immediately because we know this
* subtransaction will not commit.
*/
void
AtSubAbort_smgr(void)
{
smgrDoPendingDeletes(false);
}
void
smgr_redo(XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
/* Backup blocks are not used in smgr records */
Assert(!XLogRecHasAnyBlockRefs(record));
if (info == XLOG_SMGR_CREATE)
{
xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
SMgrRelation reln;
reln = smgropen(xlrec->rnode, InvalidBackendId);
smgrcreate(reln, xlrec->forkNum, true);
}
else if (info == XLOG_SMGR_TRUNCATE)
{
xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
SMgrRelation reln;
Relation rel;
ForkNumber forks[MAX_FORKNUM];
BlockNumber blocks[MAX_FORKNUM];
int nforks = 0;
bool need_fsm_vacuum = false;
reln = smgropen(xlrec->rnode, InvalidBackendId);
/*
* Forcibly create relation if it doesn't exist (which suggests that
* it was dropped somewhere later in the WAL sequence). As in
* XLogReadBufferForRedo, we prefer to recreate the rel and replay the
* log as best we can until the drop is seen.
*/
smgrcreate(reln, MAIN_FORKNUM, true);
/*
* Before we perform the truncation, update minimum recovery point to
* cover this WAL record. Once the relation is truncated, there's no
* going back. The buffer manager enforces the WAL-first rule for
* normal updates to relation files, so that the minimum recovery
* point is always updated before the corresponding change in the data
* file is flushed to disk. We have to do the same manually here.
*
* Doing this before the truncation means that if the truncation fails
* for some reason, you cannot start up the system even after restart,
* until you fix the underlying situation so that the truncation will
* succeed. Alternatively, we could update the minimum recovery point
* after truncation, but that would leave a small window where the
* WAL-first rule could be violated.
*/
XLogFlush(lsn);
/* Prepare for truncation of MAIN fork */
if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
{
forks[nforks] = MAIN_FORKNUM;
blocks[nforks] = xlrec->blkno;
nforks++;
/* Also tell xlogutils.c about it */
XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
}
/* Prepare for truncation of FSM and VM too */
rel = CreateFakeRelcacheEntry(xlrec->rnode);
if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
smgrexists(reln, FSM_FORKNUM))
{
blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
if (BlockNumberIsValid(blocks[nforks]))
{
forks[nforks] = FSM_FORKNUM;
nforks++;
need_fsm_vacuum = true;
}
}
if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
smgrexists(reln, VISIBILITYMAP_FORKNUM))
{
blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
if (BlockNumberIsValid(blocks[nforks]))
{
forks[nforks] = VISIBILITYMAP_FORKNUM;
nforks++;
}
}
/* Do the real work to truncate relation forks */
if (nforks > 0)
smgrtruncate(reln, forks, nforks, blocks);
/*
* Update upper-level FSM pages to account for the truncation.
* This is important because the just-truncated pages were likely
* marked as all-free, and would be preferentially selected.
*/
if (need_fsm_vacuum)
FreeSpaceMapVacuumRange(rel, xlrec->blkno,
InvalidBlockNumber);
FreeFakeRelcacheEntry(rel);
}
else
elog(PANIC, "smgr_redo: unknown op code %u", info);
}
|