1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
|
/*-------------------------------------------------------------------------
*
* tableam.h
* POSTGRES table access method definitions.
*
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/tableam.h
*
* NOTES
* See tableam.sgml for higher level documentation.
*
*-------------------------------------------------------------------------
*/
#ifndef TABLEAM_H
#define TABLEAM_H
#include "access/relscan.h"
#include "access/sdir.h"
#include "utils/guc.h"
#include "utils/rel.h"
#include "utils/snapshot.h"
#define DEFAULT_TABLE_ACCESS_METHOD "heap"
/* GUCs */
extern char *default_table_access_method;
extern bool synchronize_seqscans;
struct BulkInsertStateData;
struct IndexInfo;
struct IndexBuildCallback;
struct SampleScanState;
struct TBMIterateResult;
struct VacuumParams;
struct ValidateIndexState;
/*
* Bitmask values for the flags argument to the scan_begin callback.
*/
typedef enum ScanOptions
{
/* one of SO_TYPE_* may be specified */
SO_TYPE_SEQSCAN = 1 << 0,
SO_TYPE_BITMAPSCAN = 1 << 1,
SO_TYPE_SAMPLESCAN = 1 << 2,
SO_TYPE_ANALYZE = 1 << 3,
/* several of SO_ALLOW_* may be specified */
/* allow or disallow use of access strategy */
SO_ALLOW_STRAT = 1 << 4,
/* report location to syncscan logic? */
SO_ALLOW_SYNC = 1 << 5,
/* verify visibility page-at-a-time? */
SO_ALLOW_PAGEMODE = 1 << 6,
/* unregister snapshot at scan end? */
SO_TEMP_SNAPSHOT = 1 << 7
} ScanOptions;
/*
* Result codes for table_{update,delete,lock_tuple}, and for visibility
* routines inside table AMs.
*/
typedef enum TM_Result
{
/*
* Signals that the action succeeded (i.e. update/delete performed, lock
* was acquired)
*/
TM_Ok,
/* The affected tuple wasn't visible to the relevant snapshot */
TM_Invisible,
/* The affected tuple was already modified by the calling backend */
TM_SelfModified,
/*
* The affected tuple was updated by another transaction. This includes
* the case where tuple was moved to another partition.
*/
TM_Updated,
/* The affected tuple was deleted by another transaction */
TM_Deleted,
/*
* The affected tuple is currently being modified by another session. This
* will only be returned if table_(update/delete/lock_tuple) are
* instructed not to wait.
*/
TM_BeingModified,
/* lock couldn't be acquired, action skipped. Only used by lock_tuple */
TM_WouldBlock
} TM_Result;
/*
* When table_update, table_delete, or table_lock_tuple fail because the target
* tuple is already outdated, they fill in this struct to provide information
* to the caller about what happened.
*
* ctid is the target's ctid link: it is the same as the target's TID if the
* target was deleted, or the location of the replacement tuple if the target
* was updated.
*
* xmax is the outdating transaction's XID. If the caller wants to visit the
* replacement tuple, it must check that this matches before believing the
* replacement is really a match.
*
* cmax is the outdating command's CID, but only when the failure code is
* TM_SelfModified (i.e., something in the current transaction outdated the
* tuple); otherwise cmax is zero. (We make this restriction because
* HeapTupleHeaderGetCmax doesn't work for tuples outdated in other
* transactions.)
*/
typedef struct TM_FailureData
{
ItemPointerData ctid;
TransactionId xmax;
CommandId cmax;
bool traversed;
} TM_FailureData;
/* "options" flag bits for table_insert */
#define TABLE_INSERT_SKIP_WAL 0x0001
#define TABLE_INSERT_SKIP_FSM 0x0002
#define TABLE_INSERT_FROZEN 0x0004
#define TABLE_INSERT_NO_LOGICAL 0x0008
/* flag bits for table_lock_tuple */
/* Follow tuples whose update is in progress if lock modes don't conflict */
#define TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS (1 << 0)
/* Follow update chain and lock latest version of tuple */
#define TUPLE_LOCK_FLAG_FIND_LAST_VERSION (1 << 1)
/* Typedef for callback function for table_index_build_scan */
typedef void (*IndexBuildCallback) (Relation index,
HeapTuple htup,
Datum *values,
bool *isnull,
bool tupleIsAlive,
void *state);
/*
* API struct for a table AM. Note this must be allocated in a
* server-lifetime manner, typically as a static const struct, which then gets
* returned by FormData_pg_am.amhandler.
*
* In most cases it's not appropriate to call the callbacks directly, use the
* table_* wrapper functions instead.
*
* GetTableAmRoutine() asserts that required callbacks are filled in, remember
* to update when adding a callback.
*/
typedef struct TableAmRoutine
{
/* this must be set to T_TableAmRoutine */
NodeTag type;
/* ------------------------------------------------------------------------
* Slot related callbacks.
* ------------------------------------------------------------------------
*/
/*
* Return slot implementation suitable for storing a tuple of this AM.
*/
const TupleTableSlotOps *(*slot_callbacks) (Relation rel);
/* ------------------------------------------------------------------------
* Table scan callbacks.
* ------------------------------------------------------------------------
*/
/*
* Start a scan of `rel`. The callback has to return a TableScanDesc,
* which will typically be embedded in a larger, AM specific, struct.
*
* If nkeys != 0, the results need to be filtered by those scan keys.
*
* pscan, if not NULL, will have already been initialized with
* parallelscan_initialize(), and has to be for the same relation. Will
* only be set coming from table_beginscan_parallel().
*
* `flags` is a bitmask indicating the type of scan (ScanOptions's
* SO_TYPE_*, currently only one may be specified), options controlling
* the scan's behaviour (ScanOptions's SO_ALLOW_*, several may be
* specified, an AM may ignore unsupported ones) and whether the snapshot
* needs to be deallocated at scan_end (ScanOptions's SO_TEMP_SNAPSHOT).
*/
TableScanDesc (*scan_begin) (Relation rel,
Snapshot snapshot,
int nkeys, struct ScanKeyData *key,
ParallelTableScanDesc pscan,
uint32 flags);
/*
* Release resources and deallocate scan. If TableScanDesc.temp_snap,
* TableScanDesc.rs_snapshot needs to be unregistered.
*/
void (*scan_end) (TableScanDesc scan);
/*
* Restart relation scan. If set_params is set to true, allow_{strat,
* sync, pagemode} (see scan_begin) changes should be taken into account.
*/
void (*scan_rescan) (TableScanDesc scan, struct ScanKeyData *key,
bool set_params, bool allow_strat,
bool allow_sync, bool allow_pagemode);
/*
* Return next tuple from `scan`, store in slot.
*/
bool (*scan_getnextslot) (TableScanDesc scan,
ScanDirection direction,
TupleTableSlot *slot);
/* ------------------------------------------------------------------------
* Parallel table scan related functions.
* ------------------------------------------------------------------------
*/
/*
* Estimate the size of shared memory needed for a parallel scan of this
* relation. The snapshot does not need to be accounted for.
*/
Size (*parallelscan_estimate) (Relation rel);
/*
* Initialize ParallelTableScanDesc for a parallel scan of this relation.
* `pscan` will be sized according to parallelscan_estimate() for the same
* relation.
*/
Size (*parallelscan_initialize) (Relation rel,
ParallelTableScanDesc pscan);
/*
* Reinitialize `pscan` for a new scan. `rel` will be the same relation as
* when `pscan` was initialized by parallelscan_initialize.
*/
void (*parallelscan_reinitialize) (Relation rel,
ParallelTableScanDesc pscan);
/* ------------------------------------------------------------------------
* Index Scan Callbacks
* ------------------------------------------------------------------------
*/
/*
* Prepare to fetch tuples from the relation, as needed when fetching
* tuples for an index scan. The callback has to return an
* IndexFetchTableData, which the AM will typically embed in a larger
* structure with additional information.
*
* Tuples for an index scan can then be fetched via index_fetch_tuple.
*/
struct IndexFetchTableData *(*index_fetch_begin) (Relation rel);
/*
* Reset index fetch. Typically this will release cross index fetch
* resources held in IndexFetchTableData.
*/
void (*index_fetch_reset) (struct IndexFetchTableData *data);
/*
* Release resources and deallocate index fetch.
*/
void (*index_fetch_end) (struct IndexFetchTableData *data);
/*
* Fetch tuple at `tid` into `slot`, after doing a visibility test
* according to `snapshot`. If a tuple was found and passed the visibility
* test, return true, false otherwise.
*
* Note that AMs that do not necessarily update indexes when indexed
* columns do not change, need to return the current/correct version of
* the tuple that is visible to the snapshot, even if the tid points to an
* older version of the tuple.
*
* *call_again is false on the first call to index_fetch_tuple for a tid.
* If there potentially is another tuple matching the tid, *call_again
* needs be set to true by index_fetch_tuple, signalling to the caller
* that index_fetch_tuple should be called again for the same tid.
*
* *all_dead, if all_dead is not NULL, should be set to true by
* index_fetch_tuple iff it is guaranteed that no backend needs to see
* that tuple. Index AMs can use that do avoid returning that tid in
* future searches.
*/
bool (*index_fetch_tuple) (struct IndexFetchTableData *scan,
ItemPointer tid,
Snapshot snapshot,
TupleTableSlot *slot,
bool *call_again, bool *all_dead);
/* ------------------------------------------------------------------------
* Callbacks for non-modifying operations on individual tuples
* ------------------------------------------------------------------------
*/
/*
* Fetch tuple at `tid` into `slot`, after doing a visibility test
* according to `snapshot`. If a tuple was found and passed the visibility
* test, returns true, false otherwise.
*/
bool (*tuple_fetch_row_version) (Relation rel,
ItemPointer tid,
Snapshot snapshot,
TupleTableSlot *slot);
/*
* Is tid valid for a scan of this relation.
*/
bool (*tuple_tid_valid) (TableScanDesc scan,
ItemPointer tid);
/*
* Return the latest version of the tuple at `tid`, by updating `tid` to
* point at the newest version.
*/
void (*tuple_get_latest_tid) (TableScanDesc scan,
ItemPointer tid);
/*
* Does the tuple in `slot` satisfy `snapshot`? The slot needs to be of
* the appropriate type for the AM.
*/
bool (*tuple_satisfies_snapshot) (Relation rel,
TupleTableSlot *slot,
Snapshot snapshot);
/* see table_compute_xid_horizon_for_tuples() */
TransactionId (*compute_xid_horizon_for_tuples) (Relation rel,
ItemPointerData *items,
int nitems);
/* ------------------------------------------------------------------------
* Manipulations of physical tuples.
* ------------------------------------------------------------------------
*/
/* see table_insert() for reference about parameters */
void (*tuple_insert) (Relation rel, TupleTableSlot *slot,
CommandId cid, int options,
struct BulkInsertStateData *bistate);
/* see table_insert_speculative() for reference about parameters */
void (*tuple_insert_speculative) (Relation rel,
TupleTableSlot *slot,
CommandId cid,
int options,
struct BulkInsertStateData *bistate,
uint32 specToken);
/* see table_complete_speculative() for reference about parameters */
void (*tuple_complete_speculative) (Relation rel,
TupleTableSlot *slot,
uint32 specToken,
bool succeeded);
/* see table_multi_insert() for reference about parameters */
void (*multi_insert) (Relation rel, TupleTableSlot **slots, int nslots,
CommandId cid, int options, struct BulkInsertStateData *bistate);
/* see table_delete() for reference about parameters */
TM_Result (*tuple_delete) (Relation rel,
ItemPointer tid,
CommandId cid,
Snapshot snapshot,
Snapshot crosscheck,
bool wait,
TM_FailureData *tmfd,
bool changingPart);
/* see table_update() for reference about parameters */
TM_Result (*tuple_update) (Relation rel,
ItemPointer otid,
TupleTableSlot *slot,
CommandId cid,
Snapshot snapshot,
Snapshot crosscheck,
bool wait,
TM_FailureData *tmfd,
LockTupleMode *lockmode,
bool *update_indexes);
/* see table_lock_tuple() for reference about parameters */
TM_Result (*tuple_lock) (Relation rel,
ItemPointer tid,
Snapshot snapshot,
TupleTableSlot *slot,
CommandId cid,
LockTupleMode mode,
LockWaitPolicy wait_policy,
uint8 flags,
TM_FailureData *tmfd);
/*
* Perform operations necessary to complete insertions made via
* tuple_insert and multi_insert with a BulkInsertState specified. This
* may for example be used to flush the relation, when the
* TABLE_INSERT_SKIP_WAL option was used.
*
* Typically callers of tuple_insert and multi_insert will just pass all
* the flags that apply to them, and each AM has to decide which of them
* make sense for it, and then only take actions in finish_bulk_insert for
* those flags, and ignore others.
*
* Optional callback.
*/
void (*finish_bulk_insert) (Relation rel, int options);
/* ------------------------------------------------------------------------
* DDL related functionality.
* ------------------------------------------------------------------------
*/
/*
* This callback needs to create a new relation filenode for `rel`, with
* appropriate durability behaviour for `persistence`.
*
* Note that only the subset of the relcache filled by
* RelationBuildLocalRelation() can be relied upon and that the relation's
* catalog entries either will either not yet exist (new relation), or
* will still reference the old relfilenode.
*
* As output *freezeXid, *minmulti must be set to the values appropriate
* for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those
* fields to be filled they can be set to InvalidTransactionId and
* InvalidMultiXactId, respectively.
*
* See also table_relation_set_new_filenode().
*/
void (*relation_set_new_filenode) (Relation rel,
const RelFileNode *newrnode,
char persistence,
TransactionId *freezeXid,
MultiXactId *minmulti);
/*
* This callback needs to remove all contents from `rel`'s current
* relfilenode. No provisions for transactional behaviour need to be made.
* Often this can be implemented by truncating the underlying storage to
* its minimal size.
*
* See also table_relation_nontransactional_truncate().
*/
void (*relation_nontransactional_truncate) (Relation rel);
/*
* See table_relation_copy_data().
*
* This can typically be implemented by directly copying the underlying
* storage, unless it contains references to the tablespace internally.
*/
void (*relation_copy_data) (Relation rel,
const RelFileNode *newrnode);
/* See table_relation_copy_for_cluster() */
void (*relation_copy_for_cluster) (Relation NewHeap,
Relation OldHeap,
Relation OldIndex,
bool use_sort,
TransactionId OldestXmin,
TransactionId *xid_cutoff,
MultiXactId *multi_cutoff,
double *num_tuples,
double *tups_vacuumed,
double *tups_recently_dead);
/*
* React to VACUUM command on the relation. The VACUUM might be user
* triggered or by autovacuum. The specific actions performed by the AM
* will depend heavily on the individual AM.
*
* On entry a transaction is already established, and the relation is
* locked with a ShareUpdateExclusive lock.
*
* Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through
* this routine, even if (for ANALYZE) it is part of the same VACUUM
* command.
*
* There probably, in the future, needs to be a separate callback to
* integrate with autovacuum's scheduling.
*/
void (*relation_vacuum) (Relation onerel,
struct VacuumParams *params,
BufferAccessStrategy bstrategy);
/*
* Prepare to analyze block `blockno` of `scan`. The scan has been started
* with table_beginscan_analyze(). See also
* table_scan_analyze_next_block().
*
* The callback may acquire resources like locks that are held until
* table_scan_analyze_next_tuple() returns false. It e.g. can make sense
* to hold a lock until all tuples on a block have been analyzed by
* scan_analyze_next_tuple.
*
* The callback can return false if the block is not suitable for
* sampling, e.g. because it's a metapage that could never contain tuples.
*
* XXX: This obviously is primarily suited for block-based AMs. It's not
* clear what a good interface for non block based AMs would be, so there
* isn't one yet.
*/
bool (*scan_analyze_next_block) (TableScanDesc scan,
BlockNumber blockno,
BufferAccessStrategy bstrategy);
/*
* See table_scan_analyze_next_tuple().
*
* Not every AM might have a meaningful concept of dead rows, in which
* case it's OK to not increment *deadrows - but note that that may
* influence autovacuum scheduling (see comment for relation_vacuum
* callback).
*/
bool (*scan_analyze_next_tuple) (TableScanDesc scan,
TransactionId OldestXmin,
double *liverows,
double *deadrows,
TupleTableSlot *slot);
/* see table_index_build_range_scan for reference about parameters */
double (*index_build_range_scan) (Relation heap_rel,
Relation index_rel,
struct IndexInfo *index_nfo,
bool allow_sync,
bool anyvisible,
bool progress,
BlockNumber start_blockno,
BlockNumber end_blockno,
IndexBuildCallback callback,
void *callback_state,
TableScanDesc scan);
/* see table_index_validate_scan for reference about parameters */
void (*index_validate_scan) (Relation heap_rel,
Relation index_rel,
struct IndexInfo *index_info,
Snapshot snapshot,
struct ValidateIndexState *state);
/* ------------------------------------------------------------------------
* Miscellaneous functions.
* ------------------------------------------------------------------------
*/
/*
* See table_relation_size().
*
* Note that currently a few callers use the MAIN_FORKNUM size to figure
* out the range of potentially interesting blocks (brin, analyze). It's
* probable that we'll need to revise the interface for those at some
* point.
*/
uint64 (*relation_size) (Relation rel, ForkNumber forkNumber);
/*
* This callback should return true if the relation requires a TOAST table
* and false if it does not. It may wish to examine the relation's
* tuple descriptor before making a decision, but if it uses some other
* method of storing large values (or if it does not support them) it can
* simply return false.
*/
bool (*relation_needs_toast_table) (Relation rel);
/* ------------------------------------------------------------------------
* Planner related functions.
* ------------------------------------------------------------------------
*/
/*
* See table_relation_estimate_size().
*
* While block oriented, it shouldn't be too hard for an AM that doesn't
* doesn't internally use blocks to convert into a usable representation.
*
* This differs from the relation_size callback by returning size
* estimates (both relation size and tuple count) for planning purposes,
* rather than returning a currently correct estimate.
*/
void (*relation_estimate_size) (Relation rel, int32 *attr_widths,
BlockNumber *pages, double *tuples,
double *allvisfrac);
/* ------------------------------------------------------------------------
* Executor related functions.
* ------------------------------------------------------------------------
*/
/*
* Prepare to fetch / check / return tuples from `tbmres->blockno` as part
* of a bitmap table scan. `scan` was started via table_beginscan_bm().
* Return false if there are no tuples to be found on the page, true
* otherwise.
*
* This will typically read and pin the target block, and do the necessary
* work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might
* make sense to perform tuple visibility checks at this time). For some
* AMs it will make more sense to do all the work referencing `tbmres`
* contents here, for others it might be better to defer more work to
* scan_bitmap_next_tuple.
*
* If `tbmres->blockno` is -1, this is a lossy scan and all visible tuples
* on the page have to be returned, otherwise the tuples at offsets in
* `tbmres->offsets` need to be returned.
*
* XXX: Currently this may only be implemented if the AM uses md.c as its
* storage manager, and uses ItemPointer->ip_blkid in a manner that maps
* blockids directly to the underlying storage. nodeBitmapHeapscan.c
* performs prefetching directly using that interface. This probably
* needs to be rectified at a later point.
*
* XXX: Currently this may only be implemented if the AM uses the
* visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to
* perform prefetching. This probably needs to be rectified at a later
* point.
*
* Optional callback, but either both scan_bitmap_next_block and
* scan_bitmap_next_tuple need to exist, or neither.
*/
bool (*scan_bitmap_next_block) (TableScanDesc scan,
struct TBMIterateResult *tbmres);
/*
* Fetch the next tuple of a bitmap table scan into `slot` and return true
* if a visible tuple was found, false otherwise.
*
* For some AMs it will make more sense to do all the work referencing
* `tbmres` contents in scan_bitmap_next_block, for others it might be
* better to defer more work to this callback.
*
* Optional callback, but either both scan_bitmap_next_block and
* scan_bitmap_next_tuple need to exist, or neither.
*/
bool (*scan_bitmap_next_tuple) (TableScanDesc scan,
struct TBMIterateResult *tbmres,
TupleTableSlot *slot);
/*
* Prepare to fetch tuples from the next block in a sample scan. Return
* false if the sample scan is finished, true otherwise. `scan` was
* started via table_beginscan_sampling().
*
* Typically this will first determine the target block by call the
* TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively
* perform a sequential scan over all blocks. The determined block is
* then typically read and pinned.
*
* As the TsmRoutine interface is block based, a block needs to be passed
* to NextSampleBlock(). If that's not appropriate for an AM, it
* internally needs to perform mapping between the internal and a block
* based representation.
*
* Note that it's not acceptable to hold deadlock prone resources such as
* lwlocks until scan_sample_next_tuple() has exhausted the tuples on the
* block - the tuple is likely to be returned to an upper query node, and
* the next call could be off a long while. Holding buffer pins and such
* is obviously OK.
*
* Currently it is required to implement this interface, as there's no
* alternative way (contrary e.g. to bitmap scans) to implement sample
* scans. If infeasible to implement the AM may raise an error.
*/
bool (*scan_sample_next_block) (TableScanDesc scan,
struct SampleScanState *scanstate);
/*
* This callback, only called after scan_sample_next_block has returned
* true, should determine the next tuple to be returned from the selected
* block using the TsmRoutine's NextSampleTuple() callback.
*
* The callback needs to perform visibility checks, and only return
* visible tuples. That obviously can mean calling NextSampletuple()
* multiple times.
*
* The TsmRoutine interface assumes that there's a maximum offset on a
* given page, so if that doesn't apply to an AM, it needs to emulate that
* assumption somehow.
*/
bool (*scan_sample_next_tuple) (TableScanDesc scan,
struct SampleScanState *scanstate,
TupleTableSlot *slot);
} TableAmRoutine;
/* ----------------------------------------------------------------------------
* Slot functions.
* ----------------------------------------------------------------------------
*/
/*
* Returns slot callbacks suitable for holding tuples of the appropriate type
* for the relation. Works for tables, views, foreign tables and partitioned
* tables.
*/
extern const TupleTableSlotOps *table_slot_callbacks(Relation rel);
/*
* Returns slot using the callbacks returned by table_slot_callbacks(), and
* registers it on *reglist.
*/
extern TupleTableSlot *table_slot_create(Relation rel, List **reglist);
/* ----------------------------------------------------------------------------
* Table scan functions.
* ----------------------------------------------------------------------------
*/
/*
* Start a scan of `rel`. Returned tuples pass a visibility test of
* `snapshot`, and if nkeys != 0, the results are filtered by those scan keys.
*/
static inline TableScanDesc
table_beginscan(Relation rel, Snapshot snapshot,
int nkeys, struct ScanKeyData *key)
{
uint32 flags = SO_TYPE_SEQSCAN |
SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
/*
* Like table_beginscan(), but for scanning catalog. It'll automatically use a
* snapshot appropriate for scanning catalog relations.
*/
extern TableScanDesc table_beginscan_catalog(Relation rel, int nkeys,
struct ScanKeyData *key);
/*
* Like table_beginscan(), but table_beginscan_strat() offers an extended API
* that lets the caller control whether a nondefault buffer access strategy
* can be used, and whether syncscan can be chosen (possibly resulting in the
* scan not starting from block zero). Both of these default to true with
* plain table_beginscan.
*/
static inline TableScanDesc
table_beginscan_strat(Relation rel, Snapshot snapshot,
int nkeys, struct ScanKeyData *key,
bool allow_strat, bool allow_sync)
{
uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE;
if (allow_strat)
flags |= SO_ALLOW_STRAT;
if (allow_sync)
flags |= SO_ALLOW_SYNC;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
/*
* table_beginscan_bm is an alternative entry point for setting up a
* TableScanDesc for a bitmap heap scan. Although that scan technology is
* really quite unlike a standard seqscan, there is just enough commonality to
* make it worth using the same data structure.
*/
static inline TableScanDesc
table_beginscan_bm(Relation rel, Snapshot snapshot,
int nkeys, struct ScanKeyData *key)
{
uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
/*
* table_beginscan_sampling is an alternative entry point for setting up a
* TableScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
* using the same data structure although the behavior is rather different.
* In addition to the options offered by table_beginscan_strat, this call
* also allows control of whether page-mode visibility checking is used.
*/
static inline TableScanDesc
table_beginscan_sampling(Relation rel, Snapshot snapshot,
int nkeys, struct ScanKeyData *key,
bool allow_strat, bool allow_sync,
bool allow_pagemode)
{
uint32 flags = SO_TYPE_SAMPLESCAN;
if (allow_strat)
flags |= SO_ALLOW_STRAT;
if (allow_sync)
flags |= SO_ALLOW_SYNC;
if (allow_pagemode)
flags |= SO_ALLOW_PAGEMODE;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
/*
* table_beginscan_analyze is an alternative entry point for setting up a
* TableScanDesc for an ANALYZE scan. As with bitmap scans, it's worth using
* the same data structure although the behavior is rather different.
*/
static inline TableScanDesc
table_beginscan_analyze(Relation rel)
{
uint32 flags = SO_TYPE_ANALYZE;
return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL, flags);
}
/*
* End relation scan.
*/
static inline void
table_endscan(TableScanDesc scan)
{
scan->rs_rd->rd_tableam->scan_end(scan);
}
/*
* Restart a relation scan.
*/
static inline void
table_rescan(TableScanDesc scan,
struct ScanKeyData *key)
{
scan->rs_rd->rd_tableam->scan_rescan(scan, key, false, false, false, false);
}
/*
* Restart a relation scan after changing params.
*
* This call allows changing the buffer strategy, syncscan, and pagemode
* options before starting a fresh scan. Note that although the actual use of
* syncscan might change (effectively, enabling or disabling reporting), the
* previously selected startblock will be kept.
*/
static inline void
table_rescan_set_params(TableScanDesc scan, struct ScanKeyData *key,
bool allow_strat, bool allow_sync, bool allow_pagemode)
{
scan->rs_rd->rd_tableam->scan_rescan(scan, key, true,
allow_strat, allow_sync,
allow_pagemode);
}
/*
* Update snapshot used by the scan.
*/
extern void table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot);
/*
* Return next tuple from `scan`, store in slot.
*/
static inline bool
table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
{
slot->tts_tableOid = RelationGetRelid(sscan->rs_rd);
return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot);
}
/* ----------------------------------------------------------------------------
* Parallel table scan related functions.
* ----------------------------------------------------------------------------
*/
/*
* Estimate the size of shared memory needed for a parallel scan of this
* relation.
*/
extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot);
/*
* Initialize ParallelTableScanDesc for a parallel scan of this
* relation. `pscan` needs to be sized according to parallelscan_estimate()
* for the same relation. Call this just once in the leader process; then,
* individual workers attach via table_beginscan_parallel.
*/
extern void table_parallelscan_initialize(Relation rel,
ParallelTableScanDesc pscan,
Snapshot snapshot);
/*
* Begin a parallel scan. `pscan` needs to have been initialized with
* table_parallelscan_initialize(), for the same relation. The initialization
* does not need to have happened in this backend.
*
* Caller must hold a suitable lock on the relation.
*/
extern TableScanDesc table_beginscan_parallel(Relation rel,
ParallelTableScanDesc pscan);
/*
* Restart a parallel scan. Call this in the leader process. Caller is
* responsible for making sure that all workers have finished the scan
* beforehand.
*/
static inline void
table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
{
rel->rd_tableam->parallelscan_reinitialize(rel, pscan);
}
/* ----------------------------------------------------------------------------
* Index scan related functions.
* ----------------------------------------------------------------------------
*/
/*
* Prepare to fetch tuples from the relation, as needed when fetching tuples
* for an index scan.
*
* Tuples for an index scan can then be fetched via table_index_fetch_tuple().
*/
static inline IndexFetchTableData *
table_index_fetch_begin(Relation rel)
{
return rel->rd_tableam->index_fetch_begin(rel);
}
/*
* Reset index fetch. Typically this will release cross index fetch resources
* held in IndexFetchTableData.
*/
static inline void
table_index_fetch_reset(struct IndexFetchTableData *scan)
{
scan->rel->rd_tableam->index_fetch_reset(scan);
}
/*
* Release resources and deallocate index fetch.
*/
static inline void
table_index_fetch_end(struct IndexFetchTableData *scan)
{
scan->rel->rd_tableam->index_fetch_end(scan);
}
/*
* Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing
* a visibility test according to `snapshot`. If a tuple was found and passed
* the visibility test, returns true, false otherwise.
*
* *call_again needs to be false on the first call to table_index_fetch_tuple() for
* a tid. If there potentially is another tuple matching the tid, *call_again
* will be set to true, signalling that table_index_fetch_tuple() should be called
* again for the same tid.
*
* *all_dead, if all_dead is not NULL, will be set to true by
* table_index_fetch_tuple() iff it is guaranteed that no backend needs to see
* that tuple. Index AMs can use that do avoid returning that tid in future
* searches.
*
* The difference between this function and table_fetch_row_version is that
* this function returns the currently visible version of a row if the AM
* supports storing multiple row versions reachable via a single index entry
* (like heap's HOT). Whereas table_fetch_row_version only evaluates the
* tuple exactly at `tid`. Outside of index entry ->table tuple lookups,
* table_fetch_row_version is what's usually needed.
*/
static inline bool
table_index_fetch_tuple(struct IndexFetchTableData *scan,
ItemPointer tid,
Snapshot snapshot,
TupleTableSlot *slot,
bool *call_again, bool *all_dead)
{
return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot,
slot, call_again,
all_dead);
}
/*
* This is a convenience wrapper around table_index_fetch_tuple() which
* returns whether there are table tuple items corresponding to an index
* entry. This likely is only useful to verify if there's a conflict in a
* unique index.
*/
extern bool table_index_fetch_tuple_check(Relation rel,
ItemPointer tid,
Snapshot snapshot,
bool *all_dead);
/* ------------------------------------------------------------------------
* Functions for non-modifying operations on individual tuples
* ------------------------------------------------------------------------
*/
/*
* Fetch tuple at `tid` into `slot`, after doing a visibility test according to
* `snapshot`. If a tuple was found and passed the visibility test, returns
* true, false otherwise.
*
* See table_index_fetch_tuple's comment about what the difference between
* these functions is. This function is the correct to use outside of
* index entry->table tuple lookups.
*/
static inline bool
table_fetch_row_version(Relation rel,
ItemPointer tid,
Snapshot snapshot,
TupleTableSlot *slot)
{
return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot);
}
/*
* Verify that `tid` is a potentially valid tuple identifier. That doesn't
* mean that the pointed to row needs to exist or be visible, but that
* attempting to fetch the row (e.g. with table_get_latest_tid() or
* table_fetch_row_version()) should not error out if called with that tid.
*
* `scan` needs to have been started via table_beginscan().
*/
static inline bool
table_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
{
return scan->rs_rd->rd_tableam->tuple_tid_valid(scan, tid);
}
/*
* Return the latest version of the tuple at `tid`, by updating `tid` to
* point at the newest version.
*/
extern void table_get_latest_tid(TableScanDesc scan, ItemPointer tid);
/*
* Return true iff tuple in slot satisfies the snapshot.
*
* This assumes the slot's tuple is valid, and of the appropriate type for the
* AM.
*
* Some AMs might modify the data underlying the tuple as a side-effect. If so
* they ought to mark the relevant buffer dirty.
*/
static inline bool
table_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
Snapshot snapshot)
{
return rel->rd_tableam->tuple_satisfies_snapshot(rel, slot, snapshot);
}
/*
* Compute the newest xid among the tuples pointed to by items. This is used
* to compute what snapshots to conflict with when replaying WAL records for
* page-level index vacuums.
*/
static inline TransactionId
table_compute_xid_horizon_for_tuples(Relation rel,
ItemPointerData *items,
int nitems)
{
return rel->rd_tableam->compute_xid_horizon_for_tuples(rel, items, nitems);
}
/* ----------------------------------------------------------------------------
* Functions for manipulations of physical tuples.
* ----------------------------------------------------------------------------
*/
/*
* Insert a tuple from a slot into table AM routine.
*
* The options bitmask allows to specify options that allow to change the
* behaviour of the AM. Several options might be ignored by AMs not supporting
* them.
*
* If the TABLE_INSERT_SKIP_WAL option is specified, the new tuple doesn't
* need to be logged to WAL, even for a non-temp relation. It is the AMs
* choice whether this optimization is supported.
*
* If the TABLE_INSERT_SKIP_FSM option is specified, AMs are free to not reuse
* free space in the relation. This can save some cycles when we know the
* relation is new and doesn't contain useful amounts of free space. It's
* commonly passed directly to RelationGetBufferForTuple, see for more info.
*
* TABLE_INSERT_FROZEN should only be specified for inserts into
* relfilenodes created during the current subtransaction and when
* there are no prior snapshots or pre-existing portals open.
* This causes rows to be frozen, which is an MVCC violation and
* requires explicit options chosen by user.
*
* TABLE_INSERT_NO_LOGICAL force-disables the emitting of logical decoding
* information for the tuple. This should solely be used during table rewrites
* where RelationIsLogicallyLogged(relation) is not yet accurate for the new
* relation.
*
* Note that most of these options will be applied when inserting into the
* heap's TOAST table, too, if the tuple requires any out-of-line data.
*
*
* The BulkInsertState object (if any; bistate can be NULL for default
* behavior) is also just passed through to RelationGetBufferForTuple. If
* `bistate` is provided, table_finish_bulk_insert() needs to be called.
*
* On return the slot's tts_tid and tts_tableOid are updated to reflect the
* insertion. But note that any toasting of fields within the slot is NOT
* reflected in the slots contents.
*/
static inline void
table_insert(Relation rel, TupleTableSlot *slot, CommandId cid,
int options, struct BulkInsertStateData *bistate)
{
rel->rd_tableam->tuple_insert(rel, slot, cid, options,
bistate);
}
/*
* Perform a "speculative insertion". These can be backed out afterwards
* without aborting the whole transaction. Other sessions can wait for the
* speculative insertion to be confirmed, turning it into a regular tuple, or
* aborted, as if it never existed. Speculatively inserted tuples behave as
* "value locks" of short duration, used to implement INSERT .. ON CONFLICT.
*
* A transaction having performed a speculative insertion has to either abort,
* or finish the speculative insertion with
* table_complete_speculative(succeeded = ...).
*/
static inline void
table_insert_speculative(Relation rel, TupleTableSlot *slot, CommandId cid,
int options, struct BulkInsertStateData *bistate,
uint32 specToken)
{
rel->rd_tableam->tuple_insert_speculative(rel, slot, cid, options,
bistate, specToken);
}
/*
* Complete "speculative insertion" started in the same transaction. If
* succeeded is true, the tuple is fully inserted, if false, it's removed.
*/
static inline void
table_complete_speculative(Relation rel, TupleTableSlot *slot,
uint32 specToken, bool succeeded)
{
rel->rd_tableam->tuple_complete_speculative(rel, slot, specToken,
succeeded);
}
/*
* Insert multiple tuples into a table.
*
* This is like table_insert(), but inserts multiple tuples in one
* operation. That's often faster than calling table_insert() in a loop,
* because e.g. the AM can reduce WAL logging and page locking overhead.
*
* Except for taking `nslots` tuples as input, as an array of TupleTableSlots
* in `slots`, the parameters for table_multi_insert() are the same as for
* table_insert().
*
* Note: this leaks memory into the current memory context. You can create a
* temporary context before calling this, if that's a problem.
*/
static inline void
table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots,
CommandId cid, int options, struct BulkInsertStateData *bistate)
{
rel->rd_tableam->multi_insert(rel, slots, nslots,
cid, options, bistate);
}
/*
* Delete a tuple.
*
* NB: do not call this directly unless prepared to deal with
* concurrent-update conditions. Use simple_table_delete instead.
*
* Input parameters:
* relation - table to be modified (caller must hold suitable lock)
* tid - TID of tuple to be deleted
* cid - delete command ID (used for visibility test, and stored into
* cmax if successful)
* crosscheck - if not InvalidSnapshot, also check tuple against this
* wait - true if should wait for any conflicting update to commit/abort
* Output parameters:
* tmfd - filled in failure cases (see below)
* changingPart - true iff the tuple is being moved to another partition
* table due to an update of the partition key. Otherwise, false.
*
* Normal, successful return value is TM_Ok, which means we did actually
* delete it. Failure return codes are TM_SelfModified, TM_Updated, and
* TM_BeingModified (the last only possible if wait == false).
*
* In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
* t_xmax, and, if possible, and, if possible, t_cmax. See comments for
* struct TM_FailureData for additional info.
*/
static inline TM_Result
table_delete(Relation rel, ItemPointer tid, CommandId cid,
Snapshot snapshot, Snapshot crosscheck, bool wait,
TM_FailureData *tmfd, bool changingPart)
{
return rel->rd_tableam->tuple_delete(rel, tid, cid,
snapshot, crosscheck,
wait, tmfd, changingPart);
}
/*
* Update a tuple.
*
* NB: do not call this directly unless you are prepared to deal with
* concurrent-update conditions. Use simple_table_update instead.
*
* Input parameters:
* relation - table to be modified (caller must hold suitable lock)
* otid - TID of old tuple to be replaced
* slot - newly constructed tuple data to store
* cid - update command ID (used for visibility test, and stored into
* cmax/cmin if successful)
* crosscheck - if not InvalidSnapshot, also check old tuple against this
* wait - true if should wait for any conflicting update to commit/abort
* Output parameters:
* tmfd - filled in failure cases (see below)
* lockmode - filled with lock mode acquired on tuple
* update_indexes - in success cases this is set to true if new index entries
* are required for this tuple
*
* Normal, successful return value is TM_Ok, which means we did actually
* update it. Failure return codes are TM_SelfModified, TM_Updated, and
* TM_BeingModified (the last only possible if wait == false).
*
* On success, the slot's tts_tid and tts_tableOid are updated to match the new
* stored tuple; in particular, slot->tts_tid is set to the TID where the
* new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
* update was done. However, any TOAST changes in the new tuple's
* data are not reflected into *newtup.
*
* In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
* t_xmax, and, if possible, t_cmax. See comments for struct TM_FailureData
* for additional info.
*/
static inline TM_Result
table_update(Relation rel, ItemPointer otid, TupleTableSlot *slot,
CommandId cid, Snapshot snapshot, Snapshot crosscheck, bool wait,
TM_FailureData *tmfd, LockTupleMode *lockmode,
bool *update_indexes)
{
return rel->rd_tableam->tuple_update(rel, otid, slot,
cid, snapshot, crosscheck,
wait, tmfd,
lockmode, update_indexes);
}
/*
* Lock a tuple in the specified mode.
*
* Input parameters:
* relation: relation containing tuple (caller must hold suitable lock)
* tid: TID of tuple to lock
* snapshot: snapshot to use for visibility determinations
* cid: current command ID (used for visibility test, and stored into
* tuple's cmax if lock is successful)
* mode: lock mode desired
* wait_policy: what to do if tuple lock is not available
* flags:
* If TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS, follow the update chain to
* also lock descendant tuples if lock modes don't conflict.
* If TUPLE_LOCK_FLAG_FIND_LAST_VERSION, follow the update chain and lock
* latest version.
*
* Output parameters:
* *slot: contains the target tuple
* *tmfd: filled in failure cases (see below)
*
* Function result may be:
* TM_Ok: lock was successfully acquired
* TM_Invisible: lock failed because tuple was never visible to us
* TM_SelfModified: lock failed because tuple updated by self
* TM_Updated: lock failed because tuple updated by other xact
* TM_Deleted: lock failed because tuple deleted by other xact
* TM_WouldBlock: lock couldn't be acquired and wait_policy is skip
*
* In the failure cases other than TM_Invisible and TM_Deleted, the routine
* fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. See
* comments for struct TM_FailureData for additional info.
*/
static inline TM_Result
table_lock_tuple(Relation rel, ItemPointer tid, Snapshot snapshot,
TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
LockWaitPolicy wait_policy, uint8 flags,
TM_FailureData *tmfd)
{
return rel->rd_tableam->tuple_lock(rel, tid, snapshot, slot,
cid, mode, wait_policy,
flags, tmfd);
}
/*
* Perform operations necessary to complete insertions made via
* tuple_insert and multi_insert with a BulkInsertState specified. This
* e.g. may e.g. used to flush the relation when inserting with
* TABLE_INSERT_SKIP_WAL specified.
*/
static inline void
table_finish_bulk_insert(Relation rel, int options)
{
/* optional callback */
if (rel->rd_tableam && rel->rd_tableam->finish_bulk_insert)
rel->rd_tableam->finish_bulk_insert(rel, options);
}
/* ------------------------------------------------------------------------
* DDL related functionality.
* ------------------------------------------------------------------------
*/
/*
* Create storage for `rel` in `newrode`, with persistence set to
* `persistence`.
*
* This is used both during relation creation and various DDL operations to
* create a new relfilenode that can be filled from scratch. When creating
* new storage for an existing relfilenode, this should be called before the
* relcache entry has been updated.
*
* *freezeXid, *minmulti are set to the xid / multixact horizon for the table
* that pg_class.{relfrozenxid, relminmxid} have to be set to.
*/
static inline void
table_relation_set_new_filenode(Relation rel,
const RelFileNode *newrnode,
char persistence,
TransactionId *freezeXid,
MultiXactId *minmulti)
{
rel->rd_tableam->relation_set_new_filenode(rel, newrnode, persistence,
freezeXid, minmulti);
}
/*
* Remove all table contents from `rel`, in a non-transactional manner.
* Non-transactional meaning that there's no need to support rollbacks. This
* commonly only is used to perform truncations for relfilenodes created in the
* current transaction.
*/
static inline void
table_relation_nontransactional_truncate(Relation rel)
{
rel->rd_tableam->relation_nontransactional_truncate(rel);
}
/*
* Copy data from `rel` into the new relfilenode `newrnode`. The new
* relfilenode may not have storage associated before this function is
* called. This is only supposed to be used for low level operations like
* changing a relation's tablespace.
*/
static inline void
table_relation_copy_data(Relation rel, const RelFileNode *newrnode)
{
rel->rd_tableam->relation_copy_data(rel, newrnode);
}
/*
* Copy data from `OldHeap` into `NewHeap`, as part of a CLUSTER or VACUUM
* FULL.
*
* Additional Input parameters:
* - use_sort - if true, the table contents are sorted appropriate for
* `OldIndex`; if false and OldIndex is not InvalidOid, the data is copied
* in that index's order; if false and OidIndex is InvalidOid, no sorting is
* performed
* - OidIndex - see use_sort
* - OldestXmin - computed by vacuum_set_xid_limits(), even when
* not needed for the relation's AM
* - *xid_cutoff - dito
* - *multi_cutoff - dito
*
* Output parameters:
* - *xid_cutoff - rel's new relfrozenxid value, may be invalid
* - *multi_cutoff - rel's new relminmxid value, may be invalid
* - *tups_vacuumed - stats, for logging, if appropriate for AM
* - *tups_recently_dead - stats, for logging, if appropriate for AM
*/
static inline void
table_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
Relation OldIndex,
bool use_sort,
TransactionId OldestXmin,
TransactionId *xid_cutoff,
MultiXactId *multi_cutoff,
double *num_tuples,
double *tups_vacuumed,
double *tups_recently_dead)
{
OldHeap->rd_tableam->relation_copy_for_cluster(OldHeap, NewHeap, OldIndex,
use_sort, OldestXmin,
xid_cutoff, multi_cutoff,
num_tuples, tups_vacuumed,
tups_recently_dead);
}
/*
* Perform VACUUM on the relation. The VACUUM can be user-triggered or by
* autovacuum. The specific actions performed by the AM will depend heavily on
* the individual AM.
* On entry a transaction needs to already been established, and the
* table is locked with a ShareUpdateExclusive lock.
*
* Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through this
* routine, even if (for ANALYZE) it is part of the same VACUUM command.
*/
static inline void
table_relation_vacuum(Relation rel, struct VacuumParams *params,
BufferAccessStrategy bstrategy)
{
rel->rd_tableam->relation_vacuum(rel, params, bstrategy);
}
/*
* Prepare to analyze block `blockno` of `scan`. The scan needs to have been
* started with table_beginscan_analyze(). Note that this routine might
* acquire resources like locks that are held until
* table_scan_analyze_next_tuple() returns false.
*
* Returns false if block is unsuitable for sampling, true otherwise.
*/
static inline bool
table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
BufferAccessStrategy bstrategy)
{
return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno,
bstrategy);
}
/*
* Iterate over tuples in the block selected with
* table_scan_analyze_next_block() (which needs to have returned true, and
* this routine may not have returned false for the same block before). If a
* tuple that's suitable for sampling is found, true is returned and a tuple
* is stored in `slot`.
*
* *liverows and *deadrows are incremented according to the encountered
* tuples.
*/
static inline bool
table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
double *liverows, double *deadrows,
TupleTableSlot *slot)
{
return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin,
liverows, deadrows,
slot);
}
/*
* table_index_build_scan - scan the table to find tuples to be indexed
*
* This is called back from an access-method-specific index build procedure
* after the AM has done whatever setup it needs. The parent heap relation
* is scanned to find tuples that should be entered into the index. Each
* such tuple is passed to the AM's callback routine, which does the right
* things to add it to the new index. After we return, the AM's index
* build procedure does whatever cleanup it needs.
*
* The total count of live tuples is returned. This is for updating pg_class
* statistics. (It's annoying not to be able to do that here, but we want to
* merge that update with others; see index_update_stats.) Note that the
* index AM itself must keep track of the number of index tuples; we don't do
* so here because the AM might reject some of the tuples for its own reasons,
* such as being unable to store NULLs.
*
* If 'progress', the PROGRESS_SCAN_BLOCKS_TOTAL counter is updated when
* starting the scan, and PROGRESS_SCAN_BLOCKS_DONE is updated as we go along.
*
* A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect
* any potentially broken HOT chains. Currently, we set this if there are any
* RECENTLY_DEAD or DELETE_IN_PROGRESS entries in a HOT chain, without trying
* very hard to detect whether they're really incompatible with the chain tip.
* This only really makes sense for heap AM, it might need to be generalized
* for other AMs later.
*/
static inline double
table_index_build_scan(Relation heap_rel,
Relation index_rel,
struct IndexInfo *index_nfo,
bool allow_sync,
bool progress,
IndexBuildCallback callback,
void *callback_state,
TableScanDesc scan)
{
return heap_rel->rd_tableam->index_build_range_scan(heap_rel,
index_rel,
index_nfo,
allow_sync,
false,
progress,
0,
InvalidBlockNumber,
callback,
callback_state,
scan);
}
/*
* As table_index_build_scan(), except that instead of scanning the complete
* table, only the given number of blocks are scanned. Scan to end-of-rel can
* be signalled by passing InvalidBlockNumber as numblocks. Note that
* restricting the range to scan cannot be done when requesting syncscan.
*
* When "anyvisible" mode is requested, all tuples visible to any transaction
* are indexed and counted as live, including those inserted or deleted by
* transactions that are still in progress.
*/
static inline double
table_index_build_range_scan(Relation heap_rel,
Relation index_rel,
struct IndexInfo *index_nfo,
bool allow_sync,
bool anyvisible,
bool progress,
BlockNumber start_blockno,
BlockNumber numblocks,
IndexBuildCallback callback,
void *callback_state,
TableScanDesc scan)
{
return heap_rel->rd_tableam->index_build_range_scan(heap_rel,
index_rel,
index_nfo,
allow_sync,
anyvisible,
progress,
start_blockno,
numblocks,
callback,
callback_state,
scan);
}
/*
* table_index_validate_scan - second table scan for concurrent index build
*
* See validate_index() for an explanation.
*/
static inline void
table_index_validate_scan(Relation heap_rel,
Relation index_rel,
struct IndexInfo *index_info,
Snapshot snapshot,
struct ValidateIndexState *state)
{
heap_rel->rd_tableam->index_validate_scan(heap_rel,
index_rel,
index_info,
snapshot,
state);
}
/* ----------------------------------------------------------------------------
* Miscellaneous functionality
* ----------------------------------------------------------------------------
*/
/*
* Return the current size of `rel` in bytes. If `forkNumber` is
* InvalidForkNumber, return the relation's overall size, otherwise the size
* for the indicated fork.
*
* Note that the overall size might not be the equivalent of the sum of sizes
* for the individual forks for some AMs, e.g. because the AMs storage does
* not neatly map onto the builtin types of forks.
*/
static inline uint64
table_relation_size(Relation rel, ForkNumber forkNumber)
{
return rel->rd_tableam->relation_size(rel, forkNumber);
}
/*
* table_needs_toast_table - does this relation need a toast table?
*/
static inline bool
table_relation_needs_toast_table(Relation rel)
{
return rel->rd_tableam->relation_needs_toast_table(rel);
}
/* ----------------------------------------------------------------------------
* Planner related functionality
* ----------------------------------------------------------------------------
*/
/*
* Estimate the current size of the relation, as an AM specific workhorse for
* estimate_rel_size(). Look there for an explanation of the parameters.
*/
static inline void
table_relation_estimate_size(Relation rel, int32 *attr_widths,
BlockNumber *pages, double *tuples,
double *allvisfrac)
{
rel->rd_tableam->relation_estimate_size(rel, attr_widths, pages, tuples,
allvisfrac);
}
/* ----------------------------------------------------------------------------
* Executor related functionality
* ----------------------------------------------------------------------------
*/
/*
* Prepare to fetch / check / return tuples from `tbmres->blockno` as part of
* a bitmap table scan. `scan` needs to have been started via
* table_beginscan_bm(). Returns false if there are no tuples to be found on
* the page, true otherwise.
*
* Note, this is an optionally implemented function, therefore should only be
* used after verifying the presence (at plan time or such).
*/
static inline bool
table_scan_bitmap_next_block(TableScanDesc scan,
struct TBMIterateResult *tbmres)
{
return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan,
tbmres);
}
/*
* Fetch the next tuple of a bitmap table scan into `slot` and return true if
* a visible tuple was found, false otherwise.
* table_scan_bitmap_next_block() needs to previously have selected a
* block (i.e. returned true), and no previous
* table_scan_bitmap_next_tuple() for the same block may have
* returned false.
*/
static inline bool
table_scan_bitmap_next_tuple(TableScanDesc scan,
struct TBMIterateResult *tbmres,
TupleTableSlot *slot)
{
return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan,
tbmres,
slot);
}
/*
* Prepare to fetch tuples from the next block in a sample scan. Returns false
* if the sample scan is finished, true otherwise. `scan` needs to have been
* started via table_beginscan_sampling().
*
* This will call the TsmRoutine's NextSampleBlock() callback if necessary
* (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the
* underlying relation.
*/
static inline bool
table_scan_sample_next_block(TableScanDesc scan,
struct SampleScanState *scanstate)
{
return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate);
}
/*
* Fetch the next sample tuple into `slot` and return true if a visible tuple
* was found, false otherwise. table_scan_sample_next_block() needs to
* previously have selected a block (i.e. returned true), and no previous
* table_scan_sample_next_tuple() for the same block may have returned false.
*
* This will call the TsmRoutine's NextSampleTuple() callback.
*/
static inline bool
table_scan_sample_next_tuple(TableScanDesc scan,
struct SampleScanState *scanstate,
TupleTableSlot *slot)
{
return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate,
slot);
}
/* ----------------------------------------------------------------------------
* Functions to make modifications a bit simpler.
* ----------------------------------------------------------------------------
*/
extern void simple_table_insert(Relation rel, TupleTableSlot *slot);
extern void simple_table_delete(Relation rel, ItemPointer tid,
Snapshot snapshot);
extern void simple_table_update(Relation rel, ItemPointer otid,
TupleTableSlot *slot, Snapshot snapshot,
bool *update_indexes);
/* ----------------------------------------------------------------------------
* Helper functions to implement parallel scans for block oriented AMs.
* ----------------------------------------------------------------------------
*/
extern Size table_block_parallelscan_estimate(Relation rel);
extern Size table_block_parallelscan_initialize(Relation rel,
ParallelTableScanDesc pscan);
extern void table_block_parallelscan_reinitialize(Relation rel,
ParallelTableScanDesc pscan);
extern BlockNumber table_block_parallelscan_nextpage(Relation rel,
ParallelBlockTableScanDesc pbscan);
extern void table_block_parallelscan_startblock_init(Relation rel,
ParallelBlockTableScanDesc pbscan);
/* ----------------------------------------------------------------------------
* Functions in tableamapi.c
* ----------------------------------------------------------------------------
*/
extern const TableAmRoutine *GetTableAmRoutine(Oid amhandler);
extern const TableAmRoutine *GetHeapamTableAmRoutine(void);
extern bool check_default_table_access_method(char **newval, void **extra,
GucSource source);
#endif /* TABLEAM_H */
|