1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
|
/*-------------------------------------------------------------------------
*
* nbtxlog.h
* header file for postgres btree xlog routines
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/nbtxlog.h
*
*-------------------------------------------------------------------------
*/
#ifndef NBTXLOG_H
#define NBTXLOG_H
#include "access/xlogreader.h"
#include "lib/stringinfo.h"
#include "storage/off.h"
/*
* XLOG records for btree operations
*
* XLOG allows to store some information in high 4 bits of log
* record xl_info field
*/
#define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */
#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
#define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
/* 0x50 and 0x60 are unused */
#define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
#define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
#define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
#define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */
#define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during
* vacuum */
#define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
* FSM */
#define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the
* metapage */
/*
* All that we need to regenerate the meta-data page
*/
typedef struct xl_btree_metadata
{
uint32 version;
BlockNumber root;
uint32 level;
BlockNumber fastroot;
uint32 fastlevel;
TransactionId oldest_btpo_xact;
float8 last_cleanup_num_heap_tuples;
} xl_btree_metadata;
/*
* This is what we need to know about simple (without split) insert.
*
* This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
* Note that INSERT_META implies it's not a leaf page.
*
* Backup Blk 0: original page (data contains the inserted tuple)
* Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
* Backup Blk 2: xl_btree_metadata, if INSERT_META
*/
typedef struct xl_btree_insert
{
OffsetNumber offnum;
} xl_btree_insert;
#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
/*
* On insert with split, we save all the items going into the right sibling
* so that we can restore it completely from the log record. This way takes
* less xlog space than the normal approach, because if we did it standardly,
* XLogInsert would almost always think the right page is new and store its
* whole page image. The left page, however, is handled in the normal
* incremental-update fashion.
*
* Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
* There are two variants to indicate whether the inserted tuple went into the
* left or right split page (and thus, whether the new item is stored or not).
* We always log the left page high key because suffix truncation can generate
* a new leaf high key using user-defined code. This is also necessary on
* internal pages, since the first right item that the left page's high key
* was based on will have been truncated to zero attributes in the right page
* (the original is unavailable from the right page).
*
* Backup Blk 0: original page / new left page
*
* The left page's data portion contains the new item, if it's the _L variant.
* An IndexTuple representing the high key of the left page must follow with
* either variant.
*
* Backup Blk 1: new right page
*
* The right page's data portion contains the right page's tuples in the form
* used by _bt_restore_page. This includes the new item, if it's the _R
* variant. The right page's tuples also include the right page's high key
* with either variant (moved from the left/original page during the split),
* unless the split happened to be of the rightmost page on its level, where
* there is no high key for new right page.
*
* Backup Blk 2: next block (orig page's rightlink), if any
* Backup Blk 3: child's left sibling, if non-leaf split
*/
typedef struct xl_btree_split
{
uint32 level; /* tree level of page being split */
OffsetNumber firstright; /* first item moved to right page */
OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */
} xl_btree_split;
#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber))
/*
* This is what we need to know about delete of individual leaf index tuples.
* The WAL record can represent deletion of any number of index tuples on a
* single index page when *not* executed by VACUUM.
*
* Backup Blk 0: index page
*/
typedef struct xl_btree_delete
{
TransactionId latestRemovedXid;
int nitems;
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
} xl_btree_delete;
#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int))
/*
* This is what we need to know about page reuse within btree. This record
* only exists to generate a conflict point for Hot Standby.
*
* Note that we must include a RelFileNode in the record because we don't
* actually register the buffer with the record.
*/
typedef struct xl_btree_reuse_page
{
RelFileNode node;
BlockNumber block;
TransactionId latestRemovedXid;
} xl_btree_reuse_page;
#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
/*
* This is what we need to know about vacuum of individual leaf index tuples.
* The WAL record can represent deletion of any number of index tuples on a
* single index page when executed by VACUUM.
*
* Note that the WAL record in any vacuum of an index must have at least one
* item to delete.
*/
typedef struct xl_btree_vacuum
{
uint32 ndeleted;
/* DELETED TARGET OFFSET NUMBERS FOLLOW */
} xl_btree_vacuum;
#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32))
/*
* This is what we need to know about marking an empty branch for deletion.
* The target identifies the tuple removed from the parent page (note that we
* remove this tuple's downlink and the *following* tuple's key). Note that
* the leaf page is empty, so we don't need to store its content --- it is
* just reinitialized during recovery using the rest of the fields.
*
* Backup Blk 0: leaf block
* Backup Blk 1: top parent
*/
typedef struct xl_btree_mark_page_halfdead
{
OffsetNumber poffset; /* deleted tuple id in parent page */
/* information needed to recreate the leaf page: */
BlockNumber leafblk; /* leaf block ultimately being deleted */
BlockNumber leftblk; /* leaf block's left sibling, if any */
BlockNumber rightblk; /* leaf block's right sibling */
BlockNumber topparent; /* topmost internal page in the branch */
} xl_btree_mark_page_halfdead;
#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
/*
* This is what we need to know about deletion of a btree page. Note we do
* not store any content for the deleted page --- it is just rewritten as empty
* during recovery, apart from resetting the btpo.xact.
*
* Backup Blk 0: target block being deleted
* Backup Blk 1: target block's left sibling, if any
* Backup Blk 2: target block's right sibling
* Backup Blk 3: leaf block (if different from target)
* Backup Blk 4: metapage (if rightsib becomes new fast root)
*/
typedef struct xl_btree_unlink_page
{
BlockNumber leftsib; /* target block's left sibling, if any */
BlockNumber rightsib; /* target block's right sibling */
/*
* Information needed to recreate the leaf page, when target is an
* internal page.
*/
BlockNumber leafleftsib;
BlockNumber leafrightsib;
BlockNumber topparent; /* next child down in the branch */
TransactionId btpo_xact; /* value of btpo.xact for use in recovery */
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
} xl_btree_unlink_page;
#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
/*
* New root log record. There are zero tuples if this is to establish an
* empty root, or two if it is the result of splitting an old root.
*
* Note that although this implies rewriting the metadata page, we don't need
* an xl_btree_metadata record --- the rootblk and level are sufficient.
*
* Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
* Backup Blk 1: left child (if splitting an old root)
* Backup Blk 2: metapage
*/
typedef struct xl_btree_newroot
{
BlockNumber rootblk; /* location of new root (redundant with blk 0) */
uint32 level; /* its tree level */
} xl_btree_newroot;
#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
/*
* prototypes for functions in nbtxlog.c
*/
extern void btree_redo(XLogReaderState *record);
extern void btree_desc(StringInfo buf, XLogReaderState *record);
extern const char *btree_identify(uint8 info);
extern void btree_mask(char *pagedata, BlockNumber blkno);
#endif /* NBTXLOG_H */
|