summaryrefslogtreecommitdiff
path: root/src/backend/access
diff options
context:
space:
mode:
authorThomas Munro <tmunro@postgresql.org>2023-04-08 10:38:09 +1200
committerThomas Munro <tmunro@postgresql.org>2023-04-08 16:34:50 +1200
commitfaeedbcefd40bfdf314e048c425b6d9208896d90 (patch)
treed6bc53f2196b37e0ce2a408ab44a734382e485d5 /src/backend/access
parentd73c285af5c29a0b486643b77350bc23fbb6114c (diff)
downloadpostgresql-faeedbcefd40bfdf314e048c425b6d9208896d90.tar.gz
Introduce PG_IO_ALIGN_SIZE and align all I/O buffers.
In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a later commit, we need the addresses of user space buffers to be well aligned. The exact requirements vary by OS and file system (typically sectors and/or memory pages). The address alignment size is set to 4096, which is enough for currently known systems: it matches modern sectors and common memory page size. There is no standard governing O_DIRECT's requirements so we might eventually have to reconsider this with more information from the field or future systems. Aligning I/O buffers on memory pages is also known to improve regular buffered I/O performance. Three classes of I/O buffers for regular data pages are adjusted: (1) Heap buffers are now allocated with the new palloc_aligned() or MemoryContextAllocAligned() functions introduced by commit 439f6175. (2) Stack buffers now use a new struct PGIOAlignedBlock to respect PG_IO_ALIGN_SIZE, if possible with this compiler. (3) The buffer pool is also aligned in shared memory. WAL buffers were already aligned on XLOG_BLCKSZ. It's possible for XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus for O_DIRECT WAL writes to fail to be well aligned, but that's a pre-existing condition and will be addressed by a later commit. BufFiles are not yet addressed (there's no current plan to use O_DIRECT for those, but they could potentially get some incidental speedup even in plain buffered I/O operations through better alignment). If we can't align stack objects suitably using the compiler extensions we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to 0. This avoids the need to consider systems that have O_DIRECT but can't align stack objects the way we want; such systems could in theory be supported with more work but we don't currently know of any such machines, so it's easier to pretend there is no O_DIRECT support instead. That's an existing and tested class of system. Add assertions that all buffers passed into smgrread(), smgrwrite() and smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack alignment tricks may be unavailable) or the block size has been set too small to allow arrays of buffers to be all aligned. Author: Thomas Munro <thomas.munro@gmail.com> Author: Andres Freund <andres@anarazel.de> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
Diffstat (limited to 'src/backend/access')
-rw-r--r--src/backend/access/gist/gistbuild.c9
-rw-r--r--src/backend/access/hash/hashpage.c2
-rw-r--r--src/backend/access/heap/rewriteheap.c2
-rw-r--r--src/backend/access/nbtree/nbtree.c2
-rw-r--r--src/backend/access/nbtree/nbtsort.c8
-rw-r--r--src/backend/access/spgist/spginsert.c2
-rw-r--r--src/backend/access/transam/generic_xlog.c13
-rw-r--r--src/backend/access/transam/xlog.c2
8 files changed, 24 insertions, 16 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index d2f8da5b02..5e0c1447f9 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -415,7 +415,7 @@ gist_indexsortbuild(GISTBuildState *state)
* Write an empty page as a placeholder for the root page. It will be
* replaced with the real root page at the end.
*/
- page = palloc0(BLCKSZ);
+ page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
page, true);
state->pages_allocated++;
@@ -509,7 +509,8 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state,
levelstate->current_page++;
if (levelstate->pages[levelstate->current_page] == NULL)
- levelstate->pages[levelstate->current_page] = palloc(BLCKSZ);
+ levelstate->pages[levelstate->current_page] =
+ palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
newPage = levelstate->pages[levelstate->current_page];
gistinitpage(newPage, old_page_flags);
@@ -579,7 +580,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
/* Create page and copy data */
data = (char *) (dist->list);
- target = palloc0(BLCKSZ);
+ target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
gistinitpage(target, isleaf ? F_LEAF : 0);
for (int i = 0; i < dist->block.num; i++)
{
@@ -630,7 +631,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
if (parent == NULL)
{
parent = palloc0(sizeof(GistSortedBuildLevelState));
- parent->pages[0] = (Page) palloc(BLCKSZ);
+ parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
parent->parent = NULL;
gistinitpage(parent->pages[0], 0);
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 6d8af42260..af3a154266 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -992,7 +992,7 @@ static bool
_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
{
BlockNumber lastblock;
- PGAlignedBlock zerobuf;
+ PGIOAlignedBlock zerobuf;
Page page;
HashPageOpaque ovflopaque;
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index ae0282a70e..424958912c 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -255,7 +255,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
state->rs_old_rel = old_heap;
state->rs_new_rel = new_heap;
- state->rs_buffer = (Page) palloc(BLCKSZ);
+ state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
/* new_heap needn't be empty, just locked */
state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
state->rs_buffer_valid = false;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 992f84834f..2df8849858 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -154,7 +154,7 @@ btbuildempty(Relation index)
Page metapage;
/* Construct metapage. */
- metapage = (Page) palloc(BLCKSZ);
+ metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
/*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 1207a49689..6ad3f3c54d 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -619,7 +619,7 @@ _bt_blnewpage(uint32 level)
Page page;
BTPageOpaque opaque;
- page = (Page) palloc(BLCKSZ);
+ page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
/* Zero the page and set up standard page header info */
_bt_pageinit(page, BLCKSZ);
@@ -660,7 +660,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
while (blkno > wstate->btws_pages_written)
{
if (!wstate->btws_zeropage)
- wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
+ wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
+ PG_IO_ALIGN_SIZE,
+ MCXT_ALLOC_ZERO);
/* don't set checksum for all-zero page */
smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
wstate->btws_pages_written++,
@@ -1170,7 +1172,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
* set to point to "P_NONE"). This changes the index to the "valid" state
* by filling in a valid magic number in the metapage.
*/
- metapage = (Page) palloc(BLCKSZ);
+ metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
_bt_initmetapage(metapage, rootblkno, rootlevel,
wstate->inskey->allequalimage);
_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 718a88335d..72d2e1551c 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -158,7 +158,7 @@ spgbuildempty(Relation index)
Page page;
/* Construct metapage. */
- page = (Page) palloc(BLCKSZ);
+ page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
SpGistInitMetapage(page);
/*
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
index 9f67d1c1cd..6c68191ca6 100644
--- a/src/backend/access/transam/generic_xlog.c
+++ b/src/backend/access/transam/generic_xlog.c
@@ -58,14 +58,17 @@ typedef struct
char delta[MAX_DELTA_SIZE]; /* delta between page images */
} PageData;
-/* State of generic xlog record construction */
+/*
+ * State of generic xlog record construction. Must be allocated at an I/O
+ * aligned address.
+ */
struct GenericXLogState
{
+ /* Page images (properly aligned, must be first) */
+ PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
/* Info about each page, see above */
PageData pages[MAX_GENERIC_XLOG_PAGES];
bool isLogged;
- /* Page images (properly aligned) */
- PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
};
static void writeFragment(PageData *pageData, OffsetNumber offset,
@@ -269,7 +272,9 @@ GenericXLogStart(Relation relation)
GenericXLogState *state;
int i;
- state = (GenericXLogState *) palloc(sizeof(GenericXLogState));
+ state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState),
+ PG_IO_ALIGN_SIZE,
+ 0);
state->isLogged = RelationNeedsWAL(relation);
for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 46821ad605..a5c74fdab8 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4506,7 +4506,7 @@ XLOGShmemSize(void)
/* xlblocks array */
size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
/* extra alignment padding for XLOG I/O buffers */
- size = add_size(size, XLOG_BLCKSZ);
+ size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
/* and the buffers themselves */
size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));