summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Misch <noah@leadboat.com>2020-03-22 09:24:09 -0700
committerNoah Misch <noah@leadboat.com>2020-03-22 09:24:09 -0700
commitde9396326edcbe5cafc06a72016f9d715c350e0e (patch)
tree39975a93da0cc43535fa87ad066d54a3f8867678
parentd0587f52b3bb898db3c0011954de6ae9adc076c8 (diff)
downloadpostgresql-de9396326edcbe5cafc06a72016f9d715c350e0e.tar.gz
Revert "Skip WAL for new relfilenodes, under wal_level=minimal."
This reverts commit cb2fd7eac285b1b0a24eeb2b8ed4456b66c5a09f. Per numerous buildfarm members, it was incompatible with parallel query, and a test case assumed LP64. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20200321224920.GB1763544@rfd.leadboat.com
-rw-r--r--contrib/pg_visibility/expected/pg_visibility.out35
-rw-r--r--contrib/pg_visibility/sql/pg_visibility.sql19
-rw-r--r--doc/src/sgml/config.sgml39
-rw-r--r--doc/src/sgml/perform.sgml47
-rw-r--r--src/backend/access/gist/gistutil.c31
-rw-r--r--src/backend/access/gist/gistxlog.c21
-rw-r--r--src/backend/access/heap/heapam.c45
-rw-r--r--src/backend/access/heap/heapam_handler.c22
-rw-r--r--src/backend/access/heap/rewriteheap.c21
-rw-r--r--src/backend/access/nbtree/nbtsort.c41
-rw-r--r--src/backend/access/rmgrdesc/gistdesc.c6
-rw-r--r--src/backend/access/transam/README45
-rw-r--r--src/backend/access/transam/xact.c15
-rw-r--r--src/backend/access/transam/xlogutils.c18
-rw-r--r--src/backend/bootstrap/bootparse.y4
-rw-r--r--src/backend/catalog/storage.c246
-rw-r--r--src/backend/commands/cluster.c19
-rw-r--r--src/backend/commands/copy.c58
-rw-r--r--src/backend/commands/createas.c11
-rw-r--r--src/backend/commands/indexcmds.c2
-rw-r--r--src/backend/commands/matview.c12
-rw-r--r--src/backend/commands/tablecmds.c26
-rw-r--r--src/backend/nodes/copyfuncs.c2
-rw-r--r--src/backend/nodes/equalfuncs.c2
-rw-r--r--src/backend/nodes/outfuncs.c2
-rw-r--r--src/backend/parser/gram.y4
-rw-r--r--src/backend/parser/parse_utilcmd.c4
-rw-r--r--src/backend/storage/buffer/bufmgr.c125
-rw-r--r--src/backend/storage/lmgr/lock.c12
-rw-r--r--src/backend/storage/smgr/md.c36
-rw-r--r--src/backend/storage/smgr/smgr.c35
-rw-r--r--src/backend/utils/cache/relcache.c268
-rw-r--r--src/backend/utils/misc/guc.c12
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample1
-rw-r--r--src/include/access/gist_private.h2
-rw-r--r--src/include/access/gistxlog.h1
-rw-r--r--src/include/access/heapam.h3
-rw-r--r--src/include/access/rewriteheap.h2
-rw-r--r--src/include/access/tableam.h15
-rw-r--r--src/include/catalog/storage.h6
-rw-r--r--src/include/nodes/parsenodes.h3
-rw-r--r--src/include/storage/bufmgr.h4
-rw-r--r--src/include/storage/lock.h3
-rw-r--r--src/include/storage/smgr.h1
-rw-r--r--src/include/utils/rel.h55
-rw-r--r--src/include/utils/relcache.h8
-rw-r--r--src/test/recovery/t/018_wal_optimize.pl372
-rw-r--r--src/test/regress/expected/alter_table.out6
-rw-r--r--src/test/regress/expected/create_table.out13
-rw-r--r--src/test/regress/sql/alter_table.sql7
-rw-r--r--src/test/regress/sql/create_table.sql15
51 files changed, 363 insertions, 1439 deletions
diff --git a/contrib/pg_visibility/expected/pg_visibility.out b/contrib/pg_visibility/expected/pg_visibility.out
index 2abc1b5107..f0dcb897c4 100644
--- a/contrib/pg_visibility/expected/pg_visibility.out
+++ b/contrib/pg_visibility/expected/pg_visibility.out
@@ -1,40 +1,5 @@
CREATE EXTENSION pg_visibility;
--
--- recently-dropped table
---
-\set VERBOSITY sqlstate
-BEGIN;
-CREATE TABLE droppedtest (c int);
-SELECT 'droppedtest'::regclass::oid AS oid \gset
-SAVEPOINT q; DROP TABLE droppedtest; RELEASE q;
-SAVEPOINT q; SELECT * FROM pg_visibility_map(:oid); ROLLBACK TO q;
-ERROR: XX000
--- ERROR: could not open relation with OID 16xxx
-SAVEPOINT q; SELECT 1; ROLLBACK TO q;
- ?column?
-----------
- 1
-(1 row)
-
-SAVEPOINT q; SELECT 1; ROLLBACK TO q;
- ?column?
-----------
- 1
-(1 row)
-
-SELECT pg_relation_size(:oid), pg_relation_filepath(:oid),
- has_table_privilege(:oid, 'SELECT');
- pg_relation_size | pg_relation_filepath | has_table_privilege
-------------------+----------------------+---------------------
- | |
-(1 row)
-
-SELECT * FROM pg_visibility_map(:oid);
-ERROR: XX000
--- ERROR: could not open relation with OID 16xxx
-ROLLBACK;
-\set VERBOSITY default
---
-- check that using the module's functions with unsupported relations will fail
--
-- partitioned tables (the parent ones) don't have visibility maps
diff --git a/contrib/pg_visibility/sql/pg_visibility.sql b/contrib/pg_visibility/sql/pg_visibility.sql
index c78b90521b..c2a7f1d9e4 100644
--- a/contrib/pg_visibility/sql/pg_visibility.sql
+++ b/contrib/pg_visibility/sql/pg_visibility.sql
@@ -1,25 +1,6 @@
CREATE EXTENSION pg_visibility;
--
--- recently-dropped table
---
-\set VERBOSITY sqlstate
-BEGIN;
-CREATE TABLE droppedtest (c int);
-SELECT 'droppedtest'::regclass::oid AS oid \gset
-SAVEPOINT q; DROP TABLE droppedtest; RELEASE q;
-SAVEPOINT q; SELECT * FROM pg_visibility_map(:oid); ROLLBACK TO q;
--- ERROR: could not open relation with OID 16xxx
-SAVEPOINT q; SELECT 1; ROLLBACK TO q;
-SAVEPOINT q; SELECT 1; ROLLBACK TO q;
-SELECT pg_relation_size(:oid), pg_relation_filepath(:oid),
- has_table_privilege(:oid, 'SELECT');
-SELECT * FROM pg_visibility_map(:oid);
--- ERROR: could not open relation with OID 16xxx
-ROLLBACK;
-\set VERBOSITY default
-
---
-- check that using the module's functions with unsupported relations will fail
--
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 9cc5281f01..70854ae298 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2501,19 +2501,16 @@ include_dir 'conf.d'
levels. This parameter can only be set at server start.
</para>
<para>
- In <literal>minimal</literal> level, no information is logged for
- permanent relations for the remainder of a transaction that creates or
- rewrites them. This can make operations much faster (see
- <xref linkend="populate-pitr"/>). Operations that initiate this
- optimization include:
+ In <literal>minimal</literal> level, WAL-logging of some bulk
+ operations can be safely skipped, which can make those
+ operations much faster (see <xref linkend="populate-pitr"/>).
+ Operations in which this optimization can be applied include:
<simplelist>
- <member><command>ALTER ... SET TABLESPACE</command></member>
+ <member><command>CREATE TABLE AS</command></member>
+ <member><command>CREATE INDEX</command></member>
<member><command>CLUSTER</command></member>
- <member><command>CREATE TABLE</command></member>
- <member><command>REFRESH MATERIALIZED VIEW</command>
- (without <option>CONCURRENTLY</option>)</member>
- <member><command>REINDEX</command></member>
- <member><command>TRUNCATE</command></member>
+ <member><command>COPY</command> into tables that were created or truncated in the same
+ transaction</member>
</simplelist>
But minimal WAL does not contain enough information to reconstruct the
data from a base backup and the WAL logs, so <literal>replica</literal> or
@@ -2910,26 +2907,6 @@ include_dir 'conf.d'
</listitem>
</varlistentry>
- <varlistentry id="guc-wal-skip-threshold" xreflabel="wal_skip_threshold">
- <term><varname>wal_skip_threshold</varname> (<type>integer</type>)
- <indexterm>
- <primary><varname>wal_skip_threshold</varname> configuration parameter</primary>
- </indexterm>
- </term>
- <listitem>
- <para>
- When <varname>wal_level</varname> is <literal>minimal</literal> and a
- transaction commits after creating or rewriting a permanent relation,
- this setting determines how to persist the new data. If the data is
- smaller than this setting, write it to the WAL log; otherwise, use an
- fsync of affected files. Depending on the properties of your storage,
- raising or lowering this value might help if such commits are slowing
- concurrent transactions. The default is two megabytes
- (<literal>2MB</literal>).
- </para>
- </listitem>
- </varlistentry>
-
<varlistentry id="guc-commit-delay" xreflabel="commit_delay">
<term><varname>commit_delay</varname> (<type>integer</type>)
<indexterm>
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 58477ac83a..ab090441cf 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1607,8 +1607,8 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
needs to be written, because in case of an error, the files
containing the newly loaded data will be removed anyway.
However, this consideration only applies when
- <xref linkend="guc-wal-level"/> is <literal>minimal</literal>
- as all commands must write WAL otherwise.
+ <xref linkend="guc-wal-level"/> is <literal>minimal</literal> for
+ non-partitioned tables as all commands must write WAL otherwise.
</para>
</sect2>
@@ -1708,13 +1708,42 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
</para>
<para>
- Aside from avoiding the time for the archiver or WAL sender to process the
- WAL data, doing this will actually make certain commands faster, because
- they do not to write WAL at all if <varname>wal_level</varname>
- is <literal>minimal</literal> and the current subtransaction (or top-level
- transaction) created or truncated the table or index they change. (They
- can guarantee crash safety more cheaply by doing
- an <function>fsync</function> at the end than by writing WAL.)
+ Aside from avoiding the time for the archiver or WAL sender to
+ process the WAL data,
+ doing this will actually make certain commands faster, because they
+ are designed not to write WAL at all if <varname>wal_level</varname>
+ is <literal>minimal</literal>. (They can guarantee crash safety more cheaply
+ by doing an <function>fsync</function> at the end than by writing WAL.)
+ This applies to the following commands:
+ <itemizedlist>
+ <listitem>
+ <para>
+ <command>CREATE TABLE AS SELECT</command>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <command>CREATE INDEX</command> (and variants such as
+ <command>ALTER TABLE ADD PRIMARY KEY</command>)
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <command>ALTER TABLE SET TABLESPACE</command>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <command>CLUSTER</command>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <command>COPY FROM</command>, when the target table has been
+ created or truncated earlier in the same transaction
+ </para>
+ </listitem>
+ </itemizedlist>
</para>
</sect2>
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 765329bbcd..dd975b164c 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -1004,44 +1004,23 @@ gistproperty(Oid index_oid, int attno,
}
/*
- * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page
- * splits anyway. This function provides a fake sequence of LSNs for that
- * purpose.
+ * Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs
+ * to detect concurrent page splits anyway. This function provides a fake
+ * sequence of LSNs for that purpose.
*/
XLogRecPtr
gistGetFakeLSN(Relation rel)
{
+ static XLogRecPtr counter = FirstNormalUnloggedLSN;
+
if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
{
/*
* Temporary relations are only accessible in our session, so a simple
* backend-local counter will do.
*/
- static XLogRecPtr counter = FirstNormalUnloggedLSN;
-
return counter++;
}
- else if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
- {
- /*
- * WAL-logging on this relation will start after commit, so its LSNs
- * must be distinct numbers smaller than the LSN at the next commit.
- * Emit a dummy WAL record if insert-LSN hasn't advanced after the
- * last call.
- */
- static XLogRecPtr lastlsn = InvalidXLogRecPtr;
- XLogRecPtr currlsn = GetXLogInsertRecPtr();
-
- /* Shouldn't be called for WAL-logging relations */
- Assert(!RelationNeedsWAL(rel));
-
- /* No need for an actual record if we already have a distinct LSN */
- if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn)
- currlsn = gistXLogAssignLSN();
-
- lastlsn = currlsn;
- return currlsn;
- }
else
{
/*
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index b60dba052f..d3f3a7b803 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -449,9 +449,6 @@ gist_redo(XLogReaderState *record)
case XLOG_GIST_PAGE_DELETE:
gistRedoPageDelete(record);
break;
- case XLOG_GIST_ASSIGN_LSN:
- /* nop. See gistGetFakeLSN(). */
- break;
default:
elog(PANIC, "gist_redo: unknown op code %u", info);
}
@@ -596,24 +593,6 @@ gistXLogPageDelete(Buffer buffer, FullTransactionId xid,
}
/*
- * Write an empty XLOG record to assign a distinct LSN.
- */
-XLogRecPtr
-gistXLogAssignLSN(void)
-{
- int dummy = 0;
-
- /*
- * Records other than SWITCH_WAL must have content. We use an integer 0 to
- * follow the restriction.
- */
- XLogBeginInsert();
- XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
- XLogRegisterData((char *) &dummy, sizeof(dummy));
- return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN);
-}
-
-/*
* Write XLOG record about reuse of a deleted page.
*/
void
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index a25d539ec4..29694b8aa4 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -21,6 +21,7 @@
* heap_multi_insert - insert multiple tuples into a relation
* heap_delete - delete a tuple from a relation
* heap_update - replace a tuple in a relation with another tuple
+ * heap_sync - sync heap, for when no WAL has been written
*
* NOTES
* This file contains the heap_ routines which implement
@@ -1938,7 +1939,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
MarkBufferDirty(buffer);
/* XLOG stuff */
- if (RelationNeedsWAL(relation))
+ if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
{
xl_heap_insert xlrec;
xl_heap_header xlhdr;
@@ -2121,7 +2122,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
/* currently not needed (thus unsupported) for heap_multi_insert() */
AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
- needwal = RelationNeedsWAL(relation);
+ needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
HEAP_DEFAULT_FILLFACTOR);
@@ -8920,6 +8921,46 @@ heap2_redo(XLogReaderState *record)
}
/*
+ * heap_sync - sync a heap, for use when no WAL has been written
+ *
+ * This forces the heap contents (including TOAST heap if any) down to disk.
+ * If we skipped using WAL, and WAL is otherwise needed, we must force the
+ * relation down to disk before it's safe to commit the transaction. This
+ * requires writing out any dirty buffers and then doing a forced fsync.
+ *
+ * Indexes are not touched. (Currently, index operations associated with
+ * the commands that use this are WAL-logged and so do not need fsync.
+ * That behavior might change someday, but in any case it's likely that
+ * any fsync decisions required would be per-index and hence not appropriate
+ * to be done here.)
+ */
+void
+heap_sync(Relation rel)
+{
+ /* non-WAL-logged tables never need fsync */
+ if (!RelationNeedsWAL(rel))
+ return;
+
+ /* main heap */
+ FlushRelationBuffers(rel);
+ /* FlushRelationBuffers will have opened rd_smgr */
+ smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
+
+ /* FSM is not critical, don't bother syncing it */
+
+ /* toast heap, if any */
+ if (OidIsValid(rel->rd_rel->reltoastrelid))
+ {
+ Relation toastrel;
+
+ toastrel = table_open(rel->rd_rel->reltoastrelid, AccessShareLock);
+ FlushRelationBuffers(toastrel);
+ smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
+ table_close(toastrel, AccessShareLock);
+ }
+}
+
+/*
* Mask a heap page before performing consistency checks on it.
*/
void
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 56b35622f1..ca52846b97 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -555,6 +555,17 @@ tuple_lock_retry:
return result;
}
+static void
+heapam_finish_bulk_insert(Relation relation, int options)
+{
+ /*
+ * If we skipped writing WAL, then we need to sync the heap (but not
+ * indexes since those use WAL anyway / don't go through tableam)
+ */
+ if (options & HEAP_INSERT_SKIP_WAL)
+ heap_sync(relation);
+}
+
/* ------------------------------------------------------------------------
* DDL related callbacks for heap AM.
@@ -687,6 +698,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
IndexScanDesc indexScan;
TableScanDesc tableScan;
HeapScanDesc heapScan;
+ bool use_wal;
bool is_system_catalog;
Tuplesortstate *tuplesort;
TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
@@ -701,9 +713,12 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
is_system_catalog = IsSystemRelation(OldHeap);
/*
- * Valid smgr_targblock implies something already wrote to the relation.
- * This may be harmless, but this function hasn't planned for it.
+ * We need to log the copied data in WAL iff WAL archiving/streaming is
+ * enabled AND it's a WAL-logged rel.
*/
+ use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
+
+ /* use_wal off requires smgr_targblock be initially invalid */
Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
/* Preallocate values/isnull arrays */
@@ -713,7 +728,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
/* Initialize the rewrite operation */
rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
- *multi_cutoff);
+ *multi_cutoff, use_wal);
/* Set up sorting if wanted */
@@ -2510,6 +2525,7 @@ static const TableAmRoutine heapam_methods = {
.tuple_delete = heapam_tuple_delete,
.tuple_update = heapam_tuple_update,
.tuple_lock = heapam_tuple_lock,
+ .finish_bulk_insert = heapam_finish_bulk_insert,
.tuple_fetch_row_version = heapam_fetch_row_version,
.tuple_get_latest_tid = heap_get_latest_tid,
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 39e33763df..9c29bc0e0f 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -136,6 +136,7 @@ typedef struct RewriteStateData
Page rs_buffer; /* page currently being built */
BlockNumber rs_blockno; /* block where page will go */
bool rs_buffer_valid; /* T if any tuples in buffer */
+ bool rs_use_wal; /* must we WAL-log inserts? */
bool rs_logical_rewrite; /* do we need to do logical rewriting */
TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine
* tuple visibility */
@@ -229,13 +230,15 @@ static void logical_end_heap_rewrite(RewriteState state);
* oldest_xmin xid used by the caller to determine which tuples are dead
* freeze_xid xid before which tuples will be frozen
* cutoff_multi multixact before which multis will be removed
+ * use_wal should the inserts to the new heap be WAL-logged?
*
* Returns an opaque RewriteState, allocated in current memory context,
* to be used in subsequent calls to the other functions.
*/
RewriteState
begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin,
- TransactionId freeze_xid, MultiXactId cutoff_multi)
+ TransactionId freeze_xid, MultiXactId cutoff_multi,
+ bool use_wal)
{
RewriteState state;
MemoryContext rw_cxt;
@@ -260,6 +263,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
/* new_heap needn't be empty, just locked */
state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
state->rs_buffer_valid = false;
+ state->rs_use_wal = use_wal;
state->rs_oldest_xmin = oldest_xmin;
state->rs_freeze_xid = freeze_xid;
state->rs_cutoff_multi = cutoff_multi;
@@ -318,7 +322,7 @@ end_heap_rewrite(RewriteState state)
/* Write the last page, if any */
if (state->rs_buffer_valid)
{
- if (RelationNeedsWAL(state->rs_new_rel))
+ if (state->rs_use_wal)
log_newpage(&state->rs_new_rel->rd_node,
MAIN_FORKNUM,
state->rs_blockno,
@@ -333,14 +337,18 @@ end_heap_rewrite(RewriteState state)
}
/*
- * When we WAL-logged rel pages, we must nonetheless fsync them. The
+ * If the rel is WAL-logged, must fsync before commit. We use heap_sync
+ * to ensure that the toast table gets fsync'd too.
+ *
+ * It's obvious that we must do this when not WAL-logging. It's less
+ * obvious that we have to do it even if we did WAL-log the pages. The
* reason is the same as in storage.c's RelationCopyStorage(): we're
* writing data that's not in shared buffers, and so a CHECKPOINT
* occurring during the rewriteheap operation won't have fsync'd data we
* wrote before the checkpoint.
*/
if (RelationNeedsWAL(state->rs_new_rel))
- smgrimmedsync(state->rs_new_rel->rd_smgr, MAIN_FORKNUM);
+ heap_sync(state->rs_new_rel);
logical_end_heap_rewrite(state);
@@ -638,6 +646,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
{
int options = HEAP_INSERT_SKIP_FSM;
+ if (!state->rs_use_wal)
+ options |= HEAP_INSERT_SKIP_WAL;
+
/*
* While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
* for the TOAST table are not logically decoded. The main heap is
@@ -676,7 +687,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
/* Doesn't fit, so write out the existing page */
/* XLOG stuff */
- if (RelationNeedsWAL(state->rs_new_rel))
+ if (state->rs_use_wal)
log_newpage(&state->rs_new_rel->rd_node,
MAIN_FORKNUM,
state->rs_blockno,
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 9111e2789c..e66cd36dfa 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -31,6 +31,18 @@
* them. They will need to be re-read into shared buffers on first use after
* the build finishes.
*
+ * Since the index will never be used unless it is completely built,
+ * from a crash-recovery point of view there is no need to WAL-log the
+ * steps of the build. After completing the index build, we can just sync
+ * the whole file to disk using smgrimmedsync() before exiting this module.
+ * This can be seen to be sufficient for crash recovery by considering that
+ * it's effectively equivalent to what would happen if a CHECKPOINT occurred
+ * just after the index build. However, it is clearly not sufficient if the
+ * DBA is using the WAL log for PITR or replication purposes, since another
+ * machine would not be able to reconstruct the index from WAL. Therefore,
+ * we log the completed index pages to WAL if and only if WAL archiving is
+ * active.
+ *
* This code isn't concerned about the FSM at all. The caller is responsible
* for initializing that.
*
@@ -557,7 +569,12 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
wstate.inskey = _bt_mkscankey(wstate.index, NULL);
/* _bt_mkscankey() won't set allequalimage without metapage */
wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
- wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
+
+ /*
+ * We need to log index creation in WAL iff WAL archiving/streaming is
+ * enabled UNLESS the index isn't WAL-logged anyway.
+ */
+ wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
/* reserve the metapage */
wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
@@ -1407,15 +1424,21 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
_bt_uppershutdown(wstate, state);
/*
- * When we WAL-logged index pages, we must nonetheless fsync index files.
- * Since we're building outside shared buffers, a CHECKPOINT occurring
- * during the build has no way to flush the previously written data to
- * disk (indeed it won't know the index even exists). A crash later on
- * would replay WAL from the checkpoint, therefore it wouldn't replay our
- * earlier WAL entries. If we do not fsync those pages here, they might
- * still not be on disk when the crash occurs.
+ * If the index is WAL-logged, we must fsync it down to disk before it's
+ * safe to commit the transaction. (For a non-WAL-logged index we don't
+ * care since the index will be uninteresting after a crash anyway.)
+ *
+ * It's obvious that we must do this when not WAL-logging the build. It's
+ * less obvious that we have to do it even if we did WAL-log the index
+ * pages. The reason is that since we're building outside shared buffers,
+ * a CHECKPOINT occurring during the build has no way to flush the
+ * previously written data to disk (indeed it won't know the index even
+ * exists). A crash later on would replay WAL from the checkpoint,
+ * therefore it wouldn't replay our earlier WAL entries. If we do not
+ * fsync those pages here, they might still not be on disk when the crash
+ * occurs.
*/
- if (wstate->btws_use_wal)
+ if (RelationNeedsWAL(wstate->index))
{
RelationOpenSmgr(wstate->index);
smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index de309fb122..3377367e12 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -80,9 +80,6 @@ gist_desc(StringInfo buf, XLogReaderState *record)
case XLOG_GIST_PAGE_DELETE:
out_gistxlogPageDelete(buf, (gistxlogPageDelete *) rec);
break;
- case XLOG_GIST_ASSIGN_LSN:
- /* No details to write out */
- break;
}
}
@@ -108,9 +105,6 @@ gist_identify(uint8 info)
case XLOG_GIST_PAGE_DELETE:
id = "PAGE_DELETE";
break;
- case XLOG_GIST_ASSIGN_LSN:
- id = "ASSIGN_LSN";
- break;
}
return id;
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index eb9aac5fd3..b5a2cb2de8 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -717,38 +717,6 @@ then restart recovery. This is part of the reason for not writing a WAL
entry until we've successfully done the original action.
-Skipping WAL for New RelFileNode
---------------------------------
-
-Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK
-would unlink, in-tree access methods write no WAL for that change. Code that
-writes WAL without calling RelationNeedsWAL() must check for this case. This
-skipping is mandatory. If a WAL-writing change preceded a WAL-skipping change
-for the same block, REDO could overwrite the WAL-skipping change. If a
-WAL-writing change followed a WAL-skipping change for the same block, a
-related problem would arise. When a WAL record contains no full-page image,
-REDO expects the page to match its contents from just before record insertion.
-A WAL-skipping change may not reach disk at all, violating REDO's expectation
-under full_page_writes=off. For any access method, CommitTransaction() writes
-and fsyncs affected blocks before recording the commit.
-
-Prefer to do the same in future access methods. However, two other approaches
-can work. First, an access method can irreversibly transition a given fork
-from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and
-smgrimmedsync(). Second, an access method can opt to write WAL
-unconditionally for permanent relations. Under these approaches, the access
-method callbacks must not call functions that react to RelationNeedsWAL().
-
-This applies only to WAL records whose replay would modify bytes stored in the
-new relfilenode. It does not apply to other records about the relfilenode,
-such as XLOG_SMGR_CREATE. Because it operates at the level of individual
-relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations.
-Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which
-ALTER TABLE adds a TOAST relation. The TOAST relation will skip WAL, while
-the table owning it will not. ALTER TABLE SET TABLESPACE will cause a table
-to skip WAL, but that won't affect its indexes.
-
-
Asynchronous Commit
-------------------
@@ -852,12 +820,13 @@ Changes to a temp table are not WAL-logged, hence could reach disk in
advance of T1's commit, but we don't care since temp table contents don't
survive crashes anyway.
-Database writes that skip WAL for new relfilenodes are also safe. In these
-cases it's entirely possible for the data to reach disk before T1's commit,
-because T1 will fsync it down to disk without any sort of interlock. However,
-all these paths are designed to write data that no other transaction can see
-until after T1 commits. The situation is thus not different from ordinary
-WAL-logged updates.
+Database writes made via any of the paths we have introduced to avoid WAL
+overhead for bulk updates are also safe. In these cases it's entirely
+possible for the data to reach disk before T1's commit, because T1 will
+fsync it down to disk without any sort of interlock, as soon as it finishes
+the bulk update. However, all these paths are designed to write data that
+no other transaction can see until after T1 commits. The situation is thus
+not different from ordinary WAL-logged updates.
Transaction Emulation during Recovery
-------------------------------------
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index b6885b01bc..e3c60f23cd 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2109,13 +2109,6 @@ CommitTransaction(void)
*/
PreCommit_on_commit_actions();
- /*
- * Synchronize files that are created and not WAL-logged during this
- * transaction. This must happen before AtEOXact_RelationMap(), so that we
- * don't see committed-but-broken files after a crash.
- */
- smgrDoPendingSyncs(true);
-
/* close large objects before lower-level cleanup */
AtEOXact_LargeObject(true);
@@ -2349,13 +2342,6 @@ PrepareTransaction(void)
*/
PreCommit_on_commit_actions();
- /*
- * Synchronize files that are created and not WAL-logged during this
- * transaction. This must happen before EndPrepare(), so that we don't see
- * committed-but-broken files after a crash and COMMIT PREPARED.
- */
- smgrDoPendingSyncs(true);
-
/* close large objects before lower-level cleanup */
AtEOXact_LargeObject(true);
@@ -2674,7 +2660,6 @@ AbortTransaction(void)
*/
AfterTriggerEndXact(false); /* 'false' means it's abort */
AtAbort_Portals();
- smgrDoPendingSyncs(false);
AtEOXact_LargeObject(false);
AtAbort_Notify();
AtEOXact_RelationMap(false, is_parallel_worker);
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 6cb143e161..b217ffa52f 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -549,8 +549,6 @@ typedef FakeRelCacheEntryData *FakeRelCacheEntry;
* fields related to physical storage, like rd_rel, are initialized, so the
* fake entry is only usable in low-level operations like ReadBuffer().
*
- * This is also used for syncing WAL-skipped files.
- *
* Caller must free the returned entry with FreeFakeRelcacheEntry().
*/
Relation
@@ -559,20 +557,18 @@ CreateFakeRelcacheEntry(RelFileNode rnode)
FakeRelCacheEntry fakeentry;
Relation rel;
+ Assert(InRecovery);
+
/* Allocate the Relation struct and all related space in one block. */
fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
rel = (Relation) fakeentry;
rel->rd_rel = &fakeentry->pgc;
rel->rd_node = rnode;
-
- /*
- * We will never be working with temp rels during recovery or while
- * syncing WAL-skipped files.
- */
+ /* We will never be working with temp rels during recovery */
rel->rd_backend = InvalidBackendId;
- /* It must be a permanent table here */
+ /* It must be a permanent table if we're in recovery. */
rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
/* We don't know the name of the relation; use relfilenode instead */
@@ -581,9 +577,9 @@ CreateFakeRelcacheEntry(RelFileNode rnode)
/*
* We set up the lockRelId in case anything tries to lock the dummy
* relation. Note that this is fairly bogus since relNode may be
- * different from the relation's OID. It shouldn't really matter though.
- * In recovery, we are running by ourselves and can't have any lock
- * conflicts. While syncing, we already hold AccessExclusiveLock.
+ * different from the relation's OID. It shouldn't really matter though,
+ * since we are presumably running by ourselves and can't have any lock
+ * conflicts ...
*/
rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y
index 5eaca279ee..61e758696f 100644
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@@ -306,8 +306,6 @@ Boot_DeclareIndexStmt:
stmt->idxcomment = NULL;
stmt->indexOid = InvalidOid;
stmt->oldNode = InvalidOid;
- stmt->oldCreateSubid = InvalidSubTransactionId;
- stmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
stmt->unique = false;
stmt->primary = false;
stmt->isconstraint = false;
@@ -358,8 +356,6 @@ Boot_DeclareUniqueIndexStmt:
stmt->idxcomment = NULL;
stmt->indexOid = InvalidOid;
stmt->oldNode = InvalidOid;
- stmt->oldCreateSubid = InvalidSubTransactionId;
- stmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
stmt->unique = true;
stmt->primary = false;
stmt->isconstraint = false;
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 0ed7c64a05..fddfbf1d8c 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -29,13 +29,9 @@
#include "miscadmin.h"
#include "storage/freespace.h"
#include "storage/smgr.h"
-#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/rel.h"
-/* GUC variables */
-int wal_skip_threshold = 2048; /* in kilobytes */
-
/*
* We keep a list of all relations (represented as RelFileNode values)
* that have been created or deleted in the current transaction. When
@@ -65,14 +61,7 @@ typedef struct PendingRelDelete
struct PendingRelDelete *next; /* linked-list link */
} PendingRelDelete;
-typedef struct pendingSync
-{
- RelFileNode rnode;
- bool is_truncated; /* Has the file experienced truncation? */
-} pendingSync;
-
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
-HTAB *pendingSyncHash = NULL;
/*
* RelationCreateStorage
@@ -128,32 +117,6 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
pending->next = pendingDeletes;
pendingDeletes = pending;
- /* Queue an at-commit sync. */
- if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
- {
- pendingSync *pending;
- bool found;
-
- /* we sync only permanent relations */
- Assert(backend == InvalidBackendId);
-
- if (!pendingSyncHash)
- {
- HASHCTL ctl;
-
- ctl.keysize = sizeof(RelFileNode);
- ctl.entrysize = sizeof(pendingSync);
- ctl.hcxt = TopTransactionContext;
- pendingSyncHash =
- hash_create("pending sync hash",
- 16, &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
- }
-
- pending = hash_search(pendingSyncHash, &rnode, HASH_ENTER, &found);
- Assert(!found);
- pending->is_truncated = false;
- }
-
return srel;
}
@@ -312,8 +275,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
}
}
- RelationPreTruncate(rel);
-
/*
* We WAL-log the truncation before actually truncating, which means
* trouble if the truncation fails. If we then crash, the WAL replay
@@ -365,28 +326,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
}
/*
- * RelationPreTruncate
- * Perform AM-independent work before a physical truncation.
- *
- * If an access method's relation_nontransactional_truncate does not call
- * RelationTruncate(), it must call this before decreasing the table size.
- */
-void
-RelationPreTruncate(Relation rel)
-{
- pendingSync *pending;
-
- if (!pendingSyncHash)
- return;
- RelationOpenSmgr(rel);
-
- pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node),
- HASH_FIND, NULL);
- if (pending)
- pending->is_truncated = true;
-}
-
-/*
* Copy a fork's data, block by block.
*
* Note that this requires that there is no dirty data in shared buffers. If
@@ -416,9 +355,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
/*
* We need to log the copied data in WAL iff WAL archiving/streaming is
- * enabled AND it's a permanent relation. This gives the same answer as
- * "RelationNeedsWAL(rel) || copying_initfork", because we know the
- * current operation created a new relfilenode.
+ * enabled AND it's a permanent relation.
*/
use_wal = XLogIsNeeded() &&
(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
@@ -460,40 +397,25 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
}
/*
- * When we WAL-logged rel pages, we must nonetheless fsync them. The
- * reason is that since we're copying outside shared buffers, a CHECKPOINT
- * occurring during the copy has no way to flush the previously written
- * data to disk (indeed it won't know the new rel even exists). A crash
- * later on would replay WAL from the checkpoint, therefore it wouldn't
- * replay our earlier WAL entries. If we do not fsync those pages here,
- * they might still not be on disk when the crash occurs.
+ * If the rel is WAL-logged, must fsync before commit. We use heap_sync
+ * to ensure that the toast table gets fsync'd too. (For a temp or
+ * unlogged rel we don't care since the data will be gone after a crash
+ * anyway.)
+ *
+ * It's obvious that we must do this when not WAL-logging the copy. It's
+ * less obvious that we have to do it even if we did WAL-log the copied
+ * pages. The reason is that since we're copying outside shared buffers, a
+ * CHECKPOINT occurring during the copy has no way to flush the previously
+ * written data to disk (indeed it won't know the new rel even exists). A
+ * crash later on would replay WAL from the checkpoint, therefore it
+ * wouldn't replay our earlier WAL entries. If we do not fsync those pages
+ * here, they might still not be on disk when the crash occurs.
*/
- if (use_wal || copying_initfork)
+ if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
smgrimmedsync(dst, forkNum);
}
/*
- * RelFileNodeSkippingWAL - check if a BM_PERMANENT relfilenode is using WAL
- *
- * Changes of certain relfilenodes must not write WAL; see "Skipping WAL for
- * New RelFileNode" in src/backend/access/transam/README. Though it is
- * known from Relation efficiently, this function is intended for the code
- * paths not having access to Relation.
- */
-bool
-RelFileNodeSkippingWAL(RelFileNode rnode)
-{
- if (XLogIsNeeded())
- return false; /* no permanent relfilenode skips WAL */
-
- if (!pendingSyncHash ||
- hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL)
- return false;
-
- return true;
-}
-
-/*
* smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
*
* This also runs when aborting a subxact; we want to clean up a failed
@@ -571,144 +493,6 @@ smgrDoPendingDeletes(bool isCommit)
}
/*
- * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
- */
-void
-smgrDoPendingSyncs(bool isCommit)
-{
- PendingRelDelete *pending;
- int nrels = 0,
- maxrels = 0;
- SMgrRelation *srels = NULL;
- HASH_SEQ_STATUS scan;
- pendingSync *pendingsync;
-
- if (XLogIsNeeded())
- return; /* no relation can use this */
-
- Assert(GetCurrentTransactionNestLevel() == 1);
-
- if (!pendingSyncHash)
- return; /* no relation needs sync */
-
- /* Just throw away all pending syncs if any at rollback */
- if (!isCommit)
- {
- pendingSyncHash = NULL;
- return;
- }
-
- AssertPendingSyncs_RelationCache();
-
- /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
- for (pending = pendingDeletes; pending != NULL; pending = pending->next)
- {
- if (!pending->atCommit)
- continue;
-
- (void) hash_search(pendingSyncHash, (void *) &pending->relnode,
- HASH_REMOVE, NULL);
- }
-
- hash_seq_init(&scan, pendingSyncHash);
- while ((pendingsync = (pendingSync *) hash_seq_search(&scan)))
- {
- ForkNumber fork;
- BlockNumber nblocks[MAX_FORKNUM + 1];
- BlockNumber total_blocks = 0;
- SMgrRelation srel;
-
- srel = smgropen(pendingsync->rnode, InvalidBackendId);
-
- /*
- * We emit newpage WAL records for smaller relations.
- *
- * Small WAL records have a chance to be emitted along with other
- * backends' WAL records. We emit WAL records instead of syncing for
- * files that are smaller than a certain threshold, expecting faster
- * commit. The threshold is defined by the GUC wal_skip_threshold.
- */
- if (!pendingsync->is_truncated)
- {
- for (fork = 0; fork <= MAX_FORKNUM; fork++)
- {
- if (smgrexists(srel, fork))
- {
- BlockNumber n = smgrnblocks(srel, fork);
-
- /* we shouldn't come here for unlogged relations */
- Assert(fork != INIT_FORKNUM);
- nblocks[fork] = n;
- total_blocks += n;
- }
- else
- nblocks[fork] = InvalidBlockNumber;
- }
- }
-
- /*
- * Sync file or emit WAL records for its contents.
- *
- * Although we emit WAL record if the file is small enough, do file
- * sync regardless of the size if the file has experienced a
- * truncation. It is because the file would be followed by trailing
- * garbage blocks after a crash recovery if, while a past longer file
- * had been flushed out, we omitted syncing-out of the file and
- * emitted WAL instead. You might think that we could choose WAL if
- * the current main fork is longer than ever, but there's a case where
- * main fork is longer than ever but FSM fork gets shorter.
- */
- if (pendingsync->is_truncated ||
- total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
- {
- /* allocate the initial array, or extend it, if needed */
- if (maxrels == 0)
- {
- maxrels = 8;
- srels = palloc(sizeof(SMgrRelation) * maxrels);
- }
- else if (maxrels <= nrels)
- {
- maxrels *= 2;
- srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
- }
-
- srels[nrels++] = srel;
- }
- else
- {
- /* Emit WAL records for all blocks. The file is small enough. */
- for (fork = 0; fork <= MAX_FORKNUM; fork++)
- {
- int n = nblocks[fork];
- Relation rel;
-
- if (!BlockNumberIsValid(n))
- continue;
-
- /*
- * Emit WAL for the whole file. Unfortunately we don't know
- * what kind of a page this is, so we have to log the full
- * page including any unused space. ReadBufferExtended()
- * counts some pgstat events; unfortunately, we discard them.
- */
- rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node);
- log_newpage_range(rel, fork, 0, n, false);
- FreeFakeRelcacheEntry(rel);
- }
- }
- }
-
- pendingSyncHash = NULL;
-
- if (nrels > 0)
- {
- smgrdosyncall(srels, nrels);
- pfree(srels);
- }
-}
-
-/*
* smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
*
* The return value is the number of relations scheduled for termination.
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index ccd0c9b286..fc1cea0236 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -1111,25 +1111,6 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
}
/*
- * Recognize that rel1's relfilenode (swapped from rel2) is new in this
- * subtransaction. The rel2 storage (swapped from rel1) may or may not be
- * new.
- */
- {
- Relation rel1,
- rel2;
-
- rel1 = relation_open(r1, NoLock);
- rel2 = relation_open(r2, NoLock);
- rel2->rd_createSubid = rel1->rd_createSubid;
- rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid;
- rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid;
- RelationAssumeNewRelfilenode(rel1);
- relation_close(rel1, NoLock);
- relation_close(rel2, NoLock);
- }
-
- /*
* In the case of a shared catalog, these next few steps will only affect
* our own database's pg_class row; but that's okay, because they are all
* noncritical updates. That's also an important fact for the case of a
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 05f1fae6b0..fbde9f88e7 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2713,15 +2713,63 @@ CopyFrom(CopyState cstate)
RelationGetRelationName(cstate->rel))));
}
- /*
- * If the target file is new-in-transaction, we assume that checking FSM
- * for free space is a waste of time. This could possibly be wrong, but
- * it's unlikely.
+ /*----------
+ * Check to see if we can avoid writing WAL
+ *
+ * If archive logging/streaming is not enabled *and* either
+ * - table was created in same transaction as this COPY
+ * - data is being written to relfilenode created in this transaction
+ * then we can skip writing WAL. It's safe because if the transaction
+ * doesn't commit, we'll discard the table (or the new relfilenode file).
+ * If it does commit, we'll have done the table_finish_bulk_insert() at
+ * the bottom of this routine first.
+ *
+ * As mentioned in comments in utils/rel.h, the in-same-transaction test
+ * is not always set correctly, since in rare cases rd_newRelfilenodeSubid
+ * can be cleared before the end of the transaction. The exact case is
+ * when a relation sets a new relfilenode twice in same transaction, yet
+ * the second one fails in an aborted subtransaction, e.g.
+ *
+ * BEGIN;
+ * TRUNCATE t;
+ * SAVEPOINT save;
+ * TRUNCATE t;
+ * ROLLBACK TO save;
+ * COPY ...
+ *
+ * Also, if the target file is new-in-transaction, we assume that checking
+ * FSM for free space is a waste of time, even if we must use WAL because
+ * of archiving. This could possibly be wrong, but it's unlikely.
+ *
+ * The comments for table_tuple_insert and RelationGetBufferForTuple
+ * specify that skipping WAL logging is only safe if we ensure that our
+ * tuples do not go into pages containing tuples from any other
+ * transactions --- but this must be the case if we have a new table or
+ * new relfilenode, so we need no additional work to enforce that.
+ *
+ * We currently don't support this optimization if the COPY target is a
+ * partitioned table as we currently only lazily initialize partition
+ * information when routing the first tuple to the partition. We cannot
+ * know at this stage if we can perform this optimization. It should be
+ * possible to improve on this, but it does mean maintaining heap insert
+ * option flags per partition and setting them when we first open the
+ * partition.
+ *
+ * This optimization is not supported for relation types which do not
+ * have any physical storage, with foreign tables and views using
+ * INSTEAD OF triggers entering in this category. Partitioned tables
+ * are not supported as per the description above.
+ *----------
*/
+ /* createSubid is creation check, newRelfilenodeSubid is truncation check */
if (RELKIND_HAS_STORAGE(cstate->rel->rd_rel->relkind) &&
(cstate->rel->rd_createSubid != InvalidSubTransactionId ||
- cstate->rel->rd_firstRelfilenodeSubid != InvalidSubTransactionId))
+ cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId))
+ {
ti_options |= TABLE_INSERT_SKIP_FSM;
+ if (!XLogIsNeeded())
+ ti_options |= TABLE_INSERT_SKIP_WAL;
+ }
/*
* Optimize if new relfilenode was created in this subxact or one of its
diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c
index 8e5e4fb95e..3a5676fb39 100644
--- a/src/backend/commands/createas.c
+++ b/src/backend/commands/createas.c
@@ -550,13 +550,16 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
myState->rel = intoRelationDesc;
myState->reladdr = intoRelationAddr;
myState->output_cid = GetCurrentCommandId(true);
- myState->ti_options = TABLE_INSERT_SKIP_FSM;
- myState->bistate = GetBulkInsertState();
/*
- * Valid smgr_targblock implies something already wrote to the relation.
- * This may be harmless, but this function hasn't planned for it.
+ * We can skip WAL-logging the insertions, unless PITR or streaming
+ * replication is in use. We can skip the FSM in any case.
*/
+ myState->ti_options = TABLE_INSERT_SKIP_FSM |
+ (XLogIsNeeded() ? 0 : TABLE_INSERT_SKIP_WAL);
+ myState->bistate = GetBulkInsertState();
+
+ /* Not using WAL requires smgr_targblock be initially invalid */
Assert(RelationGetTargetBlock(intoRelationDesc) == InvalidBlockNumber);
}
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index c94939df40..4e8263af4b 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -1192,8 +1192,6 @@ DefineIndex(Oid relationId,
childStmt->relation = NULL;
childStmt->indexOid = InvalidOid;
childStmt->oldNode = InvalidOid;
- childStmt->oldCreateSubid = InvalidSubTransactionId;
- childStmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
/*
* Adjust any Vars (both in expressions and in the index's
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index 492b2a3ee6..c3954f3e24 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -457,13 +457,17 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
*/
myState->transientrel = transientrel;
myState->output_cid = GetCurrentCommandId(true);
- myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN;
- myState->bistate = GetBulkInsertState();
/*
- * Valid smgr_targblock implies something already wrote to the relation.
- * This may be harmless, but this function hasn't planned for it.
+ * We can skip WAL-logging the insertions, unless PITR or streaming
+ * replication is in use. We can skip the FSM in any case.
*/
+ myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN;
+ if (!XLogIsNeeded())
+ myState->ti_options |= TABLE_INSERT_SKIP_WAL;
+ myState->bistate = GetBulkInsertState();
+
+ /* Not using WAL requires smgr_targblock be initially invalid */
Assert(RelationGetTargetBlock(transientrel) == InvalidBlockNumber);
}
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 31d718e8ea..729025470d 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -5041,14 +5041,19 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
newrel = NULL;
/*
- * Prepare a BulkInsertState and options for table_tuple_insert. The FSM
- * is empty, so don't bother using it.
+ * Prepare a BulkInsertState and options for table_tuple_insert. Because
+ * we're building a new heap, we can skip WAL-logging and fsync it to disk
+ * at the end instead (unless WAL-logging is required for archiving or
+ * streaming replication). The FSM is empty too, so don't bother using it.
*/
if (newrel)
{
mycid = GetCurrentCommandId(true);
bistate = GetBulkInsertState();
+
ti_options = TABLE_INSERT_SKIP_FSM;
+ if (!XLogIsNeeded())
+ ti_options |= TABLE_INSERT_SKIP_WAL;
}
else
{
@@ -7714,19 +7719,14 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
/*
* If TryReuseIndex() stashed a relfilenode for us, we used it for the new
- * index instead of building from scratch. Restore associated fields.
- * This may store InvalidSubTransactionId in both fields, in which case
- * relcache.c will assume it can rebuild the relcache entry. Hence, do
- * this after the CCI that made catalog rows visible to any rebuild. The
- * DROP of the old edition of this index will have scheduled the storage
- * for deletion at commit, so cancel that pending deletion.
+ * index instead of building from scratch. The DROP of the old edition of
+ * this index will have scheduled the storage for deletion at commit, so
+ * cancel that pending deletion.
*/
if (OidIsValid(stmt->oldNode))
{
Relation irel = index_open(address.objectId, NoLock);
- irel->rd_createSubid = stmt->oldCreateSubid;
- irel->rd_firstRelfilenodeSubid = stmt->oldFirstRelfilenodeSubid;
RelationPreserveStorage(irel->rd_node, true);
index_close(irel, NoLock);
}
@@ -12052,11 +12052,7 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt)
/* If it's a partitioned index, there is no storage to share. */
if (irel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
- {
stmt->oldNode = irel->rd_node.relNode;
- stmt->oldCreateSubid = irel->rd_createSubid;
- stmt->oldFirstRelfilenodeSubid = irel->rd_firstRelfilenodeSubid;
- }
index_close(irel, NoLock);
}
}
@@ -12992,8 +12988,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
table_close(pg_class, RowExclusiveLock);
- RelationAssumeNewRelfilenode(rel);
-
relation_close(rel, NoLock);
/* Make sure the reltablespace change is visible */
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 1a70625dc8..eaab97f753 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3478,8 +3478,6 @@ _copyIndexStmt(const IndexStmt *from)
COPY_STRING_FIELD(idxcomment);
COPY_SCALAR_FIELD(indexOid);
COPY_SCALAR_FIELD(oldNode);
- COPY_SCALAR_FIELD(oldCreateSubid);
- COPY_SCALAR_FIELD(oldFirstRelfilenodeSubid);
COPY_SCALAR_FIELD(unique);
COPY_SCALAR_FIELD(primary);
COPY_SCALAR_FIELD(isconstraint);
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 2256859dc3..88b912977e 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -1345,8 +1345,6 @@ _equalIndexStmt(const IndexStmt *a, const IndexStmt *b)
COMPARE_STRING_FIELD(idxcomment);
COMPARE_SCALAR_FIELD(indexOid);
COMPARE_SCALAR_FIELD(oldNode);
- COMPARE_SCALAR_FIELD(oldCreateSubid);
- COMPARE_SCALAR_FIELD(oldFirstRelfilenodeSubid);
COMPARE_SCALAR_FIELD(unique);
COMPARE_SCALAR_FIELD(primary);
COMPARE_SCALAR_FIELD(isconstraint);
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 89d00444ed..e084c3f069 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2653,8 +2653,6 @@ _outIndexStmt(StringInfo str, const IndexStmt *node)
WRITE_STRING_FIELD(idxcomment);
WRITE_OID_FIELD(indexOid);
WRITE_OID_FIELD(oldNode);
- WRITE_UINT_FIELD(oldCreateSubid);
- WRITE_UINT_FIELD(oldFirstRelfilenodeSubid);
WRITE_BOOL_FIELD(unique);
WRITE_BOOL_FIELD(primary);
WRITE_BOOL_FIELD(isconstraint);
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 13f3755345..7e384f956c 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -7415,8 +7415,6 @@ IndexStmt: CREATE opt_unique INDEX opt_concurrently opt_index_name
n->idxcomment = NULL;
n->indexOid = InvalidOid;
n->oldNode = InvalidOid;
- n->oldCreateSubid = InvalidSubTransactionId;
- n->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
n->primary = false;
n->isconstraint = false;
n->deferrable = false;
@@ -7445,8 +7443,6 @@ IndexStmt: CREATE opt_unique INDEX opt_concurrently opt_index_name
n->idxcomment = NULL;
n->indexOid = InvalidOid;
n->oldNode = InvalidOid;
- n->oldCreateSubid = InvalidSubTransactionId;
- n->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
n->primary = false;
n->isconstraint = false;
n->deferrable = false;
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 6a27c35e3b..c1911411d0 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -1415,8 +1415,6 @@ generateClonedIndexStmt(RangeVar *heapRel, Relation source_idx,
index->idxcomment = NULL;
index->indexOid = InvalidOid;
index->oldNode = InvalidOid;
- index->oldCreateSubid = InvalidSubTransactionId;
- index->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
index->unique = idxrec->indisunique;
index->primary = idxrec->indisprimary;
index->transformed = true; /* don't need transformIndexStmt */
@@ -2015,8 +2013,6 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
index->idxcomment = NULL;
index->indexOid = InvalidOid;
index->oldNode = InvalidOid;
- index->oldCreateSubid = InvalidSubTransactionId;
- index->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
index->transformed = false;
index->concurrent = false;
index->if_not_exists = false;
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 4f60979ce5..e05e2b3456 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -66,7 +66,7 @@
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
-#define RELS_BSEARCH_THRESHOLD 20
+#define DROP_RELS_BSEARCH_THRESHOLD 20
typedef struct PrivateRefCountEntry
{
@@ -105,19 +105,6 @@ typedef struct CkptTsStatus
int index;
} CkptTsStatus;
-/*
- * Type for array used to sort SMgrRelations
- *
- * FlushRelationsAllBuffers shares the same comparator function with
- * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be
- * compatible.
- */
-typedef struct SMgrSortArray
-{
- RelFileNode rnode; /* This must be the first member */
- SMgrRelation srel;
-} SMgrSortArray;
-
/* GUC variables */
bool zero_damaged_pages = false;
int bgwriter_lru_maxpages = 100;
@@ -3003,7 +2990,7 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
* an exactly determined value, as it depends on many factors (CPU and RAM
* speeds, amount of shared buffers etc.).
*/
- use_bsearch = n > RELS_BSEARCH_THRESHOLD;
+ use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
/* sort the list of rnodes if necessary */
if (use_bsearch)
@@ -3254,104 +3241,6 @@ FlushRelationBuffers(Relation rel)
}
/* ---------------------------------------------------------------------
- * FlushRelationsAllBuffers
- *
- * This function flushes out of the buffer pool all the pages of all
- * forks of the specified smgr relations. It's equivalent to calling
- * FlushRelationBuffers once per fork per relation. The relations are
- * assumed not to use local buffers.
- * --------------------------------------------------------------------
- */
-void
-FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
-{
- int i;
- SMgrSortArray *srels;
- bool use_bsearch;
-
- if (nrels == 0)
- return;
-
- /* fill-in array for qsort */
- srels = palloc(sizeof(SMgrSortArray) * nrels);
-
- for (i = 0; i < nrels; i++)
- {
- Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
-
- srels[i].rnode = smgrs[i]->smgr_rnode.node;
- srels[i].srel = smgrs[i];
- }
-
- /*
- * Save the bsearch overhead for low number of relations to sync. See
- * DropRelFileNodesAllBuffers for details.
- */
- use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
-
- /* sort the list of SMgrRelations if necessary */
- if (use_bsearch)
- pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
-
- /* Make sure we can handle the pin inside the loop */
- ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
-
- for (i = 0; i < NBuffers; i++)
- {
- SMgrSortArray *srelent = NULL;
- BufferDesc *bufHdr = GetBufferDescriptor(i);
- uint32 buf_state;
-
- /*
- * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
- * and saves some cycles.
- */
-
- if (!use_bsearch)
- {
- int j;
-
- for (j = 0; j < nrels; j++)
- {
- if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
- {
- srelent = &srels[j];
- break;
- }
- }
-
- }
- else
- {
- srelent = bsearch((const void *) &(bufHdr->tag.rnode),
- srels, nrels, sizeof(SMgrSortArray),
- rnode_comparator);
- }
-
- /* buffer doesn't belong to any of the given relfilenodes; skip it */
- if (srelent == NULL)
- continue;
-
- ReservePrivateRefCountEntry();
-
- buf_state = LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
- (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
- {
- PinBuffer_Locked(bufHdr);
- LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
- FlushBuffer(bufHdr, srelent->srel);
- LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
- UnpinBuffer(bufHdr, true);
- }
- else
- UnlockBufHdr(bufHdr, buf_state);
- }
-
- pfree(srels);
-}
-
-/* ---------------------------------------------------------------------
* FlushDatabaseBuffers
*
* This function writes all dirty pages of a database out to disk
@@ -3552,15 +3441,13 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
(pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
{
/*
- * If we must not write WAL, due to a relfilenode-specific
- * condition or being in recovery, don't dirty the page. We can
- * set the hint, just not dirty the page as a result so the hint
- * is lost when we evict the page or shutdown.
+ * If we're in recovery we cannot dirty a page because of a hint.
+ * We can set the hint, just not dirty the page as a result so the
+ * hint is lost when we evict the page or shutdown.
*
* See src/backend/storage/page/README for longer discussion.
*/
- if (RecoveryInProgress() ||
- RelFileNodeSkippingWAL(bufHdr->tag.rnode))
+ if (RecoveryInProgress())
return;
/*
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index efb44a25c4..3013ef63d0 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -614,18 +614,6 @@ LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode)
return (locallock && locallock->nLocks > 0);
}
-#ifdef USE_ASSERT_CHECKING
-/*
- * GetLockMethodLocalHash -- return the hash of local locks, for modules that
- * evaluate assertions based on all locks held.
- */
-HTAB *
-GetLockMethodLocalHash(void)
-{
- return LockMethodLocalHash;
-}
-#endif
-
/*
* LockHasWaiters -- look up 'locktag' and check if releasing this
* lock would wake up other processes waiting for it.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index ee9822c6e1..c5b771c531 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -248,10 +248,11 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
* During replay, we would delete the file and then recreate it, which is fine
* if the contents of the file were repopulated by subsequent WAL entries.
* But if we didn't WAL-log insertions, but instead relied on fsyncing the
- * file after populating it (as we do at wal_level=minimal), the contents of
- * the file would be lost forever. By leaving the empty file until after the
- * next checkpoint, we prevent reassignment of the relfilenode number until
- * it's safe, because relfilenode assignment skips over any existing file.
+ * file after populating it (as for instance CLUSTER and CREATE INDEX do),
+ * the contents of the file would be lost forever. By leaving the empty file
+ * until after the next checkpoint, we prevent reassignment of the relfilenode
+ * number until it's safe, because relfilenode assignment skips over any
+ * existing file.
*
* We do not need to go through this dance for temp relations, though, because
* we never make WAL entries for temp rels, and so a temp rel poses no threat
@@ -876,18 +877,12 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
* mdimmedsync() -- Immediately sync a relation to stable storage.
*
* Note that only writes already issued are synced; this routine knows
- * nothing of dirty buffers that may exist inside the buffer manager. We
- * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
- * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
- * some segment, then mdtruncate() renders that segment inactive. If we
- * crash before the next checkpoint syncs the newly-inactive segment, that
- * segment may survive recovery, reintroducing unwanted data into the table.
+ * nothing of dirty buffers that may exist inside the buffer manager.
*/
void
mdimmedsync(SMgrRelation reln, ForkNumber forknum)
{
int segno;
- int min_inactive_seg;
/*
* NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -895,16 +890,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
*/
mdnblocks(reln, forknum);
- min_inactive_seg = segno = reln->md_num_open_segs[forknum];
-
- /*
- * Temporarily open inactive segments, then close them after sync. There
- * may be some inactive segments left opened after fsync() error, but that
- * is harmless. We don't bother to clean them up and take a risk of
- * further trouble. The next mdclose() will soon close them.
- */
- while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
- segno++;
+ segno = reln->md_num_open_segs[forknum];
while (segno > 0)
{
@@ -915,14 +901,6 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
FilePathName(v->mdfd_vfd))));
-
- /* Close inactive segments immediately */
- if (segno > min_inactive_seg)
- {
- FileClose(v->mdfd_vfd);
- _fdvec_resize(reln, forknum, segno - 1);
- }
-
segno--;
}
}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 72c9696ad1..360b5bf5bf 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -389,41 +389,6 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
}
/*
- * smgrdosyncall() -- Immediately sync all forks of all given relations
- *
- * All forks of all given relations are synced out to the store.
- *
- * This is equivalent to FlushRelationBuffers() for each smgr relation,
- * then calling smgrimmedsync() for all forks of each relation, but it's
- * significantly quicker so should be preferred when possible.
- */
-void
-smgrdosyncall(SMgrRelation *rels, int nrels)
-{
- int i = 0;
- ForkNumber forknum;
-
- if (nrels == 0)
- return;
-
- FlushRelationsAllBuffers(rels, nrels);
-
- /*
- * Sync the physical file(s).
- */
- for (i = 0; i < nrels; i++)
- {
- int which = rels[i]->smgr_which;
-
- for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
- {
- if (smgrsw[which].smgr_exists(rels[i], forknum))
- smgrsw[which].smgr_immedsync(rels[i], forknum);
- }
- }
-}
-
-/*
* smgrdounlinkall() -- Immediately unlink all forks of all given relations
*
* All forks of all given relations are removed from the store. This
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 9ee9dc8cc0..76f41dbe36 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -257,9 +257,6 @@ static void RelationReloadIndexInfo(Relation relation);
static void RelationReloadNailed(Relation relation);
static void RelationFlushRelation(Relation relation);
static void RememberToFreeTupleDescAtEOX(TupleDesc td);
-#ifdef USE_ASSERT_CHECKING
-static void AssertPendingSyncConsistency(Relation relation);
-#endif
static void AtEOXact_cleanup(Relation relation, bool isCommit);
static void AtEOSubXact_cleanup(Relation relation, bool isCommit,
SubTransactionId mySubid, SubTransactionId parentSubid);
@@ -1096,8 +1093,6 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
relation->rd_isnailed = false;
relation->rd_createSubid = InvalidSubTransactionId;
relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
- relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
- relation->rd_droppedSubid = InvalidSubTransactionId;
switch (relation->rd_rel->relpersistence)
{
case RELPERSISTENCE_UNLOGGED:
@@ -1822,8 +1817,6 @@ formrdesc(const char *relationName, Oid relationReltype,
relation->rd_isnailed = true;
relation->rd_createSubid = InvalidSubTransactionId;
relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
- relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
- relation->rd_droppedSubid = InvalidSubTransactionId;
relation->rd_backend = InvalidBackendId;
relation->rd_islocaltemp = false;
@@ -1996,13 +1989,6 @@ RelationIdGetRelation(Oid relationId)
if (RelationIsValid(rd))
{
- /* return NULL for dropped relations */
- if (rd->rd_droppedSubid != InvalidSubTransactionId)
- {
- Assert(!rd->rd_isvalid);
- return NULL;
- }
-
RelationIncrementReferenceCount(rd);
/* revalidate cache entry if necessary */
if (!rd->rd_isvalid)
@@ -2106,7 +2092,7 @@ RelationClose(Relation relation)
#ifdef RELCACHE_FORCE_RELEASE
if (RelationHasReferenceCountZero(relation) &&
relation->rd_createSubid == InvalidSubTransactionId &&
- relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)
+ relation->rd_newRelfilenodeSubid == InvalidSubTransactionId)
RelationClearRelation(relation, false);
#endif
}
@@ -2145,11 +2131,10 @@ RelationReloadIndexInfo(Relation relation)
HeapTuple pg_class_tuple;
Form_pg_class relp;
- /* Should be called only for invalidated, live indexes */
+ /* Should be called only for invalidated indexes */
Assert((relation->rd_rel->relkind == RELKIND_INDEX ||
relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) &&
- !relation->rd_isvalid &&
- relation->rd_droppedSubid == InvalidSubTransactionId);
+ !relation->rd_isvalid);
/* Ensure it's closed at smgr level */
RelationCloseSmgr(relation);
@@ -2445,13 +2430,6 @@ RelationClearRelation(Relation relation, bool rebuild)
return;
}
- /* Mark it invalid until we've finished rebuild */
- relation->rd_isvalid = false;
-
- /* See RelationForgetRelation(). */
- if (relation->rd_droppedSubid != InvalidSubTransactionId)
- return;
-
/*
* Even non-system indexes should not be blown away if they are open and
* have valid index support information. This avoids problems with active
@@ -2464,11 +2442,15 @@ RelationClearRelation(Relation relation, bool rebuild)
relation->rd_refcnt > 0 &&
relation->rd_indexcxt != NULL)
{
+ relation->rd_isvalid = false; /* needs to be revalidated */
if (IsTransactionState())
RelationReloadIndexInfo(relation);
return;
}
+ /* Mark it invalid until we've finished rebuild */
+ relation->rd_isvalid = false;
+
/*
* If we're really done with the relcache entry, blow it away. But if
* someone is still using it, reconstruct the whole deal without moving
@@ -2526,13 +2508,13 @@ RelationClearRelation(Relation relation, bool rebuild)
* problem.
*
* When rebuilding an open relcache entry, we must preserve ref count,
- * rd_*Subid, and rd_toastoid state. Also attempt to preserve the
- * pg_class entry (rd_rel), tupledesc, rewrite-rule, partition key,
- * and partition descriptor substructures in place, because various
- * places assume that these structures won't move while they are
- * working with an open relcache entry. (Note: the refcount
- * mechanism for tupledescs might someday allow us to remove this hack
- * for the tupledesc.)
+ * rd_createSubid/rd_newRelfilenodeSubid, and rd_toastoid state. Also
+ * attempt to preserve the pg_class entry (rd_rel), tupledesc,
+ * rewrite-rule, partition key, and partition descriptor substructures
+ * in place, because various places assume that these structures won't
+ * move while they are working with an open relcache entry. (Note:
+ * the refcount mechanism for tupledescs might someday allow us to
+ * remove this hack for the tupledesc.)
*
* Note that this process does not touch CurrentResourceOwner; which
* is good because whatever ref counts the entry may have do not
@@ -2612,8 +2594,6 @@ RelationClearRelation(Relation relation, bool rebuild)
/* creation sub-XIDs must be preserved */
SWAPFIELD(SubTransactionId, rd_createSubid);
SWAPFIELD(SubTransactionId, rd_newRelfilenodeSubid);
- SWAPFIELD(SubTransactionId, rd_firstRelfilenodeSubid);
- SWAPFIELD(SubTransactionId, rd_droppedSubid);
/* un-swap rd_rel pointers, swap contents instead */
SWAPFIELD(Form_pg_class, rd_rel);
/* ... but actually, we don't have to update newrel->rd_rel */
@@ -2692,12 +2672,12 @@ static void
RelationFlushRelation(Relation relation)
{
if (relation->rd_createSubid != InvalidSubTransactionId ||
- relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
+ relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
{
/*
* New relcache entries are always rebuilt, not flushed; else we'd
- * forget the "new" status of the relation. Ditto for the
- * new-relfilenode status.
+ * forget the "new" status of the relation, which is a useful
+ * optimization to have. Ditto for the new-relfilenode status.
*
* The rel could have zero refcnt here, so temporarily increment the
* refcnt to ensure it's safe to rebuild it. We can assume that the
@@ -2719,7 +2699,10 @@ RelationFlushRelation(Relation relation)
}
/*
- * RelationForgetRelation - caller reports that it dropped the relation
+ * RelationForgetRelation - unconditionally remove a relcache entry
+ *
+ * External interface for destroying a relcache entry when we
+ * drop the relation.
*/
void
RelationForgetRelation(Oid rid)
@@ -2734,19 +2717,7 @@ RelationForgetRelation(Oid rid)
if (!RelationHasReferenceCountZero(relation))
elog(ERROR, "relation %u is still open", rid);
- Assert(relation->rd_droppedSubid == InvalidSubTransactionId);
- if (relation->rd_createSubid != InvalidSubTransactionId ||
- relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
- {
- /*
- * In the event of subtransaction rollback, we must not forget
- * rd_*Subid. Mark the entry "dropped" so RelationClearRelation()
- * invalidates it in lieu of destroying it. (If we're in a top
- * transaction, we could opt to destroy the entry.)
- */
- relation->rd_droppedSubid = GetCurrentSubTransactionId();
- }
-
+ /* Unconditionally destroy the relcache entry */
RelationClearRelation(relation, false);
}
@@ -2786,10 +2757,11 @@ RelationCacheInvalidateEntry(Oid relationId)
* relation cache and re-read relation mapping data.
*
* This is currently used only to recover from SI message buffer overflow,
- * so we do not touch relations having new-in-transaction relfilenodes; they
- * cannot be targets of cross-backend SI updates (and our own updates now go
- * through a separate linked list that isn't limited by the SI message
- * buffer size).
+ * so we do not touch new-in-transaction relations; they cannot be targets
+ * of cross-backend SI updates (and our own updates now go through a
+ * separate linked list that isn't limited by the SI message buffer size).
+ * Likewise, we need not discard new-relfilenode-in-transaction hints,
+ * since any invalidation of those would be a local event.
*
* We do this in two phases: the first pass deletes deletable items, and
* the second one rebuilds the rebuildable items. This is essential for
@@ -2840,7 +2812,7 @@ RelationCacheInvalidate(void)
* pending invalidations.
*/
if (relation->rd_createSubid != InvalidSubTransactionId ||
- relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
+ relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
continue;
relcacheInvalsReceived++;
@@ -2952,84 +2924,6 @@ RememberToFreeTupleDescAtEOX(TupleDesc td)
EOXactTupleDescArray[NextEOXactTupleDescNum++] = td;
}
-#ifdef USE_ASSERT_CHECKING
-static void
-AssertPendingSyncConsistency(Relation relation)
-{
- bool relcache_verdict =
- relation->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&
- ((relation->rd_createSubid != InvalidSubTransactionId &&
- RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) ||
- relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId);
-
- Assert(relcache_verdict == RelFileNodeSkippingWAL(relation->rd_node));
-
- if (relation->rd_droppedSubid != InvalidSubTransactionId)
- Assert(!relation->rd_isvalid &&
- (relation->rd_createSubid != InvalidSubTransactionId ||
- relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId));
-}
-
-/*
- * AssertPendingSyncs_RelationCache
- *
- * Assert that relcache.c and storage.c agree on whether to skip WAL.
- */
-void
-AssertPendingSyncs_RelationCache(void)
-{
- HASH_SEQ_STATUS status;
- LOCALLOCK *locallock;
- Relation *rels;
- int maxrels;
- int nrels;
- RelIdCacheEnt *idhentry;
- int i;
-
- /*
- * Open every relation that this transaction has locked. If, for some
- * relation, storage.c is skipping WAL and relcache.c is not skipping WAL,
- * a CommandCounterIncrement() typically yields a local invalidation
- * message that destroys the relcache entry. By recreating such entries
- * here, we detect the problem.
- */
- PushActiveSnapshot(GetTransactionSnapshot());
- maxrels = 1;
- rels = palloc(maxrels * sizeof(*rels));
- nrels = 0;
- hash_seq_init(&status, GetLockMethodLocalHash());
- while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
- {
- Oid relid;
- Relation r;
-
- if (locallock->nLocks <= 0)
- continue;
- if ((LockTagType) locallock->tag.lock.locktag_type !=
- LOCKTAG_RELATION)
- continue;
- relid = ObjectIdGetDatum(locallock->tag.lock.locktag_field2);
- r = RelationIdGetRelation(relid);
- if (!RelationIsValid(r))
- continue;
- if (nrels >= maxrels)
- {
- maxrels *= 2;
- rels = repalloc(rels, maxrels * sizeof(*rels));
- }
- rels[nrels++] = r;
- }
-
- hash_seq_init(&status, RelationIdCache);
- while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
- AssertPendingSyncConsistency(idhentry->reldesc);
-
- for (i = 0; i < nrels; i++)
- RelationClose(rels[i]);
- PopActiveSnapshot();
-}
-#endif
-
/*
* AtEOXact_RelationCache
*
@@ -3112,8 +3006,6 @@ AtEOXact_RelationCache(bool isCommit)
static void
AtEOXact_cleanup(Relation relation, bool isCommit)
{
- bool clear_relcache = false;
-
/*
* The relcache entry's ref count should be back to its normal
* not-in-a-transaction state: 0 unless it's nailed in cache.
@@ -3139,31 +3031,17 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
#endif
/*
- * Is the relation live after this transaction ends?
+ * Is it a relation created in the current transaction?
*
- * During commit, clear the relcache entry if it is preserved after
- * relation drop, in order not to orphan the entry. During rollback,
- * clear the relcache entry if the relation is created in the current
- * transaction since it isn't interesting any longer once we are out of
- * the transaction.
- */
- clear_relcache =
- (isCommit ?
- relation->rd_droppedSubid != InvalidSubTransactionId :
- relation->rd_createSubid != InvalidSubTransactionId);
-
- /*
- * Since we are now out of the transaction, reset the subids to zero.
- * That also lets RelationClearRelation() drop the relcache entry.
+ * During commit, reset the flag to zero, since we are now out of the
+ * creating transaction. During abort, simply delete the relcache entry
+ * --- it isn't interesting any longer.
*/
- relation->rd_createSubid = InvalidSubTransactionId;
- relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
- relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
- relation->rd_droppedSubid = InvalidSubTransactionId;
-
- if (clear_relcache)
+ if (relation->rd_createSubid != InvalidSubTransactionId)
{
- if (RelationHasReferenceCountZero(relation))
+ if (isCommit)
+ relation->rd_createSubid = InvalidSubTransactionId;
+ else if (RelationHasReferenceCountZero(relation))
{
RelationClearRelation(relation, false);
return;
@@ -3178,10 +3056,16 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
* eventually. This must be just a WARNING to avoid
* error-during-error-recovery loops.
*/
+ relation->rd_createSubid = InvalidSubTransactionId;
elog(WARNING, "cannot remove relcache entry for \"%s\" because it has nonzero refcount",
RelationGetRelationName(relation));
}
}
+
+ /*
+ * Likewise, reset the hint about the relfilenode being new.
+ */
+ relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
}
/*
@@ -3245,28 +3129,15 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
/*
* Is it a relation created in the current subtransaction?
*
- * During subcommit, mark it as belonging to the parent, instead, as long
- * as it has not been dropped. Otherwise simply delete the relcache entry.
- * --- it isn't interesting any longer.
+ * During subcommit, mark it as belonging to the parent, instead. During
+ * subabort, simply delete the relcache entry.
*/
if (relation->rd_createSubid == mySubid)
{
- /*
- * Valid rd_droppedSubid means the corresponding relation is dropped
- * but the relcache entry is preserved for at-commit pending sync. We
- * need to drop it explicitly here not to make the entry orphan.
- */
- Assert(relation->rd_droppedSubid == mySubid ||
- relation->rd_droppedSubid == InvalidSubTransactionId);
- if (isCommit && relation->rd_droppedSubid == InvalidSubTransactionId)
+ if (isCommit)
relation->rd_createSubid = parentSubid;
else if (RelationHasReferenceCountZero(relation))
{
- /* allow the entry to be removed */
- relation->rd_createSubid = InvalidSubTransactionId;
- relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
- relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
- relation->rd_droppedSubid = InvalidSubTransactionId;
RelationClearRelation(relation, false);
return;
}
@@ -3286,8 +3157,7 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
}
/*
- * Likewise, update or drop any new-relfilenode-in-subtransaction record
- * or drop record.
+ * Likewise, update or drop any new-relfilenode-in-subtransaction hint.
*/
if (relation->rd_newRelfilenodeSubid == mySubid)
{
@@ -3296,22 +3166,6 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
else
relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
}
-
- if (relation->rd_firstRelfilenodeSubid == mySubid)
- {
- if (isCommit)
- relation->rd_firstRelfilenodeSubid = parentSubid;
- else
- relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
- }
-
- if (relation->rd_droppedSubid == mySubid)
- {
- if (isCommit)
- relation->rd_droppedSubid = parentSubid;
- else
- relation->rd_droppedSubid = InvalidSubTransactionId;
- }
}
@@ -3401,7 +3255,6 @@ RelationBuildLocalRelation(const char *relname,
/* it's being created in this transaction */
rel->rd_createSubid = GetCurrentSubTransactionId();
rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
- rel->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
/*
* create a new tuple descriptor from the one passed in. We do this
@@ -3699,29 +3552,14 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
*/
CommandCounterIncrement();
- RelationAssumeNewRelfilenode(relation);
-}
-
-/*
- * RelationAssumeNewRelfilenode
- *
- * Code that modifies pg_class.reltablespace or pg_class.relfilenode must call
- * this. The call shall precede any code that might insert WAL records whose
- * replay would modify bytes in the new RelFileNode, and the call shall follow
- * any WAL modifying bytes in the prior RelFileNode. See struct RelationData.
- * Ideally, call this as near as possible to the CommandCounterIncrement()
- * that makes the pg_class change visible (before it or after it); that
- * minimizes the chance of future development adding a forbidden WAL insertion
- * between RelationAssumeNewRelfilenode() and CommandCounterIncrement().
- */
-void
-RelationAssumeNewRelfilenode(Relation relation)
-{
+ /*
+ * Mark the rel as having been given a new relfilenode in the current
+ * (sub) transaction. This is a hint that can be used to optimize later
+ * operations on the rel in the same transaction.
+ */
relation->rd_newRelfilenodeSubid = GetCurrentSubTransactionId();
- if (relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)
- relation->rd_firstRelfilenodeSubid = relation->rd_newRelfilenodeSubid;
- /* Flag relation as needing eoxact cleanup (to clear these fields) */
+ /* Flag relation as needing eoxact cleanup (to remove the hint) */
EOXactListAdd(relation);
}
@@ -5787,8 +5625,6 @@ load_relcache_init_file(bool shared)
rel->rd_fkeylist = NIL;
rel->rd_createSubid = InvalidSubTransactionId;
rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
- rel->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
- rel->rd_droppedSubid = InvalidSubTransactionId;
rel->rd_amcache = NULL;
MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 7d1f1069f1..af876d1f01 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -36,7 +36,6 @@
#include "access/xlog_internal.h"
#include "catalog/namespace.h"
#include "catalog/pg_authid.h"
-#include "catalog/storage.h"
#include "commands/async.h"
#include "commands/prepare.h"
#include "commands/trigger.h"
@@ -2752,17 +2751,6 @@ static struct config_int ConfigureNamesInt[] =
},
{
- {"wal_skip_threshold", PGC_USERSET, WAL_SETTINGS,
- gettext_noop("Size of new file to fsync instead of writing WAL."),
- NULL,
- GUC_UNIT_KB
- },
- &wal_skip_threshold,
- 2048, 0, MAX_KILOBYTES,
- NULL, NULL, NULL
- },
-
- {
{"max_wal_senders", PGC_POSTMASTER, REPLICATION_SENDING,
gettext_noop("Sets the maximum number of simultaneously running WAL sender processes."),
NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index c7e46592fb..aa44f0c9bf 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -215,7 +215,6 @@
# (change requires restart)
#wal_writer_delay = 200ms # 1-10000 milliseconds
#wal_writer_flush_after = 1MB # measured in pages, 0 disables
-#wal_skip_threshold = 2MB
#commit_delay = 0 # range 0-100000, in microseconds
#commit_siblings = 5 # range 1-1000
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 4bfc628000..18f2b0d98e 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -455,8 +455,6 @@ extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
BlockNumber origrlink, GistNSN oldnsn,
Buffer leftchild, bool markfollowright);
-extern XLogRecPtr gistXLogAssignLSN(void);
-
/* gistget.c */
extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir);
extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 673afee1e1..55fc843d3a 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -26,7 +26,6 @@
/* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */
/* #define XLOG_GIST_CREATE_INDEX 0x50 */ /* not used anymore */
#define XLOG_GIST_PAGE_DELETE 0x60
-#define XLOG_GIST_ASSIGN_LSN 0x70 /* nop, assign new LSN */
/*
* Backup Blk 0: updated page.
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index f279edc473..47fda28daa 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -31,6 +31,7 @@
/* "options" flag bits for heap_insert */
+#define HEAP_INSERT_SKIP_WAL TABLE_INSERT_SKIP_WAL
#define HEAP_INSERT_SKIP_FSM TABLE_INSERT_SKIP_FSM
#define HEAP_INSERT_FROZEN TABLE_INSERT_FROZEN
#define HEAP_INSERT_NO_LOGICAL TABLE_INSERT_NO_LOGICAL
@@ -167,6 +168,8 @@ extern void simple_heap_delete(Relation relation, ItemPointer tid);
extern void simple_heap_update(Relation relation, ItemPointer otid,
HeapTuple tup);
+extern void heap_sync(Relation relation);
+
extern TransactionId heap_compute_xid_horizon_for_tuples(Relation rel,
ItemPointerData *items,
int nitems);
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
index e6d7fa1e65..fb2902bd69 100644
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -23,7 +23,7 @@ typedef struct RewriteStateData *RewriteState;
extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap,
TransactionId OldestXmin, TransactionId FreezeXid,
- MultiXactId MultiXactCutoff);
+ MultiXactId MultiXactCutoff, bool use_wal);
extern void end_heap_rewrite(RewriteState state);
extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
HeapTuple newTuple);
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 94903dd8de..91f84b1107 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -128,7 +128,7 @@ typedef struct TM_FailureData
} TM_FailureData;
/* "options" flag bits for table_tuple_insert */
-/* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */
+#define TABLE_INSERT_SKIP_WAL 0x0001
#define TABLE_INSERT_SKIP_FSM 0x0002
#define TABLE_INSERT_FROZEN 0x0004
#define TABLE_INSERT_NO_LOGICAL 0x0008
@@ -410,8 +410,9 @@ typedef struct TableAmRoutine
/*
* Perform operations necessary to complete insertions made via
- * tuple_insert and multi_insert with a BulkInsertState specified. In-tree
- * access methods ceased to use this.
+ * tuple_insert and multi_insert with a BulkInsertState specified. This
+ * may for example be used to flush the relation, when the
+ * TABLE_INSERT_SKIP_WAL option was used.
*
* Typically callers of tuple_insert and multi_insert will just pass all
* the flags that apply to them, and each AM has to decide which of them
@@ -1118,6 +1119,10 @@ table_compute_xid_horizon_for_tuples(Relation rel,
* The options bitmask allows the caller to specify options that may change the
* behaviour of the AM. The AM will ignore options that it does not support.
*
+ * If the TABLE_INSERT_SKIP_WAL option is specified, the new tuple doesn't
+ * need to be logged to WAL, even for a non-temp relation. It is the AMs
+ * choice whether this optimization is supported.
+ *
* If the TABLE_INSERT_SKIP_FSM option is specified, AMs are free to not reuse
* free space in the relation. This can save some cycles when we know the
* relation is new and doesn't contain useful amounts of free space.
@@ -1337,7 +1342,9 @@ table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot,
/*
* Perform operations necessary to complete insertions made via
- * tuple_insert and multi_insert with a BulkInsertState specified.
+ * tuple_insert and multi_insert with a BulkInsertState specified. This
+ * e.g. may e.g. used to flush the relation when inserting with
+ * TABLE_INSERT_SKIP_WAL specified.
*/
static inline void
table_finish_bulk_insert(Relation rel, int options)
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
index bd37bf311c..048003c25e 100644
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -19,24 +19,18 @@
#include "storage/smgr.h"
#include "utils/relcache.h"
-/* GUC variables */
-extern int wal_skip_threshold;
-
extern SMgrRelation RelationCreateStorage(RelFileNode rnode, char relpersistence);
extern void RelationDropStorage(Relation rel);
extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit);
-extern void RelationPreTruncate(Relation rel);
extern void RelationTruncate(Relation rel, BlockNumber nblocks);
extern void RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
ForkNumber forkNum, char relpersistence);
-extern bool RelFileNodeSkippingWAL(RelFileNode rnode);
/*
* These functions used to be in storage/smgr/smgr.c, which explains the
* naming
*/
extern void smgrDoPendingDeletes(bool isCommit);
-extern void smgrDoPendingSyncs(bool isCommit);
extern int smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr);
extern void AtSubCommit_smgr(void);
extern void AtSubAbort_smgr(void);
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 9c41bd5915..2039b42449 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -2782,9 +2782,6 @@ typedef struct IndexStmt
char *idxcomment; /* comment to apply to index, or NULL */
Oid indexOid; /* OID of an existing index, if any */
Oid oldNode; /* relfilenode of existing storage, if any */
- SubTransactionId oldCreateSubid; /* rd_createSubid of oldNode */
- SubTransactionId oldFirstRelfilenodeSubid; /* rd_firstRelfilenodeSubid of
- * oldNode */
bool unique; /* is index unique? */
bool primary; /* is index a primary key? */
bool isconstraint; /* is it for a pkey/unique constraint? */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index bf3b12a2de..d2a5b52f6e 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -49,9 +49,6 @@ typedef enum
/* forward declared, to avoid having to expose buf_internals.h here */
struct WritebackContext;
-/* forward declared, to avoid including smgr.h here */
-struct SMgrRelationData;
-
/* in globals.c ... this duplicates miscadmin.h */
extern PGDLLIMPORT int NBuffers;
@@ -189,7 +186,6 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
ForkNumber forkNum);
extern void FlushOneBuffer(Buffer buffer);
extern void FlushRelationBuffers(Relation rel);
-extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
int nforks, BlockNumber *firstDelBlock);
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index fdabf42721..a89e54dbb0 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -546,9 +546,6 @@ extern void LockReleaseSession(LOCKMETHODID lockmethodid);
extern void LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks);
extern void LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks);
extern bool LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode);
-#ifdef USE_ASSERT_CHECKING
-extern HTAB *GetLockMethodLocalHash(void);
-#endif
extern bool LockHasWaiters(const LOCKTAG *locktag,
LOCKMODE lockmode, bool sessionLock);
extern VirtualTransactionId *GetLockConflicts(const LOCKTAG *locktag,
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 79dfe0e373..243822137c 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -89,7 +89,6 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
-extern void smgrdosyncall(SMgrRelation *rels, int nrels);
extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 461f64e611..39cdcddc2b 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -66,45 +66,25 @@ typedef struct RelationData
/*----------
* rd_createSubid is the ID of the highest subtransaction the rel has
- * survived into or zero if the rel or its rd_node was created before the
- * current top transaction. (IndexStmt.oldNode leads to the case of a new
- * rel with an old rd_node.) rd_firstRelfilenodeSubid is the ID of the
- * highest subtransaction an rd_node change has survived into or zero if
- * rd_node matches the value it had at the start of the current top
- * transaction. (Rolling back the subtransaction that
- * rd_firstRelfilenodeSubid denotes would restore rd_node to the value it
- * had at the start of the current top transaction. Rolling back any
- * lower subtransaction would not.) Their accuracy is critical to
- * RelationNeedsWAL().
- *
- * rd_newRelfilenodeSubid is the ID of the highest subtransaction the
- * most-recent relfilenode change has survived into or zero if not changed
- * in the current transaction (or we have forgotten changing it). This
- * field is accurate when non-zero, but it can be zero when a relation has
- * multiple new relfilenodes within a single transaction, with one of them
- * occurring in a subsequently aborted subtransaction, e.g.
+ * survived into; or zero if the rel was not created in the current top
+ * transaction. This can be now be relied on, whereas previously it could
+ * be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is
+ * the ID of the highest subtransaction the relfilenode change has
+ * survived into, or zero if not changed in the current transaction (or we
+ * have forgotten changing it). rd_newRelfilenodeSubid can be forgotten
+ * when a relation has multiple new relfilenodes within a single
+ * transaction, with one of them occurring in a subsequently aborted
+ * subtransaction, e.g.
* BEGIN;
* TRUNCATE t;
* SAVEPOINT save;
* TRUNCATE t;
* ROLLBACK TO save;
* -- rd_newRelfilenodeSubid is now forgotten
- *
- * If every rd_*Subid field is zero, they are read-only outside
- * relcache.c. Files that trigger rd_node changes by updating
- * pg_class.reltablespace and/or pg_class.relfilenode call
- * RelationAssumeNewRelfilenode() to update rd_*Subid.
- *
- * rd_droppedSubid is the ID of the highest subtransaction that a drop of
- * the rel has survived into. In entries visible outside relcache.c, this
- * is always zero.
*/
SubTransactionId rd_createSubid; /* rel was created in current xact */
- SubTransactionId rd_newRelfilenodeSubid; /* highest subxact changing
- * rd_node to current value */
- SubTransactionId rd_firstRelfilenodeSubid; /* highest subxact changing
- * rd_node to any value */
- SubTransactionId rd_droppedSubid; /* dropped with another Subid set */
+ SubTransactionId rd_newRelfilenodeSubid; /* new relfilenode assigned in
+ * current xact */
Form_pg_class rd_rel; /* RELATION tuple */
TupleDesc rd_att; /* tuple descriptor */
@@ -551,16 +531,9 @@ typedef struct ViewOptions
/*
* RelationNeedsWAL
* True if relation needs WAL.
- *
- * Returns false if wal_level = minimal and this relation is created or
- * truncated in the current transaction. See "Skipping WAL for New
- * RelFileNode" in src/backend/access/transam/README.
- */
-#define RelationNeedsWAL(relation) \
- ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT && \
- (XLogIsNeeded() || \
- (relation->rd_createSubid == InvalidSubTransactionId && \
- relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)))
+ */
+#define RelationNeedsWAL(relation) \
+ ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
/*
* RelationUsesLocalBuffers
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 62239a09e8..d77f5beec6 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -106,10 +106,9 @@ extern Relation RelationBuildLocalRelation(const char *relname,
char relkind);
/*
- * Routines to manage assignment of new relfilenode to a relation
+ * Routine to manage assignment of new relfilenode to a relation
*/
extern void RelationSetNewRelfilenode(Relation relation, char persistence);
-extern void RelationAssumeNewRelfilenode(Relation relation);
/*
* Routines for flushing/rebuilding relcache entries in various scenarios
@@ -122,11 +121,6 @@ extern void RelationCacheInvalidate(void);
extern void RelationCloseSmgrByOid(Oid relationId);
-#ifdef USE_ASSERT_CHECKING
-extern void AssertPendingSyncs_RelationCache(void);
-#else
-#define AssertPendingSyncs_RelationCache() do {} while (0)
-#endif
extern void AtEOXact_RelationCache(bool isCommit);
extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
SubTransactionId parentSubid);
diff --git a/src/test/recovery/t/018_wal_optimize.pl b/src/test/recovery/t/018_wal_optimize.pl
deleted file mode 100644
index 50bb2fef61..0000000000
--- a/src/test/recovery/t/018_wal_optimize.pl
+++ /dev/null
@@ -1,372 +0,0 @@
-# Test WAL replay when some operation has skipped WAL.
-#
-# These tests exercise code that once violated the mandate described in
-# src/backend/access/transam/README section "Skipping WAL for New
-# RelFileNode". The tests work by committing some transactions, initiating an
-# immediate shutdown, and confirming that the expected data survives recovery.
-# For many years, individual commands made the decision to skip WAL, hence the
-# frequent appearance of COPY in these tests.
-use strict;
-use warnings;
-
-use PostgresNode;
-use TestLib;
-use Test::More tests => 34;
-
-sub check_orphan_relfilenodes
-{
- my ($node, $test_name) = @_;
-
- my $db_oid = $node->safe_psql('postgres',
- "SELECT oid FROM pg_database WHERE datname = 'postgres'");
- my $prefix = "base/$db_oid/";
- my $filepaths_referenced = $node->safe_psql(
- 'postgres', "
- SELECT pg_relation_filepath(oid) FROM pg_class
- WHERE reltablespace = 0 AND relpersistence <> 't' AND
- pg_relation_filepath(oid) IS NOT NULL;");
- is_deeply(
- [
- sort(map { "$prefix$_" }
- grep(/^[0-9]+$/, slurp_dir($node->data_dir . "/$prefix")))
- ],
- [ sort split /\n/, $filepaths_referenced ],
- $test_name);
- return;
-}
-
-# We run this same test suite for both wal_level=minimal and replica.
-sub run_wal_optimize
-{
- my $wal_level = shift;
-
- my $node = get_new_node("node_$wal_level");
- $node->init;
- $node->append_conf(
- 'postgresql.conf', qq(
-wal_level = $wal_level
-max_prepared_transactions = 1
-wal_log_hints = on
-wal_skip_threshold = 0
-#wal_debug = on
-));
- $node->start;
-
- # Setup
- my $tablespace_dir = $node->basedir . '/tablespace_other';
- mkdir($tablespace_dir);
- $tablespace_dir = TestLib::perl2host($tablespace_dir);
- $node->safe_psql('postgres',
- "CREATE TABLESPACE other LOCATION '$tablespace_dir';");
-
- # Test direct truncation optimization. No tuples.
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE trunc (id serial PRIMARY KEY);
- TRUNCATE trunc;
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- my $result = $node->safe_psql('postgres', "SELECT count(*) FROM trunc;");
- is($result, qq(0), "wal_level = $wal_level, TRUNCATE with empty table");
-
- # Test truncation with inserted tuples within the same transaction.
- # Tuples inserted after the truncation should be seen.
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE trunc_ins (id serial PRIMARY KEY);
- INSERT INTO trunc_ins VALUES (DEFAULT);
- TRUNCATE trunc_ins;
- INSERT INTO trunc_ins VALUES (DEFAULT);
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result = $node->safe_psql('postgres',
- "SELECT count(*), min(id) FROM trunc_ins;");
- is($result, qq(1|2), "wal_level = $wal_level, TRUNCATE INSERT");
-
- # Same for prepared transaction.
- # Tuples inserted after the truncation should be seen.
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE twophase (id serial PRIMARY KEY);
- INSERT INTO twophase VALUES (DEFAULT);
- TRUNCATE twophase;
- INSERT INTO twophase VALUES (DEFAULT);
- PREPARE TRANSACTION 't';
- COMMIT PREPARED 't';");
- $node->stop('immediate');
- $node->start;
- $result = $node->safe_psql('postgres',
- "SELECT count(*), min(id) FROM trunc_ins;");
- is($result, qq(1|2), "wal_level = $wal_level, TRUNCATE INSERT PREPARE");
-
- # Writing WAL at end of xact, instead of syncing.
- $node->safe_psql(
- 'postgres', "
- SET wal_skip_threshold = '1TB';
- BEGIN;
- CREATE TABLE noskip (id serial PRIMARY KEY);
- INSERT INTO noskip (SELECT FROM generate_series(1, 20000) a) ;
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result = $node->safe_psql('postgres', "SELECT count(*) FROM noskip;");
- is($result, qq(20000), "wal_level = $wal_level, end-of-xact WAL");
-
- # Data file for COPY query in subsequent tests
- my $basedir = $node->basedir;
- my $copy_file = "$basedir/copy_data.txt";
- TestLib::append_to_file(
- $copy_file, qq(20000,30000
-20001,30001
-20002,30002));
-
- # Test truncation with inserted tuples using both INSERT and COPY. Tuples
- # inserted after the truncation should be seen.
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE ins_trunc (id serial PRIMARY KEY, id2 int);
- INSERT INTO ins_trunc VALUES (DEFAULT, generate_series(1,10000));
- TRUNCATE ins_trunc;
- INSERT INTO ins_trunc (id, id2) VALUES (DEFAULT, 10000);
- COPY ins_trunc FROM '$copy_file' DELIMITER ',';
- INSERT INTO ins_trunc (id, id2) VALUES (DEFAULT, 10000);
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result = $node->safe_psql('postgres', "SELECT count(*) FROM ins_trunc;");
- is($result, qq(5), "wal_level = $wal_level, TRUNCATE COPY INSERT");
-
- # Test truncation with inserted tuples using COPY. Tuples copied after
- # the truncation should be seen.
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE trunc_copy (id serial PRIMARY KEY, id2 int);
- INSERT INTO trunc_copy VALUES (DEFAULT, generate_series(1,3000));
- TRUNCATE trunc_copy;
- COPY trunc_copy FROM '$copy_file' DELIMITER ',';
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result =
- $node->safe_psql('postgres', "SELECT count(*) FROM trunc_copy;");
- is($result, qq(3), "wal_level = $wal_level, TRUNCATE COPY");
-
- # Like previous test, but rollback SET TABLESPACE in a subtransaction.
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE spc_abort (id serial PRIMARY KEY, id2 int);
- INSERT INTO spc_abort VALUES (DEFAULT, generate_series(1,3000));
- TRUNCATE spc_abort;
- SAVEPOINT s;
- ALTER TABLE spc_abort SET TABLESPACE other; ROLLBACK TO s;
- COPY spc_abort FROM '$copy_file' DELIMITER ',';
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result = $node->safe_psql('postgres', "SELECT count(*) FROM spc_abort;");
- is($result, qq(3),
- "wal_level = $wal_level, SET TABLESPACE abort subtransaction");
-
- # in different subtransaction patterns
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE spc_commit (id serial PRIMARY KEY, id2 int);
- INSERT INTO spc_commit VALUES (DEFAULT, generate_series(1,3000));
- TRUNCATE spc_commit;
- SAVEPOINT s; ALTER TABLE spc_commit SET TABLESPACE other; RELEASE s;
- COPY spc_commit FROM '$copy_file' DELIMITER ',';
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result =
- $node->safe_psql('postgres', "SELECT count(*) FROM spc_commit;");
- is($result, qq(3),
- "wal_level = $wal_level, SET TABLESPACE commit subtransaction");
-
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE spc_nest (id serial PRIMARY KEY, id2 int);
- INSERT INTO spc_nest VALUES (DEFAULT, generate_series(1,3000));
- TRUNCATE spc_nest;
- SAVEPOINT s;
- ALTER TABLE spc_nest SET TABLESPACE other;
- SAVEPOINT s2;
- ALTER TABLE spc_nest SET TABLESPACE pg_default;
- ROLLBACK TO s2;
- SAVEPOINT s2;
- ALTER TABLE spc_nest SET TABLESPACE pg_default;
- RELEASE s2;
- ROLLBACK TO s;
- COPY spc_nest FROM '$copy_file' DELIMITER ',';
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result = $node->safe_psql('postgres', "SELECT count(*) FROM spc_nest;");
- is($result, qq(3),
- "wal_level = $wal_level, SET TABLESPACE nested subtransaction");
-
- $node->safe_psql(
- 'postgres', "
- CREATE TABLE spc_hint (id int);
- INSERT INTO spc_hint VALUES (1);
- BEGIN;
- ALTER TABLE spc_hint SET TABLESPACE other;
- CHECKPOINT;
- SELECT * FROM spc_hint; -- set hint bit
- INSERT INTO spc_hint VALUES (2);
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result = $node->safe_psql('postgres', "SELECT count(*) FROM spc_hint;");
- is($result, qq(2), "wal_level = $wal_level, SET TABLESPACE, hint bit");
-
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE idx_hint (c int PRIMARY KEY);
- SAVEPOINT q; INSERT INTO idx_hint VALUES (1); ROLLBACK TO q;
- CHECKPOINT;
- INSERT INTO idx_hint VALUES (1); -- set index hint bit
- INSERT INTO idx_hint VALUES (2);
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result = $node->psql('postgres',);
- my ($ret, $stdout, $stderr) =
- $node->psql('postgres', "INSERT INTO idx_hint VALUES (2);");
- is($ret, qq(3), "wal_level = $wal_level, unique index LP_DEAD");
- like(
- $stderr,
- qr/violates unique/,
- "wal_level = $wal_level, unique index LP_DEAD message");
-
- # UPDATE touches two buffers for one row.
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE upd (id serial PRIMARY KEY, id2 int);
- INSERT INTO upd (id, id2) VALUES (DEFAULT, generate_series(1,10000));
- COPY upd FROM '$copy_file' DELIMITER ',';
- UPDATE upd SET id2 = id2 + 1;
- DELETE FROM upd;
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result = $node->safe_psql('postgres', "SELECT count(*) FROM upd;");
- is($result, qq(0),
- "wal_level = $wal_level, UPDATE touches two buffers for one row");
-
- # Test consistency of COPY with INSERT for table created in the same
- # transaction.
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE ins_copy (id serial PRIMARY KEY, id2 int);
- INSERT INTO ins_copy VALUES (DEFAULT, 1);
- COPY ins_copy FROM '$copy_file' DELIMITER ',';
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result = $node->safe_psql('postgres', "SELECT count(*) FROM ins_copy;");
- is($result, qq(4), "wal_level = $wal_level, INSERT COPY");
-
- # Test consistency of COPY that inserts more to the same table using
- # triggers. If the INSERTS from the trigger go to the same block data
- # is copied to, and the INSERTs are WAL-logged, WAL replay will fail when
- # it tries to replay the WAL record but the "before" image doesn't match,
- # because not all changes were WAL-logged.
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE ins_trig (id serial PRIMARY KEY, id2 text);
- CREATE FUNCTION ins_trig_before_row_trig() RETURNS trigger
- LANGUAGE plpgsql as \$\$
- BEGIN
- IF new.id2 NOT LIKE 'triggered%' THEN
- INSERT INTO ins_trig
- VALUES (DEFAULT, 'triggered row before' || NEW.id2);
- END IF;
- RETURN NEW;
- END; \$\$;
- CREATE FUNCTION ins_trig_after_row_trig() RETURNS trigger
- LANGUAGE plpgsql as \$\$
- BEGIN
- IF new.id2 NOT LIKE 'triggered%' THEN
- INSERT INTO ins_trig
- VALUES (DEFAULT, 'triggered row after' || NEW.id2);
- END IF;
- RETURN NEW;
- END; \$\$;
- CREATE TRIGGER ins_trig_before_row_insert
- BEFORE INSERT ON ins_trig
- FOR EACH ROW EXECUTE PROCEDURE ins_trig_before_row_trig();
- CREATE TRIGGER ins_trig_after_row_insert
- AFTER INSERT ON ins_trig
- FOR EACH ROW EXECUTE PROCEDURE ins_trig_after_row_trig();
- COPY ins_trig FROM '$copy_file' DELIMITER ',';
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result = $node->safe_psql('postgres', "SELECT count(*) FROM ins_trig;");
- is($result, qq(9), "wal_level = $wal_level, COPY with INSERT triggers");
-
- # Test consistency of INSERT, COPY and TRUNCATE in same transaction block
- # with TRUNCATE triggers.
- $node->safe_psql(
- 'postgres', "
- BEGIN;
- CREATE TABLE trunc_trig (id serial PRIMARY KEY, id2 text);
- CREATE FUNCTION trunc_trig_before_stat_trig() RETURNS trigger
- LANGUAGE plpgsql as \$\$
- BEGIN
- INSERT INTO trunc_trig VALUES (DEFAULT, 'triggered stat before');
- RETURN NULL;
- END; \$\$;
- CREATE FUNCTION trunc_trig_after_stat_trig() RETURNS trigger
- LANGUAGE plpgsql as \$\$
- BEGIN
- INSERT INTO trunc_trig VALUES (DEFAULT, 'triggered stat before');
- RETURN NULL;
- END; \$\$;
- CREATE TRIGGER trunc_trig_before_stat_truncate
- BEFORE TRUNCATE ON trunc_trig
- FOR EACH STATEMENT EXECUTE PROCEDURE trunc_trig_before_stat_trig();
- CREATE TRIGGER trunc_trig_after_stat_truncate
- AFTER TRUNCATE ON trunc_trig
- FOR EACH STATEMENT EXECUTE PROCEDURE trunc_trig_after_stat_trig();
- INSERT INTO trunc_trig VALUES (DEFAULT, 1);
- TRUNCATE trunc_trig;
- COPY trunc_trig FROM '$copy_file' DELIMITER ',';
- COMMIT;");
- $node->stop('immediate');
- $node->start;
- $result =
- $node->safe_psql('postgres', "SELECT count(*) FROM trunc_trig;");
- is($result, qq(4),
- "wal_level = $wal_level, TRUNCATE COPY with TRUNCATE triggers");
-
- # Test redo of temp table creation.
- $node->safe_psql(
- 'postgres', "
- CREATE TEMP TABLE temp (id serial PRIMARY KEY, id2 text);");
- $node->stop('immediate');
- $node->start;
- check_orphan_relfilenodes($node,
- "wal_level = $wal_level, no orphan relfilenode remains");
-
- return;
-}
-
-# Run same test suite for multiple wal_level values.
-run_wal_optimize("minimal");
-run_wal_optimize("replica");
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 7c2181ac2f..fb6d86a269 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -1984,12 +1984,6 @@ select * from another;
(3 rows)
drop table another;
--- Create an index that skips WAL, then perform a SET DATA TYPE that skips
--- rewriting the index.
-begin;
-create table skip_wal_skip_rewrite_index (c varchar(10) primary key);
-alter table skip_wal_skip_rewrite_index alter c type varchar(20);
-commit;
-- table's row type
create table tab1 (a int, b text);
create table tab2 (x int, y tab1);
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index 6acf31725f..c5e95edbed 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -331,19 +331,6 @@ CREATE TABLE default_expr_agg (a int DEFAULT (generate_series(1,3)));
ERROR: set-returning functions are not allowed in DEFAULT expressions
LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (generate_serie...
^
--- Verify that subtransaction rollback restores rd_createSubid.
-BEGIN;
-CREATE TABLE remember_create_subid (c int);
-SAVEPOINT q; DROP TABLE remember_create_subid; ROLLBACK TO q;
-COMMIT;
-DROP TABLE remember_create_subid;
--- Verify that subtransaction rollback restores rd_firstRelfilenodeSubid.
-CREATE TABLE remember_node_subid (c int);
-BEGIN;
-ALTER TABLE remember_node_subid ALTER c TYPE bigint;
-SAVEPOINT q; DROP TABLE remember_node_subid; ROLLBACK TO q;
-COMMIT;
-DROP TABLE remember_node_subid;
--
-- Partitioned tables
--
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index 1b1315f316..3801f19c58 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -1360,13 +1360,6 @@ select * from another;
drop table another;
--- Create an index that skips WAL, then perform a SET DATA TYPE that skips
--- rewriting the index.
-begin;
-create table skip_wal_skip_rewrite_index (c varchar(10) primary key);
-alter table skip_wal_skip_rewrite_index alter c type varchar(20);
-commit;
-
-- table's row type
create table tab1 (a int, b text);
create table tab2 (x int, y tab1);
diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
index a670438c48..00ef81a685 100644
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -318,21 +318,6 @@ CREATE TABLE default_expr_agg (a int DEFAULT (select 1));
-- invalid use of set-returning function
CREATE TABLE default_expr_agg (a int DEFAULT (generate_series(1,3)));
--- Verify that subtransaction rollback restores rd_createSubid.
-BEGIN;
-CREATE TABLE remember_create_subid (c int);
-SAVEPOINT q; DROP TABLE remember_create_subid; ROLLBACK TO q;
-COMMIT;
-DROP TABLE remember_create_subid;
-
--- Verify that subtransaction rollback restores rd_firstRelfilenodeSubid.
-CREATE TABLE remember_node_subid (c int);
-BEGIN;
-ALTER TABLE remember_node_subid ALTER c TYPE bigint;
-SAVEPOINT q; DROP TABLE remember_node_subid; ROLLBACK TO q;
-COMMIT;
-DROP TABLE remember_node_subid;
-
--
-- Partitioned tables
--