Back-port changes of Jan 16 and 17 to "revoke" pending fsync requests during

DROP TABLE and DROP DATABASE. Should prevent unexpected "permission denied" failures on Windows, and is cleaner on other platforms too since we no longer have to take it on faith that ENOENT is okay during an fsync attempt. Patched as far back as 8.1; per recent discussion I think we are not going to worry about Windows-specific issues in 8.0 anymore.
author: Tom Lane <tgl@sss.pgh.pa.us> 2007-01-27 20:15:55 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2007-01-27 20:15:55 +0000
commit: 722ad326f17ab6cf623d350baa9679dcad23088b (patch)
tree: e23398200004a90fe111d0079d8011b41f2c28da
parent: 03d128147719acac9de6f7e118525db9370fbf05 (diff)
download: postgresql-722ad326f17ab6cf623d350baa9679dcad23088b.tar.gz
4 files changed, 287 insertions, 87 deletions
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 5fa0f6155d..e12b7be407 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.173.2.1 2005/11/22 18:23:07 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.173.2.2 2007/01/27 20:15:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -42,6 +42,7 @@
 #include "storage/fd.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
+#include "storage/smgr.h"
 #include "utils/acl.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
@@ -665,6 +666,12 @@ dropdb(const char *dbname)
 	FreeSpaceMapForgetDatabase(db_id);
 
 	/*
+	 * Tell bgwriter to forget any pending fsync requests for files in the
+	 * database; else it'll fail at next checkpoint.
+	 */
+	ForgetDatabaseFsyncRequests(db_id);
+
+	/*
 	 * On Windows, force a checkpoint so that the bgwriter doesn't hold any
 	 * open files, which would cause rmdir() to fail.
 	 */
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 62e51fa4f1..fefcbdf67b 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -2,7 +2,7 @@
  *
  * bgwriter.c
  *
- * The background writer (bgwriter) is new in Postgres 8.0.  It attempts
+ * The background writer (bgwriter) is new as of Postgres 8.0.  It attempts
  * to keep regular backends from having to write out dirty shared buffers
  * (which they would only do when needing to free a shared buffer to read in
  * another page).  In the best scenario all writes from shared buffers will
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.21.2.1 2005/12/08 19:19:31 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.21.2.2 2007/01/27 20:15:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -101,8 +101,8 @@
 typedef struct
 {
 	RelFileNode rnode;
-	BlockNumber segno;
-	/* might add a request-type field later */
+	BlockNumber segno;			/* see md.c for special values */
+	/* might add a real request-type field later; not needed yet */
 } BgWriterRequest;
 
 typedef struct
@@ -626,6 +626,11 @@ RequestCheckpoint(bool waitforit, bool warnontime)
  * the backend calls this routine to pass over knowledge that the relation
  * is dirty and must be fsync'd before next checkpoint.
  *
+ * segno specifies which segment (not block!) of the relation needs to be
+ * fsync'd.  (Since the valid range is much less than BlockNumber, we can
+ * use high values for special flags; that's all internal to md.c, which
+ * see for details.)
+ *
  * If we are unable to pass over the request (at present, this can happen
  * if the shared memory queue is full), we return false.  That forces
  * the backend to do its own fsync.  We hope that will be even more seldom.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 6160e734f4..57582f55b3 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.118.2.1 2006/11/20 01:08:02 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.118.2.2 2007/01/27 20:15:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -28,6 +28,26 @@
 #include "utils/memutils.h"
 
 
+/* interval for calling AbsorbFsyncRequests in mdsync */
+#define FSYNCS_PER_ABSORB		10
+
+/* special values for the segno arg to RememberFsyncRequest */
+#define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
+#define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
+
+/*
+ * On Windows, we have to interpret EACCES as possibly meaning the same as
+ * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
+ * that's what you get.  Ugh.  This code is designed so that we don't
+ * actually believe these cases are okay without further evidence (namely,
+ * a pending fsync request getting revoked ... see mdsync).
+ */
+#ifndef WIN32
+#define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT)
+#else
+#define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT || (err) == EACCES)
+#endif
+
 /*
  *	The magnetic disk storage manager keeps track of open file
  *	descriptors in its own descriptor pool.  This is done to make it
@@ -90,9 +110,8 @@ static MemoryContext MdCxt;		/* context for all md.c allocations */
  * we keep track of pending fsync operations: we need to remember all relation
  * segments that have been written since the last checkpoint, so that we can
  * fsync them down to disk before completing the next checkpoint.  This hash
- * table remembers the pending operations.	We use a hash table not because
- * we want to look up individual operations, but simply as a convenient way
- * of eliminating duplicate requests.
+ * table remembers the pending operations.	We use a hash table mostly as
+ * a convenient way of eliminating duplicate requests.
  *
  * (Regular backends do not track pending operations locally, but forward
  * them to the bgwriter.)
@@ -101,6 +120,12 @@ typedef struct
 {
 	RelFileNode rnode;			/* the targeted relation */
 	BlockNumber segno;			/* which segment */
+} PendingOperationTag;
+
+typedef struct
+{
+	PendingOperationTag tag;	/* hash table key (must be first!) */
+	int			failures;		/* number of failed attempts to fsync */
 } PendingOperationEntry;
 
 static HTAB *pendingOpsTable = NULL;
@@ -143,7 +168,7 @@ mdinit(void)
 		HASHCTL		hash_ctl;
 
 		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-		hash_ctl.keysize = sizeof(PendingOperationEntry);
+		hash_ctl.keysize = sizeof(PendingOperationTag);
 		hash_ctl.entrysize = sizeof(PendingOperationEntry);
 		hash_ctl.hash = tag_hash;
 		hash_ctl.hcxt = MdCxt;
@@ -226,6 +251,12 @@ mdunlink(RelFileNode rnode, bool isRedo)
 	int			save_errno = 0;
 	char	   *path;
 
+	/*
+	 * We have to clean out any pending fsync requests for the doomed relation,
+	 * else the next mdsync() will fail.
+	 */
+	ForgetRelationFsyncRequests(rnode);
+
 	path = relpath(rnode);
 
 	/* Delete the first segment, or only segment if not doing segmenting */
@@ -372,7 +403,7 @@ mdopen(SMgrRelation reln, bool allowNotFound)
 		if (fd < 0)
 		{
 			pfree(path);
-			if (allowNotFound && errno == ENOENT)
+			if (allowNotFound && FILE_POSSIBLY_DELETED(errno))
 				return NULL;
 			ereport(ERROR,
 					(errcode_for_file_access(),
@@ -725,82 +756,130 @@ mdimmedsync(SMgrRelation reln)
 bool
 mdsync(void)
 {
-	HASH_SEQ_STATUS hstat;
-	PendingOperationEntry *entry;
+	bool		need_retry;
 
 	if (!pendingOpsTable)
 		return false;
 
 	/*
-	 * If we are in the bgwriter, the sync had better include all fsync
-	 * requests that were queued by backends before the checkpoint REDO point
-	 * was determined.	We go that a little better by accepting all requests
-	 * queued up to the point where we start fsync'ing.
+	 * The fsync table could contain requests to fsync relations that have
+	 * been deleted (unlinked) by the time we get to them.  Rather than
+	 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
+	 * what we will do is retry the whole process after absorbing fsync
+	 * request messages again.  Since mdunlink() queues a "revoke" message
+	 * before actually unlinking, the fsync request is guaranteed to be gone
+	 * the second time if it really was this case.  DROP DATABASE likewise
+	 * has to tell us to forget fsync requests before it starts deletions.
 	 */
-	AbsorbFsyncRequests();
+	do {
+		HASH_SEQ_STATUS hstat;
+		PendingOperationEntry *entry;
+		int			absorb_counter;
+
+		need_retry = false;
 
-	hash_seq_init(&hstat, pendingOpsTable);
-	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-	{
 		/*
-		 * If fsync is off then we don't have to bother opening the file at
-		 * all.  (We delay checking until this point so that changing fsync on
-		 * the fly behaves sensibly.)
+		 * If we are in the bgwriter, the sync had better include all fsync
+		 * requests that were queued by backends before the checkpoint REDO
+		 * point was determined. We go that a little better by accepting all
+		 * requests queued up to the point where we start fsync'ing.
 		 */
-		if (enableFsync)
-		{
-			SMgrRelation reln;
-			MdfdVec    *seg;
-
-			/*
-			 * Find or create an smgr hash entry for this relation. This may
-			 * seem a bit unclean -- md calling smgr?  But it's really the
-			 * best solution.  It ensures that the open file reference isn't
-			 * permanently leaked if we get an error here. (You may say "but
-			 * an unreferenced SMgrRelation is still a leak!" Not really,
-			 * because the only case in which a checkpoint is done by a
-			 * process that isn't about to shut down is in the bgwriter, and
-			 * it will periodically do smgrcloseall().	This fact justifies
-			 * our not closing the reln in the success path either, which is a
-			 * good thing since in non-bgwriter cases we couldn't safely do
-			 * that.)  Furthermore, in many cases the relation will have been
-			 * dirtied through this same smgr relation, and so we can save a
-			 * file open/close cycle.
-			 */
-			reln = smgropen(entry->rnode);
+		AbsorbFsyncRequests();
 
+		absorb_counter = FSYNCS_PER_ABSORB;
+		hash_seq_init(&hstat, pendingOpsTable);
+		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+		{
 			/*
-			 * It is possible that the relation has been dropped or truncated
-			 * since the fsync request was entered.  Therefore, we have to
-			 * allow file-not-found errors.  This applies both during
-			 * _mdfd_getseg() and during FileSync, since fd.c might have
-			 * closed the file behind our back.
+			 * If fsync is off then we don't have to bother opening the file
+			 * at all.  (We delay checking until this point so that changing
+			 * fsync on the fly behaves sensibly.)
 			 */
-			seg = _mdfd_getseg(reln,
-							   entry->segno * ((BlockNumber) RELSEG_SIZE),
-							   true);
-			if (seg)
+			if (enableFsync)
 			{
-				if (FileSync(seg->mdfd_vfd) < 0 &&
-					errno != ENOENT)
+				SMgrRelation reln;
+				MdfdVec    *seg;
+
+				/*
+				 * If in bgwriter, we want to absorb pending requests every so
+				 * often to prevent overflow of the fsync request queue.  This
+				 * could result in deleting the current entry out from under
+				 * our hashtable scan, so the procedure is to fall out of the
+				 * scan and start over from the top of the function.
+				 */
+				if (--absorb_counter <= 0)
 				{
-					ereport(LOG,
-							(errcode_for_file_access(),
-							 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
-									entry->segno,
-									entry->rnode.spcNode,
-									entry->rnode.dbNode,
-									entry->rnode.relNode)));
-					return false;
+					need_retry = true;
+					break;
+				}
+
+				/*
+				 * Find or create an smgr hash entry for this relation. This
+				 * may seem a bit unclean -- md calling smgr?  But it's really
+				 * the best solution.  It ensures that the open file reference
+				 * isn't permanently leaked if we get an error here. (You may
+				 * say "but an unreferenced SMgrRelation is still a leak!" Not
+				 * really, because the only case in which a checkpoint is done
+				 * by a process that isn't about to shut down is in the
+				 * bgwriter, and it will periodically do smgrcloseall(). This
+				 * fact justifies our not closing the reln in the success path
+				 * either, which is a good thing since in non-bgwriter cases
+				 * we couldn't safely do that.)  Furthermore, in many cases
+				 * the relation will have been dirtied through this same smgr
+				 * relation, and so we can save a file open/close cycle.
+				 */
+				reln = smgropen(entry->tag.rnode);
+
+				/*
+				 * It is possible that the relation has been dropped or
+				 * truncated since the fsync request was entered.  Therefore,
+				 * allow ENOENT, but only if we didn't fail once already on
+				 * this file.  This applies both during _mdfd_getseg() and
+				 * during FileSync, since fd.c might have closed the file
+				 * behind our back.
+				 */
+				seg = _mdfd_getseg(reln,
+								   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
+								   true);
+				if (seg == NULL ||
+					FileSync(seg->mdfd_vfd) < 0)
+				{
+					/*
+					 * XXX is there any point in allowing more than one try?
+					 * Don't see one at the moment, but easy to change the
+					 * test here if so.
+					 */
+					if (!FILE_POSSIBLY_DELETED(errno) ||
+						++(entry->failures) > 1)
+					{
+						ereport(LOG,
+								(errcode_for_file_access(),
+								 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+										entry->tag.segno,
+										entry->tag.rnode.spcNode,
+										entry->tag.rnode.dbNode,
+										entry->tag.rnode.relNode)));
+						return false;
+					}
+					else
+						ereport(DEBUG1,
+								(errcode_for_file_access(),
+								 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
+										entry->tag.segno,
+										entry->tag.rnode.spcNode,
+										entry->tag.rnode.dbNode,
+										entry->tag.rnode.relNode)));
+					need_retry = true;
+					continue;	/* don't delete the hashtable entry */
 				}
 			}
-		}
 
-		/* Okay, delete this entry */
-		if (hash_search(pendingOpsTable, entry,
-						HASH_REMOVE, NULL) == NULL)
-			elog(ERROR, "pendingOpsTable corrupted");
-	}
+			/* Okay, delete this entry */
+			if (hash_search(pendingOpsTable, &entry->tag,
+							HASH_REMOVE, NULL) == NULL)
+				elog(ERROR, "pendingOpsTable corrupted");
+		}
+	} while (need_retry);
 
 	return true;
 }
@@ -822,14 +901,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 {
 	if (pendingOpsTable)
 	{
-		PendingOperationEntry entry;
-
-		/* ensure any pad bytes in the struct are zeroed */
-		MemSet(&entry, 0, sizeof(entry));
-		entry.rnode = reln->smgr_rnode;
-		entry.segno = seg->mdfd_segno;
-
-		(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+		/* push it into local pending-ops table */
+		RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
 		return true;
 	}
 	else
@@ -848,22 +921,135 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
  *
  * We stuff the fsync request into the local hash table for execution
  * during the bgwriter's next checkpoint.
+ *
+ * The range of possible segment numbers is way less than the range of
+ * BlockNumber, so we can reserve high values of segno for special purposes.
+ * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
+ * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
+ * a whole database.  (These are a tad slow because the hash table has to be
+ * searched linearly, but it doesn't seem worth rethinking the table structure
+ * for them.)
  */
 void
 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 {
-	PendingOperationEntry entry;
-
 	Assert(pendingOpsTable);
 
-	/* ensure any pad bytes in the struct are zeroed */
-	MemSet(&entry, 0, sizeof(entry));
-	entry.rnode = rnode;
-	entry.segno = segno;
+	if (segno == FORGET_RELATION_FSYNC)
+	{
+		/* Remove any pending requests for the entire relation */
+		HASH_SEQ_STATUS hstat;
+		PendingOperationEntry *entry;
+
+		hash_seq_init(&hstat, pendingOpsTable);
+		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (RelFileNodeEquals(entry->tag.rnode, rnode))
+			{
+				/* Okay, delete this entry */
+				if (hash_search(pendingOpsTable, &entry->tag,
+								HASH_REMOVE, NULL) == NULL)
+					elog(ERROR, "pendingOpsTable corrupted");
+			}
+		}
+	}
+	else if (segno == FORGET_DATABASE_FSYNC)
+	{
+		/* Remove any pending requests for the entire database */
+		HASH_SEQ_STATUS hstat;
+		PendingOperationEntry *entry;
+
+		hash_seq_init(&hstat, pendingOpsTable);
+		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (entry->tag.rnode.dbNode == rnode.dbNode)
+			{
+				/* Okay, delete this entry */
+				if (hash_search(pendingOpsTable, &entry->tag,
+								HASH_REMOVE, NULL) == NULL)
+					elog(ERROR, "pendingOpsTable corrupted");
+			}
+		}
+	}
+	else
+	{
+		/* Normal case: enter a request to fsync this segment */
+		PendingOperationTag key;
+		PendingOperationEntry *entry;
+		bool		found;
+
+		/* ensure any pad bytes in the hash key are zeroed */
+		MemSet(&key, 0, sizeof(key));
+		key.rnode = rnode;
+		key.segno = segno;
+
+		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
+													  &key,
+													  HASH_ENTER,
+													  &found);
+		if (!found)				/* new entry, so initialize it */
+			entry->failures = 0;
+	}
+}
+
+/*
+ * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
+ */
+void
+ForgetRelationFsyncRequests(RelFileNode rnode)
+{
+	if (pendingOpsTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/*
+		 * Notify the bgwriter about it.  If we fail to queue the revoke
+		 * message, we have to sleep and try again ... ugly, but hopefully
+		 * won't happen often.
+		 *
+		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
+		 * an error would leave the no-longer-used file still present on
+		 * disk, which would be bad, so I'm inclined to assume that the
+		 * bgwriter will always empty the queue soon.
+		 */
+		while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+		/*
+		 * Note we don't wait for the bgwriter to actually absorb the
+		 * revoke message; see mdsync() for the implications.
+		 */
+	}
+}
+
+/*
+ * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
+ */
+void
+ForgetDatabaseFsyncRequests(Oid dbid)
+{
+	RelFileNode rnode;
+
+	rnode.dbNode = dbid;
+	rnode.spcNode = 0;
+	rnode.relNode = 0;
 
-	(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+	if (pendingOpsTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/* see notes in ForgetRelationFsyncRequests */
+		while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+	}
 }
 
+
 /*
  *	_fdvec_alloc() -- Make a MdfdVec object.
  */
@@ -967,7 +1153,7 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
 								(segstogo == 1 || InRecovery) ? O_CREAT : 0);
 			if (v->mdfd_chain == NULL)
 			{
-				if (allowNotFound && errno == ENOENT)
+				if (allowNotFound && FILE_POSSIBLY_DELETED(errno))
 					return NULL;
 				ereport(ERROR,
 						(errcode_for_file_access(),
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 910d49565c..872853aea0 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.53 2005/10/15 02:49:46 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.53.2.1 2007/01/27 20:15:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -107,6 +107,8 @@ extern bool mdimmedsync(SMgrRelation reln);
 extern bool mdsync(void);
 
 extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
+extern void ForgetRelationFsyncRequests(RelFileNode rnode);
+extern void ForgetDatabaseFsyncRequests(Oid dbid);
 
 /* smgrtype.c */
 extern Datum smgrout(PG_FUNCTION_ARGS);
author	Tom Lane <tgl@sss.pgh.pa.us>	2007-01-27 20:15:55 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2007-01-27 20:15:55 +0000
commit	722ad326f17ab6cf623d350baa9679dcad23088b (patch)
tree	e23398200004a90fe111d0079d8011b41f2c28da
parent	03d128147719acac9de6f7e118525db9370fbf05 (diff)
download	postgresql-722ad326f17ab6cf623d350baa9679dcad23088b.tar.gz