summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2003-08-10 19:48:08 +0000
committerTom Lane <tgl@sss.pgh.pa.us>2003-08-10 19:48:08 +0000
commitffafacc1f6e403b81180aa20d0a4184ce92beef8 (patch)
treea85699334c2bcffee5ccaec9d4b37e90de438d67
parent18c10877a9b7d5e3766f6f34c827892a33872567 (diff)
downloadpostgresql-ffafacc1f6e403b81180aa20d0a4184ce92beef8.tar.gz
Repair potential deadlock created by recent changes to recycle btree
index pages: when _bt_getbuf asks the FSM for a free index page, it is possible (and, in some cases, even moderately likely) that the answer will be the same page that _bt_split is trying to split. _bt_getbuf already knew that the returned page might not be free, but it wasn't prepared for the possibility that even trying to lock the page could be problematic. Fix by doing a conditional rather than unconditional grab of the page lock.
-rw-r--r--src/backend/access/nbtree/nbtpage.c42
-rw-r--r--src/backend/storage/buffer/bufmgr.c33
-rw-r--r--src/include/storage/bufmgr.h3
3 files changed, 67 insertions, 11 deletions
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index b0aff28a59..23bc2fb062 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.69 2003/08/08 21:41:27 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.70 2003/08/10 19:48:08 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -409,6 +409,22 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
* that the page is still free. (For example, an already-free
* page could have been re-used between the time the last VACUUM
* scanned it and the time the VACUUM made its FSM updates.)
+ *
+ * In fact, it's worse than that: we can't even assume that it's
+ * safe to take a lock on the reported page. If somebody else
+ * has a lock on it, or even worse our own caller does, we could
+ * deadlock. (The own-caller scenario is actually not improbable.
+ * Consider an index on a serial or timestamp column. Nearly all
+ * splits will be at the rightmost page, so it's entirely likely
+ * that _bt_split will call us while holding a lock on the page most
+ * recently acquired from FSM. A VACUUM running concurrently with
+ * the previous split could well have placed that page back in FSM.)
+ *
+ * To get around that, we ask for only a conditional lock on the
+ * reported page. If we fail, then someone else is using the page,
+ * and we may reasonably assume it's not free. (If we happen to be
+ * wrong, the worst consequence is the page will be lost to use till
+ * the next VACUUM, which is no big problem.)
*/
for (;;)
{
@@ -416,16 +432,24 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
if (blkno == InvalidBlockNumber)
break;
buf = ReadBuffer(rel, blkno);
- LockBuffer(buf, access);
- page = BufferGetPage(buf);
- if (_bt_page_recyclable(page))
+ if (ConditionalLockBuffer(buf))
{
- /* Okay to use page. Re-initialize and return it */
- _bt_pageinit(page, BufferGetPageSize(buf));
- return buf;
+ page = BufferGetPage(buf);
+ if (_bt_page_recyclable(page))
+ {
+ /* Okay to use page. Re-initialize and return it */
+ _bt_pageinit(page, BufferGetPageSize(buf));
+ return buf;
+ }
+ elog(DEBUG2, "FSM returned nonrecyclable page");
+ _bt_relbuf(rel, buf);
+ }
+ else
+ {
+ elog(DEBUG2, "FSM returned nonlockable page");
+ /* couldn't get lock, so just drop pin */
+ ReleaseBuffer(buf);
}
- elog(DEBUG2, "FSM returned nonrecyclable page");
- _bt_relbuf(rel, buf);
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 2da3b4e364..1be30d30e2 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.139 2003/08/04 02:40:03 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.140 2003/08/10 19:48:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1937,6 +1937,37 @@ LockBuffer(Buffer buffer, int mode)
}
/*
+ * Acquire the cntx_lock for the buffer, but only if we don't have to wait.
+ *
+ * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
+ */
+bool
+ConditionalLockBuffer(Buffer buffer)
+{
+ BufferDesc *buf;
+
+ Assert(BufferIsValid(buffer));
+ if (BufferIsLocal(buffer))
+ return true; /* act as though we got it */
+
+ buf = &(BufferDescriptors[buffer - 1]);
+
+ if (LWLockConditionalAcquire(buf->cntx_lock, LW_EXCLUSIVE))
+ {
+ /*
+ * This is not the best place to set cntxDirty flag (eg indices do
+ * not always change buffer they lock in excl mode). But please
+ * remember that it's critical to set cntxDirty *before* logging
+ * changes with XLogInsert() - see comments in BufferSync().
+ */
+ buf->cntxDirty = true;
+
+ return true;
+ }
+ return false;
+}
+
+/*
* LockBufferForCleanup - lock a buffer in preparation for deleting items
*
* Items may be deleted from a disk page only when the caller (a) holds an
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 5d78a3b9b2..bfbf64e207 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: bufmgr.h,v 1.69 2003/08/04 02:40:14 momjian Exp $
+ * $Id: bufmgr.h,v 1.70 2003/08/10 19:48:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -180,6 +180,7 @@ extern void SetBufferCommitInfoNeedsSave(Buffer buffer);
extern void UnlockBuffers(void);
extern void LockBuffer(Buffer buffer, int mode);
+extern bool ConditionalLockBuffer(Buffer buffer);
extern void LockBufferForCleanup(Buffer buffer);
extern void AbortBufferIO(void);