summaryrefslogtreecommitdiff
path: root/src/backend/access/gin/ginvacuum.c
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2014-01-22 18:51:48 +0200
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2014-01-22 19:20:58 +0200
commit36a35c550ac114caa423bcbe339d3515db0cd957 (patch)
tree3bd40801d0bc4ee3ac6ff668f9f2ae221aaada49 /src/backend/access/gin/ginvacuum.c
parent243ee266339bd4a049ff92e101010242169b7287 (diff)
downloadpostgresql-36a35c550ac114caa423bcbe339d3515db0cd957.tar.gz
Compress GIN posting lists, for smaller index size.
GIN posting lists are now encoded using varbyte-encoding, which allows them to fit in much smaller space than the straight ItemPointer array format used before. The new encoding is used for both the lists stored in-line in entry tree items, and in posting tree leaf pages. To maintain backwards-compatibility and keep pg_upgrade working, the code can still read old-style pages and tuples. Posting tree leaf pages in the new format are flagged with GIN_COMPRESSED flag, to distinguish old and new format pages. Likewise, entry tree tuples in the new format have a GIN_ITUP_COMPRESSED flag set in a bit that was previously unused. This patch bumps GIN_CURRENT_VERSION from 1 to 2. New indexes created with version 9.4 will therefore have version number 2 in the metapage, while old pg_upgraded indexes will have version 1. The code treats them the same, but it might be come handy in the future, if we want to drop support for the uncompressed format. Alexander Korotkov and me. Reviewed by Tomas Vondra and Amit Langote.
Diffstat (limited to 'src/backend/access/gin/ginvacuum.c')
-rw-r--r--src/backend/access/gin/ginvacuum.c232
1 files changed, 124 insertions, 108 deletions
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index 4212effbe4..6bf65c3293 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -20,8 +20,9 @@
#include "postmaster/autovacuum.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
+#include "utils/memutils.h"
-typedef struct
+typedef struct GinVacuumState
{
Relation index;
IndexBulkDeleteResult *result;
@@ -29,56 +30,58 @@ typedef struct
void *callback_state;
GinState ginstate;
BufferAccessStrategy strategy;
+ MemoryContext tmpCxt;
} GinVacuumState;
-
/*
- * Vacuums a list of item pointers. The original size of the list is 'nitem',
- * returns the number of items remaining afterwards.
+ * Vacuums an uncompressed posting list. The size of the must can be specified
+ * in number of items (nitems).
*
- * If *cleaned == NULL on entry, the original array is left unmodified; if
- * any items are removed, a palloc'd copy of the result is stored in *cleaned.
- * Otherwise *cleaned should point to the original array, in which case it's
- * modified directly.
+ * If none of the items need to be removed, returns NULL. Otherwise returns
+ * a new palloc'd array with the remaining items. The number of remaining
+ * items is returned in *nremaining.
*/
-static int
-ginVacuumPostingList(GinVacuumState *gvs, ItemPointerData *items, int nitem,
- ItemPointerData **cleaned)
+ItemPointer
+ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items,
+ int nitem, int *nremaining)
{
int i,
- j = 0;
-
- Assert(*cleaned == NULL || *cleaned == items);
+ remaining = 0;
+ ItemPointer tmpitems = NULL;
/*
- * just scan over ItemPointer array
+ * Iterate over TIDs array
*/
for (i = 0; i < nitem; i++)
{
if (gvs->callback(items + i, gvs->callback_state))
{
gvs->result->tuples_removed += 1;
- if (!*cleaned)
+ if (!tmpitems)
{
- *cleaned = (ItemPointerData *) palloc(sizeof(ItemPointerData) * nitem);
- if (i != 0)
- memcpy(*cleaned, items, sizeof(ItemPointerData) * i);
+ /*
+ * First TID to be deleted: allocate memory to hold the
+ * remaining items.
+ */
+ tmpitems = palloc(sizeof(ItemPointerData) * nitem);
+ memcpy(tmpitems, items, sizeof(ItemPointerData) * i);
}
}
else
{
gvs->result->num_index_tuples += 1;
- if (i != j)
- (*cleaned)[j] = items[i];
- j++;
+ if (tmpitems)
+ tmpitems[remaining] = items[i];
+ remaining++;
}
}
- return j;
+ *nremaining = remaining;
+ return tmpitems;
}
/*
- * fills WAL record for vacuum leaf page
+ * Create a WAL record for vacuuming entry tree leaf page.
*/
static void
xlogVacuumPage(Relation index, Buffer buffer)
@@ -86,65 +89,64 @@ xlogVacuumPage(Relation index, Buffer buffer)
Page page = BufferGetPage(buffer);
XLogRecPtr recptr;
XLogRecData rdata[3];
- ginxlogVacuumPage data;
- char *backup;
- char itups[BLCKSZ];
- uint32 len = 0;
+ ginxlogVacuumPage xlrec;
+ uint16 lower;
+ uint16 upper;
+ /* This is only used for entry tree leaf pages. */
+ Assert(!GinPageIsData(page));
Assert(GinPageIsLeaf(page));
if (!RelationNeedsWAL(index))
return;
- data.node = index->rd_node;
- data.blkno = BufferGetBlockNumber(buffer);
+ xlrec.node = index->rd_node;
+ xlrec.blkno = BufferGetBlockNumber(buffer);
+
+ /* Assume we can omit data between pd_lower and pd_upper */
+ lower = ((PageHeader) page)->pd_lower;
+ upper = ((PageHeader) page)->pd_upper;
+
+ Assert(lower < BLCKSZ);
+ Assert(upper < BLCKSZ);
- if (GinPageIsData(page))
+ if (lower >= SizeOfPageHeaderData &&
+ upper > lower &&
+ upper <= BLCKSZ)
{
- backup = GinDataPageGetData(page);
- data.nitem = GinPageGetOpaque(page)->maxoff;
- if (data.nitem)
- len = MAXALIGN(sizeof(ItemPointerData) * data.nitem);
+ xlrec.hole_offset = lower;
+ xlrec.hole_length = upper - lower;
}
else
{
- char *ptr;
- OffsetNumber i;
-
- ptr = backup = itups;
- for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++)
- {
- IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
-
- memcpy(ptr, itup, IndexTupleSize(itup));
- ptr += MAXALIGN(IndexTupleSize(itup));
- }
-
- data.nitem = PageGetMaxOffsetNumber(page);
- len = ptr - backup;
+ /* No "hole" to compress out */
+ xlrec.hole_offset = 0;
+ xlrec.hole_length = 0;
}
- rdata[0].buffer = buffer;
- rdata[0].buffer_std = (GinPageIsData(page)) ? FALSE : TRUE;
- rdata[0].len = 0;
- rdata[0].data = NULL;
- rdata[0].next = rdata + 1;
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = sizeof(ginxlogVacuumPage);
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].next = &rdata[1];
- rdata[1].buffer = InvalidBuffer;
- rdata[1].len = sizeof(ginxlogVacuumPage);
- rdata[1].data = (char *) &data;
-
- if (len == 0)
+ if (xlrec.hole_length == 0)
{
+ rdata[1].data = (char *) page;
+ rdata[1].len = BLCKSZ;
+ rdata[1].buffer = InvalidBuffer;
rdata[1].next = NULL;
}
else
{
- rdata[1].next = rdata + 2;
-
+ /* must skip the hole */
+ rdata[1].data = (char *) page;
+ rdata[1].len = xlrec.hole_offset;
+ rdata[1].buffer = InvalidBuffer;
+ rdata[1].next = &rdata[2];
+
+ rdata[2].data = (char *) page + (xlrec.hole_offset + xlrec.hole_length);
+ rdata[2].len = BLCKSZ - (xlrec.hole_offset + xlrec.hole_length);
rdata[2].buffer = InvalidBuffer;
- rdata[2].len = len;
- rdata[2].data = backup;
rdata[2].next = NULL;
}
@@ -158,6 +160,7 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
Buffer buffer;
Page page;
bool hasVoidPage = FALSE;
+ MemoryContext oldCxt;
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy);
@@ -169,7 +172,6 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
* again). New scan can't start but previously started ones work
* concurrently.
*/
-
if (isRoot)
LockBufferForCleanup(buffer);
else
@@ -179,32 +181,14 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
if (GinPageIsLeaf(page))
{
- OffsetNumber newMaxOff,
- oldMaxOff = GinPageGetOpaque(page)->maxoff;
- ItemPointerData *cleaned = NULL;
-
- newMaxOff = ginVacuumPostingList(gvs,
- (ItemPointer) GinDataPageGetData(page), oldMaxOff, &cleaned);
-
- /* saves changes about deleted tuple ... */
- if (oldMaxOff != newMaxOff)
- {
- START_CRIT_SECTION();
-
- if (newMaxOff > 0)
- memcpy(GinDataPageGetData(page), cleaned, sizeof(ItemPointerData) * newMaxOff);
- pfree(cleaned);
- GinPageGetOpaque(page)->maxoff = newMaxOff;
+ oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
+ ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
+ MemoryContextSwitchTo(oldCxt);
+ MemoryContextReset(gvs->tmpCxt);
- MarkBufferDirty(buffer);
- xlogVacuumPage(gvs->index, buffer);
-
- END_CRIT_SECTION();
-
- /* if root is a leaf page, we don't desire further processing */
- if (!isRoot && GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
- hasVoidPage = TRUE;
- }
+ /* if root is a leaf page, we don't desire further processing */
+ if (!isRoot && !hasVoidPage && GinDataLeafPageIsEmpty(page))
+ hasVoidPage = TRUE;
}
else
{
@@ -224,7 +208,7 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
}
/*
- * if we have root and theres void pages in tree, then we don't release
+ * if we have root and there are empty pages in tree, then we don't release
* lock to go further processing and guarantee that tree is unused
*/
if (!(isRoot && hasVoidPage))
@@ -391,6 +375,7 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
Buffer buffer;
Page page;
bool meDelete = FALSE;
+ bool isempty;
if (isRoot)
{
@@ -429,7 +414,12 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
}
}
- if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
+ if (GinPageIsLeaf(page))
+ isempty = GinDataLeafPageIsEmpty(page);
+ else
+ isempty = GinPageGetOpaque(page)->maxoff < FirstOffsetNumber;
+
+ if (isempty)
{
/* we never delete the left- or rightmost branch */
if (me->leftBlkno != InvalidBlockNumber && !GinPageRightMost(page))
@@ -513,22 +503,47 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
}
else if (GinGetNPosting(itup) > 0)
{
+ int nitems;
+ ItemPointer uncompressed;
+
/*
- * if we already created a temporary page, make changes in place
+ * Vacuum posting list with proper function for compressed and
+ * uncompressed format.
*/
- ItemPointerData *cleaned = (tmppage == origpage) ? NULL : GinGetPosting(itup);
- int newN;
-
- newN = ginVacuumPostingList(gvs, GinGetPosting(itup), GinGetNPosting(itup), &cleaned);
+ if (GinItupIsCompressed(itup))
+ uncompressed = ginPostingListDecode((GinPostingList *) GinGetPosting(itup), &nitems);
+ else
+ {
+ uncompressed = (ItemPointer) GinGetPosting(itup);
+ nitems = GinGetNPosting(itup);
+ }
- if (GinGetNPosting(itup) != newN)
+ uncompressed = ginVacuumItemPointers(gvs, uncompressed, nitems,
+ &nitems);
+ if (uncompressed)
{
+ /*
+ * Some ItemPointers were deleted, recreate tuple.
+ */
OffsetNumber attnum;
Datum key;
GinNullCategory category;
+ GinPostingList *plist;
+ int plistsize;
+
+ if (nitems > 0)
+ {
+ plist = ginCompressPostingList(uncompressed, nitems, GinMaxItemSize, NULL);
+ plistsize = SizeOfGinPostingList(plist);
+ }
+ else
+ {
+ plist = NULL;
+ plistsize = 0;
+ }
/*
- * Some ItemPointers were deleted, recreate tuple.
+ * if we already created a temporary page, make changes in place
*/
if (tmppage == origpage)
{
@@ -538,15 +553,6 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
*/
tmppage = PageGetTempPageCopy(origpage);
- if (newN > 0)
- {
- Size pos = ((char *) GinGetPosting(itup)) - ((char *) origpage);
-
- memcpy(tmppage + pos, cleaned, sizeof(ItemPointerData) * newN);
- }
-
- pfree(cleaned);
-
/* set itup pointer to new page */
itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i));
}
@@ -554,7 +560,10 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
attnum = gintuple_get_attrnum(&gvs->ginstate, itup);
key = gintuple_get_key(&gvs->ginstate, itup, &category);
itup = GinFormTuple(&gvs->ginstate, attnum, key, category,
- GinGetPosting(itup), newN, true);
+ (char *) plist, plistsize,
+ nitems, true);
+ if (plist)
+ pfree(plist);
PageIndexTupleDelete(tmppage, i);
if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i)
@@ -583,6 +592,11 @@ ginbulkdelete(PG_FUNCTION_ARGS)
BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))];
uint32 nRoot;
+ gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "Gin vacuum temporary context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
gvs.index = index;
gvs.callback = callback;
gvs.callback_state = callback_state;
@@ -683,6 +697,8 @@ ginbulkdelete(PG_FUNCTION_ARGS)
LockBuffer(buffer, GIN_EXCLUSIVE);
}
+ MemoryContextDelete(gvs.tmpCxt);
+
PG_RETURN_POINTER(gvs.result);
}