diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b2d1901..b9a057e 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -3992,7 +3992,7 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, - TransactionId latestRemovedXid) + TransactionId latestRemovedXid, bool defragment) { xl_heap_clean xlrec; uint8 info; @@ -4005,6 +4005,7 @@ log_heap_clean(Relation reln, Buffer buffer, xlrec.node = reln->rd_node; xlrec.block = BufferGetBlockNumber(buffer); xlrec.latestRemovedXid = latestRemovedXid; + xlrec.flags = defragment ? HEAP_CLEAN_DEFRAGMENT : 0; xlrec.nredirected = nredirected; xlrec.ndead = ndead; @@ -4308,6 +4309,7 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) int ndead; int nunused; Size freespace; + bool defragment; /* * We're about to remove tuples. In Hot Standby mode, ensure that there's @@ -4325,11 +4327,15 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) if (record->xl_info & XLR_BKP_BLOCK_1) return; + defragment = (xlrec->flags & HEAP_CLEAN_DEFRAGMENT) != 0; buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL); if (!BufferIsValid(buffer)) return; - LockBufferForCleanup(buffer); + if (defragment) + LockBufferForCleanup(buffer); + else + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (XLByteLE(lsn, PageGetLSN(page))) @@ -4351,7 +4357,8 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) heap_page_prune_execute(buffer, redirected, nredirected, nowdead, ndead, - nowunused, nunused); + nowunused, nunused, + defragment); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 61f2ce4..f3fecce 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -108,15 +108,19 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) { - /* OK, try to get exclusive buffer lock */ + /* + * Try to get buffer cleanup lock. There's little point in pruning + * the page unless we can also defragment. + */ if (!ConditionalLockBufferForCleanup(buffer)) return; /* * Now that we have buffer lock, get accurate information about the * page's free space, and recheck the heuristic about whether to - * prune. (We needn't recheck PageIsPrunable, since no one else could - * have pruned while we hold pin.) + * prune. (We don't recheck PageIsPrunable(); if vacuum cleaned up + * the page despite our pin, it will necessarily have skipped + * the defrag step, so we may as well do it now.) */ if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) { @@ -124,7 +128,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) * needed */ /* OK to prune */ - (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore); + (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore, + true); } /* And release buffer lock */ @@ -134,9 +139,10 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) /* - * Prune and repair fragmentation in the specified page. + * Prune and (optionally) repair fragmentation in the specified page. * - * Caller must have pin and buffer cleanup lock on the page. + * Caller must have pin and exclusive lock on the page. To defragment, a + * cleanup lock is required. * * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). @@ -151,7 +157,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) */ int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, - bool report_stats, TransactionId *latestRemovedXid) + bool report_stats, TransactionId *latestRemovedXid, + bool defragment) { int ndeleted = 0; Page page = BufferGetPage(buffer); @@ -201,8 +208,9 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, /* Any error while applying the changes is critical */ START_CRIT_SECTION(); - /* Have we found any prunable items? */ - if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) + /* Have we found any prunable items (or do we need to defrag anyway)? */ + if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0 + || (defragment && PageNeedsDefrag(page))) { /* * Apply the planned item changes, then repair page fragmentation, and @@ -211,7 +219,8 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, heap_page_prune_execute(buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, - prstate.nowunused, prstate.nunused); + prstate.nowunused, prstate.nunused, + defragment); /* * Update the page's pd_prune_xid field to either zero, or the lowest @@ -220,11 +229,21 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; /* - * Also clear the "page is full" flag, since there's no point in - * repeating the prune/defrag process until something else happens to - * the page. + * If we were able to defragment, also clear the "page is full" flag, + * since there's no point in repeating this work something else happens + * to the page; also clear the "needs defrag" flag, if set. + * + * If we were not able to defragment, set the "needs defrag" flag so + * that the next vacuum will try to clean it up even if no new dead + * tuples have been created. */ - PageClearFull(page); + if (defragment) + { + PageClearFull(page); + PageClearNeedsDefrag(page); + } + else + PageSetNeedsDefrag(page); MarkBufferDirty(buffer); @@ -239,7 +258,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, - prstate.latestRemovedXid); + prstate.latestRemovedXid, defragment); PageSetLSN(BufferGetPage(buffer), recptr); PageSetTLI(BufferGetPage(buffer), ThisTimeLineID); @@ -252,15 +271,15 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, * pd_prune_xid field, update it and mark the buffer dirty. This is * treated as a non-WAL-logged hint. * - * Also clear the "page is full" flag if it is set, since there's no - * point in repeating the prune/defrag process until something else - * happens to the page. + * Also clear the "page is full" flag, since there's no point in + * repeating this work something else happens to the page. */ if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || PageIsFull(page)) { ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; - PageClearFull(page); + if (defragment) + PageClearFull(page); SetBufferCommitInfoNeedsSave(buffer); } } @@ -643,7 +662,7 @@ void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, - OffsetNumber *nowunused, int nunused) + OffsetNumber *nowunused, int nunused, bool defragment) { Page page = (Page) BufferGetPage(buffer); OffsetNumber *offnum; @@ -684,7 +703,8 @@ heap_page_prune_execute(Buffer buffer, * Finally, repair any fragmentation, and update the page's hint bit about * whether it has free pointers. */ - PageRepairFragmentation(page); + if (defragment) + PageRepairFragmentation(page); } diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index b197b45..c794dc2 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -124,7 +124,7 @@ static void lazy_cleanup_index(Relation indrel, IndexBulkDeleteResult *stats, LVRelStats *vacrelstats); static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, - int tupindex, LVRelStats *vacrelstats); + int tupindex, LVRelStats *vacrelstats, bool defragment); static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats); static BlockNumber count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats); @@ -418,6 +418,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; + bool defragment; if (blkno == next_not_all_visible_block) { @@ -485,8 +486,13 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); - /* We need buffer cleanup lock so that we can prune HOT chains. */ - LockBufferForCleanup(buf); + /* + * If we can get a buffer cleanup lock, we'll prune and defragment + * the page. But if someone's holding a pin, we don't want to get + * stuck waiting for it, so we'll just prune and leave defragmentation + * for another time. + */ + defragment = LockBufferForPossibleCleanup(buf); page = BufferGetPage(buf); @@ -567,7 +573,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, - &vacrelstats->latestRemovedXid); + &vacrelstats->latestRemovedXid, + defragment); /* * Now scan the page to collect vacuumable items and check for tuples @@ -759,7 +766,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ - lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); + lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, defragment); /* * Forget the now-vacuumed tuples, and press on, but be careful @@ -772,8 +779,11 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, freespace = PageGetHeapFreeSpace(page); - /* Update the all-visible flag on the page */ - if (!PageIsAllVisible(page) && all_visible) + /* + * Update the all-visible flag on the page. We skip this if we weren't + * able to defragment, so that the next vacuum will (hopefully) do so. + */ + if (!PageIsAllVisible(page) && all_visible && defragment) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); @@ -926,21 +936,27 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) Buffer buf; Page page; Size freespace; + bool defragment; vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL, vac_strategy); - LockBufferForCleanup(buf); - tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats); + defragment = LockBufferForPossibleCleanup(buf); + tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats, + defragment); - /* Now that we've compacted the page, record its available space */ - page = BufferGetPage(buf); - freespace = PageGetHeapFreeSpace(page); + /* If we've compacted the page, record its available space */ + if (defragment) + { + page = BufferGetPage(buf); + freespace = PageGetHeapFreeSpace(page); + } UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(onerel, tblk, freespace); + if (defragment) + RecordPageWithFreeSpace(onerel, tblk, freespace); npages++; } @@ -964,7 +980,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) */ static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, - int tupindex, LVRelStats *vacrelstats) + int tupindex, LVRelStats *vacrelstats, bool defragment) { Page page = BufferGetPage(buffer); OffsetNumber unused[MaxOffsetNumber]; @@ -987,7 +1003,8 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, unused[uncnt++] = toff; } - PageRepairFragmentation(page); + if (defragment) + PageRepairFragmentation(page); MarkBufferDirty(buffer); @@ -999,7 +1016,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, recptr = log_heap_clean(onerel, buffer, NULL, 0, NULL, 0, unused, uncnt, - vacrelstats->latestRemovedXid); + vacrelstats->latestRemovedXid, defragment); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index e59af33..093f4a3 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2564,6 +2564,50 @@ ConditionalLockBufferForCleanup(Buffer buffer) return false; } +/* + * LockBufferForPossibleCleanup + * + * Exclusive lock the target buffer. If the pin count happens to be exactly + * one, then return true, indicating that we've acquired a cleanup lock. If + * not, return false, but retain the buffer lock. + */ +bool +LockBufferForPossibleCleanup(Buffer buffer) +{ + volatile BufferDesc *bufHdr; + bool got_cleanup_lock; + + Assert(BufferIsValid(buffer)); + Assert(PinCountWaitBuf == NULL); + + if (BufferIsLocal(buffer)) + { + /* There should be exactly one pin */ + if (LocalRefCount[-buffer - 1] != 1) + elog(ERROR, "incorrect local pin count: %d", + LocalRefCount[-buffer - 1]); + /* Nobody else to wait for */ + return true; + } + + /* There should be exactly one local pin */ + if (PrivateRefCount[buffer - 1] != 1) + elog(ERROR, "incorrect local pin count: %d", + PrivateRefCount[buffer - 1]); + + bufHdr = &BufferDescriptors[buffer - 1]; + + /* Get buffer lock. */ + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* Check whether it's a cleanup lock. */ + LockBufHdr(bufHdr); + Assert(bufHdr->refcount > 0); + got_cleanup_lock = (bufHdr->refcount == 1); + UnlockBufHdr(bufHdr); + + return got_cleanup_lock; +} /* * Functions for buffer I/O handling diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 776ea5c..3a9417d 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -133,7 +133,7 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, - TransactionId latestRemovedXid); + TransactionId latestRemovedXid, bool defragment); extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, OffsetNumber *offsets, int offcnt); @@ -147,11 +147,13 @@ extern void heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin); extern int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, - bool report_stats, TransactionId *latestRemovedXid); + bool report_stats, TransactionId *latestRemovedXid, + bool defragment); extern void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, - OffsetNumber *nowunused, int nunused); + OffsetNumber *nowunused, int nunused, + bool defragment); extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets); /* in heap/syncscan.c */ diff --git a/src/include/access/htup.h b/src/include/access/htup.h index 966e2d0..0854ca6 100644 --- a/src/include/access/htup.h +++ b/src/include/access/htup.h @@ -689,6 +689,7 @@ typedef struct xl_heap_clean RelFileNode node; BlockNumber block; TransactionId latestRemovedXid; + uint16 flags; uint16 nredirected; uint16 ndead; /* OFFSET NUMBERS FOLLOW */ @@ -696,6 +697,9 @@ typedef struct xl_heap_clean #define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16)) +#define HEAP_CLEAN_DEFRAGMENT 0x0001 /* defragment page during cleanup */ + + /* * Cleanup_info is required in some cases during a lazy VACUUM. * Used for reporting the results of HeapTupleHeaderAdvanceLatestRemovedXid() diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 49b5d31..ab568ca 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -208,6 +208,7 @@ extern void LockBuffer(Buffer buffer, int mode); extern bool ConditionalLockBuffer(Buffer buffer); extern void LockBufferForCleanup(Buffer buffer); extern bool ConditionalLockBufferForCleanup(Buffer buffer); +extern bool LockBufferForPossibleCleanup(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); extern void AbortBufferIO(void); diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 42d6b10..2d4c721 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -154,8 +154,9 @@ typedef PageHeaderData *PageHeader; * tuple? */ #define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to * everyone */ +#define PD_NEEDS_DEFRAG 0x0008 /* page pruned without defrag */ -#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ +#define PD_VALID_FLAG_BITS 0x000f /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. @@ -345,6 +346,13 @@ typedef PageHeaderData *PageHeader; #define PageClearAllVisible(page) \ (((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE) +#define PageNeedsDefrag(page) \ + (((PageHeader) (page))->pd_flags & PD_NEEDS_DEFRAG) +#define PageSetNeedsDefrag(page) \ + (((PageHeader) (page))->pd_flags |= PD_NEEDS_DEFRAG) +#define PageClearNeedsDefrag(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_NEEDS_DEFRAG) + #define PageIsPrunable(page, oldestxmin) \ ( \ AssertMacro(TransactionIdIsNormal(oldestxmin)), \