From 0822ccf1c1df26abf50e865c62a69a302fcfc58f Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Tue, 17 Jan 2023 17:20:37 +0700 Subject: [PATCH v23 18/18] Use TIDStore for storing dead tuple TID during lazy vacuum Previously, we used an array of ItemPointerData to store dead tuple TIDs, which was not space efficient and slow to lookup. Also, we had the 1GB limit on its size. Now we use TIDStore to store dead tuple TIDs. Since the TIDStore, backed by the radix tree, incrementaly allocates the memory, we get rid of the 1GB limit. Since we are no longer able to exactly estimate the maximum number of TIDs can be stored the pg_stat_progress_vacuum shows the progress information based on the amount of memory in bytes. The column names are also changed to max_dead_tuple_bytes and num_dead_tuple_bytes. In addition, since the TIDStore use the radix tree internally, the minimum amount of memory required by TIDStore is 1MB, the inital DSA segment size. Due to that, we increase the minimum value of maintenance_work_mem (also autovacuum_work_mem) from 1MB to 2MB. XXX: needs to bump catalog version --- doc/src/sgml/monitoring.sgml | 8 +- src/backend/access/heap/vacuumlazy.c | 218 +++++++-------------- src/backend/catalog/system_views.sql | 2 +- src/backend/commands/vacuum.c | 78 +------- src/backend/commands/vacuumparallel.c | 62 +++--- src/backend/postmaster/autovacuum.c | 6 +- src/backend/storage/lmgr/lwlock.c | 2 + src/backend/utils/misc/guc_tables.c | 2 +- src/include/commands/progress.h | 4 +- src/include/commands/vacuum.h | 25 +-- src/include/storage/lwlock.h | 1 + src/test/regress/expected/cluster.out | 2 +- src/test/regress/expected/create_index.out | 2 +- src/test/regress/expected/rules.out | 4 +- src/test/regress/sql/cluster.sql | 2 +- src/test/regress/sql/create_index.sql | 2 +- 16 files changed, 142 insertions(+), 278 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index d936aa3da3..0230c74e3d 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -6870,10 +6870,10 @@ FROM pg_stat_get_backend_idset() AS backendid; - max_dead_tuples bigint + max_dead_tuple_bytes bigint - Number of dead tuples that we can store before needing to perform + Amount of dead tuple data that we can store before needing to perform an index vacuum cycle, based on . @@ -6881,10 +6881,10 @@ FROM pg_stat_get_backend_idset() AS backendid; - num_dead_tuples bigint + num_dead_tuple_bytes bigint - Number of dead tuples collected since the last index vacuum cycle. + Amount of dead tuple data collected since the last index vacuum cycle. diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 8f14cf85f3..3537df16fd 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -40,6 +40,7 @@ #include "access/heapam_xlog.h" #include "access/htup_details.h" #include "access/multixact.h" +#include "access/tidstore.h" #include "access/transam.h" #include "access/visibilitymap.h" #include "access/xact.h" @@ -188,7 +189,7 @@ typedef struct LVRelState * lazy_vacuum_heap_rel, which marks the same LP_DEAD line pointers as * LP_UNUSED during second heap pass. */ - VacDeadItems *dead_items; /* TIDs whose index tuples we'll delete */ + TidStore *dead_items; /* TIDs whose index tuples we'll delete */ BlockNumber rel_pages; /* total number of pages */ BlockNumber scanned_pages; /* # pages examined (not skipped via VM) */ BlockNumber removed_pages; /* # pages removed by relation truncation */ @@ -220,11 +221,14 @@ typedef struct LVRelState typedef struct LVPagePruneState { bool hastup; /* Page prevents rel truncation? */ - bool has_lpdead_items; /* includes existing LP_DEAD items */ + + /* collected offsets of LP_DEAD items including existing ones */ + OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; + int num_offsets; /* * State describes the proper VM bit states to set for the page following - * pruning and freezing. all_visible implies !has_lpdead_items, but don't + * pruning and freezing. all_visible implies num_offsets == 0, but don't * trust all_frozen result unless all_visible is also set to true. */ bool all_visible; /* Every item visible to all? */ @@ -259,8 +263,9 @@ static bool lazy_scan_noprune(LVRelState *vacrel, Buffer buf, static void lazy_vacuum(LVRelState *vacrel); static bool lazy_vacuum_all_indexes(LVRelState *vacrel); static void lazy_vacuum_heap_rel(LVRelState *vacrel); -static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, - Buffer buffer, int index, Buffer vmbuffer); +static void lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, + OffsetNumber *offsets, int num_offsets, + Buffer buffer, Buffer vmbuffer); static bool lazy_check_wraparound_failsafe(LVRelState *vacrel); static void lazy_cleanup_all_indexes(LVRelState *vacrel); static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel, @@ -825,21 +830,21 @@ lazy_scan_heap(LVRelState *vacrel) blkno, next_unskippable_block, next_fsm_block_to_vacuum = 0; - VacDeadItems *dead_items = vacrel->dead_items; + TidStore *dead_items = vacrel->dead_items; Buffer vmbuffer = InvalidBuffer; bool next_unskippable_allvis, skipping_current_range; const int initprog_index[] = { PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_TOTAL_HEAP_BLKS, - PROGRESS_VACUUM_MAX_DEAD_TUPLES + PROGRESS_VACUUM_MAX_DEAD_TUPLE_BYTES }; int64 initprog_val[3]; /* Report that we're scanning the heap, advertising total # of blocks */ initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP; initprog_val[1] = rel_pages; - initprog_val[2] = dead_items->max_items; + initprog_val[2] = tidstore_max_memory(vacrel->dead_items); pgstat_progress_update_multi_param(3, initprog_index, initprog_val); /* Set up an initial range of skippable blocks using the visibility map */ @@ -906,8 +911,7 @@ lazy_scan_heap(LVRelState *vacrel) * dead_items TIDs, pause and do a cycle of vacuuming before we tackle * this page. */ - Assert(dead_items->max_items >= MaxHeapTuplesPerPage); - if (dead_items->max_items - dead_items->num_items < MaxHeapTuplesPerPage) + if (tidstore_is_full(vacrel->dead_items)) { /* * Before beginning index vacuuming, we release any pin we may @@ -1018,7 +1022,7 @@ lazy_scan_heap(LVRelState *vacrel) */ lazy_scan_prune(vacrel, buf, blkno, page, &prunestate); - Assert(!prunestate.all_visible || !prunestate.has_lpdead_items); + Assert(!prunestate.all_visible || (prunestate.num_offsets == 0)); /* Remember the location of the last page with nonremovable tuples */ if (prunestate.hastup) @@ -1034,14 +1038,12 @@ lazy_scan_heap(LVRelState *vacrel) * performed here can be thought of as the one-pass equivalent of * a call to lazy_vacuum(). */ - if (prunestate.has_lpdead_items) + if (prunestate.num_offsets > 0) { Size freespace; - lazy_vacuum_heap_page(vacrel, blkno, buf, 0, vmbuffer); - - /* Forget the LP_DEAD items that we just vacuumed */ - dead_items->num_items = 0; + lazy_vacuum_heap_page(vacrel, blkno, prunestate.deadoffsets, + prunestate.num_offsets, buf, vmbuffer); /* * Periodically perform FSM vacuuming to make newly-freed @@ -1078,7 +1080,16 @@ lazy_scan_heap(LVRelState *vacrel) * with prunestate-driven visibility map and FSM steps (just like * the two-pass strategy). */ - Assert(dead_items->num_items == 0); + Assert(tidstore_num_tids(dead_items) == 0); + } + else if (prunestate.num_offsets > 0) + { + /* Save details of the LP_DEAD items from the page */ + tidstore_add_tids(dead_items, blkno, prunestate.deadoffsets, + prunestate.num_offsets); + + pgstat_progress_update_param(PROGRESS_VACUUM_DEAD_TUPLE_BYTES, + tidstore_memory_usage(dead_items)); } /* @@ -1145,7 +1156,7 @@ lazy_scan_heap(LVRelState *vacrel) * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE * set, however. */ - else if (prunestate.has_lpdead_items && PageIsAllVisible(page)) + else if ((prunestate.num_offsets > 0) && PageIsAllVisible(page)) { elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", vacrel->relname, blkno); @@ -1193,7 +1204,7 @@ lazy_scan_heap(LVRelState *vacrel) * Final steps for block: drop cleanup lock, record free space in the * FSM */ - if (prunestate.has_lpdead_items && vacrel->do_index_vacuuming) + if ((prunestate.num_offsets > 0) && vacrel->do_index_vacuuming) { /* * Wait until lazy_vacuum_heap_rel() to save free space. This @@ -1249,7 +1260,7 @@ lazy_scan_heap(LVRelState *vacrel) * Do index vacuuming (call each index's ambulkdelete routine), then do * related heap vacuuming */ - if (dead_items->num_items > 0) + if (tidstore_num_tids(dead_items) > 0) lazy_vacuum(vacrel); /* @@ -1543,13 +1554,11 @@ lazy_scan_prune(LVRelState *vacrel, HTSV_Result res; int tuples_deleted, tuples_frozen, - lpdead_items, live_tuples, recently_dead_tuples; int nnewlpdead; HeapPageFreeze pagefrz; int64 fpi_before = pgWalUsage.wal_fpi; - OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; HeapTupleFreeze frozen[MaxHeapTuplesPerPage]; Assert(BufferGetBlockNumber(buf) == blkno); @@ -1571,7 +1580,6 @@ retry: pagefrz.NoFreezePageRelminMxid = vacrel->NewRelminMxid; tuples_deleted = 0; tuples_frozen = 0; - lpdead_items = 0; live_tuples = 0; recently_dead_tuples = 0; @@ -1580,9 +1588,9 @@ retry: * * We count tuples removed by the pruning step as tuples_deleted. Its * final value can be thought of as the number of tuples that have been - * deleted from the table. It should not be confused with lpdead_items; - * lpdead_items's final value can be thought of as the number of tuples - * that were deleted from indexes. + * deleted from the table. It should not be confused with + * prunestate->deadoffsets; prunestate->deadoffsets's final value can + * be thought of as the number of tuples that were deleted from indexes. */ tuples_deleted = heap_page_prune(rel, buf, vacrel->vistest, InvalidTransactionId, 0, &nnewlpdead, @@ -1593,7 +1601,7 @@ retry: * requiring freezing among remaining tuples with storage */ prunestate->hastup = false; - prunestate->has_lpdead_items = false; + prunestate->num_offsets = 0; prunestate->all_visible = true; prunestate->all_frozen = true; prunestate->visibility_cutoff_xid = InvalidTransactionId; @@ -1638,7 +1646,7 @@ retry: * (This is another case where it's useful to anticipate that any * LP_DEAD items will become LP_UNUSED during the ongoing VACUUM.) */ - deadoffsets[lpdead_items++] = offnum; + prunestate->deadoffsets[prunestate->num_offsets++] = offnum; continue; } @@ -1875,7 +1883,7 @@ retry: */ #ifdef USE_ASSERT_CHECKING /* Note that all_frozen value does not matter when !all_visible */ - if (prunestate->all_visible && lpdead_items == 0) + if (prunestate->all_visible && prunestate->num_offsets == 0) { TransactionId cutoff; bool all_frozen; @@ -1888,28 +1896,9 @@ retry: } #endif - /* - * Now save details of the LP_DEAD items from the page in vacrel - */ - if (lpdead_items > 0) + if (prunestate->num_offsets > 0) { - VacDeadItems *dead_items = vacrel->dead_items; - ItemPointerData tmp; - vacrel->lpdead_item_pages++; - prunestate->has_lpdead_items = true; - - ItemPointerSetBlockNumber(&tmp, blkno); - - for (int i = 0; i < lpdead_items; i++) - { - ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); - dead_items->items[dead_items->num_items++] = tmp; - } - - Assert(dead_items->num_items <= dead_items->max_items); - pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, - dead_items->num_items); /* * It was convenient to ignore LP_DEAD items in all_visible earlier on @@ -1928,7 +1917,7 @@ retry: /* Finally, add page-local counts to whole-VACUUM counts */ vacrel->tuples_deleted += tuples_deleted; vacrel->tuples_frozen += tuples_frozen; - vacrel->lpdead_items += lpdead_items; + vacrel->lpdead_items += prunestate->num_offsets; vacrel->live_tuples += live_tuples; vacrel->recently_dead_tuples += recently_dead_tuples; } @@ -2129,8 +2118,7 @@ lazy_scan_noprune(LVRelState *vacrel, } else { - VacDeadItems *dead_items = vacrel->dead_items; - ItemPointerData tmp; + TidStore *dead_items = vacrel->dead_items; /* * Page has LP_DEAD items, and so any references/TIDs that remain in @@ -2139,17 +2127,10 @@ lazy_scan_noprune(LVRelState *vacrel, */ vacrel->lpdead_item_pages++; - ItemPointerSetBlockNumber(&tmp, blkno); + tidstore_add_tids(dead_items, blkno, deadoffsets, lpdead_items); - for (int i = 0; i < lpdead_items; i++) - { - ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); - dead_items->items[dead_items->num_items++] = tmp; - } - - Assert(dead_items->num_items <= dead_items->max_items); - pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, - dead_items->num_items); + pgstat_progress_update_param(PROGRESS_VACUUM_DEAD_TUPLE_BYTES, + tidstore_memory_usage(dead_items)); vacrel->lpdead_items += lpdead_items; @@ -2198,7 +2179,7 @@ lazy_vacuum(LVRelState *vacrel) if (!vacrel->do_index_vacuuming) { Assert(!vacrel->do_index_cleanup); - vacrel->dead_items->num_items = 0; + tidstore_reset(vacrel->dead_items); return; } @@ -2227,7 +2208,7 @@ lazy_vacuum(LVRelState *vacrel) BlockNumber threshold; Assert(vacrel->num_index_scans == 0); - Assert(vacrel->lpdead_items == vacrel->dead_items->num_items); + Assert(vacrel->lpdead_items == tidstore_num_tids(vacrel->dead_items)); Assert(vacrel->do_index_vacuuming); Assert(vacrel->do_index_cleanup); @@ -2254,8 +2235,8 @@ lazy_vacuum(LVRelState *vacrel) * cases then this may need to be reconsidered. */ threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES; - bypass = (vacrel->lpdead_item_pages < threshold && - vacrel->lpdead_items < MAXDEADITEMS(32L * 1024L * 1024L)); + bypass = (vacrel->lpdead_item_pages < threshold) && + tidstore_memory_usage(vacrel->dead_items) < (32L * 1024L * 1024L); } if (bypass) @@ -2300,7 +2281,7 @@ lazy_vacuum(LVRelState *vacrel) * Forget the LP_DEAD items that we just vacuumed (or just decided to not * vacuum) */ - vacrel->dead_items->num_items = 0; + tidstore_reset(vacrel->dead_items); } /* @@ -2373,7 +2354,7 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) * place). */ Assert(vacrel->num_index_scans > 0 || - vacrel->dead_items->num_items == vacrel->lpdead_items); + tidstore_num_tids(vacrel->dead_items) == vacrel->lpdead_items); Assert(allindexes || vacrel->failsafe_active); /* @@ -2410,10 +2391,11 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) static void lazy_vacuum_heap_rel(LVRelState *vacrel) { - int index = 0; BlockNumber vacuumed_pages = 0; Buffer vmbuffer = InvalidBuffer; LVSavedErrInfo saved_err_info; + TidStoreIter *iter; + TidStoreIterResult *result; Assert(vacrel->do_index_vacuuming); Assert(vacrel->do_index_cleanup); @@ -2428,7 +2410,8 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) VACUUM_ERRCB_PHASE_VACUUM_HEAP, InvalidBlockNumber, InvalidOffsetNumber); - while (index < vacrel->dead_items->num_items) + iter = tidstore_begin_iterate(vacrel->dead_items); + while ((result = tidstore_iterate_next(iter)) != NULL) { BlockNumber blkno; Buffer buf; @@ -2437,7 +2420,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) vacuum_delay_point(); - blkno = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); + blkno = result->blkno; vacrel->blkno = blkno; /* @@ -2451,7 +2434,8 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, vacrel->bstrategy); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - index = lazy_vacuum_heap_page(vacrel, blkno, buf, index, vmbuffer); + lazy_vacuum_heap_page(vacrel, blkno, result->offsets, result->num_offsets, + buf, vmbuffer); /* Now that we've vacuumed the page, record its available space */ page = BufferGetPage(buf); @@ -2461,6 +2445,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); vacuumed_pages++; } + tidstore_end_iterate(iter); vacrel->blkno = InvalidBlockNumber; if (BufferIsValid(vmbuffer)) @@ -2470,36 +2455,30 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) * We set all LP_DEAD items from the first heap pass to LP_UNUSED during * the second heap pass. No more, no less. */ - Assert(index > 0); Assert(vacrel->num_index_scans > 1 || - (index == vacrel->lpdead_items && + (tidstore_num_tids(vacrel->dead_items) == vacrel->lpdead_items && vacuumed_pages == vacrel->lpdead_item_pages)); ereport(DEBUG2, - (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages", - vacrel->relname, (long long) index, vacuumed_pages))); + (errmsg("table \"%s\": removed " UINT64_FORMAT "dead item identifiers in %u pages", + vacrel->relname, tidstore_num_tids(vacrel->dead_items), vacuumed_pages))); /* Revert to the previous phase information for error traceback */ restore_vacuum_error_info(vacrel, &saved_err_info); } /* - * lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the - * vacrel->dead_items array. + * lazy_vacuum_heap_page() -- free page's LP_DEAD items. * * Caller must have an exclusive buffer lock on the buffer (though a full * cleanup lock is also acceptable). vmbuffer must be valid and already have * a pin on blkno's visibility map page. - * - * index is an offset into the vacrel->dead_items array for the first listed - * LP_DEAD item on the page. The return value is the first index immediately - * after all LP_DEAD items for the same page in the array. */ -static int -lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, - int index, Buffer vmbuffer) +static void +lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, + OffsetNumber *deadoffsets, int num_offsets, Buffer buffer, + Buffer vmbuffer) { - VacDeadItems *dead_items = vacrel->dead_items; Page page = BufferGetPage(buffer); OffsetNumber unused[MaxHeapTuplesPerPage]; int nunused = 0; @@ -2518,16 +2497,11 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, START_CRIT_SECTION(); - for (; index < dead_items->num_items; index++) + for (int i = 0; i < num_offsets; i++) { - BlockNumber tblk; - OffsetNumber toff; ItemId itemid; + OffsetNumber toff = deadoffsets[i]; - tblk = ItemPointerGetBlockNumber(&dead_items->items[index]); - if (tblk != blkno) - break; /* past end of tuples for this block */ - toff = ItemPointerGetOffsetNumber(&dead_items->items[index]); itemid = PageGetItemId(page, toff); Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid)); @@ -2597,7 +2571,6 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, /* Revert to the previous phase information for error traceback */ restore_vacuum_error_info(vacrel, &saved_err_info); - return index; } /* @@ -3093,46 +3066,6 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) return vacrel->nonempty_pages; } -/* - * Returns the number of dead TIDs that VACUUM should allocate space to - * store, given a heap rel of size vacrel->rel_pages, and given current - * maintenance_work_mem setting (or current autovacuum_work_mem setting, - * when applicable). - * - * See the comments at the head of this file for rationale. - */ -static int -dead_items_max_items(LVRelState *vacrel) -{ - int64 max_items; - int vac_work_mem = IsAutoVacuumWorkerProcess() && - autovacuum_work_mem != -1 ? - autovacuum_work_mem : maintenance_work_mem; - - if (vacrel->nindexes > 0) - { - BlockNumber rel_pages = vacrel->rel_pages; - - max_items = MAXDEADITEMS(vac_work_mem * 1024L); - max_items = Min(max_items, INT_MAX); - max_items = Min(max_items, MAXDEADITEMS(MaxAllocSize)); - - /* curious coding here to ensure the multiplication can't overflow */ - if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > rel_pages) - max_items = rel_pages * MaxHeapTuplesPerPage; - - /* stay sane if small maintenance_work_mem */ - max_items = Max(max_items, MaxHeapTuplesPerPage); - } - else - { - /* One-pass case only stores a single heap page's TIDs at a time */ - max_items = MaxHeapTuplesPerPage; - } - - return (int) max_items; -} - /* * Allocate dead_items (either using palloc, or in dynamic shared memory). * Sets dead_items in vacrel for caller. @@ -3143,11 +3076,9 @@ dead_items_max_items(LVRelState *vacrel) static void dead_items_alloc(LVRelState *vacrel, int nworkers) { - VacDeadItems *dead_items; - int max_items; - - max_items = dead_items_max_items(vacrel); - Assert(max_items >= MaxHeapTuplesPerPage); + int vac_work_mem = IsAutoVacuumWorkerProcess() && + autovacuum_work_mem != -1 ? + autovacuum_work_mem * 1024L : maintenance_work_mem * 1024L; /* * Initialize state for a parallel vacuum. As of now, only one worker can @@ -3174,7 +3105,7 @@ dead_items_alloc(LVRelState *vacrel, int nworkers) else vacrel->pvs = parallel_vacuum_init(vacrel->rel, vacrel->indrels, vacrel->nindexes, nworkers, - max_items, + vac_work_mem, MaxHeapTuplesPerPage, vacrel->verbose ? INFO : DEBUG2, vacrel->bstrategy); @@ -3187,11 +3118,8 @@ dead_items_alloc(LVRelState *vacrel, int nworkers) } /* Serial VACUUM case */ - dead_items = (VacDeadItems *) palloc(vac_max_items_to_alloc_size(max_items)); - dead_items->max_items = max_items; - dead_items->num_items = 0; - - vacrel->dead_items = dead_items; + vacrel->dead_items = tidstore_create(vac_work_mem, MaxHeapTuplesPerPage, + NULL); } /* diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 8608e3fa5b..a526e607fe 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1165,7 +1165,7 @@ CREATE VIEW pg_stat_progress_vacuum AS END AS phase, S.param2 AS heap_blks_total, S.param3 AS heap_blks_scanned, S.param4 AS heap_blks_vacuumed, S.param5 AS index_vacuum_count, - S.param6 AS max_dead_tuples, S.param7 AS num_dead_tuples + S.param6 AS max_dead_tuple_bytes, S.param7 AS dead_tuple_bytes FROM pg_stat_get_progress_info('VACUUM') AS S LEFT JOIN pg_database D ON S.datid = D.oid; diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 7b1a4b127e..d8e680ca20 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -97,7 +97,6 @@ static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, static double compute_parallel_delay(void); static VacOptValue get_vacoptval_from_boolean(DefElem *def); static bool vac_tid_reaped(ItemPointer itemptr, void *state); -static int vac_cmp_itemptr(const void *left, const void *right); /* * Primary entry point for manual VACUUM and ANALYZE commands @@ -2303,16 +2302,16 @@ get_vacoptval_from_boolean(DefElem *def) */ IndexBulkDeleteResult * vac_bulkdel_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat, - VacDeadItems *dead_items) + TidStore *dead_items) { /* Do bulk deletion */ istat = index_bulk_delete(ivinfo, istat, vac_tid_reaped, (void *) dead_items); ereport(ivinfo->message_level, - (errmsg("scanned index \"%s\" to remove %d row versions", + (errmsg("scanned index \"%s\" to remove " UINT64_FORMAT " row versions", RelationGetRelationName(ivinfo->index), - dead_items->num_items))); + tidstore_num_tids(dead_items)))); return istat; } @@ -2343,82 +2342,15 @@ vac_cleanup_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat) return istat; } -/* - * Returns the total required space for VACUUM's dead_items array given a - * max_items value. - */ -Size -vac_max_items_to_alloc_size(int max_items) -{ - Assert(max_items <= MAXDEADITEMS(MaxAllocSize)); - - return offsetof(VacDeadItems, items) + sizeof(ItemPointerData) * max_items; -} - /* * vac_tid_reaped() -- is a particular tid deletable? * * This has the right signature to be an IndexBulkDeleteCallback. - * - * Assumes dead_items array is sorted (in ascending TID order). */ static bool vac_tid_reaped(ItemPointer itemptr, void *state) { - VacDeadItems *dead_items = (VacDeadItems *) state; - int64 litem, - ritem, - item; - ItemPointer res; - - litem = itemptr_encode(&dead_items->items[0]); - ritem = itemptr_encode(&dead_items->items[dead_items->num_items - 1]); - item = itemptr_encode(itemptr); - - /* - * Doing a simple bound check before bsearch() is useful to avoid the - * extra cost of bsearch(), especially if dead items on the heap are - * concentrated in a certain range. Since this function is called for - * every index tuple, it pays to be really fast. - */ - if (item < litem || item > ritem) - return false; - - res = (ItemPointer) bsearch((void *) itemptr, - (void *) dead_items->items, - dead_items->num_items, - sizeof(ItemPointerData), - vac_cmp_itemptr); - - return (res != NULL); -} - -/* - * Comparator routines for use with qsort() and bsearch(). - */ -static int -vac_cmp_itemptr(const void *left, const void *right) -{ - BlockNumber lblk, - rblk; - OffsetNumber loff, - roff; - - lblk = ItemPointerGetBlockNumber((ItemPointer) left); - rblk = ItemPointerGetBlockNumber((ItemPointer) right); - - if (lblk < rblk) - return -1; - if (lblk > rblk) - return 1; - - loff = ItemPointerGetOffsetNumber((ItemPointer) left); - roff = ItemPointerGetOffsetNumber((ItemPointer) right); - - if (loff < roff) - return -1; - if (loff > roff) - return 1; + TidStore *dead_items = (TidStore *) state; - return 0; + return tidstore_lookup_tid(dead_items, itemptr); } diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index bcd40c80a1..5c7e6ed99c 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -103,6 +103,9 @@ typedef struct PVShared /* Counter for vacuuming and cleanup */ pg_atomic_uint32 idx; + + /* Handle of the shared TidStore */ + tidstore_handle dead_items_handle; } PVShared; /* Status used during parallel index vacuum or cleanup */ @@ -166,7 +169,8 @@ struct ParallelVacuumState PVIndStats *indstats; /* Shared dead items space among parallel vacuum workers */ - VacDeadItems *dead_items; + TidStore *dead_items; + dsa_area *dead_items_area; /* Points to buffer usage area in DSM */ BufferUsage *buffer_usage; @@ -222,20 +226,23 @@ static void parallel_vacuum_error_callback(void *arg); */ ParallelVacuumState * parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, - int nrequested_workers, int max_items, - int elevel, BufferAccessStrategy bstrategy) + int nrequested_workers, int vac_work_mem, + int max_offset, int elevel, + BufferAccessStrategy bstrategy) { ParallelVacuumState *pvs; ParallelContext *pcxt; PVShared *shared; - VacDeadItems *dead_items; + TidStore *dead_items; PVIndStats *indstats; BufferUsage *buffer_usage; WalUsage *wal_usage; + void *area_space; + dsa_area *dead_items_dsa; bool *will_parallel_vacuum; Size est_indstats_len; Size est_shared_len; - Size est_dead_items_len; + Size dsa_minsize = dsa_minimum_size(); int nindexes_mwm = 0; int parallel_workers = 0; int querylen; @@ -283,9 +290,8 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, shm_toc_estimate_chunk(&pcxt->estimator, est_shared_len); shm_toc_estimate_keys(&pcxt->estimator, 1); - /* Estimate size for dead_items -- PARALLEL_VACUUM_KEY_DEAD_ITEMS */ - est_dead_items_len = vac_max_items_to_alloc_size(max_items); - shm_toc_estimate_chunk(&pcxt->estimator, est_dead_items_len); + /* Estimate size for dead tuple DSA -- PARALLEL_VACUUM_KEY_DSA */ + shm_toc_estimate_chunk(&pcxt->estimator, dsa_minsize); shm_toc_estimate_keys(&pcxt->estimator, 1); /* @@ -351,6 +357,16 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_INDEX_STATS, indstats); pvs->indstats = indstats; + /* Prepare DSA space for dead items */ + area_space = shm_toc_allocate(pcxt->toc, dsa_minsize); + shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_ITEMS, area_space); + dead_items_dsa = dsa_create_in_place(area_space, dsa_minsize, + LWTRANCHE_PARALLEL_VACUUM_DSA, + pcxt->seg); + dead_items = tidstore_create(vac_work_mem, max_offset, dead_items_dsa); + pvs->dead_items = dead_items; + pvs->dead_items_area = dead_items_dsa; + /* Prepare shared information */ shared = (PVShared *) shm_toc_allocate(pcxt->toc, est_shared_len); MemSet(shared, 0, est_shared_len); @@ -360,6 +376,7 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, (nindexes_mwm > 0) ? maintenance_work_mem / Min(parallel_workers, nindexes_mwm) : maintenance_work_mem; + shared->dead_items_handle = tidstore_get_handle(dead_items); pg_atomic_init_u32(&(shared->cost_balance), 0); pg_atomic_init_u32(&(shared->active_nworkers), 0); @@ -368,15 +385,6 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_SHARED, shared); pvs->shared = shared; - /* Prepare the dead_items space */ - dead_items = (VacDeadItems *) shm_toc_allocate(pcxt->toc, - est_dead_items_len); - dead_items->max_items = max_items; - dead_items->num_items = 0; - MemSet(dead_items->items, 0, sizeof(ItemPointerData) * max_items); - shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_ITEMS, dead_items); - pvs->dead_items = dead_items; - /* * Allocate space for each worker's BufferUsage and WalUsage; no need to * initialize @@ -434,6 +442,9 @@ parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats) istats[i] = NULL; } + tidstore_destroy(pvs->dead_items); + dsa_detach(pvs->dead_items_area); + DestroyParallelContext(pvs->pcxt); ExitParallelMode(); @@ -442,7 +453,7 @@ parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats) } /* Returns the dead items space */ -VacDeadItems * +TidStore * parallel_vacuum_get_dead_items(ParallelVacuumState *pvs) { return pvs->dead_items; @@ -940,7 +951,9 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) Relation *indrels; PVIndStats *indstats; PVShared *shared; - VacDeadItems *dead_items; + TidStore *dead_items; + void *area_space; + dsa_area *dead_items_area; BufferUsage *buffer_usage; WalUsage *wal_usage; int nindexes; @@ -984,10 +997,10 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PARALLEL_VACUUM_KEY_INDEX_STATS, false); - /* Set dead_items space */ - dead_items = (VacDeadItems *) shm_toc_lookup(toc, - PARALLEL_VACUUM_KEY_DEAD_ITEMS, - false); + /* Set dead items */ + area_space = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_DEAD_ITEMS, false); + dead_items_area = dsa_attach_in_place(area_space, seg); + dead_items = tidstore_attach(dead_items_area, shared->dead_items_handle); /* Set cost-based vacuum delay */ VacuumCostActive = (VacuumCostDelay > 0); @@ -1033,6 +1046,9 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); + tidstore_detach(pvs.dead_items); + dsa_detach(dead_items_area); + /* Pop the error context stack */ error_context_stack = errcallback.previous; diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index f5ea381c53..d88db3e1f8 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -3397,12 +3397,12 @@ check_autovacuum_work_mem(int *newval, void **extra, GucSource source) return true; /* - * We clamp manually-set values to at least 1MB. Since + * We clamp manually-set values to at least 2MB. Since * maintenance_work_mem is always set to at least this value, do the same * here. */ - if (*newval < 1024) - *newval = 1024; + if (*newval < 2048) + *newval = 2048; return true; } diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 55b3a04097..c223a7dc94 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -192,6 +192,8 @@ static const char *const BuiltinTrancheNames[] = { "LogicalRepLauncherDSA", /* LWTRANCHE_LAUNCHER_HASH: */ "LogicalRepLauncherHash", + /* LWTRANCHE_PARALLEL_VACUUM_DSA: */ + "ParallelVacuumDSA", }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 4ac808ed22..422914f0a9 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2312,7 +2312,7 @@ struct config_int ConfigureNamesInt[] = GUC_UNIT_KB }, &maintenance_work_mem, - 65536, 1024, MAX_KILOBYTES, + 65536, 2048, MAX_KILOBYTES, NULL, NULL, NULL }, diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index e5add41352..b209d3cf84 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -23,8 +23,8 @@ #define PROGRESS_VACUUM_HEAP_BLKS_SCANNED 2 #define PROGRESS_VACUUM_HEAP_BLKS_VACUUMED 3 #define PROGRESS_VACUUM_NUM_INDEX_VACUUMS 4 -#define PROGRESS_VACUUM_MAX_DEAD_TUPLES 5 -#define PROGRESS_VACUUM_NUM_DEAD_TUPLES 6 +#define PROGRESS_VACUUM_MAX_DEAD_TUPLE_BYTES 5 +#define PROGRESS_VACUUM_DEAD_TUPLE_BYTES 6 /* Phases of vacuum (as advertised via PROGRESS_VACUUM_PHASE) */ #define PROGRESS_VACUUM_PHASE_SCAN_HEAP 1 diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 689dbb7702..a3ebb169ef 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -17,6 +17,7 @@ #include "access/htup.h" #include "access/genam.h" #include "access/parallel.h" +#include "access/tidstore.h" #include "catalog/pg_class.h" #include "catalog/pg_statistic.h" #include "catalog/pg_type.h" @@ -276,21 +277,6 @@ struct VacuumCutoffs MultiXactId MultiXactCutoff; }; -/* - * VacDeadItems stores TIDs whose index tuples are deleted by index vacuuming. - */ -typedef struct VacDeadItems -{ - int max_items; /* # slots allocated in array */ - int num_items; /* current # of entries */ - - /* Sorted array of TIDs to delete from indexes */ - ItemPointerData items[FLEXIBLE_ARRAY_MEMBER]; -} VacDeadItems; - -#define MAXDEADITEMS(avail_mem) \ - (((avail_mem) - offsetof(VacDeadItems, items)) / sizeof(ItemPointerData)) - /* GUC parameters */ extern PGDLLIMPORT int default_statistics_target; /* PGDLLIMPORT for PostGIS */ extern PGDLLIMPORT int vacuum_freeze_min_age; @@ -339,18 +325,17 @@ extern Relation vacuum_open_relation(Oid relid, RangeVar *relation, LOCKMODE lmode); extern IndexBulkDeleteResult *vac_bulkdel_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat, - VacDeadItems *dead_items); + TidStore *dead_items); extern IndexBulkDeleteResult *vac_cleanup_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat); -extern Size vac_max_items_to_alloc_size(int max_items); /* in commands/vacuumparallel.c */ extern ParallelVacuumState *parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, int nrequested_workers, - int max_items, int elevel, - BufferAccessStrategy bstrategy); + int vac_work_mem, int max_offset, + int elevel, BufferAccessStrategy bstrategy); extern void parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats); -extern VacDeadItems *parallel_vacuum_get_dead_items(ParallelVacuumState *pvs); +extern TidStore *parallel_vacuum_get_dead_items(ParallelVacuumState *pvs); extern void parallel_vacuum_bulkdel_all_indexes(ParallelVacuumState *pvs, long num_table_tuples, int num_index_scans); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 07002fdfbe..537b34b30c 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -207,6 +207,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_PGSTATS_DATA, LWTRANCHE_LAUNCHER_DSA, LWTRANCHE_LAUNCHER_HASH, + LWTRANCHE_PARALLEL_VACUUM_DSA, LWTRANCHE_FIRST_USER_DEFINED } BuiltinTrancheIds; diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out index 2eec483eaa..e04f50726f 100644 --- a/src/test/regress/expected/cluster.out +++ b/src/test/regress/expected/cluster.out @@ -526,7 +526,7 @@ create index cluster_sort on clstr_4 (hundred, thousand, tenthous); -- ensure we don't use the index in CLUSTER nor the checking SELECTs set enable_indexscan = off; -- Use external sort: -set maintenance_work_mem = '1MB'; +set maintenance_work_mem = '2MB'; cluster clstr_4 using cluster_sort; select * from (select hundred, lag(hundred) over () as lhundred, diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 6cd57e3eaa..d1889b9d10 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1214,7 +1214,7 @@ DROP TABLE unlogged_hash_table; -- CREATE INDEX hash_ovfl_index ON hash_ovfl_heap USING hash (x int4_ops); -- Test hash index build tuplesorting. Force hash tuplesort using low -- maintenance_work_mem setting and fillfactor: -SET maintenance_work_mem = '1MB'; +SET maintenance_work_mem = '2MB'; CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fillfactor = 10); EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index e7a2f5856a..f6ae02eb14 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2020,8 +2020,8 @@ pg_stat_progress_vacuum| SELECT s.pid, s.param3 AS heap_blks_scanned, s.param4 AS heap_blks_vacuumed, s.param5 AS index_vacuum_count, - s.param6 AS max_dead_tuples, - s.param7 AS num_dead_tuples + s.param6 AS max_dead_tuple_bytes, + s.param7 AS dead_tuple_bytes FROM (pg_stat_get_progress_info('VACUUM'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) LEFT JOIN pg_database d ON ((s.datid = d.oid))); pg_stat_recovery_prefetch| SELECT stats_reset, diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql index a4cfaae807..a4cb5b98a5 100644 --- a/src/test/regress/sql/cluster.sql +++ b/src/test/regress/sql/cluster.sql @@ -258,7 +258,7 @@ create index cluster_sort on clstr_4 (hundred, thousand, tenthous); set enable_indexscan = off; -- Use external sort: -set maintenance_work_mem = '1MB'; +set maintenance_work_mem = '2MB'; cluster clstr_4 using cluster_sort; select * from (select hundred, lag(hundred) over () as lhundred, diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index a3738833b2..edb5e4b4f3 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -367,7 +367,7 @@ DROP TABLE unlogged_hash_table; -- Test hash index build tuplesorting. Force hash tuplesort using low -- maintenance_work_mem setting and fillfactor: -SET maintenance_work_mem = '1MB'; +SET maintenance_work_mem = '2MB'; CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fillfactor = 10); EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; -- 2.31.1