From 195c9bbf595512038240c51fc2142decfef4699e Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 6 Jul 2023 21:57:53 +0200 Subject: [PATCH v1] Implement a reloption that forces updated tuples to go to other pages Without this, there is no way to efficiently make a table more compact when a hole was created. With max_local_update configured, local updates are only available for the first MBs of the table, all other updates will go through the visibility map to find a destination page for the new tuple version. This is intended as a debug/maintenance option so that tables will tend to less fragmentation if a large part of the table was updated at once and left many pages nearly empty; e.g. when someone updated a table with fillfactor=100 using an unqualified UPDATE statement. --- src/backend/access/common/reloptions.c | 13 +++++++- src/backend/access/heap/heapam.c | 7 ++-- src/include/utils/rel.h | 19 +++++++++++ src/test/regress/expected/alter_table.out | 18 ++++++++++ src/test/regress/expected/update.out | 40 +++++++++++++++++++++++ src/test/regress/sql/alter_table.sql | 8 +++++ src/test/regress/sql/update.sql | 23 +++++++++++++ 7 files changed, 124 insertions(+), 4 deletions(-) diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 11cc431677..2cb5d90d58 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -382,6 +382,15 @@ static relopt_int intRelOpts[] = }, -1, 0, 1024 }, + { + { + "max_local_update", + "Updates in the table that are not located the first max_local_update MB of the table will always try to insert the new tuple on a different page.", + RELOPT_KIND_HEAP, + ShareUpdateExclusiveLock + }, + -1, -1, (MaxBlockNumber / (1024 * 1024 / BLCKSZ)) + }, /* list terminator */ {{NULL}} @@ -1882,7 +1891,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) {"vacuum_index_cleanup", RELOPT_TYPE_ENUM, offsetof(StdRdOptions, vacuum_index_cleanup)}, {"vacuum_truncate", RELOPT_TYPE_BOOL, - offsetof(StdRdOptions, vacuum_truncate)} + offsetof(StdRdOptions, vacuum_truncate)}, + {"max_local_update", RELOPT_TYPE_INT, + offsetof(StdRdOptions, max_local_update)} }; return (bytea *) build_reloptions(reloptions, validate, kind, diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 7ed72abe59..9e7e852375 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -3495,7 +3495,8 @@ l2: newtupsize = MAXALIGN(newtup->t_len); - if (need_toast || newtupsize > pagefree) + if (need_toast || newtupsize > pagefree || + !RelationUpdateTupleOnPageLocally(relation, -1, block)) { TransactionId xmax_lock_old_tuple; uint16 infomask_lock_old_tuple, @@ -3507,7 +3508,7 @@ l2: * temporarily mark it locked, while we release the page-level lock. * * To satisfy the rule that any xid potentially appearing in a buffer - * written out to disk, we unfortunately have to WAL log this + * must be written out to disk, we unfortunately have to WAL log this * temporary modification. We can reuse xl_heap_lock for this * purpose. If we crash/error before following through with the * actual update, xmax will be of an aborted transaction, allowing @@ -3622,7 +3623,7 @@ l2: */ for (;;) { - if (newtupsize > pagefree) + if (newtupsize > pagefree || !RelationUpdateTupleOnPageLocally(relation, -1, block)) { /* It doesn't fit, must use RelationGetBufferForTuple. */ newbuf = RelationGetBufferForTuple(relation, heaptup->t_len, diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 1426a353cd..65d183ff8c 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -342,6 +342,7 @@ typedef struct StdRdOptions int parallel_workers; /* max number of parallel workers */ StdRdOptIndexCleanup vacuum_index_cleanup; /* controls index vacuuming */ bool vacuum_truncate; /* enables vacuum to truncate a relation */ + int max_local_update; /* Updates to pages after this block must go through the VM */ } StdRdOptions; #define HEAP_MIN_FILLFACTOR 10 @@ -377,6 +378,24 @@ typedef struct StdRdOptions #define RelationGetTargetPageFreeSpace(relation, defaultff) \ (BLCKSZ * (100 - RelationGetFillFactor(relation, defaultff)) / 100) +/* + * RelationGetMaxLocalUpdateBlock + * Returns the size of the relation's local update section (MB). + */ +#define RelationGetMaxLocalUpdate(relation, defaultmlu) \ + ((relation)->rd_options ? \ + ((StdRdOptions *) (relation)->rd_options)->max_local_update : (defaultmlu)) + +/* + * RelationUpdateTupleLocally + * Is an update on blockno allowed to put the new tuple on the current + * page, or should we instead try to find a different page? + */ +#define RelationUpdateTupleOnPageLocally(relation, defaultmlu, blockno) \ + ((RelationGetMaxLocalUpdate((relation), (defaultmlu)) == -1) || \ + ((blockno) < (BlockNumber) (RelationGetMaxLocalUpdate((relation), (defaultmlu)) * \ + (1024 * 1024) / BLCKSZ))) + /* * RelationIsUsedAsCatalogTable * Returns whether the relation should be treated as a catalog table diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 3b708c7976..859a87dde5 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -2758,6 +2758,24 @@ select * from my_locks order by 1; pg_toast | ShareUpdateExclusiveLock (2 rows) +commit; +begin; alter table alterlock set (max_local_update = 8); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock reset (max_local_update); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + commit; begin; alter table alterlock set (toast.autovacuum_enabled = off); select * from my_locks order by 1; diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index c809f88f54..c164f86e43 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -1026,3 +1026,43 @@ update hash_parted set b = b + 8 where b = 1; drop table hash_parted; drop operator class custom_opclass using hash; drop function dummy_hashint4(a int4, seed int8); +create table block_local_updates(id int, b int) with (fillfactor = 10, autovacuum_enabled = false); +insert into block_local_updates select generate_series(1, 88), 0; +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; + ctid | count +------+------- + 0 | 22 + 1 | 22 + 2 | 22 + 3 | 22 +(4 rows) + +-- FF 10=>100 -> all blocks have ~ 90% space left +alter table block_local_updates set (fillfactor = 100); +-- vacuum to clear FULL bits on all pages +vacuum (disable_page_skipping true) block_local_updates; +-- 10% space of each page is updated => 20% full, ~80% space left. +update block_local_updates set b = 1; +-- all tuples still on same page, 22 each +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; + ctid | count +------+------- + 0 | 22 + 1 | 22 + 2 | 22 + 3 | 22 +(4 rows) + +-- max_local_update=0 -> all updates in the table must not take the easy block-local path +alter table block_local_updates set (max_local_update=0); +-- 80% space left, all updates would be page-local if not for max_local_update +update block_local_updates set b = 2; +-- all tuples moved to first page, 77 total +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; + ctid | count +------+------- + 0 | 88 +(1 row) + +-- cleanup +drop table block_local_updates; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 58ea20ac3d..7e067f4d2a 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -1770,6 +1770,14 @@ begin; alter table alterlock reset (fillfactor); select * from my_locks order by 1; commit; +begin; alter table alterlock set (max_local_update = 8); +select * from my_locks order by 1; +commit; + +begin; alter table alterlock reset (max_local_update); +select * from my_locks order by 1; +commit; + begin; alter table alterlock set (toast.autovacuum_enabled = off); select * from my_locks order by 1; commit; diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql index 7a7bee77b9..dcf09e9671 100644 --- a/src/test/regress/sql/update.sql +++ b/src/test/regress/sql/update.sql @@ -667,3 +667,26 @@ update hash_parted set b = b + 8 where b = 1; drop table hash_parted; drop operator class custom_opclass using hash; drop function dummy_hashint4(a int4, seed int8); + +create table block_local_updates(id int, b int) with (fillfactor = 10, autovacuum_enabled = false); +insert into block_local_updates select generate_series(1, 88), 0; + +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; +-- FF 10=>100 -> all blocks have ~ 90% space left +alter table block_local_updates set (fillfactor = 100); +-- vacuum to clear FULL bits on all pages +vacuum (disable_page_skipping true) block_local_updates; +-- 10% space of each page is updated => 20% full, ~80% space left. +update block_local_updates set b = 1; +-- all tuples still on same page, 22 each +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; + +-- max_local_update=0 -> all updates in the table must not take the easy block-local path +alter table block_local_updates set (max_local_update=0); +-- 80% space left, all updates would be page-local if not for max_local_update +update block_local_updates set b = 2; +-- all tuples moved to first page, 77 total +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; + +-- cleanup +drop table block_local_updates; -- 2.40.1