From 13f3aa76bbae8818654f8a71a25e989d3204a044 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 6 Jul 2023 21:57:53 +0200 Subject: [PATCH v1] Add heap reloption local_update_limit Without this, there is no way to efficiently make a table more compact when a hole or area with few tuples was created. With local_update_limit configured, local updates are only available for the first MBs of the table, all other updates will go through the visibility map to find a destination page for the new tuple version. This is intended as a debug/maintenance option so that tables will tend to less fragmentation if a large part of the table was updated at once and left many pages nearly empty; such as when someone updated a table with fillfactor=100 using an unqualified UPDATE statement. --- doc/src/sgml/ref/create_table.sgml | 34 +++++++++++++++++++ src/backend/access/common/reloptions.c | 13 +++++++- src/backend/access/heap/heapam.c | 7 ++-- src/bin/psql/tab-complete.c | 1 + src/include/utils/rel.h | 19 +++++++++++ src/test/regress/expected/alter_table.out | 18 ++++++++++ src/test/regress/expected/update.out | 40 +++++++++++++++++++++++ src/test/regress/sql/alter_table.sql | 8 +++++ src/test/regress/sql/update.sql | 23 +++++++++++++ 9 files changed, 159 insertions(+), 4 deletions(-) diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index 10ef699fab..d49da8b161 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -1459,6 +1459,40 @@ WITH ( MODULUS numeric_literal, REM + + local_update_limit (integer) + + local_update_limit storage parameter + + + + + The local update limit for a table configures in which part of the table + tuples will prefer updates on the same page. -1 (unlimited) is the + default. When a the limit is configured, tuples that are located outside + the the first local_update_limit megabytes of the heap + relation will always use the new tuple insertion mechanism to find a page + to insert the new tuple version into, instead of first trying to use free + space on the local page, making this quite useful in helping move tuples + from the end of a table into free space at the front of the table without + the long locks and processing downtime associated with + . + Note this feature also essentially disables + heap-only tuple updates for the data stored behind the configured + limit. Additionally, configuring local_update_limit is + likely to reduce data locality, so use this parameter carefully if your + workload makes assumptions about data locality. It is also important to + note that the tuple insertion algorithm doesn't insert tuples into pages + that are considered full after accounting for + fillfactor, + which (when combined with an already full table and a low + fillfactor) can result in bad bloat because updates in + the area after local_update_limit will never be able + to use free space reserved by fillfactor. + + + + toast_tuple_target (integer) diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 469de9bb49..225ea9f019 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -382,6 +382,15 @@ static relopt_int intRelOpts[] = }, -1, 0, 1024 }, + { + { + "local_update_limit", + "Updates in the table that are not located the first local_update_limit MB of the table will always try to insert the new tuple on a different page.", + RELOPT_KIND_HEAP, + ShareUpdateExclusiveLock + }, + -1, -1, (MaxBlockNumber / (1024 * 1024 / BLCKSZ)) + }, /* list terminator */ {{NULL}} @@ -1882,7 +1891,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) {"vacuum_index_cleanup", RELOPT_TYPE_ENUM, offsetof(StdRdOptions, vacuum_index_cleanup)}, {"vacuum_truncate", RELOPT_TYPE_BOOL, - offsetof(StdRdOptions, vacuum_truncate)} + offsetof(StdRdOptions, vacuum_truncate)}, + {"local_update_limit", RELOPT_TYPE_INT, + offsetof(StdRdOptions, local_update_limit)} }; return (bytea *) build_reloptions(reloptions, validate, kind, diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 6a66214a58..d2af4afc48 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -3496,7 +3496,8 @@ l2: newtupsize = MAXALIGN(newtup->t_len); - if (need_toast || newtupsize > pagefree) + if (need_toast || newtupsize > pagefree || + !RelationUpdateTupleOnPageLocally(relation, -1, block)) { TransactionId xmax_lock_old_tuple; uint16 infomask_lock_old_tuple, @@ -3508,7 +3509,7 @@ l2: * temporarily mark it locked, while we release the page-level lock. * * To satisfy the rule that any xid potentially appearing in a buffer - * written out to disk, we unfortunately have to WAL log this + * must be written out to disk, we unfortunately have to WAL log this * temporary modification. We can reuse xl_heap_lock for this * purpose. If we crash/error before following through with the * actual update, xmax will be of an aborted transaction, allowing @@ -3623,7 +3624,7 @@ l2: */ for (;;) { - if (newtupsize > pagefree) + if (newtupsize > pagefree || !RelationUpdateTupleOnPageLocally(relation, -1, block)) { /* It doesn't fit, must use RelationGetBufferForTuple. */ newbuf = RelationGetBufferForTuple(relation, heaptup->t_len, diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index 779fdc90cb..48dcacd352 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -1304,6 +1304,7 @@ static const char *const table_storage_parameters[] = { "autovacuum_vacuum_scale_factor", "autovacuum_vacuum_threshold", "fillfactor", + "local_update_limit", "log_autovacuum_min_duration", "parallel_workers", "toast.autovacuum_enabled", diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 1426a353cd..22411fd7c8 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -342,6 +342,7 @@ typedef struct StdRdOptions int parallel_workers; /* max number of parallel workers */ StdRdOptIndexCleanup vacuum_index_cleanup; /* controls index vacuuming */ bool vacuum_truncate; /* enables vacuum to truncate a relation */ + int local_update_limit; /* after this block, always use tuple routing */ } StdRdOptions; #define HEAP_MIN_FILLFACTOR 10 @@ -377,6 +378,24 @@ typedef struct StdRdOptions #define RelationGetTargetPageFreeSpace(relation, defaultff) \ (BLCKSZ * (100 - RelationGetFillFactor(relation, defaultff)) / 100) +/* + * RelationGetLocalUpdateLimit + * Returns the size of the relation's local update section (MB). + */ +#define RelationGetLocalUpdateLimit(relation, defaultlul) \ + ((relation)->rd_options ? \ + ((StdRdOptions *) (relation)->rd_options)->local_update_limit : (defaultlul)) + +/* + * RelationUpdateTupleLocally + * Is an update on blockno allowed to put the new tuple on the current + * page, or should we instead try to find a different page? + */ +#define RelationUpdateTupleOnPageLocally(relation, defaultmlu, blockno) \ + ((RelationGetLocalUpdateLimit((relation), (defaultmlu)) == -1) || \ + ((blockno) < (BlockNumber) (RelationGetLocalUpdateLimit((relation), (defaultmlu)) * \ + (1024 * 1024) / BLCKSZ))) + /* * RelationIsUsedAsCatalogTable * Returns whether the relation should be treated as a catalog table diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index cd814ff321..d8076f2e61 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -2758,6 +2758,24 @@ select * from my_locks order by 1; pg_toast | ShareUpdateExclusiveLock (2 rows) +commit; +begin; alter table alterlock set (local_update_limit = 8); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock reset (local_update_limit); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + commit; begin; alter table alterlock set (toast.autovacuum_enabled = off); select * from my_locks order by 1; diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index c809f88f54..dd8632ed18 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -1026,3 +1026,43 @@ update hash_parted set b = b + 8 where b = 1; drop table hash_parted; drop operator class custom_opclass using hash; drop function dummy_hashint4(a int4, seed int8); +create table block_local_updates(id int, b int) with (fillfactor = 10, autovacuum_enabled = false); +insert into block_local_updates select generate_series(1, 88), 0; +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; + ctid | count +------+------- + 0 | 22 + 1 | 22 + 2 | 22 + 3 | 22 +(4 rows) + +-- FF 10=>100 -> all blocks have ~ 90% space left +alter table block_local_updates set (fillfactor = 100); +-- vacuum to clear FULL bits on all pages +vacuum (disable_page_skipping true) block_local_updates; +-- 10% space of each page is updated => 20% full, ~80% space left. +update block_local_updates set b = 1; +-- all tuples still on same page, 22 each +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; + ctid | count +------+------- + 0 | 22 + 1 | 22 + 2 | 22 + 3 | 22 +(4 rows) + +-- local_update_limit=0 -> all updates in the table must try to find a different page +alter table block_local_updates set (local_update_limit=0); +-- 80% space left, all updates would be page-local if not for local_update_limit +update block_local_updates set b = 2; +-- all tuples moved to first page, 77 total +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; + ctid | count +------+------- + 0 | 88 +(1 row) + +-- cleanup +drop table block_local_updates; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index ff8c498419..eb8c8ded0c 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -1770,6 +1770,14 @@ begin; alter table alterlock reset (fillfactor); select * from my_locks order by 1; commit; +begin; alter table alterlock set (local_update_limit = 8); +select * from my_locks order by 1; +commit; + +begin; alter table alterlock reset (local_update_limit); +select * from my_locks order by 1; +commit; + begin; alter table alterlock set (toast.autovacuum_enabled = off); select * from my_locks order by 1; commit; diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql index 7a7bee77b9..2a5b6d0d1e 100644 --- a/src/test/regress/sql/update.sql +++ b/src/test/regress/sql/update.sql @@ -667,3 +667,26 @@ update hash_parted set b = b + 8 where b = 1; drop table hash_parted; drop operator class custom_opclass using hash; drop function dummy_hashint4(a int4, seed int8); + +create table block_local_updates(id int, b int) with (fillfactor = 10, autovacuum_enabled = false); +insert into block_local_updates select generate_series(1, 88), 0; + +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; +-- FF 10=>100 -> all blocks have ~ 90% space left +alter table block_local_updates set (fillfactor = 100); +-- vacuum to clear FULL bits on all pages +vacuum (disable_page_skipping true) block_local_updates; +-- 10% space of each page is updated => 20% full, ~80% space left. +update block_local_updates set b = 1; +-- all tuples still on same page, 22 each +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; + +-- local_update_limit=0 -> all updates in the table must try to find a different page +alter table block_local_updates set (local_update_limit=0); +-- 80% space left, all updates would be page-local if not for local_update_limit +update block_local_updates set b = 2; +-- all tuples moved to first page, 77 total +select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1; + +-- cleanup +drop table block_local_updates; -- 2.40.1