From a9073083671689f64ae25b95b4ded8083d870de2 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 12 Nov 2025 21:47:44 +0200 Subject: [PATCH v26 04/10] Add pg_upgrade for 64 bit multixact offsets Author: Maxim Orlov Author: Heikki Linnakangas --- src/backend/access/transam/multixact.c | 56 --- src/bin/pg_upgrade/Makefile | 3 + src/bin/pg_upgrade/meson.build | 4 + src/bin/pg_upgrade/multixact_new.c | 101 ++++++ src/bin/pg_upgrade/multixact_new.h | 23 ++ src/bin/pg_upgrade/multixact_old.c | 297 ++++++++++++++++ src/bin/pg_upgrade/multixact_old.h | 29 ++ src/bin/pg_upgrade/pg_upgrade.c | 108 +++++- src/bin/pg_upgrade/pg_upgrade.h | 5 + src/bin/pg_upgrade/slru_io.c | 242 +++++++++++++ src/bin/pg_upgrade/slru_io.h | 52 +++ .../pg_upgrade/t/007_multixact_conversion.pl | 329 ++++++++++++++++++ src/test/perl/PostgreSQL/Test/Cluster.pm | 21 +- src/tools/pgindent/typedefs.list | 3 + 14 files changed, 1204 insertions(+), 69 deletions(-) create mode 100644 src/bin/pg_upgrade/multixact_new.c create mode 100644 src/bin/pg_upgrade/multixact_new.h create mode 100644 src/bin/pg_upgrade/multixact_old.c create mode 100644 src/bin/pg_upgrade/multixact_old.h create mode 100644 src/bin/pg_upgrade/slru_io.c create mode 100644 src/bin/pg_upgrade/slru_io.h create mode 100644 src/bin/pg_upgrade/t/007_multixact_conversion.pl diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 34a745c07be..e0323ec1014 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -1824,48 +1824,6 @@ BootStrapMultiXact(void) SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0); } -/* - * MaybeExtendOffsetSlru - * Extend the offsets SLRU area, if necessary - * - * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might - * contain files that are shorter than necessary; this would occur if the old - * installation had used multixacts beyond the first page (files cannot be - * copied, because the on-disk representation is different). pg_upgrade would - * update pg_control to set the next offset value to be at that position, so - * that tuples marked as locked by such MultiXacts would be seen as visible - * without having to consult multixact. However, trying to create and use a - * new MultiXactId would result in an error because the page on which the new - * value would reside does not exist. This routine is in charge of creating - * such pages. - */ -static void -MaybeExtendOffsetSlru(void) -{ - int64 pageno; - LWLock *lock; - - pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) - { - int slotno; - - /* - * Fortunately for us, SimpleLruWritePage is already prepared to deal - * with creating a new segment file even if the page we're writing is - * not the first in it, so this is enough. - */ - slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - } - - LWLockRelease(lock); -} - /* * This must be called ONCE during postmaster or standalone-backend startup. * @@ -2058,20 +2016,6 @@ MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactState->nextMXact = nextMulti; MultiXactState->nextOffset = nextMultiOffset; LWLockRelease(MultiXactGenLock); - - /* - * During a binary upgrade, make sure that the offsets SLRU is large - * enough to contain the next value that would be created. - * - * We need to do this pretty early during the first startup in binary - * upgrade mode: before StartupMultiXact() in fact, because this routine - * is called even before that by StartupXLOG(). And we can't do it - * earlier than at this point, because during that first call of this - * routine we determine the MultiXactState->nextMXact value that - * MaybeExtendOffsetSlru needs. - */ - if (IsBinaryUpgrade) - MaybeExtendOffsetSlru(); } /* diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index 69fcf593cae..42995d53b0b 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -18,11 +18,14 @@ OBJS = \ file.o \ function.o \ info.o \ + multixact_new.o \ + multixact_old.o \ option.o \ parallel.o \ pg_upgrade.o \ relfilenumber.o \ server.o \ + slru_io.o \ tablespace.o \ task.o \ util.o \ diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build index ac992f0d14b..fff0db3b560 100644 --- a/src/bin/pg_upgrade/meson.build +++ b/src/bin/pg_upgrade/meson.build @@ -8,11 +8,14 @@ pg_upgrade_sources = files( 'file.c', 'function.c', 'info.c', + 'multixact_new.c', + 'multixact_old.c', 'option.c', 'parallel.c', 'pg_upgrade.c', 'relfilenumber.c', 'server.c', + 'slru_io.c', 'tablespace.c', 'task.c', 'util.c', @@ -47,6 +50,7 @@ tests += { 't/004_subscription.pl', 't/005_char_signedness.pl', 't/006_transfer_modes.pl', + 't/007_multixact_conversion.pl', ], 'test_kwargs': {'priority': 40}, # pg_upgrade tests are slow }, diff --git a/src/bin/pg_upgrade/multixact_new.c b/src/bin/pg_upgrade/multixact_new.c new file mode 100644 index 00000000000..8284a2015fc --- /dev/null +++ b/src/bin/pg_upgrade/multixact_new.c @@ -0,0 +1,101 @@ +/* + * multixact_new.c + * + * Functions to write multixacts in the v19 format with 64-bit + * MultiXactOffsets + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_new.c + */ + +#include "postgres_fe.h" + +#include "access/multixact_internal.h" +#include "multixact_new.h" + +MultiXactWriter * +AllocMultiXactWrite(const char *pgdata, MultiXactId firstMulti, + MultiXactOffset firstOffset) +{ + MultiXactWriter *state = pg_malloc(sizeof(*state)); + char dir[MAXPGPATH] = {0}; + + pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata); + state->offset = AllocSlruWrite(dir, false); + SlruWriteSwitchPage(state->offset, MultiXactIdToOffsetPage(firstMulti)); + + pg_sprintf(dir, "%s/pg_multixact/members", pgdata); + state->members = AllocSlruWrite(dir, true /* use long segment names */ ); + SlruWriteSwitchPage(state->members, MXOffsetToMemberPage(firstOffset)); + + return state; +} + +/* + * Write a new multixact with members. + * + * Simplified version of the correspoding server function, hence the name. + */ +void +RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset, + MultiXactId multi, int nmembers, MultiXactMember *members) +{ + int64 pageno; + int64 prev_pageno; + int entryno; + char *buf; + MultiXactOffset *offptr; + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + /* Store the offset */ + buf = SlruWriteSwitchPage(state->offset, pageno); + offptr = (MultiXactOffset *) buf; + offptr[entryno] = offset; + + /* Store the members */ + prev_pageno = -1; + for (int i = 0; i < nmembers; i++, offset++) + { + TransactionId *memberptr; + uint32 *flagsptr; + uint32 flagsval; + int bshift; + int flagsoff; + int memberoff; + + Assert(members[i].status <= MultiXactStatusUpdate); + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + + if (pageno != prev_pageno) + { + buf = SlruWriteSwitchPage(state->members, pageno); + prev_pageno = pageno; + } + + memberptr = (TransactionId *) (buf + memberoff); + + *memberptr = members[i].xid; + + flagsptr = (uint32 *) (buf + flagsoff); + + flagsval = *flagsptr; + flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= (members[i].status << bshift); + *flagsptr = flagsval; + } +} + +void +FreeMultiXactWrite(MultiXactWriter *state) +{ + FreeSlruWrite(state->offset); + FreeSlruWrite(state->members); + + pfree(state); +} diff --git a/src/bin/pg_upgrade/multixact_new.h b/src/bin/pg_upgrade/multixact_new.h new file mode 100644 index 00000000000..f66e6af7e45 --- /dev/null +++ b/src/bin/pg_upgrade/multixact_new.h @@ -0,0 +1,23 @@ +/* + * multixact_new.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_new.h + */ +#include "access/multixact.h" + +#include "slru_io.h" + +typedef struct MultiXactWriter +{ + SlruSegState *offset; + SlruSegState *members; +} MultiXactWriter; + +extern MultiXactWriter *AllocMultiXactWrite(const char *pgdata, + MultiXactId firstMulti, + MultiXactOffset firstOffset); +extern void RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset, + MultiXactId multi, int nmembers, + MultiXactMember *members); +extern void FreeMultiXactWrite(MultiXactWriter *writer); diff --git a/src/bin/pg_upgrade/multixact_old.c b/src/bin/pg_upgrade/multixact_old.c new file mode 100644 index 00000000000..7bf7db4b009 --- /dev/null +++ b/src/bin/pg_upgrade/multixact_old.c @@ -0,0 +1,297 @@ +/* + * multixact_old.c + * + * Functions to read pre-v19 multixacts + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_old.c + */ + +#include "postgres_fe.h" + +#include "multixact_old.h" +#include "pg_upgrade.h" + +/* + * NOTE: below are a bunch of definitions that are copy-pasted from + * multixact.c from version 18. The only difference is that we use the + * OldMultiXactOffset type equal to uint32 instead of MultiXactOffset which + * became uint64. + */ + +/* We need four bytes per offset and 8 bytes per base for each page. */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(OldMultiXactOffset)) + +static inline int64 +MultiXactIdToOffsetPage(MultiXactId multi) +{ + return multi / MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int +MultiXactIdToOffsetEntry(MultiXactId multi) +{ + return multi % MULTIXACT_OFFSETS_PER_PAGE; +} + +/* + * The situation for members is a bit more complex: we store one byte of + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags, and then the + * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* page in which a member is to be found */ +static inline int64 +MXOffsetToMemberPage(OldMultiXactOffset offset) +{ + return offset / MULTIXACT_MEMBERS_PER_PAGE; +} + +/* Location (byte offset within page) of flag word for a given member */ +static inline int +MXOffsetToFlagsOffset(MultiXactOffset offset) +{ + OldMultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; + int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; + + return byteoff; +} + +/* Location (byte offset within page) of TransactionId of given member */ +static inline int +MXOffsetToMemberOffset(OldMultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + + return MXOffsetToFlagsOffset(offset) + + MULTIXACT_FLAGBYTES_PER_GROUP + + member_in_group * sizeof(TransactionId); +} + +static inline int +MXOffsetToFlagsBitShift(OldMultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; + + return bshift; +} + +/* + * Construct reader of old multixacts. + * + * Returns the malloced memory used by the all other calls in this module. + */ +OldMultiXactReader * +AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti, + OldMultiXactOffset nextOffset) +{ + OldMultiXactReader *state = state = pg_malloc(sizeof(*state)); + char dir[MAXPGPATH] = {0}; + + state->nextMXact = nextMulti; + state->nextOffset = nextOffset; + + pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata); + state->offset = AllocSlruRead(dir, false); + + pg_sprintf(dir, "%s/pg_multixact/members", pgdata); + state->members = AllocSlruRead(dir, false); + + return state; +} + +/* + * This is a simplified version of the GetMultiXactIdMembers() server function. + * + * - Only return the updating member, if any. Upgrade only cares about the + * updaters. If there is no updating member, return the first locking-only + * member. We don't have any way to represent "no members", but we also don't + * need to preserve all the locking members. + * + * - We don't need to worry about locking and some corner cases because there's + * no concurrent activity. + */ +void +GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi, + TransactionId *result, MultiXactStatus *status) +{ + MultiXactId nextMXact, + nextOffset, + tmpMXact; + int64 pageno, + prev_pageno; + int entryno, + length; + char *buf; + OldMultiXactOffset *offptr, + offset; + TransactionId result_xid = InvalidTransactionId; + bool result_isupdate = false; + + nextMXact = state->nextMXact; + nextOffset = state->nextOffset; + + /* + * See GetMultiXactIdMembers in multixact.c + * + * Find out the offset at which we need to start reading MultiXactMembers + * and the number of members in the multixact. We determine the latter as + * the difference between this multixact's starting offset and the next + * one's. However, there are some corner cases to worry about: + * + * 1. This multixact may be the latest one created, in which case there is + * no next one to look at. In this case the nextOffset value we just + * saved is the correct endpoint. + * + * 2. The next multixact may still be in process of being filled in... + * This cannot happen during upgrade. + * + * 3. Because GetNewMultiXactId increments offset zero to offset one to + * handle case #2, there is an ambiguity near the point of offset + * wraparound. If we see next multixact's offset is one, is that our + * multixact's actual endpoint, or did it end at zero with a subsequent + * increment? We handle this using the knowledge that if the zero'th + * member slot wasn't filled, it'll contain zero, and zero isn't a valid + * transaction ID so it can't be a multixact member. Therefore, if we + * read a zero from the members array, just ignore it. + */ + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + buf = SlruReadSwitchPage(state->offset, pageno); + offptr = (OldMultiXactOffset *) buf; + offptr += entryno; + offset = *offptr; + + Assert(offset != 0); + + /* + * Use the same increment rule as GetNewMultiXactId(), that is, don't + * handle wraparound explicitly until needed. + */ + tmpMXact = multi + 1; + + if (nextMXact == tmpMXact) + { + /* Corner case 1: there is no next multixact */ + length = nextOffset - offset; + } + else + { + OldMultiXactOffset nextMXOffset; + + /* handle wraparound if needed */ + if (tmpMXact < FirstMultiXactId) + tmpMXact = FirstMultiXactId; + + prev_pageno = pageno; + + pageno = MultiXactIdToOffsetPage(tmpMXact); + entryno = MultiXactIdToOffsetEntry(tmpMXact); + + if (pageno != prev_pageno) + buf = SlruReadSwitchPage(state->offset, pageno); + + offptr = (OldMultiXactOffset *) buf; + offptr += entryno; + nextMXOffset = *offptr; + + /* + * Corner case 2: next multixact is still being filled in, this must + * not happen during upgrade. + */ + Assert(nextMXOffset != 0); + + length = nextMXOffset - offset; + } + + prev_pageno = -1; + for (int i = 0; i < length; i++, offset++) + { + TransactionId *xactptr; + uint32 *flagsptr; + int flagsoff; + int bshift; + int memberoff; + MultiXactStatus st; + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + + if (pageno != prev_pageno) + { + buf = SlruReadSwitchPage(state->members, pageno); + prev_pageno = pageno; + } + + xactptr = (TransactionId *) (buf + memberoff); + if (!TransactionIdIsValid(*xactptr)) + { + /* Corner case 3: we must be looking at unused slot zero */ + Assert(offset == 0); + continue; + } + + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + flagsptr = (uint32 *) (buf + flagsoff); + + st = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; + + /* Verify that there is a single update Xid among the given members. */ + if (ISUPDATE_from_mxstatus(st)) + { + if (result_isupdate) + pg_fatal("multixact %u has more than one updating member", + multi); + result_xid = *xactptr; + result_isupdate = true; + } + else if (!TransactionIdIsValid(result_xid)) + result_xid = *xactptr; + } + + /* A multixid with zero members should not happen */ + Assert(TransactionIdIsValid(result_xid)); + + *result = result_xid; + *status = result_isupdate ? MultiXactStatusUpdate : + MultiXactStatusForKeyShare; +} + +/* + * Frees the malloced reader. + */ +void +FreeOldMultiXactReader(OldMultiXactReader *state) +{ + FreeSlruRead(state->offset); + FreeSlruRead(state->members); + + pfree(state); +} diff --git a/src/bin/pg_upgrade/multixact_old.h b/src/bin/pg_upgrade/multixact_old.h new file mode 100644 index 00000000000..8eb5af2ccaf --- /dev/null +++ b/src/bin/pg_upgrade/multixact_old.h @@ -0,0 +1,29 @@ +/* + * multixact_old.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_old.h + */ + +#include "access/multixact.h" +#include "slru_io.h" + +typedef uint32 OldMultiXactOffset; + +typedef struct OldMultiXactReader +{ + MultiXactId nextMXact; + OldMultiXactOffset nextOffset; + + SlruSegState *offset; + SlruSegState *members; +} OldMultiXactReader; + +extern OldMultiXactReader *AllocOldMultiXactRead(char *pgdata, + MultiXactId nextMulti, + OldMultiXactOffset nextOffset); +extern void GetOldMultiXactIdSingleMember(OldMultiXactReader *state, + MultiXactId multi, + TransactionId *result, + MultiXactStatus *status); +extern void FreeOldMultiXactReader(OldMultiXactReader *reader); diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 490e98fa26f..0fdd05c127c 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -48,6 +48,8 @@ #include "common/logging.h" #include "common/restricted_token.h" #include "fe_utils/string_utils.h" +#include "multixact_old.h" +#include "multixact_new.h" #include "pg_upgrade.h" /* @@ -769,6 +771,81 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir) check_ok(); } +/* + * Convert pg_multixact/offset and /members to new format with 64-bit offsets. + */ +static void +convert_multixacts(MultiXactId *new_nxtmulti, MultiXactOffset *new_nxtmxoff) +{ + MultiXactId oldest_multi, + next_multi; + OldMultiXactReader *old_reader; + MultiXactWriter *new_writer; + MultiXactOffset next_offset; + + /* + * The range of valid multi XIDs is unchanged by the conversion (they are + * referenced from the heap tables), but the members SLRU is rewritten to + * start from offset 1. + */ + oldest_multi = old_cluster.controldata.chkpnt_oldstMulti; + next_multi = old_cluster.controldata.chkpnt_nxtmulti; + next_offset = 1; + + old_reader = AllocOldMultiXactRead(old_cluster.pgdata, + old_cluster.controldata.chkpnt_nxtmulti, + old_cluster.controldata.chkpnt_nxtmxoff); + new_writer = AllocMultiXactWrite(new_cluster.pgdata, + oldest_multi, next_offset); + + /* handle wraparound */ + if (next_multi < FirstMultiXactId) + next_multi = FirstMultiXactId; + + /* + * Read multixids from old files one by one, and write them back in the + * new format. + */ + for (MultiXactId multi = oldest_multi; multi != next_multi;) + { + TransactionId xid; + MultiXactStatus status; + MultiXactMember member; + + /* + * Read the old multixid. The locking-only XIDs that may be part of + * multi-xids don't matter after upgrade, as there can be no + * transactions running across upgrade. So as a little optimization, + * we only read one member from each multixid: the one updating one, + * or if there was no update, arbitrarily the first locking xid. + */ + GetOldMultiXactIdSingleMember(old_reader, multi, &xid, &status); + + /* Write it out in new format */ + member.xid = xid; + member.status = status; + RecordNewMultiXact(new_writer, next_offset, multi, 1, &member); + + next_offset += 1; + multi++; + /* handle wraparound */ + if (multi < FirstMultiXactId) + multi = FirstMultiXactId; + } + + /* + * Update the nextMXact/Offset values in the control file to match what we + * wrote. The nextMXact is unchanged, but nextOffset will be different. + */ + Assert(next_multi == old_cluster.controldata.chkpnt_nxtmulti); + *new_nxtmulti = next_multi; + *new_nxtmxoff = next_offset; + + /* Release resources */ + FreeMultiXactWrite(new_writer); + FreeOldMultiXactReader(old_reader); +} + static void copy_xact_xlog_xid(void) { @@ -816,8 +893,29 @@ copy_xact_xlog_xid(void) if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) { - copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); - copy_subdir_files("pg_multixact/members", "pg_multixact/members"); + MultiXactId new_nxtmulti = old_cluster.controldata.chkpnt_nxtmulti; + MultiXactOffset new_nxtmxoff = old_cluster.controldata.chkpnt_nxtmxoff; + + /* + * If the old server is before the + * MULTIXACTOFFSET_FORMATCHANGE_CAT_VER it must have 32-bit multixid + * offsets, thus it should be converted. + */ + if (old_cluster.controldata.cat_ver < MULTIXACTOFFSET_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER) + { + remove_new_subdir("pg_multixact/members", false); + remove_new_subdir("pg_multixact/offsets", false); + + prep_status("Converting pg_multixact/offsets to 64-bit"); + convert_multixacts(&new_nxtmulti, &new_nxtmxoff); + check_ok(); + } + else + { + copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); + copy_subdir_files("pg_multixact/members", "pg_multixact/members"); + } prep_status("Setting next multixact ID and offset for new cluster"); @@ -826,10 +924,8 @@ copy_xact_xlog_xid(void) * counters here and the oldest multi present on system. */ exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"", - new_cluster.bindir, - old_cluster.controldata.chkpnt_nxtmxoff, - old_cluster.controldata.chkpnt_nxtmulti, + "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"", + new_cluster.bindir, new_nxtmxoff, new_nxtmulti, old_cluster.controldata.chkpnt_oldstMulti, new_cluster.pgdata); check_ok(); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index e86336f4be9..127b2cb00fa 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -114,6 +114,11 @@ extern char *output_files[]; */ #define MULTIXACT_FORMATCHANGE_CAT_VER 201301231 +/* + * Swicth from 32-bit to 64-bit for multixid offsets. + */ +#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 999999999 + /* * large object chunk size added to pg_controldata, * commit 5f93c37805e7485488480916b4585e098d3cc883 diff --git a/src/bin/pg_upgrade/slru_io.c b/src/bin/pg_upgrade/slru_io.c new file mode 100644 index 00000000000..010094184be --- /dev/null +++ b/src/bin/pg_upgrade/slru_io.c @@ -0,0 +1,242 @@ +/* + * slru_io.c + * + * Routines for reading and writing SLRU files during upgrade. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/slru_io.c + */ + +#include "postgres_fe.h" + +#include + +#include "common/fe_memutils.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "port/pg_iovec.h" +#include "pg_upgrade.h" +#include "slru_io.h" + +static SlruSegState *AllocSlruSegState(const char *dir); +static char *SlruFileName(SlruSegState *state, int64 segno); +static void SlruFlush(SlruSegState *state); + +static SlruSegState * +AllocSlruSegState(const char *dir) +{ + SlruSegState *state = pg_malloc(sizeof(*state)); + + state->dir = pstrdup(dir); + state->fn = NULL; + state->fd = -1; + state->segno = -1; + state->pageno = 0; + + return state; +} + +/* similar to the backend function with the same name */ +static char * +SlruFileName(SlruSegState *state, int64 segno) +{ + if (state->long_segment_names) + { + Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF)); + return psprintf("%s/%015" PRIX64, state->dir, segno); + } + else + { + Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF)); + return psprintf("%s/%04X", state->dir, (unsigned int) segno); + } +} + +/* + * Create slru reader for dir. + * + * Returns the malloced memory used by the all other read calls in this module. + */ +SlruSegState * +AllocSlruRead(const char *dir, bool long_segment_names) +{ + SlruSegState *state = AllocSlruSegState(dir); + + state->writing = false; + state->long_segment_names = long_segment_names; + + return state; +} + +/* + * Open given page for reading. + * + * Reading can be done in random order. + */ +char * +SlruReadSwitchPageSlow(SlruSegState *state, uint64 pageno) +{ + int64 segno; + + Assert(!state->writing); /* read only mode */ + + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + + segno = pageno / SLRU_PAGES_PER_SEGMENT; + if (segno != state->segno) + { + if (state->segno != -1) + { + close(state->fd); + state->fd = -1; + + pg_free(state->fn); + state->fn = NULL; + + state->segno = -1; + } + + /* Open new segment */ + state->fn = SlruFileName(state, segno); + if ((state->fd = open(state->fn, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", state->fn); + } + + state->segno = segno; + + { + struct iovec iovec = { + .iov_base = &state->buf, + .iov_len = BLCKSZ, + }; + off_t offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + if (pg_preadv(state->fd, &iovec, 1, offset) < 0) + pg_fatal("could not read file \"%s\": %m", state->fn); + + state->pageno = pageno; + } + + return state->buf.data; +} + +/* + * Frees the malloced reader. + */ +void +FreeSlruRead(SlruSegState *state) +{ + Assert(!state->writing); /* read only mode */ + + if (state->fd != -1) + close(state->fd); + pg_free(state); +} + +/* + * Create slru writer for dir. + * + * Returns the malloced memory used by the all other write calls in this module. + */ +SlruSegState * +AllocSlruWrite(const char *dir, bool long_segment_names) +{ + SlruSegState *state = AllocSlruSegState(dir); + + state->writing = true; + state->long_segment_names = long_segment_names; + + return state; +} + +/* + * Open the given page for writing. + * + * NOTE: This uses O_EXCL when stepping to a new segment, so this assumes that + * each segment is written in full before moving on to next one. This + * limitation would be easy to lift if needed, but it fits the usage pattern of + * current callers. + */ +char * +SlruWriteSwitchPageSlow(SlruSegState *state, uint64 pageno) +{ + int64 segno; + off_t offset; + + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + + segno = pageno / SLRU_PAGES_PER_SEGMENT; + offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + SlruFlush(state); + memset(state->buf.data, 0, BLCKSZ); + + if (segno != state->segno) + { + if (state->segno != -1) + { + close(state->fd); + state->fd = -1; + + pg_free(state->fn); + state->fn = NULL; + + state->segno = -1; + } + + /* Create the segment */ + state->fn = SlruFileName(state, segno); + if ((state->fd = open(state->fn, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + { + pg_fatal("could not create file \"%s\": %m", state->fn); + } + + state->segno = segno; + + if (offset > 0) + { + if (pg_pwrite_zeros(state->fd, offset, 0) < 0) + pg_fatal("could not write file \"%s\": %m", state->fn); + } + } + + state->pageno = pageno; + + return state->buf.data; +} + +static void +SlruFlush(SlruSegState *state) +{ + struct iovec iovec = { + .iov_base = &state->buf, + .iov_len = BLCKSZ, + }; + off_t offset; + + if (state->segno == -1) + return; + + offset = (state->pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + if (pg_pwritev_with_retry(state->fd, &iovec, 1, offset) < 0) + pg_fatal("could not write file \"%s\": %m", state->fn); +} + +/* + * Frees the malloced writer. + */ +void +FreeSlruWrite(SlruSegState *state) +{ + Assert(state->writing); + + SlruFlush(state); + + if (state->fd != -1) + close(state->fd); + pg_free(state); +} diff --git a/src/bin/pg_upgrade/slru_io.h b/src/bin/pg_upgrade/slru_io.h new file mode 100644 index 00000000000..5c80a679b4d --- /dev/null +++ b/src/bin/pg_upgrade/slru_io.h @@ -0,0 +1,52 @@ +/* + * slru_io.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/slru_io.h + */ + +#ifndef SLRU_IO_H +#define SLRU_IO_H + +/* + * State for reading or writing an SLRU, with a one page buffer. + */ +typedef struct SlruSegState +{ + bool writing; + bool long_segment_names; + + char *dir; + char *fn; + int fd; + int64 segno; + uint64 pageno; + + PGAlignedBlock buf; +} SlruSegState; + +extern SlruSegState *AllocSlruRead(const char *dir, bool long_segment_names); +extern char *SlruReadSwitchPageSlow(SlruSegState *state, uint64 pageno); +extern void FreeSlruRead(SlruSegState *state); + +static inline char * +SlruReadSwitchPage(SlruSegState *state, uint64 pageno) +{ + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + return SlruReadSwitchPageSlow(state, pageno); +} + +extern SlruSegState *AllocSlruWrite(const char *dir, bool long_segment_names); +extern char *SlruWriteSwitchPageSlow(SlruSegState *state, uint64 pageno); +extern void FreeSlruWrite(SlruSegState *state); + +static inline char * +SlruWriteSwitchPage(SlruSegState *state, uint64 pageno) +{ + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + return SlruWriteSwitchPageSlow(state, pageno); +} + +#endif /* SLRU_IO_H */ diff --git a/src/bin/pg_upgrade/t/007_multixact_conversion.pl b/src/bin/pg_upgrade/t/007_multixact_conversion.pl new file mode 100644 index 00000000000..fe8da9aded2 --- /dev/null +++ b/src/bin/pg_upgrade/t/007_multixact_conversion.pl @@ -0,0 +1,329 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Version 19 expanded MultiXactOffset from 32 to 64 bits. Upgrading +# across that requires rewriting the SLRU files to the new format. +# This file contains tests for the conversion. +# +# To run, set 'oldinstall' ENV variable to point to a pre-v19 +# installation. If it's not set, or if it points to a v19 or above +# installation, this still performs a very basic test, upgrading a +# cluster with some multixacts. It's not very interesting, however, +# because there's no conversion involved in that case. + +use strict; +use warnings FATAL => 'all'; + +use Math::BigInt; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Temp dir for a dumps. +my $tempdir = PostgreSQL::Test::Utils::tempdir; + +# A workload that consumes multixids. The purpose of this is to +# generate some multixids in the old cluster, so that we can test +# upgrading them. The workload is a mix of KEY SHARE locking queries +# and UPDATEs, and commits and aborts. It consumes around 3000 +# multixids with 30000 members. That's enough to span more than one +# multixids 'offsets' page, and more than one 'members' segment. +# +# The workload leaves behind a table called 'mxofftest' containing a +# small number of rows referencing some of the generated multixids. +# +# Because this function is used to generate test data on the old +# installation, it needs to work with older PostgreSQL server +# versions. +# +# The first argument is the cluster to connect to, the second argument +# is a cluster using the new version. We need the 'psql' binary from +# the new version, the new cluster is otherwise unused. (We need to +# use the new 'psql' because some of the more advanced background psql +# perl module features depend on a fairly recent psql version.) +sub mxact_workload +{ + my $node = shift; # Cluster to connect to + my $binnode = shift; # Use the psql binary from this cluster + + my $connstr = $node->connstr('postgres'); + + $node->start; + $node->safe_psql('postgres', qq[ + CREATE TABLE mxofftest (id INT PRIMARY KEY, n_updated INT) + WITH (AUTOVACUUM_ENABLED=FALSE); + INSERT INTO mxofftest SELECT G, 0 FROM GENERATE_SERIES(1, 50) G; + ]); + + my $nclients = 20; + my $update_every = 13; + my $abort_every = 11; + my @connections = (); + + # Open multiple connections to the database. Start a transaction + # in each connection. + for (0 .. $nclients) + { + # Use the psql binary from the new installation. The + # BackgroundPsql functionality doesn't work with older psql + # versions. + my $conn = $binnode->background_psql('', + connstr => $node->connstr('postgres')); + $conn->query_safe("SET enable_seqscan=off"); + $conn->query_safe("BEGIN"); + + push(@connections, $conn); + } + + # Run queries using cycling through the connections in a + # round-robin fashion. We keep a transaction open in each + # connection at all times, and lock/update the rows. With 10 + # connections, each SELECT FOR KEY SHARE query generates a new + # multixid, containing the 10 XIDs of all the transactions running + # at the time. + for (my $i = 0; $i < 3000; $i++) + { + my $conn = $connections[ $i % $nclients ]; + + my $sql; + if ($i % $abort_every == 0) + { + $sql = "ABORT; "; + } + else + { + $sql = "COMMIT; "; + } + $sql .= "BEGIN; "; + + if ($i % $update_every == 0) + { + $sql .= qq[ + UPDATE mxofftest SET n_updated = n_updated + 1 WHERE id = ${i} % 50; + ]; + } + else + { + my $threshold = int($i / 3000 * 50); + $sql .= qq[ + select count(*) from ( + SELECT * FROM mxofftest WHERE id >= $threshold FOR KEY SHARE + ) as x + ]; + } + $conn->query_safe($sql); + } + + for my $conn (@connections) + { + $conn->quit(); + } + + $node->stop; + return; +} + +# Read NextMultiOffset from the control file +# +# Note: This is used on both the old and the new installation, so the +# command arguments and the output parsing used here must work with +# all PostgreSQL versions supported by the test. +sub read_next_mxoff +{ + my $node = shift; + + my $pg_controldata_path = $node->installed_command('pg_controldata'); + my ($stdout, $stderr) = + run_command([ $pg_controldata_path, $node->data_dir ]); + $stdout =~ /^Latest checkpoint's NextMultiOffset:\s*(.*)$/m + or die "could not read NextMultiOffset from pg_controldata"; + return $1; +} + +# Reset a cluster's oldest multixact-offset to given offset. +# +# Note: This is used on both the old and the new installation, so the +# command arguments and the output parsing used here must work with +# all PostgreSQL versions supported by the test. +sub reset_mxoff_pre_v19 +{ + my $node = shift; + my $offset = shift; + + my $pg_resetwal_path = $node->installed_command('pg_resetwal'); + # Get block size + my ($out, $err) = + run_command([ $pg_resetwal_path, '--dry-run', $node->data_dir ]); + $out =~ /^Database block size: *(\d+)$/m or die; + my $blcksz = $1; + # SLRU_PAGES_PER_SEGMENT is always 32 on pre-19 version + my $slru_pages_per_segment = 32; + + # Verify that no multixids are currently in use. Resetting would + # destroy them. (A freshly initialized cluster has no multixids.) + $out =~ /^Latest checkpoint's NextMultiXactId: *(\d+)$/m or die; + my $next_mxid = $1; + $out =~ /^Latest checkpoint's oldestMultiXid: *(\d+)$/m or die; + my $oldest_mxid = $1; + die "cluster has some multixids in use" unless $next_mxid == $oldest_mxid; + + # Reset to new offset using pg_resetwal + my @cmd = ( + $pg_resetwal_path, + '--pgdata' => $node->data_dir, + '--multixact-offset' => $offset); + command_ok(\@cmd, 'set oldest multixact-offset'); + + # pg_resetwal just updates the control file. The cluster will + # refuse to start up, if the SLRU segment corresponding to the + # offset does not exist. Create a dummy segment that covers the + # given offset, filled with zeros. But first remove any old + # segments. + unlink glob $node->data_dir . "/pg_multixact/members/*"; + + my $mult = 32 * int($blcksz / 20) * 4; + my $segname = sprintf "%04X", $offset / $mult; + + my $path = $node->data_dir . "/pg_multixact/members/" . $segname; + + my $null_block = "\x00" x $blcksz; + open(my $dh, '>', $path) + || die "could not open $path for writing $!"; + for (0 .. $slru_pages_per_segment) + { + print $dh $null_block; + } + close($dh); +} + +# Dump contents of the 'mxofftest' table, created by mxact_workload +sub get_dump_for_comparison +{ + my ($node, $file_prefix) = @_; + + my $contents = $node->safe_psql('postgres', + "SELECT ctid, xmin, xmax, * FROM mxofftest"); + + my $dumpfile = $tempdir . '/' . $file_prefix . '.sql'; + open(my $dh, '>', $dumpfile) + || die "could not open $dumpfile for writing $!"; + print $dh $contents; + close($dh); + + return $dumpfile; +} + +# Main test workhorse routine. +# Dump data on old version, run pg_upgrade, compare data after upgrade. +sub upgrade_and_compare +{ + my $tag = shift; + my $oldnode = shift; + my $newnode = shift; + + command_ok( + [ + 'pg_upgrade', '--no-sync', + '--old-datadir' => $oldnode->data_dir, + '--new-datadir' => $newnode->data_dir, + '--old-bindir' => $oldnode->config_data('--bindir'), + '--new-bindir' => $newnode->config_data('--bindir'), + '--socketdir' => $newnode->host, + '--old-port' => $oldnode->port, + '--new-port' => $newnode->port, + ], + 'run of pg_upgrade for new instance'); + + # Note: we do this *after* running pg_upgrade, to ensure that we + # don't set all the hint bits before upgrade by doing the SELECT + # on the table. + $oldnode->start; + my $old_dump = get_dump_for_comparison($oldnode, "oldnode_${tag}_dump"); + $oldnode->stop; + + $newnode->start; + my $new_dump = get_dump_for_comparison($newnode, "newnode_${tag}_dump"); + $newnode->stop; + + compare_files($old_dump, $new_dump, + 'dump outputs from original and restored regression databases match'); +} + +my $old_version; + +# Basic scenario: Create a cluster using old installation, run +# multixid-creating workload on it, then upgrade. +# +# This works even even if the old and new version is the same, +# although it's not very interesting as the conversion routines only +# run when upgrading from a pre-v19 cluster. +{ + my $tag = 'basic'; + my $old = + PostgreSQL::Test::Cluster->new("${tag}_oldnode", + install_path => $ENV{oldinstall}); + my $new = PostgreSQL::Test::Cluster->new("${tag}_newnode"); + + $old->init(extra => ['-k']); + + $old_version = $old->pg_version; + note "old installation is version $old_version\n"; + + # Run the workload + my $start_mxoff = read_next_mxoff($old); + mxact_workload($old, $new); + my $finish_mxoff = read_next_mxoff($old); + + $new->init; + upgrade_and_compare($tag, $old, $new); + + my $new_next_mxoff = read_next_mxoff($new); + + note ">>> case #${tag}\n" + . " oldnode mxoff from ${start_mxoff} to ${finish_mxoff}\n" + . " newnode mxoff ${new_next_mxoff}\n"; +} + +# Wraparound scenario: This is the same as the basic scenario, but the +# old cluster goes through mxoffset wraparound. +# +# This requires the old installation to be version 19 of older, +# because the hacks we use to reset the old cluster to a state just +# before the wraparound rely on the pre-v19 file format. In version +# 19, offsets no longer wrap around anyway. +SKIP: +{ + skip + "skipping mxoffset conversion tests because upgrading from the old version does not require conversion" + if ($old_version >= '19devel'); + + my $tag = 'wraparound'; + my $old = + PostgreSQL::Test::Cluster->new("${tag}_oldnode", + install_path => $ENV{oldinstall}); + my $new = PostgreSQL::Test::Cluster->new("${tag}_newnode"); + + $old->init(extra => ['-k']); + + # Reset the NextMultiOffset value in the old cluster to just before 32-bit wraparound. + reset_mxoff_pre_v19($old, 0xFFFFEC77); + + # Run the workload. This crosses the wraparound. + my $start_mxoff = read_next_mxoff($old); + mxact_workload($old, $new); + my $finish_mxoff = read_next_mxoff($old); + + # Verify that wraparound happened. + cmp_ok($finish_mxoff, '<', $start_mxoff, + "mxoff wrapped around in old cluster"); + + $new->init; + upgrade_and_compare($tag, $old, $new); + + my $new_next_mxoff = read_next_mxoff($new); + + note ">>> case #${tag}\n" + . " oldnode mxoff from ${start_mxoff} to ${finish_mxoff}\n" + . " newnode mxoff ${new_next_mxoff}\n"; +} + +done_testing(); diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 35413f14019..34f07d52cd8 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -1793,13 +1793,20 @@ sub _get_env return (%inst_env); } -# Private routine to get an installation path qualified command. -# -# IPC::Run maintains a cache, %cmd_cache, mapping commands to paths. Tests -# which use nodes spanning more than one postgres installation path need to -# avoid confusing which installation's binaries get run. Setting $ENV{PATH} is -# insufficient, as IPC::Run does not check to see if the path has changed since -# caching a command. +=pod + +=item $node->installed_command(cmd) + +Get an installation path qualified command. + +IPC::Run maintains a cache, %cmd_cache, mapping commands to paths. Tests +which use nodes spanning more than one postgres installation path need to +avoid confusing which installation's binaries get run. Setting $ENV{PATH} is +insufficient, as IPC::Run does not check to see if the path has changed since +caching a command. + +=cut + sub installed_command { my ($self, $cmd) = @_; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 23bce72ae64..f9ddd06ec1d 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1725,6 +1725,7 @@ MultiXactMember MultiXactOffset MultiXactStateData MultiXactStatus +MultiXactWriter MultirangeIOData MultirangeParseState MultirangeType @@ -1808,6 +1809,7 @@ OffsetVarNodes_context Oid OidOptions OkeysState +OldMultiXactReader OldToNewMapping OldToNewMappingData OnCommitAction @@ -2804,6 +2806,7 @@ SlruCtlData SlruErrorCause SlruPageStatus SlruScanCallback +SlruSegState SlruShared SlruSharedData SlruWriteAll -- 2.47.3