diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 699497c..8d28a5c 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -197,8 +197,9 @@ typedef struct MultiXactStateData MultiXactOffset nextOffset; /* - * Oldest multixact that is still on disk. Anything older than this - * should not be consulted. These values are updated by vacuum. + * Oldest multixact that may still be referenced from a relation. + * Anything older than this should not be consulted. These values are + * updated by vacuum. */ MultiXactId oldestMultiXactId; Oid oldestMultiXactDB; @@ -211,6 +212,18 @@ typedef struct MultiXactStateData */ MultiXactId lastCheckpointedOldest; + /* + * This is the oldest file that actually exist on the disk. This value + * is initialized by scanning pg_multixact/offsets, and subsequently + * updated each time we complete a truncation. We need a flag to + * indicate whether this has been initialized yet. + */ + MultiXactId oldestMultiXactOnDisk; + bool oldestMultiXactOnDiskValid; + + /* Has TrimMultiXact been called yet? */ + bool didTrimMultiXact; + /* support for anti-wraparound measures */ MultiXactId multiVacLimit; MultiXactId multiWarnLimit; @@ -342,6 +355,8 @@ static char *mxstatus_to_string(MultiXactStatus status); /* management of SLRU infrastructure */ static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog); static int ZeroMultiXactMemberPage(int pageno, bool writeXlog); +static MultiXactOffset GetOldestMultiXactOnDisk(void); +static MultiXactOffset GetOldestReferencedOffset(MultiXactId oldestMXact); static bool MultiXactOffsetPagePrecedes(int page1, int page2); static bool MultiXactMemberPagePrecedes(int page1, int page2); static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, @@ -1975,12 +1990,6 @@ StartupMultiXact(void) */ pageno = MXOffsetToMemberPage(offset); MultiXactMemberCtl->shared->latest_page_number = pageno; - - /* - * compute the oldest member we need to keep around to avoid old member - * data overrun. - */ - DetermineSafeOldestOffset(MultiXactState->oldestMultiXactId); } /* @@ -1997,7 +2006,10 @@ TrimMultiXact(void) int pageno; int entryno; int flagsoff; - + MultiXactId lastCheckpointedOldest; + MultiXactId oldestMultiXactId; + MultiXactId earliest; + MultiXactOffset oldestOffset; /* Clean up offsets state */ LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); @@ -2066,6 +2078,51 @@ TrimMultiXact(void) } LWLockRelease(MultiXactMemberControlLock); + + /* + * Read values from shared memory so that we can establish member + * wraparound defenses. + */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + lastCheckpointedOldest = MultiXactState->lastCheckpointedOldest; + oldestMultiXactId = MultiXactState->oldestMultiXactId; + LWLockRelease(MultiXactGenLock); + + /* + * Determine an initial safe stop point for multixact member wraparound. + * If we are starting up without entering recovery, none of this work has + * been done yet. Even if we did recovery, the stop point might not be + * set yet if the checks in TruncateMultiXact skipped truncation every + * time. + * + * PostgreSQL 9.3.0 through 9.3.6 and PostgreSQL 9.4.0 through 9.4.1 + * had bugs that could allow users who reached those release through + * pg_upgrade from an earlier release to end up with the bogus value of 1 + * for datminmxid, and that value could sometimes propagate itself back + * into pg_control. To defend against that, if the oldest multixact + * value that we have doesn't actually exist on disk, use the oldest one + * that does to set the safe stop point. + */ + earliest = GetOldestMultiXactOnDisk(); + Assert(MultiXactIdIsValid(lastCheckpointedOldest)); + if (MultiXactIdPrecedes(lastCheckpointedOldest, earliest)) + DetermineSafeOldestOffset(earliest); + else + DetermineSafeOldestOffset(lastCheckpointedOldest); + + /* + * Determine the oldest offset that we still need to worry about, for + * purposes of establishing how aggressively to vacuum. This is based + * on the oldest value that we believe to be present in any table, rather + * than the oldest value we believe to be present on disk. That's so that + * vacuum doesn't go crazy trying to remove multixacts that it's already + * cleaned out but which have not yet been removed by a checkpoint. + */ + oldestOffset = GetOldestReferencedOffset(oldestMultiXactId); + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestOffset = oldestOffset; + MultiXactState->didTrimMultiXact = true; /* now fully initialized */ + LWLockRelease(MultiXactGenLock); } /* @@ -2165,12 +2222,19 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) MultiXactId multiStopLimit; MultiXactId multiWrapLimit; MultiXactId curMulti; - MultiXactOffset oldestOffset; + MultiXactOffset oldestOffset = 0; /* keep compiler happy */ MultiXactOffset nextOffset; + bool did_trim; Assert(MultiXactIdIsValid(oldest_datminmxid)); /* + * We can read this without a lock, because it only changes when nothing + * else is running. + */ + did_trim = MultiXactState->didTrimMultiXact; + + /* * We pretend that a wrap will happen halfway through the multixact ID * space, but that's not really true, because multixacts wrap differently * from transaction IDs. Note that, separately from any concern about @@ -2221,32 +2285,13 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) /* * Determine the offset of the oldest multixact that might still be - * referenced. Normally, we can read the offset from the multixact itself, - * but there's an important special case: if there are no multixacts in - * existence at all, oldest_datminmxid obviously can't point to one. It - * will instead point to the multixact ID that will be assigned the next - * time one is needed. - * - * NB: oldest_dataminmxid is the oldest multixact that might still be - * referenced from a table, unlike in DetermineSafeOldestOffset, where we - * do this same computation based on the oldest value that might still - * exist in the SLRU. This is because here we're trying to compute a - * threshold for activating autovacuum, which can only remove references - * to multixacts, whereas there we are computing a threshold for creating - * new multixacts, which requires the old ones to have first been - * truncated away by a checkpoint. + * referenced, if we're done with recovery. It isn't safe to do this + * any earlier, because the database might be inconsistent. + * Fortunately, we don't need it then anyway, because this only controls + * the behavior of vacuum. */ - LWLockAcquire(MultiXactGenLock, LW_SHARED); - if (MultiXactState->nextMXact == oldest_datminmxid) - { - oldestOffset = MultiXactState->nextOffset; - LWLockRelease(MultiXactGenLock); - } - else - { - LWLockRelease(MultiXactGenLock); - oldestOffset = find_multixact_start(oldest_datminmxid); - } + if (did_trim) + oldestOffset = GetOldestReferencedOffset(oldest_datminmxid); /* Grab lock for just long enough to set the new limit values */ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); @@ -2275,11 +2320,11 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) */ if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD)) && - IsUnderPostmaster && !InRecovery) + IsUnderPostmaster && did_trim) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); /* Give an immediate warning if past the wrap warn point */ - if (MultiXactIdPrecedes(multiWarnLimit, curMulti) && !InRecovery) + if (MultiXactIdPrecedes(multiWarnLimit, curMulti) && did_trim) { char *oldest_datname; @@ -2319,6 +2364,49 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) } /* + * Get the offset of the oldest MultiXact that might still be referenced from + * a table somewhere. + */ +static MultiXactOffset +GetOldestReferencedOffset(MultiXactId oldestMXact) +{ + MultiXactId earliest; + MultiXactOffset oldestOffset; + + /* + * Because of bugs in early 9.3.X and 9.4.X releases (see comments in + * TrimMultiXact for details), oldest_datminmxid might point to a + * nonexistent multixact. If so, use the oldest one that actually + * exists. Anything before this can't be successfully used anyway. + */ + earliest = GetOldestMultiXactOnDisk(); + if (MultiXactIdPrecedes(oldestMXact, earliest)) + oldestMXact = earliest; + + LWLockAcquire(MultiXactGenLock, LW_SHARED); + if (MultiXactState->nextMXact == oldestMXact) + { + /* + * If there are no multixacts in existence at all, oldestMXact + * obviously can't point to one. It will instead point to the + * multixact ID that will be assigned the next time one is needed. + * But that means we can't look up its members, because it doesn't + * exist yet. Instead, return the offset that will be assigned to + * it when it gets created. + */ + oldestOffset = MultiXactState->nextOffset; + LWLockRelease(MultiXactGenLock); + } + else + { + LWLockRelease(MultiXactGenLock); + oldestOffset = find_multixact_start(oldestMXact); + } + + return oldestOffset; +} + +/* * Ensure the next-to-be-assigned MultiXactId is at least minMulti, * and similarly nextOffset is at least minMultiOffset. * @@ -2821,8 +2909,7 @@ TruncateMultiXact(void) { MultiXactId oldestMXact; MultiXactOffset oldestOffset; - MultiXactOffset nextOffset; - mxtruncinfo trunc; + MultiXactOffset nextOffset; MultiXactId earliest; MembersLiveRange range; @@ -2835,19 +2922,20 @@ TruncateMultiXact(void) Assert(MultiXactIdIsValid(oldestMXact)); /* - * Note we can't just plow ahead with the truncation; it's possible that - * there are no segments to truncate, which is a problem because we are - * going to attempt to read the offsets page to determine where to - * truncate the members SLRU. So we first scan the directory to determine - * the earliest offsets page number that we can read without error. + * We must be careful here, because we may be in recovery. If we're here, + * we've replayed at least one checkpoint, but have not yet reached the + * minimum recovery point, so the truncation may have already been done. + * + * But even if we're in normal running, we still need to be careful. + * PostgreSQL 9.3.0 through 9.3.6 and PostgreSQL 9.4.0 through 9.4.1 + * had bugs that could allow users who reached those release through + * pg_upgrade from an earlier release to end up with the bogus value of 1 + * for datminmxid, and that value could sometimes propagate itself back + * into pg_control. So it's possible that oldestMXact precedes the + * earliest value on disk even in normal running. If that happens, we + * skip truncation. */ - trunc.earliestExistingPage = -1; - SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); - earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE; - if (earliest < FirstMultiXactId) - earliest = FirstMultiXactId; - - /* nothing to do */ + earliest = GetOldestMultiXactOnDisk(); if (MultiXactIdPrecedes(oldestMXact, earliest)) return; @@ -2879,7 +2967,15 @@ TruncateMultiXact(void) SimpleLruTruncate(MultiXactOffsetCtl, MultiXactIdToOffsetPage(oldestMXact)); - + /* Update oldest-on-disk value in shared memory. */ + earliest = range.rangeStart * MULTIXACT_OFFSETS_PER_PAGE; + if (earliest < FirstMultiXactId) + earliest = FirstMultiXactId; + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + Assert(MultiXactState->oldestMultiXactOnDiskValid); + MultiXactState->oldestMultiXactOnDiskValid = earliest; + LWLockRelease(MultiXactGenLock); + /* * Now, and only now, we can advance the stop point for multixact members. * If we did it any sooner, the segments we deleted above might already @@ -2889,6 +2985,47 @@ TruncateMultiXact(void) } /* + * Scan pg_multixact/offsets to determine the earliest offsets page number + * that we can read without error. + */ +static MultiXactOffset +GetOldestMultiXactOnDisk(void) +{ + mxtruncinfo trunc; + MultiXactId earliest; + bool valid; + + /* Read values from shared memory. */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + earliest = MultiXactState->oldestMultiXactOnDisk; + valid = MultiXactState->oldestMultiXactOnDiskValid; + LWLockRelease(MultiXactGenLock); + + /* If the value we read is valid, just return it. */ + if (valid) + return earliest; + + /* + * We haven't scanned the directory yet, so do that now. This will + * give us the earliest offsets page number that we can read without + * error. + */ + trunc.earliestExistingPage = -1; + SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); + earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE; + if (earliest < FirstMultiXactId) + earliest = FirstMultiXactId; + + /* Update oldest-on-disk value in shared memory. */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestMultiXactOnDisk = earliest; + MultiXactState->oldestMultiXactOnDiskValid = true; + LWLockRelease(MultiXactGenLock); + + return earliest; +} + +/* * Decide which of two MultiXactOffset page numbers is "older" for truncation * purposes. *