From 023db6aaeee8cfdbe1d89bfd6ae7c13dd3a60465 Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Mon, 1 Jan 2018 20:55:14 +0500 Subject: [PATCH] SLRU checksums patch --- src/backend/access/transam/clog.c | 2 +- src/backend/access/transam/commit_ts.c | 2 +- src/backend/access/transam/multixact.c | 4 +- src/backend/access/transam/slru.c | 26 +++++ src/backend/access/transam/subtrans.c | 2 +- src/backend/commands/async.c | 2 +- src/backend/storage/lmgr/predicate.c | 2 +- src/bin/pg_upgrade/pg_upgrade.c | 188 +++++++++++++++++++++++++++++++-- src/bin/pg_upgrade/pg_upgrade.h | 4 + src/include/access/slru.h | 1 + src/include/catalog/catversion.h | 2 +- src/include/storage/checksum.h | 11 +- src/include/storage/checksum_impl.h | 46 ++++++++ 13 files changed, 273 insertions(+), 19 deletions(-) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index bbf9ce1a3a..3d9dba0414 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -58,7 +58,7 @@ /* We need two bits per xact, so four xacts fit in a byte */ #define CLOG_BITS_PER_XACT 2 #define CLOG_XACTS_PER_BYTE 4 -#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE) +#define CLOG_XACTS_PER_PAGE ((BLCKSZ - CHKSUMSZ) * CLOG_XACTS_PER_BYTE) #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1) #define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE) diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 7b7bf2b2bf..621f6cf482 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -64,7 +64,7 @@ typedef struct CommitTimestampEntry sizeof(RepOriginId)) #define COMMIT_TS_XACTS_PER_PAGE \ - (BLCKSZ / SizeOfCommitTimestampEntry) + ((BLCKSZ - CHKSUMSZ) / SizeOfCommitTimestampEntry) #define TransactionIdToCTsPage(xid) \ ((xid) / (TransactionId) COMMIT_TS_XACTS_PER_PAGE) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index ba01e94328..f04c23c649 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -106,7 +106,7 @@ */ /* We need four bytes per offset */ -#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) +#define MULTIXACT_OFFSETS_PER_PAGE ((BLCKSZ - CHKSUMSZ) / sizeof(MultiXactOffset)) #define MultiXactIdToOffsetPage(xid) \ ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) @@ -138,7 +138,7 @@ /* size in bytes of a complete group */ #define MULTIXACT_MEMBERGROUP_SIZE \ (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) -#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE ((BLCKSZ - CHKSUMSZ) / MULTIXACT_MEMBERGROUP_SIZE) #define MULTIXACT_MEMBERS_PER_PAGE \ (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 94b6e6612a..651afd51e9 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -55,9 +55,15 @@ #include "access/transam.h" #include "access/xlog.h" #include "pgstat.h" +#include "storage/checksum.h" #include "storage/fd.h" #include "storage/shmem.h" #include "miscadmin.h" +#include "utils/guc.h" +#include "utils/memutils.h" + +/* GUC variable */ +extern bool ignore_checksum_failure; #define SlruFileName(ctl, path, seg) \ @@ -376,6 +382,7 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, TransactionId xid) { SlruShared shared = ctl->shared; + int16 checksum; /* Outer loop handles restart if we must wait for someone else's I/O */ for (;;) @@ -426,6 +433,19 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, /* Do the read */ ok = SlruPhysicalReadPage(ctl, pageno, slotno); + if (DataChecksumsEnabled() && ok) + { + checksum = pg_getchecksum_slru_page(shared->page_buffer[slotno]); + if (checksum != pg_checksum_slru_page(shared->page_buffer[slotno])) + { + elog(LOG, "CHECKSUM: Page Is not Verified."); + if (!ignore_checksum_failure) + { + elog(ERROR, "CHECKSUM: ERROR ignore_checksum_failure turned off."); + } + } + } + /* Set the LSNs for this newly read-in page to zero */ SimpleLruZeroLSNs(ctl, slotno); @@ -539,6 +559,12 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) /* Release control lock while doing I/O */ LWLockRelease(shared->ControlLock); + /* + * Update checksum on the page. We do not need to copy the page since page + * contents cannot be modified under the lock. + */ + pg_setchecksum_slru_page(shared->page_buffer[slotno]); + /* Do the write */ ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index f640661130..80420cb7a4 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -49,7 +49,7 @@ */ /* We need four bytes per xact */ -#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId)) +#define SUBTRANS_XACTS_PER_PAGE ((BLCKSZ - CHKSUMSZ) / sizeof(TransactionId)) #define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE) #define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE) diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index f7de742a56..8f672635e3 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -150,7 +150,7 @@ * than that, so changes in that data structure won't affect user-visible * restrictions. */ -#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128) +#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128 - CHKSUMSZ) /* * Struct representing an entry in the global notify queue diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index 251a359bff..de8b74820e 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -315,7 +315,7 @@ static SlruCtlData OldSerXidSlruCtlData; #define OLDSERXID_PAGESIZE BLCKSZ #define OLDSERXID_ENTRYSIZE sizeof(SerCommitSeqNo) -#define OLDSERXID_ENTRIESPERPAGE (OLDSERXID_PAGESIZE / OLDSERXID_ENTRYSIZE) +#define OLDSERXID_ENTRIESPERPAGE ((OLDSERXID_PAGESIZE - CHKSUMSZ) / OLDSERXID_ENTRYSIZE) /* * Set maximum pages based on the lesser of the number needed to track all diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index c10103f0bf..5751285b3f 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -404,17 +404,183 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir) check_ok(); } +#include "storage/checksum.h" + +#include +#include +#include + +#define SLRU_PAGES_PER_SEGMENT 32 +#define SLRU_SEGMENT_SIZE (BLCKSZ * SLRU_PAGES_PER_SEGMENT) + +#define CLOG_BYTES_PER_PAGE_NEW (BLCKSZ - CHKSUMSZ) +#define CLOG_BYTES_PER_SEGMENT_NEW (BLCKSZ - CHKSUMSZ) * SLRU_PAGES_PER_SEGMENT + +static void write_xact_data_to_file(char *file_name, uint32 local_start, char *data, uint32 length) +{ + int dest_fd; + int local_end = local_start + length; + char *buffer = pg_malloc(SLRU_SEGMENT_SIZE); + + Assert(length <= CLOG_BYTES_PER_SEGMENT_NEW); + + if ((dest_fd = open(file_name, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + S_IRUSR | S_IWUSR)) < 0) + pg_fatal("could not create file \"%s\": %s\n", file_name, strerror(errno)); + + if (ftruncate(dest_fd, SLRU_SEGMENT_SIZE) < 0) + pg_fatal("could not set size of file \"%s\": %s\n", file_name, strerror(errno)); + + while (local_start < local_end) + { + int nbytes; + int page = local_start / CLOG_BYTES_PER_PAGE_NEW; + int page_start = local_start - (page * CLOG_BYTES_PER_PAGE_NEW); + int delta = Min((page + 1) * CLOG_BYTES_PER_PAGE_NEW, local_end) - local_start; + + if (lseek(dest_fd, page * BLCKSZ, SEEK_SET) < 0) + pg_fatal("could not set position in file \"%s\": %s\n", file_name, strerror(errno)); + + nbytes = read(dest_fd, buffer, BLCKSZ); + + if (nbytes < 0) + pg_fatal("could not read file \"%s\": %s\n", file_name, strerror(errno)); + + memmove(buffer + page_start, data, delta); + + pg_setchecksum_slru_page(buffer); + + if (lseek(dest_fd, page * BLCKSZ, SEEK_SET) < 0) + pg_fatal("could not set position in file \"%s\": %s\n", file_name, strerror(errno)); + + if (write(dest_fd, buffer, BLCKSZ) != BLCKSZ) + { + pg_fatal("could not write file \"%s\": %s\n", file_name, strerror(errno)); + } + + local_start += delta; + data += delta; + } + + pg_free(buffer); + close(dest_fd); +} + +static void +distribute_xact_data(char *buffer, int nbytes, int oldsegno, const char *new_subdir) +{ + uint64 start = oldsegno * ((uint64) SLRU_SEGMENT_SIZE); + uint64 end = start + nbytes; + + while (start < end) + { + int new_segno = start / (CLOG_BYTES_PER_SEGMENT_NEW); + uint64 local_start = start - new_segno * CLOG_BYTES_PER_SEGMENT_NEW; + uint64 local_end = Min(end, ((uint64)new_segno + 1) * CLOG_BYTES_PER_SEGMENT_NEW); + int64 length = local_end - start; + char new_file[MAXPGPATH]; + + Assert(length > 0); + Assert(length == (uint32)length); + Assert(local_start == (uint32)local_start); + + snprintf(new_file, sizeof(new_file), "%s/%s/%04X", new_cluster.pgdata, new_subdir, new_segno); + + write_xact_data_to_file(new_file, (uint32)local_start, buffer, (uint32)length); + + start +=length; + buffer +=length; + } +} + +static void +upgrade_one_xact_file(const char *old_file, int segno, const char *new_subdir) +{ + char *buffer = pg_malloc(SLRU_SEGMENT_SIZE); + int src_fd; + ssize_t nbytes; + + if ((src_fd = open(old_file, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %s\n", old_file, strerror(errno)); + + nbytes = read(src_fd, buffer, SLRU_SEGMENT_SIZE); + + if (nbytes < 0) + pg_fatal("could not read file \"%s\": %s\n", old_file, strerror(errno)); + + distribute_xact_data(buffer, nbytes, segno, new_subdir); + + pg_free(buffer); + close(src_fd); +} + +static void +upgrade_xact_files(const char *old_subdir, const char *new_subdir) +{ + char old_path[MAXPGPATH]; + char old_file[MAXPGPATH]; + + DIR *cldir; + struct dirent *clde; + int segno; + + remove_new_subdir(new_subdir, false); + + snprintf(old_path, sizeof(old_path), "%s/%s", old_cluster.pgdata, old_subdir); + + prep_status("Upgrading old %s to new cluster", old_subdir); + + + if ((cldir = opendir(old_path)) == NULL) + { + pg_fatal("could not open dir \"%s\": %s\n", old_path, strerror(errno)); + } + + while (errno = 0, (clde = readdir(cldir)) != NULL) + { + size_t len; + + len = strlen(clde->d_name); + + if ((len == 4 || len == 5 || len == 6) && + strspn(clde->d_name, "0123456789ABCDEF") == len) + { + segno = (int) strtol(clde->d_name, NULL, 16); + snprintf(old_file, sizeof(old_file), "%s/%s", old_path, clde->d_name); + + upgrade_one_xact_file(old_file, segno, new_subdir); + } + } + + if (errno) + { + pg_fatal("could not read dir \"%s\": %s\n", old_path, strerror(errno)); + } + check_ok(); +} + static void copy_xact_xlog_xid(void) { - /* - * Copy old commit logs to new data dir. pg_clog has been renamed to - * pg_xact in post-10 clusters. - */ - copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) < 1000 ? - "pg_clog" : "pg_xact", - GET_MAJOR_VERSION(new_cluster.major_version) < 1000 ? - "pg_clog" : "pg_xact"); + bool slru_changed = (new_cluster.controldata.cat_ver >= SLRU_FORMAT_CHANGE_CAT_VER && + old_cluster.controldata.cat_ver < SLRU_FORMAT_CHANGE_CAT_VER); + char *xact_old_subdir = GET_MAJOR_VERSION(old_cluster.major_version) < 1000 ? + "pg_clog" : "pg_xact"; + char *xact_new_subdir = GET_MAJOR_VERSION(new_cluster.major_version) < 1000 ? + "pg_clog" : "pg_xact"; + + if (slru_changed) + { + upgrade_xact_files(xact_old_subdir, xact_new_subdir); + } + else + { + /* + * Copy old commit logs to new data dir. pg_clog has been renamed to + * pg_xact in post-10 clusters. + */ + copy_subdir_files(xact_old_subdir, xact_new_subdir); + } /* set the next transaction id and epoch of the new cluster */ prep_status("Setting next transaction ID and epoch for new cluster"); @@ -442,7 +608,8 @@ copy_xact_xlog_xid(void) * server doesn't attempt to read multis older than the cutoff value. */ if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && - new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) + new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && + !slru_changed) { copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); copy_subdir_files("pg_multixact/members", "pg_multixact/members"); @@ -462,7 +629,8 @@ copy_xact_xlog_xid(void) new_cluster.pgdata); check_ok(); } - else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) + else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER || + slru_changed) { /* * Remove offsets/0000 file created by initdb that no longer matches diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index a21dd48c42..2c9350d0fb 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -131,6 +131,10 @@ extern char *output_files[]; */ #define JSONB_FORMAT_CHANGE_CAT_VER 201409291 +/* + * change in SLRU format to add checksums + */ +#define SLRU_FORMAT_CHANGE_CAT_VER 201803181 /* * Each relation is represented by a relinfo structure. diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 20114c4d44..e7b9662764 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -14,6 +14,7 @@ #define SLRU_H #include "access/xlogdefs.h" +#include "storage/checksum.h" #include "storage/lwlock.h" diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 3934582efc..d6b15761d8 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201712251 +#define CATALOG_VERSION_NO 201803181 #endif diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index b85f714712..f4bd19b242 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * checksum.h - * Checksum implementation for data pages. + * Checksum implementation for data pages and SLRU pages. * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -21,4 +21,13 @@ */ extern uint16 pg_checksum_page(char *page, BlockNumber blkno); +extern uint16 pg_checksum_slru_page(char *page); + +extern uint16 pg_getchecksum_slru_page(char *page); + +extern void pg_setchecksum_slru_page(char *page); + +/* Size of checksum in bytes default 2 bytes (uint16) */ +#define CHKSUMSZ 2 + #endif /* CHECKSUM_H */ diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h index bffd061de8..edc9d7e1dc 100644 --- a/src/include/storage/checksum_impl.h +++ b/src/include/storage/checksum_impl.h @@ -101,6 +101,7 @@ */ #include "storage/bufpage.h" +#include "storage/checksum.h" /* number of checksums to calculate in parallel */ #define N_SUMS 32 @@ -205,3 +206,48 @@ pg_checksum_page(char *page, BlockNumber blkno) */ return (checksum % 65535) + 1; } + + +#define SLRU_CHECKSUM_UINT16_OFFSET (BLCKSZ / sizeof(uint16) - 1) +/* + * Compute the checksum for a Postgres SLRU page. The page must be aligned on a + * 4-byte boundary. + * + * The checksum save itself to the last 2 bytes (CHKSUMSZ = 2 bytes) of the page + */ +uint16 +pg_checksum_slru_page(char *page) +{ + uint16 *page_casted = (uint16*) page; + uint16 save_checksum; + uint32 checksum; + + save_checksum = page_casted[SLRU_CHECKSUM_UINT16_OFFSET]; + page_casted[SLRU_CHECKSUM_UINT16_OFFSET] = 0; + + checksum = (pg_checksum_block(page, BLCKSZ) % 65535) + 1; + + page_casted[SLRU_CHECKSUM_UINT16_OFFSET] = save_checksum; + + return checksum; +} + +/* + * Get the checksum for a Postgres SLRU page. + */ +uint16 +pg_getchecksum_slru_page(char *page) +{ + uint16 *page_casted = (uint16*) page; + return page_casted[SLRU_CHECKSUM_UINT16_OFFSET]; +} + +/* + * Compute and install the checksum for a Postgres SLRU page. + */ +void +pg_setchecksum_slru_page(char *page) +{ + uint16 *page_casted = (uint16*) page; + page_casted[SLRU_CHECKSUM_UINT16_OFFSET] = pg_checksum_slru_page(page); +} -- 2.14.3 (Apple Git-98)