From 98d5255f0b1e2edbbd95897030a1dd070cc52334 Mon Sep 17 00:00:00 2001 From: David Christensen Date: Tue, 1 Nov 2022 18:48:08 -0400 Subject: [PATCH v2 4/5] Add extended page checksums feature This is an example page feature which utilizes 2 bytes of the reserved page space and the existing 2-byte pd_checksum to store the total 32-bit page checksum that we currently calculate and throw half away. It also serves as an illustration of writing/using a page feature. --- src/backend/access/transam/xlog.c | 4 +- src/backend/backup/basebackup.c | 27 +- src/backend/storage/page/bufpage.c | 45 +- src/backend/utils/misc/guc_tables.c | 11 + src/bin/initdb/initdb.c | 18 +- src/bin/pg_controldata/pg_controldata.c | 3 + src/common/pagefeat.c | 5 + src/include/common/komihash.h | 569 ++++++++++++++++++++++++ src/include/common/pagefeat.h | 2 + src/include/storage/checksum.h | 5 + src/include/storage/checksum_impl.h | 198 +++++++++ 11 files changed, 868 insertions(+), 19 deletions(-) create mode 100644 src/include/common/komihash.h diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 27953367aa..7d4bf6bba9 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4234,7 +4234,9 @@ bool DataChecksumsEnabled(void) { Assert(ControlFile != NULL); - return (ControlFile->data_checksum_version > 0); + return (ControlFile->data_checksum_version > 0) || \ + PageFeatureSetHasFeature(ControlFile->page_features, PF_EXT_CHECKSUMS); + } /* diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index 74fb529380..9a0825bdb9 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -25,6 +25,7 @@ #include "commands/defrem.h" #include "common/compression.h" #include "common/file_perm.h" +#include "common/pagefeat.h" #include "lib/stringinfo.h" #include "miscadmin.h" #include "nodes/pg_list.h" @@ -1492,7 +1493,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, int fd; BlockNumber blkno = 0; bool block_retry = false; - uint16 checksum; + uint64 checksum, page_checksum; int checksum_failures = 0; off_t cnt; int i; @@ -1608,9 +1609,23 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, */ if (!PageIsNew(page) && PageGetLSN(page) < sink->bbs_state->startptr) { - checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE); - phdr = (PageHeader) page; - if (phdr->pd_checksum != checksum) + char *extended_checksum_loc = NULL; + + /* are we using extended checksums? */ + if ((extended_checksum_loc = PageGetFeatureOffset(page, PF_EXT_CHECKSUMS))) + { + /* 56-bit checksum stored in high 7 bytes */ + page_checksum = pg_get_checksum56_page(page, (uint64*)extended_checksum_loc); + checksum = pg_checksum56_page(page, blkno + segmentno * RELSEG_SIZE, (uint64*)extended_checksum_loc); + } + else + { + phdr = (PageHeader) page; + page_checksum = (uint32)phdr->pd_checksum; + checksum = pg_checksum_page(page, blkno + segmentno * RELSEG_SIZE); + } + + if (page_checksum != checksum) { /* * Retry the block on the first failure. It's @@ -1661,9 +1676,9 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, ereport(WARNING, (errmsg("checksum verification failed in " "file \"%s\", block %u: calculated " - "%X but expected %X", + "%lu but expected %lu", readfilename, blkno, checksum, - phdr->pd_checksum))); + page_checksum))); if (checksum_failures == 5) ereport(WARNING, (errmsg("further checksum verification " diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 0433aade03..9577296428 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -107,8 +107,9 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) bool checksum_failure = false; bool header_sane = false; bool all_zeroes = false; - uint16 checksum = 0; - + uint64 checksum = 0; + uint64 page_checksum = 0; + char *extended_checksum_loc = NULL; /* * Don't verify page data unless the page passes basic non-zero test */ @@ -116,9 +117,20 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) { if (DataChecksumsEnabled()) { - checksum = pg_checksum_page((char *) page, blkno); - - if (checksum != p->pd_checksum) + /* are we using extended checksums? */ + if ((extended_checksum_loc = PageGetFeatureOffset(page, PF_EXT_CHECKSUMS))) + { + /* 56-bit checksum stored in high 7 bytes */ + page_checksum = pg_get_checksum56_page(page, (uint64*)extended_checksum_loc); + checksum = pg_checksum56_page(page, blkno, (uint64*)extended_checksum_loc); + } + else + { + /* traditional checksums in the pd_checksum field */ + page_checksum = (uint32)p->pd_checksum; + checksum = (uint32)pg_checksum_page((char *) page, blkno); + } + if (checksum != page_checksum) checksum_failure = true; } @@ -163,8 +175,8 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) if ((flags & PIV_LOG_WARNING) != 0) ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("page verification failed, calculated checksum %u but expected %u", - checksum, p->pd_checksum))); + errmsg("page verification failed, calculated checksum %lu but expected %lu", + checksum, page_checksum))); if ((flags & PIV_REPORT_STAT) != 0) pgstat_report_checksum_failure(); @@ -1524,6 +1536,7 @@ char * PageSetChecksumCopy(Page page, BlockNumber blkno) { static char *pageCopy = NULL; + char *extended_checksum_loc = NULL; /* If we don't need a checksum, just return the passed-in data */ if (PageIsNew(page) || !DataChecksumsEnabled()) @@ -1539,7 +1552,13 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); memcpy(pageCopy, (char *) page, BLCKSZ); - ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); + + if ((extended_checksum_loc = PageGetFeatureOffset(pageCopy, PF_EXT_CHECKSUMS))) + pg_set_checksum56_page(pageCopy, + pg_checksum56_page(pageCopy, blkno, (uint64*)extended_checksum_loc), + (uint64*)extended_checksum_loc); + else + ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); return pageCopy; } @@ -1552,9 +1571,17 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) void PageSetChecksumInplace(Page page, BlockNumber blkno) { + char *extended_checksum_loc = NULL; + /* If we don't need a checksum, just return */ if (PageIsNew(page) || !DataChecksumsEnabled()) return; - ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); + /* are we using extended checksums? */ + if ((extended_checksum_loc = PageGetFeatureOffset(page, PF_EXT_CHECKSUMS))) + pg_set_checksum56_page(page, + pg_checksum56_page(page, blkno, (uint64*)extended_checksum_loc), + (uint64*)extended_checksum_loc); + else + ((PageHeader) page)->pd_checksum = pg_checksum_page(page, blkno); } diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 4c30d0d37a..24a38642a0 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -1804,6 +1804,17 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"extended_checksums", PGC_INTERNAL, PRESET_OPTIONS, + gettext_noop("Shows whether extended checksums are turned on for this cluster."), + NULL, + GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED + }, + &page_feature_extended_checksums, + false, + NULL, NULL, NULL + }, + { {"syslog_sequence_numbers", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Add sequence number to syslog messages to avoid duplicate suppression."), diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 40561d5d61..876e0bbe97 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -150,6 +150,7 @@ static bool do_sync = true; static bool sync_only = false; static bool show_setting = false; static bool data_checksums = false; +static bool extended_checksums = false; static char *xlog_dir = NULL; static char *str_wal_segment_size_mb = NULL; static int wal_segment_size_mb; @@ -1322,10 +1323,11 @@ bootstrap_template1(void) unsetenv("PGCLIENTENCODING"); snprintf(cmd, sizeof(cmd), - "\"%s\" --boot -X %d %s %s %s %s", + "\"%s\" --boot -X %d %s %s %s %s %s", backend_exec, wal_segment_size_mb * (1024 * 1024), data_checksums ? "-k" : "", + extended_checksums ? "-e extended_checksums" : "", boot_options, extra_options, debug ? "-d 5" : ""); @@ -2148,6 +2150,7 @@ usage(const char *progname) printf(_(" -g, --allow-group-access allow group read/execute on data directory\n")); printf(_(" --icu-locale=LOCALE set ICU locale ID for new databases\n")); printf(_(" -k, --data-checksums use data page checksums\n")); + printf(_(" -K, --extended-checksums use extended data page checksums\n")); printf(_(" --locale=LOCALE set default locale for new databases\n")); printf(_(" --lc-collate=, --lc-ctype=, --lc-messages=LOCALE\n" " --lc-monetary=, --lc-numeric=, --lc-time=LOCALE\n" @@ -2803,6 +2806,7 @@ main(int argc, char *argv[]) {"waldir", required_argument, NULL, 'X'}, {"wal-segsize", required_argument, NULL, 12}, {"data-checksums", no_argument, NULL, 'k'}, + {"extended-checksums", no_argument, NULL, 'K'}, {"allow-group-access", no_argument, NULL, 'g'}, {"discard-caches", no_argument, NULL, 14}, {"locale-provider", required_argument, NULL, 15}, @@ -2848,7 +2852,7 @@ main(int argc, char *argv[]) /* process command-line options */ - while ((c = getopt_long(argc, argv, "A:dD:E:gkL:nNsST:U:WX:", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "A:dD:E:gkKL:nNsST:U:WX:", long_options, &option_index)) != -1) { switch (c) { @@ -2900,6 +2904,9 @@ main(int argc, char *argv[]) case 'k': data_checksums = true; break; + case 'K': + extended_checksums = true; + break; case 'L': share_path = pg_strdup(optarg); break; @@ -3015,6 +3022,9 @@ main(int argc, char *argv[]) if (pwprompt && pwfilename) pg_fatal("password prompt and password file cannot be specified together"); + if (data_checksums && extended_checksums) + pg_fatal("data checksums and extended data checksums cannot be specified together"); + check_authmethod_unspecified(&authmethodlocal); check_authmethod_unspecified(&authmethodhost); @@ -3068,7 +3078,9 @@ main(int argc, char *argv[]) printf("\n"); - if (data_checksums) + if (extended_checksums) + printf(_("Extended data page checksums are enabled.\n")); + else if (data_checksums) printf(_("Data page checksums are enabled.\n")); else printf(_("Data page checksums are disabled.\n")); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index c1006ad5d8..bc6be4844a 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -331,5 +331,8 @@ main(int argc, char *argv[]) mock_auth_nonce_str); printf(_("Reserved page size for features: %d\n"), PageFeatureSetCalculateSize(ControlFile->page_features)); + printf(_("Using extended checksums: %s\n"), + PageFeatureSetHasFeature(ControlFile->page_features, PF_EXT_CHECKSUMS) \ + ? _("yes") : _("no")); return 0; } diff --git a/src/common/pagefeat.c b/src/common/pagefeat.c index 75d5bffce2..2037713ccb 100644 --- a/src/common/pagefeat.c +++ b/src/common/pagefeat.c @@ -20,6 +20,9 @@ int reserved_page_size; PageFeatureSet cluster_page_features; +/* status GUCs, display only. set by XLog startup */ +bool page_feature_extended_checksums; + /* * A "page feature" is an optional cluster-defined additional data field that * is stored in the "reserved_page_size" area in the footer of a given Page. @@ -44,6 +47,8 @@ typedef struct PageFeatureDesc * or the attempt to set the GUC will fail. */ static PageFeatureDesc feature_descs[PF_MAX_FEATURE] = { + /* PF_EXT_CHECKSUMS */ + { 7, "extended_checksums" }, /* occupies the 7 bytes atop the 1-byte trailer */ }; diff --git a/src/include/common/komihash.h b/src/include/common/komihash.h new file mode 100644 index 0000000000..867a7f09b1 --- /dev/null +++ b/src/include/common/komihash.h @@ -0,0 +1,569 @@ +/** + * komihash.h version 4.3.1 + * + * The inclusion file for the "komihash" hash function. + * + * Description is available at https://github.com/avaneev/komihash + * + * License + * + * Copyright (c) 2021-2022 Aleksey Vaneev + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef KOMIHASH_INCLUDED +#define KOMIHASH_INCLUDED + +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeclaration-after-statement" + +// Macros that apply byte-swapping. + +#if defined( __GNUC__ ) || defined( __clang__ ) + + #define KOMIHASH_BYTESW32( v ) __builtin_bswap32( v ) + #define KOMIHASH_BYTESW64( v ) __builtin_bswap64( v ) + +#elif defined( _MSC_VER ) + + #define KOMIHASH_BYTESW32( v ) _byteswap_ulong( v ) + #define KOMIHASH_BYTESW64( v ) _byteswap_uint64( v ) + +#else // defined( _MSC_VER ) + + #define KOMIHASH_BYTESW32( v ) ( \ + ( v & 0xFF000000 ) >> 24 | \ + ( v & 0x00FF0000 ) >> 8 | \ + ( v & 0x0000FF00 ) << 8 | \ + ( v & 0x000000FF ) << 24 ) + + #define KOMIHASH_BYTESW64( v ) ( \ + ( v & 0xFF00000000000000 ) >> 56 | \ + ( v & 0x00FF000000000000 ) >> 40 | \ + ( v & 0x0000FF0000000000 ) >> 24 | \ + ( v & 0x000000FF00000000 ) >> 8 | \ + ( v & 0x00000000FF000000 ) << 8 | \ + ( v & 0x0000000000FF0000 ) << 24 | \ + ( v & 0x000000000000FF00 ) << 40 | \ + ( v & 0x00000000000000FF ) << 56 ) + +#endif // defined( _MSC_VER ) + +// Endianness-definition macro, can be defined externally (e.g. =1, if +// endianness-correction is unnecessary in any case, to reduce its associated +// overhead). + +#if !defined( KOMIHASH_LITTLE_ENDIAN ) + #if defined( _WIN32 ) || defined( __LITTLE_ENDIAN__ ) || \ + ( defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ) + + #define KOMIHASH_LITTLE_ENDIAN 1 + + #elif defined( __BIG_ENDIAN__ ) || \ + ( defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ) + + #define KOMIHASH_LITTLE_ENDIAN 0 + + #else // defined( __BIG_ENDIAN__ ) + + #warning KOMIHASH: cannot determine endianness, assuming little-endian. + + #define KOMIHASH_LITTLE_ENDIAN 1 + + #endif // defined( __BIG_ENDIAN__ ) +#endif // !defined( KOMIHASH_LITTLE_ENDIAN ) + +// Macros that apply byte-swapping, used for endianness-correction. + +#if KOMIHASH_LITTLE_ENDIAN + + #define KOMIHASH_EC32( v ) ( v ) + #define KOMIHASH_EC64( v ) ( v ) + +#else // KOMIHASH_LITTLE_ENDIAN + + #define KOMIHASH_EC32( v ) KOMIHASH_BYTESW32( v ) + #define KOMIHASH_EC64( v ) KOMIHASH_BYTESW64( v ) + +#endif // KOMIHASH_LITTLE_ENDIAN + +// Likelihood macros that are used for manually-guided micro-optimization. + +#if defined( __GNUC__ ) || defined( __clang__ ) + + #define KOMIHASH_LIKELY( x ) __builtin_expect( x, 1 ) + #define KOMIHASH_UNLIKELY( x ) __builtin_expect( x, 0 ) + +#else // likelihood macros + + #define KOMIHASH_LIKELY( x ) ( x ) + #define KOMIHASH_UNLIKELY( x ) ( x ) + +#endif // likelihood macros + +// In-memory data prefetch macro (temporal locality=1, in case a collision +// resolution would be necessary). + +#if defined( __GNUC__ ) || defined( __clang__ ) + + #define KOMIHASH_PREFETCH( addr ) __builtin_prefetch( addr, 0, 1 ) + +#else // prefetch macro + + #define KOMIHASH_PREFETCH( addr ) + +#endif // prefetch macro + +/** + * An auxiliary function that returns an unsigned 32-bit value created out of + * a sequence of bytes in memory. This function is used to convert endianness + * of in-memory 32-bit unsigned values, and to avoid unaligned memory + * accesses. + * + * @param p Pointer to 4 bytes in memory. Alignment is unimportant. + */ + +static inline uint32_t kh_lu32ec( const uint8_t* const p ) +{ + uint32_t v; + memcpy( &v, p, 4 ); + + return( KOMIHASH_EC32( v )); +} + +/** + * An auxiliary function that returns an unsigned 64-bit value created out of + * a sequence of bytes in memory. This function is used to convert endianness + * of in-memory 64-bit unsigned values, and to avoid unaligned memory + * accesses. + * + * @param p Pointer to 8 bytes in memory. Alignment is unimportant. + */ + +static inline uint64_t kh_lu64ec( const uint8_t* const p ) +{ + uint64_t v; + memcpy( &v, p, 8 ); + + return( KOMIHASH_EC64( v )); +} + +/** + * Function builds an unsigned 64-bit value out of remaining bytes in a + * message, and pads it with the "final byte". This function can only be + * called if less than 8 bytes are left to read. The message should be "long", + * permitting Msg[ -3 ] reads. + * + * @param Msg Message pointer, alignment is unimportant. + * @param MsgLen Message's remaining length, in bytes; can be 0. + * @param fb Final byte used for padding. + */ + +static inline uint64_t kh_lpu64ec_l3( const uint8_t* const Msg, + const size_t MsgLen, uint64_t fb ) +{ + if( MsgLen < 4 ) + { + const uint8_t* const Msg3 = Msg + MsgLen - 3; + const int ml8 = (int) ( MsgLen << 3 ); + const uint64_t m = (uint64_t) Msg3[ 0 ] | (uint64_t) Msg3[ 1 ] << 8 | + (uint64_t) Msg3[ 2 ] << 16; + return( fb << ml8 | m >> ( 24 - ml8 )); + } + + const int ml8 = (int) ( MsgLen << 3 ); + const uint64_t mh = kh_lu32ec( Msg + MsgLen - 4 ); + const uint64_t ml = kh_lu32ec( Msg ); + + return( fb << ml8 | ml | ( mh >> ( 64 - ml8 )) << 32 ); +} + +/** + * Function builds an unsigned 64-bit value out of remaining bytes in a + * message, and pads it with the "final byte". This function can only be + * called if less than 8 bytes are left to read. Can be used on "short" + * messages, but MsgLen should be greater than 0. + * + * @param Msg Message pointer, alignment is unimportant. + * @param MsgLen Message's remaining length, in bytes; cannot be 0. + * @param fb Final byte used for padding. + */ + +static inline uint64_t kh_lpu64ec_nz( const uint8_t* const Msg, + const size_t MsgLen, uint64_t fb ) +{ + if( MsgLen < 4 ) + { + fb <<= ( MsgLen << 3 ); + uint64_t m = Msg[ 0 ]; + + if( MsgLen > 1 ) + { + m |= (uint64_t) Msg[ 1 ] << 8; + + if( MsgLen > 2 ) + { + m |= (uint64_t) Msg[ 2 ] << 16; + } + } + + return( fb | m ); + } + + const int ml8 = (int) ( MsgLen << 3 ); + const uint64_t mh = kh_lu32ec( Msg + MsgLen - 4 ); + const uint64_t ml = kh_lu32ec( Msg ); + + return( fb << ml8 | ml | ( mh >> ( 64 - ml8 )) << 32 ); +} + +/** + * Function builds an unsigned 64-bit value out of remaining bytes in a + * message, and pads it with the "final byte". This function can only be + * called if less than 8 bytes are left to read. The message should be "long", + * permitting Msg[ -4 ] reads. + * + * @param Msg Message pointer, alignment is unimportant. + * @param MsgLen Message's remaining length, in bytes; can be 0. + * @param fb Final byte used for padding. + */ + +static inline uint64_t kh_lpu64ec_l4( const uint8_t* const Msg, + const size_t MsgLen, uint64_t fb ) +{ + if( MsgLen < 5 ) + { + const int ml8 = (int) ( MsgLen << 3 ); + + return( fb << ml8 | + (uint64_t) kh_lu32ec( Msg + MsgLen - 4 ) >> ( 32 - ml8 )); + } + else + { + const int ml8 = (int) ( MsgLen << 3 ); + + return( fb << ml8 | kh_lu64ec( Msg + MsgLen - 8 ) >> ( 64 - ml8 )); + } +} + +#if defined( __SIZEOF_INT128__ ) + + /** + * 64-bit by 64-bit unsigned multiplication. + * + * @param m1 Multiplier 1. + * @param m2 Multiplier 2. + * @param[out] rl The lower half of the 128-bit result. + * @param[out] rh The higher half of the 128-bit result. + */ + + static inline void kh_m128( const uint64_t m1, const uint64_t m2, + uint64_t* const rl, uint64_t* const rh ) + { + const __uint128_t r = (__uint128_t) m1 * m2; + + *rl = (uint64_t) r; + *rh = (uint64_t) ( r >> 64 ); + } + +#elif defined( _MSC_VER ) && defined( _M_X64 ) + + #include + + static inline void kh_m128( const uint64_t m1, const uint64_t m2, + uint64_t* const rl, uint64_t* const rh ) + { + *rl = _umul128( m1, m2, rh ); + } + +#else // defined( _MSC_VER ) + + // _umul128() code for 32-bit systems, adapted from mullu(), + // from https://go.dev/src/runtime/softfloat64.go + // Licensed under BSD-style license. + + static inline uint64_t kh__emulu( const uint32_t x, const uint32_t y ) + { + return( x * (uint64_t) y ); + } + + static inline void kh_m128( const uint64_t u, const uint64_t v, + uint64_t* const rl, uint64_t* const rh ) + { + *rl = u * v; + + const uint32_t u0 = (uint32_t) u; + const uint32_t v0 = (uint32_t) v; + const uint64_t w0 = kh__emulu( u0, v0 ); + const uint32_t u1 = (uint32_t) ( u >> 32 ); + const uint32_t v1 = (uint32_t) ( v >> 32 ); + const uint64_t t = kh__emulu( u1, v0 ) + ( w0 >> 32 ); + const uint64_t w1 = (uint32_t) t + kh__emulu( u0, v1 ); + + *rh = kh__emulu( u1, v1 ) + ( w1 >> 32 ) + ( t >> 32 ); + } + +#endif // defined( _MSC_VER ) + +// Common hashing round with 16-byte input, using the "r1l" and "r1h" +// temporary variables. + +#define KOMIHASH_HASH16( m ) \ + kh_m128( Seed1 ^ kh_lu64ec( m ), \ + Seed5 ^ kh_lu64ec( m + 8 ), &r1l, &r1h ); \ + Seed5 += r1h; \ + Seed1 = Seed5 ^ r1l; + +// Common hashing round without input, using the "r2l" and "r2h" temporary +// variables. + +#define KOMIHASH_HASHROUND() \ + kh_m128( Seed1, Seed5, &r2l, &r2h ); \ + Seed5 += r2h; \ + Seed1 = Seed5 ^ r2l; + +// Common hashing finalization round, with the final hashing input expected in +// the "r2l" and "r2h" temporary variables. + +#define KOMIHASH_HASHFIN() \ + kh_m128( r2l, r2h, &r1l, &r1h ); \ + Seed5 += r1h; \ + Seed1 = Seed5 ^ r1l; \ + KOMIHASH_HASHROUND(); + +/** + * KOMIHASH hash function. Produces and returns a 64-bit hash value of the + * specified message, string, or binary data block. Designed for 64-bit + * hash-table and hash-map uses. Produces identical hashes on both big- and + * little-endian systems. + * + * @param Msg0 The message to produce a hash from. The alignment of this + * pointer is unimportant. + * @param MsgLen Message's length, in bytes. + * @param UseSeed Optional value, to use instead of the default seed. To use + * the default seed, set to 0. The UseSeed value can have any bit length and + * statistical quality, and is used only as an additional entropy source. May + * need endianness-correction if this value is shared between big- and + * little-endian systems. + */ + +static inline uint64_t komihash( const void* const Msg0, size_t MsgLen, + const uint64_t UseSeed ) +{ + const uint8_t* Msg = (const uint8_t*) Msg0; + + // The seeds are initialized to the first mantissa bits of PI. + + uint64_t Seed1 = 0x243F6A8885A308D3 ^ ( UseSeed & 0x5555555555555555 ); + uint64_t Seed5 = 0x452821E638D01377 ^ ( UseSeed & 0xAAAAAAAAAAAAAAAA ); + uint64_t r1l, r1h, r2l, r2h; + + // The three instructions in the "KOMIHASH_HASHROUND" macro represent the + // simplest constant-less PRNG, scalable to any even-sized state + // variables, with the `Seed1` being the PRNG output (2^64 PRNG period). + // It passes `PractRand` tests with rare non-systematic "unusual" + // evaluations. + // + // To make this PRNG reliable, self-starting, and eliminate a risk of + // stopping, the following variant can be used, which is a "register + // checker-board", a source of raw entropy. The PRNG is available as the + // komirand() function. Not required for hashing (but works for it) since + // the input entropy is usually available in abundance during hashing. + // + // Seed5 += r2h + 0xAAAAAAAAAAAAAAAA; + // + // (the `0xAAAA...` constant should match register's size; essentially, + // it is a replication of the `10` bit-pair; it is not an arbitrary + // constant). + + KOMIHASH_HASHROUND(); // Required for PerlinNoise. + + if( KOMIHASH_LIKELY( MsgLen < 16 )) + { + KOMIHASH_PREFETCH( Msg ); + + r2l = Seed1; + r2h = Seed5; + + if( MsgLen > 7 ) + { + // The following two XOR instructions are equivalent to mixing a + // message with a cryptographic one-time-pad (bitwise modulo 2 + // addition). Message's statistics and distribution are thus + // unimportant. + + r2h ^= kh_lpu64ec_l3( Msg + 8, MsgLen - 8, + 1 << ( Msg[ MsgLen - 1 ] >> 7 )); + + r2l ^= kh_lu64ec( Msg ); + } + else + if( KOMIHASH_LIKELY( MsgLen != 0 )) + { + r2l ^= kh_lpu64ec_nz( Msg, MsgLen, + 1 << ( Msg[ MsgLen - 1 ] >> 7 )); + } + + KOMIHASH_HASHFIN(); + + return( Seed1 ); + } + + if( KOMIHASH_LIKELY( MsgLen < 32 )) + { + KOMIHASH_PREFETCH( Msg ); + + KOMIHASH_HASH16( Msg ); + + const uint64_t fb = 1 << ( Msg[ MsgLen - 1 ] >> 7 ); + + if( MsgLen > 23 ) + { + r2h = Seed5 ^ kh_lpu64ec_l4( Msg + 24, MsgLen - 24, fb ); + r2l = Seed1 ^ kh_lu64ec( Msg + 16 ); + } + else + { + r2l = Seed1 ^ kh_lpu64ec_l4( Msg + 16, MsgLen - 16, fb ); + r2h = Seed5; + } + + KOMIHASH_HASHFIN(); + + return( Seed1 ); + } + + if( MsgLen > 63 ) + { + uint64_t Seed2 = 0x13198A2E03707344 ^ Seed1; + uint64_t Seed3 = 0xA4093822299F31D0 ^ Seed1; + uint64_t Seed4 = 0x082EFA98EC4E6C89 ^ Seed1; + uint64_t Seed6 = 0xBE5466CF34E90C6C ^ Seed5; + uint64_t Seed7 = 0xC0AC29B7C97C50DD ^ Seed5; + uint64_t Seed8 = 0x3F84D5B5B5470917 ^ Seed5; + uint64_t r3l, r3h, r4l, r4h; + + do + { + KOMIHASH_PREFETCH( Msg ); + + kh_m128( Seed1 ^ kh_lu64ec( Msg ), + Seed5 ^ kh_lu64ec( Msg + 8 ), &r1l, &r1h ); + + kh_m128( Seed2 ^ kh_lu64ec( Msg + 16 ), + Seed6 ^ kh_lu64ec( Msg + 24 ), &r2l, &r2h ); + + kh_m128( Seed3 ^ kh_lu64ec( Msg + 32 ), + Seed7 ^ kh_lu64ec( Msg + 40 ), &r3l, &r3h ); + + kh_m128( Seed4 ^ kh_lu64ec( Msg + 48 ), + Seed8 ^ kh_lu64ec( Msg + 56 ), &r4l, &r4h ); + + Msg += 64; + MsgLen -= 64; + + // Such "shifting" arrangement (below) does not increase + // individual SeedN's PRNG period beyond 2^64, but reduces a + // chance of any occassional synchronization between PRNG lanes + // happening. Practically, Seed1-4 together become a single + // "fused" 256-bit PRNG value, having a summary PRNG period of + // 2^66. + + Seed5 += r1h; + Seed6 += r2h; + Seed7 += r3h; + Seed8 += r4h; + Seed2 = Seed5 ^ r2l; + Seed3 = Seed6 ^ r3l; + Seed4 = Seed7 ^ r4l; + Seed1 = Seed8 ^ r1l; + + } while( KOMIHASH_LIKELY( MsgLen > 63 )); + + Seed5 ^= Seed6 ^ Seed7 ^ Seed8; + Seed1 ^= Seed2 ^ Seed3 ^ Seed4; + } + + KOMIHASH_PREFETCH( Msg ); + + if( KOMIHASH_LIKELY( MsgLen > 31 )) + { + KOMIHASH_HASH16( Msg ); + KOMIHASH_HASH16( Msg + 16 ); + + Msg += 32; + MsgLen -= 32; + } + + if( MsgLen > 15 ) + { + KOMIHASH_HASH16( Msg ); + + Msg += 16; + MsgLen -= 16; + } + + const uint64_t fb = 1 << ( Msg[ MsgLen - 1 ] >> 7 ); + + if( MsgLen > 7 ) + { + r2h = Seed5 ^ kh_lpu64ec_l4( Msg + 8, MsgLen - 8, fb ); + r2l = Seed1 ^ kh_lu64ec( Msg ); + } + else + { + r2l = Seed1 ^ kh_lpu64ec_l4( Msg, MsgLen, fb ); + r2h = Seed5; + } + + KOMIHASH_HASHFIN(); + + return( Seed1 ); +} + +/** + * Simple, reliable, self-starting yet efficient PRNG, with 2^64 period. + * 0.62 cycles/byte performance. Self-starts in 4 iterations, which is a + * suggested "warming up" initialization before using its output. + * + * @param[in,out] Seed1 Seed value 1. Can be initialized to any value + * (even 0). This is the usual "PRNG seed" value. + * @param[in,out] Seed2 Seed value 2, a supporting variable. Best initialized + * to the same value as Seed1. + * @return The next uniformly-random 64-bit value. + */ + +static inline uint64_t komirand( uint64_t* const Seed1, uint64_t* const Seed2 ) +{ + uint64_t r1l, r1h; + + kh_m128( *Seed1, *Seed2, &r1l, &r1h ); + *Seed2 += r1h + 0xAAAAAAAAAAAAAAAA; + *Seed1 = *Seed2 ^ r1l; + + return( *Seed1 ); +} + +#pragma GCC diagnostic pop + +#endif // KOMIHASH_INCLUDED diff --git a/src/include/common/pagefeat.h b/src/include/common/pagefeat.h index cf0f04ecdb..c286062af6 100644 --- a/src/include/common/pagefeat.h +++ b/src/include/common/pagefeat.h @@ -16,6 +16,7 @@ /* revealed for GUCs */ extern int reserved_page_size; +extern bool page_feature_extended_checksums; /* forward declaration to avoid circular includes */ typedef Pointer Page; @@ -27,6 +28,7 @@ extern PageFeatureSet cluster_page_features; /* bit offset for features flags */ typedef enum { + PF_EXT_CHECKSUMS = 0, /* must be first */ PF_MAX_FEATURE /* must be last */ } PageFeature; diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index 1904fabd5a..4c14b89a29 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -20,5 +20,10 @@ * 4-byte boundary. */ extern uint16 pg_checksum_page(char *page, BlockNumber blkno); +extern uint32 pg_checksum32_page(char *page, BlockNumber blkno, char*offset); +extern uint64 pg_checksum64_page(char *page, BlockNumber blkno, uint64*offset); +extern uint64 pg_checksum56_page(char *page, BlockNumber blkno, uint64*offset); +extern void pg_set_checksum56_page(char *page, uint64 checksum, uint64 *cksumloc); +extern uint64 pg_get_checksum56_page(char *page, uint64 *cksumloc); #endif /* CHECKSUM_H */ diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h index d2eb75f769..5c6f549b5f 100644 --- a/src/include/storage/checksum_impl.h +++ b/src/include/storage/checksum_impl.h @@ -101,6 +101,7 @@ */ #include "storage/bufpage.h" +#include "common/komihash.h" /* number of checksums to calculate in parallel */ #define N_SUMS 32 @@ -213,3 +214,200 @@ pg_checksum_page(char *page, BlockNumber blkno) */ return (uint16) ((checksum % 65535) + 1); } + + +/* + * Compute and return a 32-bit checksum for a Postgres page. + * + * Beware that the 16-bit portion of the page that cksum points to is + * transiently zeroed, as is the pd_checksums field. The storage location for + * this is determined by the PageFeatures in play for cluster, so we are + * storing the + * + * The checksum includes the block number (to detect the case where a page is + * somehow moved to a different location), the page header (excluding the + * checksum itself), and the page data. + * + * The high bits of this are stored in the overflow storage area of the page + * pointed to by *cksum, leaving the pd_checksum field with the same checksum + * you'd expect if running the pg_checksum_page function. + */ +uint32 +pg_checksum32_page(char *page, BlockNumber blkno, char *cksum) +{ + PGChecksummablePage *cpage = (PGChecksummablePage *) page; + uint16 save_pd,save_ext,*ptr; + uint32 checksum; + + /* We only calculate the checksum for properly-initialized pages */ + Assert(!PageIsNew((Page) page)); + /* Ensure that the cksum pointer is in the page range on this page */ + Assert(cksum >= page && cksum <= (page + BLCKSZ - sizeof(uint16))); + + ptr = (uint16*)cksum; + + /* + * Save the existing checksum locations and temporarily set it to zero, so + * that the checksum calculation isn't affected by the old checksum stored + * on the page. Restore it after, because actually updating the checksum + * is NOT part of the API of this function. + */ + + save_ext = *ptr; + save_pd = cpage->phdr.pd_checksum; + *ptr = 0; + cpage->phdr.pd_checksum = 0; + + checksum = pg_checksum_block(cpage); + + /* restore */ + *ptr = save_ext; + cpage->phdr.pd_checksum = save_pd; + + /* Mix in the block number to detect transposed pages */ + checksum ^= blkno; + + /* ensure we have non-zero return value here; this does double-up on our + * coset for group 1 here, but it's a nice property to preserve */ + return (checksum == 0 ? 1 : checksum); +} + +/* + * 64-bit block checksum algorithm. The page must be adequately aligned + * (at least on 4-byte boundary). + */ + +static uint64 +pg_checksum64_block(const PGChecksummablePage *page) +{ + /* ensure that the size is compatible with the algorithm */ + Assert(sizeof(PGChecksummablePage) == BLCKSZ); + + return (uint64)komihash(page, BLCKSZ, 0); +} + +/* temporary struct for ease of accessing memory */ +typedef union { + uint64 u64; + uint8 bytes[8]; +} Checksum56; + +StaticAssertDecl(sizeof(Checksum56) == sizeof(uint64), "Can't make combined checksum56 struct"); + +/* + * Compute and return a 64-bit checksum for a Postgres page. + * + * Beware that the 64-bit portion of the page that cksum points to is + * transiently zeroed, though it is restored. + * + * The checksum includes the block number (to detect the case where a page is + * somehow moved to a different location), the page header (excluding the + * checksum itself), and the page data. + */ +uint64 +pg_checksum64_page(char *page, BlockNumber blkno, uint64 *cksumloc) +{ + PGChecksummablePage *cpage = (PGChecksummablePage *) page; + uint64 saved; + uint64 checksum; + + /* We only calculate the checksum for properly-initialized pages */ + Assert(!PageIsNew((Page) page)); + /* Ensure that the cksum pointer is in the page range on this page */ + Assert((char*)cksumloc >= page && (char*)cksumloc <= (page + BLCKSZ - sizeof(uint64))); + + saved = *cksumloc; + *cksumloc = 0; + + checksum = pg_checksum64_block(cpage); + + /* restore */ + *cksumloc = saved; + + /* Mix in the block number to detect transposed pages */ + checksum ^= blkno; + + /* ensure in the extremely unlikely case that we have non-zero return + * value here; this does double-up on our coset for group 1 here, but it's + * a nice property to preserve */ + return (checksum == 0 ? 1 : checksum); +} + +/* + * Compute and return a 56-bit checksum for a Postgres page. + * + * Beware that the 56-bit portion of the page that cksum points to is + * transiently zeroed, though it is restored. The low byte of the uint64 is + * not part of this checksum, so is left on the page to be included as well. + * + * The checksum includes the block number (to detect the case where a page is + * somehow moved to a different location), the page header (excluding the + * checksum itself), and the page data. + * + */ +uint64 +pg_checksum56_page(char *page, BlockNumber blkno, uint64 *cksumloc) +{ + PGChecksummablePage *cpage = (PGChecksummablePage *) page; + Checksum56 saved, checksum; + + /* We only calculate the checksum for properly-initialized pages */ + Assert(!PageIsNew((Page) page)); + /* Ensure that the cksum pointer is in the page range on this page */ + Assert((char*)cksumloc >= page && (char*)cksumloc <= (page + BLCKSZ - sizeof(uint64))); + + saved = *(Checksum56*)cksumloc; + ((Checksum56*)cksumloc)->u64 = 0; + ((Checksum56*)cksumloc)->bytes[7] = saved.bytes[7]; + + checksum.u64 = pg_checksum64_block(cpage); + checksum.bytes[7] = saved.bytes[7]; + /* restore */ + *cksumloc = saved.u64; + + /* Mix in the block number to detect transposed pages */ + checksum.u64 ^= blkno << 8; + + // checksum cannot be zero + return checksum.u64; +} + +/* + * Set a 56-bit checksum onto a Postgres page. + * + * Given a uint64*, set the top 7 bytes to this checksum value, leaving the + * original low-order byte in-place. + */ +void +pg_set_checksum56_page(char *page, uint64 checksum, uint64 *cksumloc) +{ + uint8 byte; + /* Can only set the checksum for properly-initialized pages */ + Assert(!PageIsNew((Page) page)); + + /* Ensure that the cksum pointer is in the page range on this page */ + Assert((char*)cksumloc >= page && (char*)cksumloc <= (page + BLCKSZ - sizeof(uint64))); + + // preserve the old byte field + byte = ((Checksum56*)cksumloc)->bytes[7]; + ((Checksum56*)cksumloc)->u64 = checksum; + ((Checksum56*)cksumloc)->bytes[7] = byte; +} + +/* + * Get the 56-bit checksum onto a Postgres page given the offset to the + * containing uint64. + */ +uint64 +pg_get_checksum56_page(char *page, uint64 *cksumloc) +{ + /* Can only set the checksum for properly-initialized pages */ + Assert(!PageIsNew((Page) page)); + + /* Ensure that the cksum pointer is in the page range on this page */ + Assert((char*)cksumloc >= page && (char*)cksumloc <= (page + BLCKSZ - sizeof(uint64))); + Assert(MAXALIGN((uint64)cksumloc) == (uint64)cksumloc); + + return *cksumloc; +} + -- 2.31.1