*** a/src/backend/storage/page/Makefile --- b/src/backend/storage/page/Makefile *************** *** 12,17 **** subdir = src/backend/storage/page top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global ! OBJS = bufpage.o itemptr.o include $(top_srcdir)/src/backend/common.mk --- 12,17 ---- top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global ! OBJS = bufpage.o checksum.o itemptr.o include $(top_srcdir)/src/backend/common.mk *** a/src/backend/storage/page/bufpage.c --- b/src/backend/storage/page/bufpage.c *************** *** 16,21 **** --- 16,22 ---- #include "access/htup_details.h" #include "access/xlog.h" + #include "storage/checksum.h" bool ignore_checksum_failure = false; *************** *** 948,980 **** PageSetChecksumInplace(Page page, BlockNumber blkno) static uint16 PageCalcChecksum16(Page page, BlockNumber blkno) { ! pg_crc32 crc; ! PageHeader p = (PageHeader) page; /* only calculate the checksum for properly-initialized pages */ Assert(!PageIsNew(page)); - INIT_CRC32(crc); - /* ! * Initialize the checksum calculation with the block number. This helps ! * catch corruption from whole blocks being transposed with other whole ! * blocks. */ ! COMP_CRC32(crc, &blkno, sizeof(blkno)); ! /* ! * Now add in the LSN, which is always the first field on the page. ! */ ! COMP_CRC32(crc, page, sizeof(p->pd_lsn)); /* ! * Now add the rest of the page, skipping the pd_checksum field. */ ! COMP_CRC32(crc, page + sizeof(p->pd_lsn) + sizeof(p->pd_checksum), ! BLCKSZ - sizeof(p->pd_lsn) - sizeof(p->pd_checksum)); ! ! FIN_CRC32(crc); ! ! return (uint16) crc; } --- 949,976 ---- static uint16 PageCalcChecksum16(Page page, BlockNumber blkno) { ! PageHeader phdr = (PageHeader) page; ! uint16 save_checksum; ! uint32 checksum; /* only calculate the checksum for properly-initialized pages */ Assert(!PageIsNew(page)); /* ! * Save pd_checksum and set it to zero, so that the checksum calculation ! * isn't affected by the checksum stored on the page. */ ! save_checksum = phdr->pd_checksum; ! phdr->pd_checksum = 0; ! checksum = checksum_block(page, BLCKSZ); ! phdr->pd_checksum = save_checksum; ! /* mix in the block number to detect transposed pages */ ! checksum ^= blkno; /* ! * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of ! * one. That avoids checksums of zero, which seems like a good idea. */ ! return (checksum % 65535) + 1; } *** /dev/null --- b/src/backend/storage/page/checksum.c *************** *** 0 **** --- 1,121 ---- + /*------------------------------------------------------------------------- + * + * checksum.c + * Checksum implementation for data pages. + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/page/checksum.c + * + *------------------------------------------------------------------------- + * + * Checksum algorithm + * + * The algorithm used to checksum pages is chosen for very fast calculation. + * Workloads where the database working set fits into OS file cache but not + * into shared buffers can read in pages at a very fast pace and the checksum + * algorithm itself can become the largest bottleneck. + * + * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand + * for Fowler/Noll/Vo) The primitive of a plain FNV-1a hash folds in data 4 + * bytes at a time according to the formula: + * + * hash = (hash ^ value) * FNV_PRIME + * + * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of + * high bits - high order bits in input data only affect high order bits in + * output data. To resolve this we xor in the value prior to multiplication + * shifted right by 3 bits. The number 3 was chosen as it is a small odd, + * prime, and experimentally provides enough mixing for the high order bits to + * avalanche into lower positions. The actual hash formula used as the basis + * is: + * + * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 3) + * + * The main bottleneck in this calculation is the multiplication latency. To + * hide the latency and to make use of SIMD parallelism multiple hash values + * are calculated in parallel. Each hash function uses a different initial + * value (offset basis in FNV terminology). The initial values actually used + * were chosen randomly, as the values themselves don't matter as much as that + * they are different and don't match anything in real data. The page is then + * treated as 32 wide array of 32bit values and each column is aggregated + * according to the above formula. Finally one more iteration of the formula is + * performed with value 0 to mix the bits of the last value added. + * + * The partial checksums are then aggregated together using xor to form a + * 32-bit checksum. The caller can safely reduce the value to 16 bits + * using modulo 2^16-1. That will cause a very slight bias towards lower + * values but this is not significant for the performance of the + * checksum. + * + * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer + * multiplication instruction. As of 2013 the corresponding instruction is + * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). + * Vectorization requires a compiler to do the vectorization for us. For recent + * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough + * to achieve vectorization. + */ + #include "postgres.h" + + #include "storage/checksum.h" + + /* number of checksums to calculate in parallel */ + #define N_SUMS 32 + /* prime multiplier of FNV-1a hash */ + #define FNV_PRIME 16777619 + + /* + * Base offsets to initialize each of the parallel FNV hashes into a + * different initial state. + */ + static const uint32 checksumBaseOffsets[N_SUMS] = { + 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, + 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, + 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, + 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, + 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, + 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, + 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, + 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756 + }; + + /* + * Calculate one round of the checksum. + */ + #define CHECKSUM_COMP(checksum, value) do {\ + uint32 __tmp = (checksum) ^ (value);\ + (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 3);\ + } while (0) + + uint32 + checksum_block(char *data, uint32 size) + { + uint32 sums[N_SUMS]; + uint32 (*dataArr)[N_SUMS] = (uint32 (*)[N_SUMS]) data; + uint32 result = 0; + int i, j; + + /* ensure that the size is compatible with the algorithm */ + Assert((size % (sizeof(uint32)*N_SUMS)) == 0); + + /* initialize partial checksums to their corresponding offsets */ + memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); + + /* main checksum calculation */ + for (i = 0; i < size/sizeof(uint32)/N_SUMS; i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], dataArr[i][j]); + + /* finally add in one round of zeroes for one more layer of mixing */ + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], 0); + + /* xor fold partial checksums together */ + for (i = 0; i < N_SUMS; i++) + result ^= sums[i]; + + return result; + } *** /dev/null --- b/src/include/storage/checksum.h *************** *** 0 **** --- 1,23 ---- + /*------------------------------------------------------------------------- + * + * checksum.h + * Checksum implementation for data pages. + * + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/checksum.h + * + *------------------------------------------------------------------------- + */ + #ifndef CHECKSUM_H + #define CHECKSUM_H + + /* + * Fowler-Noll-Vo 1a block checksum algorithm. The data argument should be + * aligned on a 4-byte boundary. + */ + extern uint32 checksum_block(char *data, uint32 size); + + #endif /* CHECKSUM_H */