From 040c7082670d3f90069576a7438c61971c616141 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 25 Nov 2014 14:24:26 +0900 Subject: [PATCH 2/2] Support compression for full-page writes in WAL full_page_writes has a new setting value called 'compress' allowing a user to reduce the amount of data written to WAL for full-page writes. --- contrib/pg_xlogdump/pg_xlogdump.c | 5 +- doc/src/sgml/config.sgml | 41 +++++--- src/backend/access/rmgrdesc/xlogdesc.c | 8 +- src/backend/access/transam/xlog.c | 56 ++++++---- src/backend/access/transam/xloginsert.c | 145 +++++++++++++++++++++----- src/backend/access/transam/xlogreader.c | 39 ++++++- src/backend/utils/misc/guc.c | 44 +++++--- src/backend/utils/misc/postgresql.conf.sample | 3 +- src/bin/pg_controldata/pg_controldata.c | 2 +- src/bin/pg_resetxlog/pg_resetxlog.c | 4 +- src/include/access/xlog.h | 22 +++- src/include/access/xlogreader.h | 1 + src/include/access/xlogrecord.h | 10 +- src/include/catalog/pg_control.h | 2 +- 14 files changed, 290 insertions(+), 92 deletions(-) diff --git a/contrib/pg_xlogdump/pg_xlogdump.c b/contrib/pg_xlogdump/pg_xlogdump.c index 26556dc..ad335e7 100644 --- a/contrib/pg_xlogdump/pg_xlogdump.c +++ b/contrib/pg_xlogdump/pg_xlogdump.c @@ -450,9 +450,10 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) blk); if (XLogRecHasBlockImage(record, block_id)) { - printf(" (FPW); hole: offset: %u, length: %u\n", + printf(" (FPW); hole: offset: %u, length: %u, compressed: %u\n", record->blocks[block_id].hole_offset, - record->blocks[block_id].hole_length); + record->blocks[block_id].hole_length, + record->blocks[block_id].compress_len); } putchar('\n'); } diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index ab8c263..ee490bf 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2181,28 +2181,32 @@ include_dir 'conf.d' - full_page_writes (boolean) + full_page_writes (enum) full_page_writes configuration parameter - When this parameter is on, the PostgreSQL server - writes the entire content of each disk page to WAL during the - first modification of that page after a checkpoint. - This is needed because - a page write that is in process during an operating system crash might - be only partially completed, leading to an on-disk page - that contains a mix of old and new data. The row-level change data - normally stored in WAL will not be enough to completely restore - such a page during post-crash recovery. Storing the full page image - guarantees that the page can be correctly restored, but at the price - of increasing the amount of data that must be written to WAL. - (Because WAL replay always starts from a checkpoint, it is sufficient - to do this during the first change of each page after a checkpoint. - Therefore, one way to reduce the cost of full-page writes is to - increase the checkpoint interval parameters.) + When this parameter is on or compress, + the PostgreSQL server writes the entire content + of each disk page to WAL during the first modification of that + page after a checkpoint. This is needed because a page write that + is in process during an operating system crash might be only partially + completed, leading to an on-disk page that contains a mix of old and + new data. The row-level change data normally stored in WAL will not + be enough to completely restore such a page during post-crash + recovery. Storing the full page image guarantees that the page can + be correctly restored, but at the price of increasing the amount of + data that must be written to WAL. (Because WAL replay always starts + from a checkpoint, it is sufficient to do this during the first change + of each page after a checkpoint. Therefore, one way to reduce the cost + of full-page writes is to increase the checkpoint interval parameters.) + + + + Valid values are on, compress, and + off. The default is on. @@ -2220,6 +2224,11 @@ include_dir 'conf.d' + Setting this parameter to compress compresses + the full page image to reduce the amount of WAL data. + + + This parameter can only be set in the postgresql.conf file or on the server command line. The default is on. diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index eba046d..918cc5a 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -49,7 +49,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) (uint32) (checkpoint->redo >> 32), (uint32) checkpoint->redo, checkpoint->ThisTimeLineID, checkpoint->PrevTimeLineID, - checkpoint->fullPageWrites ? "true" : "false", + FullPageWritesStr(checkpoint->fullPageWrites), checkpoint->nextXidEpoch, checkpoint->nextXid, checkpoint->nextOid, checkpoint->nextMulti, @@ -114,10 +114,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record) } else if (info == XLOG_FPW_CHANGE) { - bool fpw; + int fpw; - memcpy(&fpw, rec, sizeof(bool)); - appendStringInfo(buf, "%s", fpw ? "true" : "false"); + memcpy(&fpw, rec, sizeof(int)); + appendStringInfo(buf, "fpw: %s", FullPageWritesStr(fpw)); } else if (info == XLOG_END_OF_RECOVERY) { diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 0f661f5..0f0f1ca 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -85,7 +85,7 @@ int XLogArchiveTimeout = 0; bool XLogArchiveMode = false; char *XLogArchiveCommand = NULL; bool EnableHotStandby = false; -bool fullPageWrites = true; +int fullPageWrites = FULL_PAGE_WRITES_ON; bool wal_log_hints = false; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; @@ -179,7 +179,7 @@ static TimeLineID receiveTLI = 0; * that the recovery starting checkpoint record indicates, and then updated * each time XLOG_FPW_CHANGE record is replayed. */ -static bool lastFullPageWrites; +static int lastFullPageWrites; /* * Local copy of SharedRecoveryInProgress variable. True actually means "not @@ -316,6 +316,13 @@ static XLogRecPtr RedoRecPtr; static bool doPageWrites; /* + * doPageCompression is this backend'd local copy of + * (fullPageWrites == FULL_PAGE_WRITES_COMPRESS). It is used to check if + * a full page write can be compressed. + */ +static int doPageCompression; + +/* * RedoStartLSN points to the checkpoint's REDO location which is specified * in a backup label file, backup history file or control file. In standby * mode, XLOG streaming usually starts from the position where an invalid @@ -464,7 +471,7 @@ typedef struct XLogCtlInsert */ XLogRecPtr RedoRecPtr; /* current redo point for insertions */ bool forcePageWrites; /* forcing full-page writes for PITR? */ - bool fullPageWrites; + int fullPageWrites; /* * exclusiveBackup is true if a backup started with pg_start_backup() is @@ -915,10 +922,11 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) WALInsertLockAcquire(); /* - * Check to see if my copy of RedoRecPtr or doPageWrites is out of date. - * If so, may have to go back and have the caller recompute everything. - * This can only happen just after a checkpoint, so it's better to be - * slow in this case and fast otherwise. + * Check to see if my copy of RedoRecPtr, doPageWrites or + * doPageCompression is out of date. If so, may have to go back and + * have the caller recompute everything. This can only happen just + * after a checkpoint, so it's better to be slow in this case and + * fast otherwise. * * If we aren't doing full-page writes then RedoRecPtr doesn't actually * affect the contents of the XLOG record, so we'll update our local copy @@ -932,6 +940,7 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) RedoRecPtr = Insert->RedoRecPtr; } doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); + doPageCompression = (Insert->fullPageWrites == FULL_PAGE_WRITES_COMPRESS); if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr && doPageWrites) { @@ -5990,6 +5999,7 @@ StartupXLOG(void) RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; doPageWrites = lastFullPageWrites; + doPageCompression = (lastFullPageWrites == FULL_PAGE_WRITES_COMPRESS); if (RecPtr < checkPoint.redo) ereport(PANIC, @@ -7308,17 +7318,23 @@ GetRedoRecPtr(void) /* * Return information needed to decide whether a modified block needs a - * full-page image to be included in the WAL record. + * full-page image to be included in the WAL record, compressed or not. * * The returned values are cached copies from backend-private memory, and * possibly out-of-date. XLogInsertRecord will re-check them against * up-to-date values, while holding the WAL insert lock. */ void -GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p) -{ - *RedoRecPtr_p = RedoRecPtr; - *doPageWrites_p = doPageWrites; +GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, + bool *doPageWrites_p, + bool *doPageCompression_p) +{ + if (RedoRecPtr_p) + *RedoRecPtr_p = RedoRecPtr; + if (doPageWrites_p) + *doPageWrites_p = doPageWrites; + if (doPageCompression_p) + *doPageCompression_p = doPageCompression; } /* @@ -8534,10 +8550,10 @@ UpdateFullPageWrites(void) * setting it to false, first write the WAL record and then set the global * flag. */ - if (fullPageWrites) + if (fullPageWrites != FULL_PAGE_WRITES_OFF) { WALInsertLockAcquireExclusive(); - Insert->fullPageWrites = true; + Insert->fullPageWrites = fullPageWrites; WALInsertLockRelease(); } @@ -8548,7 +8564,7 @@ UpdateFullPageWrites(void) if (XLogStandbyInfoActive() && !RecoveryInProgress()) { XLogBeginInsert(); - XLogRegisterData((char *) (&fullPageWrites), sizeof(bool)); + XLogRegisterData((char *) (&fullPageWrites), sizeof(int)); XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE); } @@ -8556,7 +8572,7 @@ UpdateFullPageWrites(void) if (!fullPageWrites) { WALInsertLockAcquireExclusive(); - Insert->fullPageWrites = false; + Insert->fullPageWrites = fullPageWrites; WALInsertLockRelease(); } END_CRIT_SECTION(); @@ -8900,16 +8916,16 @@ xlog_redo(XLogReaderState *record) } else if (info == XLOG_FPW_CHANGE) { - bool fpw; + int fpw; - memcpy(&fpw, XLogRecGetData(record), sizeof(bool)); + memcpy(&fpw, XLogRecGetData(record), sizeof(int)); /* * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that * do_pg_start_backup() and do_pg_stop_backup() can check whether * full_page_writes has been disabled during online backup. */ - if (!fpw) + if (fpw == FULL_PAGE_WRITES_OFF) { SpinLockAcquire(&XLogCtl->info_lck); if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr) @@ -9322,7 +9338,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, recptr = XLogCtl->lastFpwDisableRecPtr; SpinLockRelease(&XLogCtl->info_lck); - if (!checkpointfpw || startpoint <= recptr) + if (checkpointfpw == FULL_PAGE_WRITES_OFF || startpoint <= recptr) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("WAL generated with full_page_writes=off was replayed " diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 34e44e4..98c6534 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -27,6 +27,7 @@ #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/proc.h" +#include "utils/pg_lzcompress.h" #include "utils/memutils.h" #include "pg_trace.h" @@ -50,6 +51,8 @@ typedef struct XLogRecData bkp_rdatas[2]; /* temporary rdatas used to hold references to * backup block data in XLogRecordAssemble() */ + char *compressed_page; /* recipient for compressed page, NULL + * if compression is not activated */ } registered_buffer; static registered_buffer *registered_buffers; @@ -95,7 +98,10 @@ static MemoryContext xloginsert_cxt; static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, - XLogRecPtr *fpw_lsn); + bool doPageCompression, XLogRecPtr *fpw_lsn); + +static bool XLogCompressBackupBlock(char *page, uint32 orig_len, + char *dest, uint16 *len); /* * Begin constructing a WAL record. This must be called before the @@ -150,6 +156,7 @@ XLogEnsureRecordSpace(int max_block_id, int ndatas) if (nbuffers > max_registered_buffers) { + int i; registered_buffers = (registered_buffer *) repalloc(registered_buffers, sizeof(registered_buffer) * nbuffers); @@ -159,6 +166,9 @@ XLogEnsureRecordSpace(int max_block_id, int ndatas) */ MemSet(®istered_buffers[max_registered_buffers], 0, (nbuffers - max_registered_buffers) * sizeof(registered_buffer)); + for (i = max_registered_buffers; i < nbuffers; i++) + registered_buffers[i].compressed_page = (char *) + MemoryContextAllocZero(xloginsert_cxt, PGLZ_MAX_OUTPUT(BLCKSZ)); max_registered_buffers = nbuffers; } @@ -409,18 +419,20 @@ XLogInsert(RmgrId rmid, uint8 info) { XLogRecPtr RedoRecPtr; bool doPageWrites; + bool doPageCompression; XLogRecPtr fpw_lsn; XLogRecData *rdt; /* - * Get values needed to decide whether to do full-page writes. Since - * we don't yet have an insertion lock, these could change under us, - * but XLogInsertRecData will recheck them once it has a lock. + * Get values needed to decide whether to do full-page writes and if yes + * under with conditions. Since we don't yet have an insertion lock, these + * could change under us, but XLogInsertRecord will recheck them once it + * has a lock. */ - GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); + GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites, &doPageCompression); rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites, - &fpw_lsn); + doPageCompression, &fpw_lsn); EndPos = XLogInsertRecord(rdt, fpw_lsn); } while (EndPos == InvalidXLogRecPtr); @@ -445,7 +457,7 @@ XLogInsert(RmgrId rmid, uint8 info) static XLogRecData * XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, - XLogRecPtr *fpw_lsn) + bool doPageCompression, XLogRecPtr *fpw_lsn) { XLogRecData *rdt; uint32 total_len = 0; @@ -472,7 +484,11 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, /* * Make an rdata chain containing all the data portions of all block * references. This includes the data for full-page images. Also append - * the headers for the block references in the scratch buffer. + * the headers for the block references in the scratch buffer. If + * compression of full-page writes is activated, save all the images in + * the dedicated buffer and assemble the unique record made of the + * compressed data ince all the registered blocks are completely + * scanned. */ *fpw_lsn = InvalidXLogRecPtr; for (block_id = 0; block_id < max_registered_block_id; block_id++) @@ -529,6 +545,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if (needs_backup) { Page page = regbuf->page; + int compression_done = false; /* * The page needs to be backed up, so set up *bimg @@ -563,29 +580,76 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, /* Fill in the remaining fields in the XLogRecordBlockData struct */ bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE; - total_len += BLCKSZ - bimg.hole_length; - /* - * Construct XLogRecData entries for the page content. + * Construct XLogRecData entries for the page content. If page + * compression is active instead of creating a new entry store + * the data in dedicated buffer to prepare for the compression. + * If page has a hole pre-cook for compression the */ - rdt_datas_last->next = ®buf->bkp_rdatas[0]; - rdt_datas_last = rdt_datas_last->next; - if (bimg.hole_length == 0) + if (doPageCompression) { - rdt_datas_last->data = page; - rdt_datas_last->len = BLCKSZ; + int page_len = BLCKSZ - bimg.hole_length; + char uncompressed_page[PGLZ_MAX_OUTPUT(BLCKSZ)]; + uint16 compression_len; + + /* shape block image for compression and skip hole if any */ + if (bimg.hole_length == 0) + memcpy(uncompressed_page, page, BLCKSZ); + else + { + /* Copy page content without hole */ + memcpy(uncompressed_page, page, bimg.hole_offset); + memcpy(uncompressed_page + bimg.hole_offset, + page + bimg.hole_offset + bimg.hole_length, + BLCKSZ - (bimg.hole_offset + bimg.hole_length)); + } + + /* Perform compression of block */ + if (XLogCompressBackupBlock(uncompressed_page, + page_len, + regbuf->compressed_page, + &compression_len)) + { + /* compression is done, add record */ + compression_done = true; + bimg.compress_len = compression_len; + + rdt_datas_last->next = ®buf->bkp_rdatas[0]; + rdt_datas_last = rdt_datas_last->next; + rdt_datas_last->data = regbuf->compressed_page; + rdt_datas_last->len = compression_len; + total_len += compression_len; + } } - else + + /* + * If compression has not been done store normally this + * block image. + */ + if (!compression_done) { - /* must skip the hole */ - rdt_datas_last->data = page; - rdt_datas_last->len = bimg.hole_offset; + total_len += BLCKSZ - bimg.hole_length; - rdt_datas_last->next = ®buf->bkp_rdatas[1]; + rdt_datas_last->next = ®buf->bkp_rdatas[0]; rdt_datas_last = rdt_datas_last->next; + if (bimg.hole_length == 0) + { + rdt_datas_last->data = page; + rdt_datas_last->len = BLCKSZ; + } + else + { + /* must skip the hole */ + rdt_datas_last->data = page; + rdt_datas_last->len = bimg.hole_offset; + + rdt_datas_last->next = ®buf->bkp_rdatas[1]; + rdt_datas_last = rdt_datas_last->next; - rdt_datas_last->data = page + (bimg.hole_offset + bimg.hole_length); - rdt_datas_last->len = BLCKSZ - (bimg.hole_offset + bimg.hole_length); + rdt_datas_last->data = page + (bimg.hole_offset + bimg.hole_length); + rdt_datas_last->len = BLCKSZ - (bimg.hole_offset + bimg.hole_length); + } + bimg.compress_len = 0; } } @@ -681,6 +745,35 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, } /* + * Create a compressed version of a backup block. If successful, return + * true and set 'len' to its length. If block cannot be compressed or if + * compression failed return false. + */ +static bool +XLogCompressBackupBlock(char *page, uint32 orig_len, char *dest, uint16 *len) +{ + /* leave if data can not be compressed */ + if (pglz_compress(page, orig_len, (PGLZ_Header *) dest, + PGLZ_strategy_default) != PGLZ_OK) + return false; + + /* + * We recheck the actual size even if pglz_compress() report success, + * because it might be satisfied with having saved as little as one byte + * in the compressed data --- which could turn into a net loss once you + * consider header and alignment padding. Worst case, the compressed + * format might require three padding bytes (plus header, which is + * included in VARSIZE(buf)), whereas the uncompressed format would take + * only one header byte and no padding if the value is short enough. So + * we insist on a savings of more than 2 bytes to ensure we have a gain. + */ + *len = VARSIZE((struct varlena *) dest); + if (*len >= orig_len - 2) + return false; + return true; +} + +/* * Determine whether the buffer referenced has to be backed up. * * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites @@ -694,7 +787,7 @@ XLogCheckBufferNeedsBackup(Buffer buffer) bool doPageWrites; Page page; - GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); + GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites, NULL); page = BufferGetPage(buffer); @@ -875,9 +968,13 @@ InitXLogInsert(void) if (registered_buffers == NULL) { + int i; registered_buffers = (registered_buffer *) MemoryContextAllocZero(xloginsert_cxt, sizeof(registered_buffer) * (XLR_NORMAL_MAX_BLOCK_ID + 1)); + for (i = 0; i < XLR_NORMAL_MAX_BLOCK_ID + 1; i++) + registered_buffers[i].compressed_page = (char *) + MemoryContextAllocZero(xloginsert_cxt, PGLZ_MAX_OUTPUT(BLCKSZ)); max_registered_buffers = XLR_NORMAL_MAX_BLOCK_ID + 1; } if (rdatas == NULL) diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 67d6223..66446b0 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -20,6 +20,7 @@ #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "catalog/pg_control.h" +#include "utils/pg_lzcompress.h" static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength); @@ -1034,7 +1035,13 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) { COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); - datatotal += BLCKSZ - blk->hole_length; + COPY_HEADER_FIELD(&blk->compress_len, sizeof(uint16)); + + /* adapt depending on presence of compressed image */ + if (blk->compress_len != 0) + datatotal += blk->compress_len; + else + datatotal += BLCKSZ - blk->hole_length; } if (!(fork_flags & BKPBLOCK_SAME_REL)) { @@ -1089,7 +1096,12 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) if (blk->has_image) { blk->bkp_image = ptr; - ptr += BLCKSZ - blk->hole_length; + + /* adapt depending on presence of compressed image */ + if (blk->compress_len != 0) + ptr += blk->compress_len; + else + ptr += BLCKSZ - blk->hole_length; } if (blk->has_data) { @@ -1195,6 +1207,8 @@ bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) { DecodedBkpBlock *bkpb; + char *uncompressed_page = NULL; + char *block_image; if (!record->blocks[block_id].in_use) return false; @@ -1202,20 +1216,35 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) return false; bkpb = &record->blocks[block_id]; + block_image = bkpb->bkp_image; + + /* decompress block if it is compressed before processing */ + if (bkpb->compress_len != 0) + { + PGLZ_Header *header = (PGLZ_Header *) block_image; + uncompressed_page = (char *) + palloc(PGLZ_RAW_SIZE(header)); + /* XXX: should check for status code here */ + pglz_decompress(header, uncompressed_page); + block_image = uncompressed_page; + } + /* generate page, taking into account hole if necessary */ if (bkpb->hole_length == 0) { - memcpy(page, bkpb->bkp_image, BLCKSZ); + memcpy(page, block_image, BLCKSZ); } else { - memcpy(page, bkpb->bkp_image, bkpb->hole_offset); + memcpy(page, block_image, bkpb->hole_offset); /* must zero-fill the hole */ MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); memcpy(page + (bkpb->hole_offset + bkpb->hole_length), - bkpb->bkp_image + bkpb->hole_offset, + block_image + bkpb->hole_offset, BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); } + if (uncompressed_page) + pfree(uncompressed_page); return true; } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index d4d74ba..f1bf578 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -420,6 +420,23 @@ static const struct config_enum_entry row_security_options[] = { }; /* + * Although only "on", "off", and "compress" are documented, we + * accept all the likely variants of "on" and "off". + */ +static const struct config_enum_entry full_page_writes_options[] = { + {"compress", FULL_PAGE_WRITES_COMPRESS, false}, + {"on", FULL_PAGE_WRITES_ON, false}, + {"off", FULL_PAGE_WRITES_OFF, false}, + {"true", FULL_PAGE_WRITES_ON, true}, + {"false", FULL_PAGE_WRITES_OFF, true}, + {"yes", FULL_PAGE_WRITES_ON, true}, + {"no", FULL_PAGE_WRITES_OFF, true}, + {"1", FULL_PAGE_WRITES_ON, true}, + {"0", FULL_PAGE_WRITES_OFF, true}, + {NULL, 0, false} +}; + +/* * Options for enum values stored in other modules */ extern const struct config_enum_entry wal_level_options[]; @@ -894,19 +911,6 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, - { - {"full_page_writes", PGC_SIGHUP, WAL_SETTINGS, - gettext_noop("Writes full pages to WAL when first modified after a checkpoint."), - gettext_noop("A page write in process during an operating system crash might be " - "only partially written to disk. During recovery, the row changes " - "stored in WAL are not enough to recover. This option writes " - "pages when first modified after a checkpoint to WAL so full recovery " - "is possible.") - }, - &fullPageWrites, - true, - NULL, NULL, NULL - }, { {"wal_log_hints", PGC_POSTMASTER, WAL_SETTINGS, @@ -3436,6 +3440,20 @@ static struct config_enum ConfigureNamesEnum[] = }, { + {"full_page_writes", PGC_SIGHUP, WAL_SETTINGS, + gettext_noop("Writes full pages to WAL when first modified after a checkpoint."), + gettext_noop("A page write in process during an operating system crash might be " + "only partially written to disk. During recovery, the row changes " + "stored in WAL are not enough to recover. This option writes " + "pages when first modified after a checkpoint to WAL so full recovery " + "is possible.") + }, + &fullPageWrites, + FULL_PAGE_WRITES_ON, full_page_writes_options, + NULL, NULL, NULL + }, + + { {"trace_recovery_messages", PGC_SIGHUP, DEVELOPER_OPTIONS, gettext_noop("Enables logging of recovery-related debugging information."), gettext_noop("Each level includes all the levels that follow it. The later" diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 4a89cb7..8a1fb9e 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -185,7 +185,8 @@ # fsync # fsync_writethrough # open_sync -#full_page_writes = on # recover from partial page writes +#full_page_writes = on # recover from partial page writes; + # off, compress, or on #wal_log_hints = off # also do full page writes of non-critical updates #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers # (change requires restart) diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index b2e0793..e250ee0 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -250,7 +250,7 @@ main(int argc, char *argv[]) printf(_("Latest checkpoint's PrevTimeLineID: %u\n"), ControlFile.checkPointCopy.PrevTimeLineID); printf(_("Latest checkpoint's full_page_writes: %s\n"), - ControlFile.checkPointCopy.fullPageWrites ? _("on") : _("off")); + FullPageWritesStr(ControlFile.checkPointCopy.fullPageWrites)); printf(_("Latest checkpoint's NextXID: %u/%u\n"), ControlFile.checkPointCopy.nextXidEpoch, ControlFile.checkPointCopy.nextXid); diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c index 666e8db..178c43c 100644 --- a/src/bin/pg_resetxlog/pg_resetxlog.c +++ b/src/bin/pg_resetxlog/pg_resetxlog.c @@ -517,7 +517,7 @@ GuessControlValues(void) ControlFile.checkPointCopy.redo = SizeOfXLogLongPHD; ControlFile.checkPointCopy.ThisTimeLineID = 1; ControlFile.checkPointCopy.PrevTimeLineID = 1; - ControlFile.checkPointCopy.fullPageWrites = false; + ControlFile.checkPointCopy.fullPageWrites = FULL_PAGE_WRITES_OFF; ControlFile.checkPointCopy.nextXidEpoch = 0; ControlFile.checkPointCopy.nextXid = FirstNormalTransactionId; ControlFile.checkPointCopy.nextOid = FirstBootstrapObjectId; @@ -601,7 +601,7 @@ PrintControlValues(bool guessed) printf(_("Latest checkpoint's TimeLineID: %u\n"), ControlFile.checkPointCopy.ThisTimeLineID); printf(_("Latest checkpoint's full_page_writes: %s\n"), - ControlFile.checkPointCopy.fullPageWrites ? _("on") : _("off")); + FullPageWritesStr(ControlFile.checkPointCopy.fullPageWrites)); printf(_("Latest checkpoint's NextXID: %u/%u\n"), ControlFile.checkPointCopy.nextXidEpoch, ControlFile.checkPointCopy.nextXid); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d06fbc0..c34f49c 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -96,7 +96,6 @@ extern int XLogArchiveTimeout; extern bool XLogArchiveMode; extern char *XLogArchiveCommand; extern bool EnableHotStandby; -extern bool fullPageWrites; extern bool wal_log_hints; extern bool log_checkpoints; @@ -113,6 +112,23 @@ extern int wal_level; #define XLogArchivingActive() (XLogArchiveMode && wal_level >= WAL_LEVEL_ARCHIVE) #define XLogArchiveCommandSet() (XLogArchiveCommand[0] != '\0') +/* full-page writes */ +typedef enum FullPageWritesLevel +{ + FULL_PAGE_WRITES_OFF = 0, + FULL_PAGE_WRITES_COMPRESS, + FULL_PAGE_WRITES_ON +} FullPageWritesLevel; +extern int fullPageWrites; + +/* + * Convert full-page write parameter into a readable string. + */ +#define FullPageWritesStr(fpw) \ + (fpw == FULL_PAGE_WRITES_ON ? _("on") : \ + (fpw == FULL_PAGE_WRITES_COMPRESS ? _("compress") : \ + (fpw == FULL_PAGE_WRITES_OFF ? _("off") : _("unrecognized")))) + /* * Is WAL-logging necessary for archival or log-shipping, or can we skip * WAL-logging if we fsync() the data before committing instead? @@ -235,7 +251,9 @@ extern bool CreateRestartPoint(int flags); extern void XLogPutNextOid(Oid nextOid); extern XLogRecPtr XLogRestorePoint(const char *rpName); extern void UpdateFullPageWrites(void); -extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p); +extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, + bool *doPageWrites_p, + bool *doPageCompression_p); extern XLogRecPtr GetRedoRecPtr(void); extern XLogRecPtr GetInsertRecPtr(void); extern XLogRecPtr GetFlushRecPtr(void); diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index eb6cc89..84c107e 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -55,6 +55,7 @@ typedef struct char *bkp_image; uint16 hole_offset; uint16 hole_length; + uint16 compress_len; /* Buffer holding the rmgr-specific data associated with this block */ bool has_data; diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index 11ddfac..0a111c1 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -29,6 +29,9 @@ * ... * main data * + * If compression of full-page write is activated, all the data blocks are + * compressed as a single record in the record chain. + * * There can be zero or more XLogRecordBlockHeaders, and 0 or more bytes of * rmgr-specific data not associated with a block. XLogRecord structs * always start on MAXALIGN boundaries in the WAL files, but the rest of @@ -103,11 +106,16 @@ typedef struct XLogRecordBlockHeader * such a "hole" from the stored data (and it's not counted in the * XLOG record's CRC, either). Hence, the amount of block data actually * present is BLCKSZ - hole_length bytes. + * + * compress_len indicates the length of this block when compressed. A length + * of 0 means that this block is not compressed. If the block image has a hole + * the block image is compressed without the hole. */ typedef struct XLogRecordBlockImageHeader { uint16 hole_offset; /* number of bytes before "hole" */ - uint16 hole_length; /* number of bytes in "hole" */ + uint16 hole_length; /* number of bytes before "hole" */ + uint16 compress_len; /* size of compressed block */ } XLogRecordBlockImageHeader; #define SizeOfXLogRecordBlockImageHeader sizeof(XLogRecordBlockImageHeader) diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 15f81e4..97d4a6d 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -35,7 +35,7 @@ typedef struct CheckPoint TimeLineID ThisTimeLineID; /* current TLI */ TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new * timeline (equals ThisTimeLineID otherwise) */ - bool fullPageWrites; /* current full_page_writes */ + int fullPageWrites; /* current full_page_writes */ uint32 nextXidEpoch; /* higher-order bits of nextXid */ TransactionId nextXid; /* next free XID */ Oid nextOid; /* next free OID */ -- 2.1.3