From b113dfb9e876a84c2121b64ab94a9d10c3c670b4 Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy Date: Tue, 19 Jul 2022 10:35:29 +0000 Subject: [PATCH v10] Add time taken for processing logical decoding files to checkpoint log At times, there can be many snapshot and mapping files under pg_logical dir that the checkpoint might have to delete/fsync based on the cutoff LSN which can increase the checkpoint time. Add stats related to these files to better understand the delays or time spent by the checkpointer processing them. Also, add total number of WAL files processed during checkpoint to the log message. This will be useful for debugging issues like total time taken by checkpoint (if there are many WAL files) and estimate the disk space occupied by pg_wal at the time of checkpoint. --- src/backend/access/transam/xlog.c | 30 +++++++++++++++++++++++------- src/include/access/xlog.h | 5 +++++ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index b809a2152c..20c1689ed2 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -3603,6 +3603,8 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr, insertTLI); } } + + CheckpointStats.ckpt_segs_processed++; } FreeDir(xldir); @@ -6092,7 +6094,8 @@ LogCheckpointEnd(bool restartpoint) sync_msecs, total_msecs, longest_msecs, - average_msecs; + average_msecs, + l_dec_ops_msecs; uint64 average_sync_time; CheckpointStats.ckpt_end_t = GetCurrentTimestamp(); @@ -6129,6 +6132,9 @@ LogCheckpointEnd(bool restartpoint) CheckpointStats.ckpt_sync_rels; average_msecs = (long) ((average_sync_time + 999) / 1000); + l_dec_ops_msecs = TimestampDifferenceMilliseconds(CheckpointStats.l_dec_ops_start_t, + CheckpointStats.l_dec_ops_end_t); + /* * ControlFileLock is not required to see ControlFile->checkPoint and * ->checkPointCopy here as we are the only updator of those variables at @@ -6137,16 +6143,18 @@ LogCheckpointEnd(bool restartpoint) if (restartpoint) ereport(LOG, (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); " - "%d WAL file(s) added, %d removed, %d recycled; " + "%d WAL file(s) added, %d removed, %d recycled, %d processed; " "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " "distance=%d kB, estimate=%d kB; " - "lsn=%X/%X, redo lsn=%X/%X", + "lsn=%X/%X, redo lsn=%X/%X; " + "logical decoding file(s) processing=%ld.%03d s", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_segs_added, CheckpointStats.ckpt_segs_removed, CheckpointStats.ckpt_segs_recycled, + CheckpointStats.ckpt_segs_processed, write_msecs / 1000, (int) (write_msecs % 1000), sync_msecs / 1000, (int) (sync_msecs % 1000), total_msecs / 1000, (int) (total_msecs % 1000), @@ -6156,20 +6164,23 @@ LogCheckpointEnd(bool restartpoint) (int) (PrevCheckPointDistance / 1024.0), (int) (CheckPointDistanceEstimate / 1024.0), LSN_FORMAT_ARGS(ControlFile->checkPoint), - LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)))); + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), + l_dec_ops_msecs / 1000, (int) (l_dec_ops_msecs % 1000)))); else ereport(LOG, (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); " - "%d WAL file(s) added, %d removed, %d recycled; " + "%d WAL file(s) added, %d removed, %d recycled, %d processed; " "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " "distance=%d kB, estimate=%d kB; " - "lsn=%X/%X, redo lsn=%X/%X", + "lsn=%X/%X, redo lsn=%X/%X; " + "logical decoding file(s) processing=%ld.%03d s", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_segs_added, CheckpointStats.ckpt_segs_removed, CheckpointStats.ckpt_segs_recycled, + CheckpointStats.ckpt_segs_processed, write_msecs / 1000, (int) (write_msecs % 1000), sync_msecs / 1000, (int) (sync_msecs % 1000), total_msecs / 1000, (int) (total_msecs % 1000), @@ -6179,7 +6190,8 @@ LogCheckpointEnd(bool restartpoint) (int) (PrevCheckPointDistance / 1024.0), (int) (CheckPointDistanceEstimate / 1024.0), LSN_FORMAT_ARGS(ControlFile->checkPoint), - LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)))); + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), + l_dec_ops_msecs / 1000, (int) (l_dec_ops_msecs % 1000)))); } /* @@ -6848,8 +6860,12 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { CheckPointRelationMap(); CheckPointReplicationSlots(); + + CheckpointStats.l_dec_ops_start_t = GetCurrentTimestamp(); CheckPointSnapBuild(); CheckPointLogicalRewriteHeap(); + CheckpointStats.l_dec_ops_end_t = GetCurrentTimestamp(); + CheckPointReplicationOrigin(); /* Write out all dirty data in SLRUs and the main buffer pool */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index cd674c3c23..75730254ed 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -166,6 +166,7 @@ typedef struct CheckpointStatsData int ckpt_segs_added; /* # of new xlog segments created */ int ckpt_segs_removed; /* # of xlog segments deleted */ int ckpt_segs_recycled; /* # of xlog segments recycled */ + int ckpt_segs_processed; /* # of xlog segments processed */ int ckpt_sync_rels; /* # of relations synced */ uint64 ckpt_longest_sync; /* Longest sync for one relation */ @@ -173,6 +174,10 @@ typedef struct CheckpointStatsData * times, which is not necessarily the * same as the total elapsed time for the * entire sync phase. */ + + /* start and end timestamps of logical decoding file processing */ + TimestampTz l_dec_ops_start_t; + TimestampTz l_dec_ops_end_t; } CheckpointStatsData; extern PGDLLIMPORT CheckpointStatsData CheckpointStats; -- 2.25.1