From 3ae6f0082dd6ae5ba9c8b53e242d111811f12516 Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy Date: Sun, 13 Mar 2022 08:19:07 +0000 Subject: [PATCH v2] add checkpoint stats of snapshot and mapping files of pg_logical dir At times, there can be many snapshot and mapping files under pg_logical dir that the checkpoint might have to delete/fsync based on the cutoff LSN which can increase the checkpoint time. Add stats related to these files to better understand the delays or time spent by the checkpointer processing them. Also, emitting snapshot and mapping files stats in case of restartpoint doesn't make sense for now as logical decoding isn't supported on standbys. --- src/backend/access/heap/rewriteheap.c | 8 ++++ src/backend/access/transam/xlog.c | 41 +++++++++++++++++++-- src/backend/replication/logical/snapbuild.c | 5 +++ src/include/access/xlog.h | 16 ++++++++ 4 files changed, 67 insertions(+), 3 deletions(-) diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 2a53826736..9f3c2f510b 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -1210,6 +1210,8 @@ CheckPointLogicalRewriteHeap(void) if (cutoff != InvalidXLogRecPtr && redo < cutoff) cutoff = redo; + CheckpointStats.repl_map_cutoff_lsn = cutoff; + mappings_dir = AllocateDir("pg_logical/mappings"); while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL) { @@ -1247,6 +1249,9 @@ CheckPointLogicalRewriteHeap(void) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); + + CheckpointStats.repl_map_files_rmvd_cnt++; + CheckpointStats.repl_map_files_rmvd_sz += statbuf.st_size; } else { @@ -1280,6 +1285,9 @@ CheckPointLogicalRewriteHeap(void) (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", path))); } + + CheckpointStats.repl_map_files_syncd_cnt++; + CheckpointStats.repl_map_files_syncd_sz += statbuf.st_size; } FreeDir(mappings_dir); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 0d2bd7a357..70e1971b21 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6077,7 +6077,9 @@ LogCheckpointEnd(bool restartpoint) sync_msecs, total_msecs, longest_msecs, - average_msecs; + average_msecs, + repl_snap_msecs, + repl_map_msecs; uint64 average_sync_time; CheckpointStats.ckpt_end_t = GetCurrentTimestamp(); @@ -6114,6 +6116,20 @@ LogCheckpointEnd(bool restartpoint) CheckpointStats.ckpt_sync_rels; average_msecs = (long) ((average_sync_time + 999) / 1000); + /* + * XXX let's not emit snapshot and mapping files stats in case of + * restartpoint as logical decoding isn't supported on standbys yet, + * however it needs to be revisited when it is supported. + */ + if (!restartpoint) + { + repl_snap_msecs = TimestampDifferenceMilliseconds(CheckpointStats.repl_snap_start_t, + CheckpointStats.repl_snap_end_t); + + repl_map_msecs = TimestampDifferenceMilliseconds(CheckpointStats.repl_map_start_t, + CheckpointStats.repl_map_end_t); + } + if (restartpoint) ereport(LOG, (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); " @@ -6140,7 +6156,9 @@ LogCheckpointEnd(bool restartpoint) "%d WAL file(s) added, %d removed, %d recycled; " "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " - "distance=%d kB, estimate=%d kB", + "distance=%d kB, estimate=%d kB; " + "snapbuild snapshot file(s) removed=" UINT64_FORMAT ", size=%zu bytes, time=%ld.%03d s, cutoff LSN=%X/%X; " + "logical rewrite mapping file(s) removed=" UINT64_FORMAT ", size=%zu bytes, synced=" UINT64_FORMAT ", size=%zu bytes, time=%ld.%03d s, cutoff LSN=%X/%X", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_segs_added, @@ -6153,7 +6171,17 @@ LogCheckpointEnd(bool restartpoint) longest_msecs / 1000, (int) (longest_msecs % 1000), average_msecs / 1000, (int) (average_msecs % 1000), (int) (PrevCheckPointDistance / 1024.0), - (int) (CheckPointDistanceEstimate / 1024.0)))); + (int) (CheckPointDistanceEstimate / 1024.0), + CheckpointStats.repl_snap_files_rmvd_cnt, + CheckpointStats.repl_snap_files_rmvd_sz, + repl_snap_msecs / 1000, (int) (repl_snap_msecs % 1000), + LSN_FORMAT_ARGS(CheckpointStats.repl_snap_cutoff_lsn), + CheckpointStats.repl_map_files_rmvd_cnt, + CheckpointStats.repl_map_files_rmvd_sz, + CheckpointStats.repl_map_files_syncd_cnt, + CheckpointStats.repl_map_files_syncd_sz, + repl_map_msecs / 1000, (int) (repl_map_msecs % 1000), + LSN_FORMAT_ARGS(CheckpointStats.repl_map_cutoff_lsn)))); } /* @@ -6809,8 +6837,15 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { CheckPointRelationMap(); CheckPointReplicationSlots(); + + CheckpointStats.repl_snap_start_t = GetCurrentTimestamp(); CheckPointSnapBuild(); + CheckpointStats.repl_snap_end_t = GetCurrentTimestamp(); + + CheckpointStats.repl_map_start_t = GetCurrentTimestamp(); CheckPointLogicalRewriteHeap(); + CheckpointStats.repl_map_end_t = GetCurrentTimestamp(); + CheckPointReplicationOrigin(); /* Write out all dirty data in SLRUs and the main buffer pool */ diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 83fca8a77d..5e7d8d9dea 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -1941,6 +1941,8 @@ CheckPointSnapBuild(void) if (redo < cutoff) cutoff = redo; + CheckpointStats.repl_snap_cutoff_lsn = cutoff; + snap_dir = AllocateDir("pg_logical/snapshots"); while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL) { @@ -1997,6 +1999,9 @@ CheckPointSnapBuild(void) path))); continue; } + + CheckpointStats.repl_snap_files_rmvd_cnt++; + CheckpointStats.repl_snap_files_rmvd_sz += statbuf.st_size; } } FreeDir(snap_dir); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 09f6464331..40cda7f4bb 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -172,6 +172,22 @@ typedef struct CheckpointStatsData * times, which is not necessarily the * same as the total elapsed time for the * entire sync phase. */ + + /* Statistics of snapshot files under "pg_logical/snapshots" */ + Size repl_snap_files_rmvd_sz; + uint64 repl_snap_files_rmvd_cnt; + XLogRecPtr repl_snap_cutoff_lsn; + TimestampTz repl_snap_start_t; + TimestampTz repl_snap_end_t; + + /* Statistics of map files under "pg_logical/mappings" */ + Size repl_map_files_syncd_sz; + uint64 repl_map_files_syncd_cnt; + Size repl_map_files_rmvd_sz; + uint64 repl_map_files_rmvd_cnt; + XLogRecPtr repl_map_cutoff_lsn; + TimestampTz repl_map_start_t; + TimestampTz repl_map_end_t; } CheckpointStatsData; extern CheckpointStatsData CheckpointStats; -- 2.25.1