From 752fd6cc636eea08c06b25d8898e091835442387 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Thu, 5 Aug 2021 15:47:50 -0400 Subject: [PATCH v2 2/2] Add IO wait time stat and add guc for BHS prefetch - Add an IO wait time measurement which can be seen in EXPLAIN output (with explain buffers option and track_io_timing on) - TODO: add the wait time to database statistics - Also, add a GUC to control the BHS pre-fetch max window size --- src/backend/access/heap/heapam_handler.c | 2 +- src/backend/commands/explain.c | 9 ++++++++- src/backend/executor/instrument.c | 4 ++++ src/backend/postmaster/pgstat.c | 6 ++++++ src/backend/storage/aio/aio.c | 16 ++++++++++++++++ src/backend/storage/aio/aio_util.c | 2 -- src/backend/storage/buffer/bufmgr.c | 2 ++ src/backend/utils/adt/pgstatfuncs.c | 2 ++ src/backend/utils/misc/guc.c | 10 ++++++++++ src/include/executor/instrument.h | 1 + src/include/pgstat.h | 6 ++++++ src/include/storage/bufmgr.h | 1 + 12 files changed, 57 insertions(+), 4 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index a8bd8050dc..7409dd2fb3 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2221,7 +2221,7 @@ void bitmapheap_pgsr_alloc(BitmapHeapScanState *scanstate) HeapScanDesc hscan = (HeapScanDesc ) scanstate->ss.ss_currentScanDesc; if (!hscan->rs_inited) { - int iodepth = Max(Min(128, NBuffers / 128), 1); + int iodepth = io_bitmap_prefetch_depth; hscan->pgsr = pg_streaming_read_alloc(iodepth, (uintptr_t) scanstate, bitmapheapscan_pgsr_next_single, bitmapheapscan_pgsr_release); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index c66d39a5c7..d1e89f1e1b 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3339,7 +3339,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning) bool has_temp = (usage->temp_blks_read > 0 || usage->temp_blks_written > 0); bool has_timing = (!INSTR_TIME_IS_ZERO(usage->blk_read_time) || - !INSTR_TIME_IS_ZERO(usage->blk_write_time)); + !INSTR_TIME_IS_ZERO(usage->blk_write_time || + !INSTR_TIME_IS_ZERO(usage->io_wait_time))); bool show_planning = (planning && (has_shared || has_local || has_temp || has_timing)); @@ -3416,6 +3417,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning) if (!INSTR_TIME_IS_ZERO(usage->blk_write_time)) appendStringInfo(es->str, " write=%0.3f", INSTR_TIME_GET_MILLISEC(usage->blk_write_time)); + if (!INSTR_TIME_IS_ZERO(usage->io_wait_time)) + appendStringInfo(es->str, " wait=%0.3f", + INSTR_TIME_GET_MILLISEC(usage->io_wait_time)); appendStringInfoChar(es->str, '\n'); } @@ -3452,6 +3456,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning) ExplainPropertyFloat("I/O Write Time", "ms", INSTR_TIME_GET_MILLISEC(usage->blk_write_time), 3, es); + ExplainPropertyFloat("I/O Wait Time", "ms", + INSTR_TIME_GET_MILLISEC(usage->io_wait_time), + 5, es); } } } diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 96af2a2673..85e7321d63 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -218,6 +218,8 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) dst->temp_blks_written += add->temp_blks_written; INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time); INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time); + INSTR_TIME_ADD(dst->io_wait_time, add->io_wait_time); + } /* dst += add - sub */ @@ -240,6 +242,8 @@ BufferUsageAccumDiff(BufferUsage *dst, add->blk_read_time, sub->blk_read_time); INSTR_TIME_ACCUM_DIFF(dst->blk_write_time, add->blk_write_time, sub->blk_write_time); + INSTR_TIME_ACCUM_DIFF(dst->io_wait_time, + add->io_wait_time, sub->io_wait_time); } /* helper functions for WAL usage accumulation */ diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 9d335b8507..028ca14aa8 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -258,6 +258,7 @@ static int pgStatXactCommit = 0; static int pgStatXactRollback = 0; PgStat_Counter pgStatBlockReadTime = 0; PgStat_Counter pgStatBlockWriteTime = 0; +PgStat_Counter pgStatIOWaitTime = 0; static PgStat_Counter pgStatActiveTime = 0; static PgStat_Counter pgStatTransactionIdleTime = 0; SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL; @@ -1004,10 +1005,12 @@ pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg) tsmsg->m_xact_rollback = pgStatXactRollback; tsmsg->m_block_read_time = pgStatBlockReadTime; tsmsg->m_block_write_time = pgStatBlockWriteTime; + tsmsg->m_io_wait_time = pgStatIOWaitTime; pgStatXactCommit = 0; pgStatXactRollback = 0; pgStatBlockReadTime = 0; pgStatBlockWriteTime = 0; + pgStatIOWaitTime = 0; } else { @@ -1015,6 +1018,7 @@ pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg) tsmsg->m_xact_rollback = 0; tsmsg->m_block_read_time = 0; tsmsg->m_block_write_time = 0; + tsmsg->m_io_wait_time = 0; } n = tsmsg->m_nentries; @@ -5122,6 +5126,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry) dbentry->last_checksum_failure = 0; dbentry->n_block_read_time = 0; dbentry->n_block_write_time = 0; + dbentry->n_io_wait_time = 0; dbentry->n_sessions = 0; dbentry->total_session_time = 0; dbentry->total_active_time = 0; @@ -6442,6 +6447,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback); dbentry->n_block_read_time += msg->m_block_read_time; dbentry->n_block_write_time += msg->m_block_write_time; + dbentry->n_io_wait_time += msg->m_io_wait_time; /* * Process all table entries in the message. diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index 726d50f698..c5aacbcf9d 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -36,6 +36,8 @@ #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner_private.h" +#include "executor/instrument.h" +#include "storage/bufmgr.h" #define PGAIO_VERBOSE @@ -1729,6 +1731,12 @@ wait_ref_again: } else if (io_method != IOMETHOD_WORKER && (flags & PGAIOIP_INFLIGHT)) { + int io_wait_start, io_wait_time; + if (track_io_timing) + INSTR_TIME_SET_CURRENT(io_wait_start); + else + INSTR_TIME_SET_ZERO(io_wait_start); + /* note that this is allowed to spuriously return */ if (io_method == IOMETHOD_WORKER) ConditionVariableSleep(&io->cv, WAIT_EVENT_AIO_IO_COMPLETE_ONE); @@ -1741,6 +1749,14 @@ wait_ref_again: else if (io_method == IOMETHOD_POSIX) pgaio_posix_wait_one(io, ref_generation); #endif + + if (track_io_timing) + { + INSTR_TIME_SET_CURRENT(io_wait_time); + INSTR_TIME_SUBTRACT(io_wait_time, io_wait_start); + pgstat_count_io_wait_time(INSTR_TIME_GET_MICROSEC(io_wait_time)); + INSTR_TIME_ADD(pgBufferUsage.io_wait_time, io_wait_time); + } } #ifdef USE_POSIX_AIO /* XXX untangle this */ diff --git a/src/backend/storage/aio/aio_util.c b/src/backend/storage/aio/aio_util.c index 35436dfcd3..a79db4d747 100644 --- a/src/backend/storage/aio/aio_util.c +++ b/src/backend/storage/aio/aio_util.c @@ -417,8 +417,6 @@ pg_streaming_read_alloc(uint32 iodepth, uintptr_t pgsr_private, { PgStreamingRead *pgsr; - iodepth = Max(Min(iodepth, NBuffers / 128), 1); - pgsr = palloc0(offsetof(PgStreamingRead, all_items) + sizeof(PgStreamingReadItem) * iodepth * 2); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f5df1b78f2..69175699b0 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -145,6 +145,8 @@ bool track_io_timing = false; */ int effective_io_concurrency = 0; +int io_bitmap_prefetch_depth = 128; + /* * Like effective_io_concurrency, but used by maintenance code paths that might * benefit from a higher setting because they work on behalf of many sessions. diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 2fca05f7af..a6d4f121e1 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1615,6 +1615,8 @@ pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(result); } +// TODO: add one for io wait time + Datum pg_stat_get_db_blk_write_time(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index d143783f22..f4f02fb9ad 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -3029,6 +3029,16 @@ static struct config_int ConfigureNamesInt[] = 0, MAX_IO_CONCURRENCY, check_maintenance_io_concurrency, NULL, NULL }, + { + {"io_bitmap_prefetch_depth", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Maximum pre-fetch distance for bitmapheapscan"), + NULL, + GUC_EXPLAIN + }, + &io_bitmap_prefetch_depth, + 128, 0, MAX_IO_CONCURRENCY, + NULL, NULL, NULL + }, { {"io_wal_concurrency", PGC_POSTMASTER, RESOURCES_ASYNCHRONOUS, diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 62ca398e9d..950414cc0b 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -30,6 +30,7 @@ typedef struct BufferUsage long temp_blks_written; /* # of temp blocks written */ instr_time blk_read_time; /* time spent reading */ instr_time blk_write_time; /* time spent writing */ + instr_time io_wait_time; } BufferUsage; typedef struct WalUsage diff --git a/src/include/pgstat.h b/src/include/pgstat.h index b77dcbc58b..a02813f02f 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -291,6 +291,7 @@ typedef struct PgStat_MsgTabstat int m_xact_rollback; PgStat_Counter m_block_read_time; /* times in microseconds */ PgStat_Counter m_block_write_time; + PgStat_Counter m_io_wait_time; PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES]; } PgStat_MsgTabstat; @@ -732,6 +733,7 @@ typedef struct PgStat_StatDBEntry TimestampTz last_checksum_failure; PgStat_Counter n_block_read_time; /* times in microseconds */ PgStat_Counter n_block_write_time; + PgStat_Counter n_io_wait_time; PgStat_Counter n_sessions; PgStat_Counter total_session_time; PgStat_Counter total_active_time; @@ -1417,6 +1419,8 @@ extern PgStat_MsgWal WalStats; extern PgStat_Counter pgStatBlockReadTime; extern PgStat_Counter pgStatBlockWriteTime; +extern PgStat_Counter pgStatIOWaitTime; + /* * Updated by the traffic cop and in errfinish() */ @@ -1593,6 +1597,8 @@ pgstat_report_wait_end(void) (pgStatBlockReadTime += (n)) #define pgstat_count_buffer_write_time(n) \ (pgStatBlockWriteTime += (n)) +#define pgstat_count_io_wait_time(n) \ + (pgStatIOWaitTime += (n)) extern void pgstat_count_heap_insert(Relation rel, PgStat_Counter n); extern void pgstat_count_heap_update(Relation rel, bool hot); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 07401f8493..8c21bb6e56 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -70,6 +70,7 @@ extern int bgwriter_lru_maxpages; extern double bgwriter_lru_multiplier; extern bool track_io_timing; extern int effective_io_concurrency; +extern int io_bitmap_prefetch_depth; extern int maintenance_io_concurrency; extern int checkpoint_flush_after; -- 2.27.0