From e03c96cbd3079c03ae63b6427937b79edaa9562b Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 9 Sep 2025 02:16:59 -0700 Subject: [PATCH v4 4/7] Optimize measuring WAL/buffer usage through stack-based instrumentation Previously, in order to determine the buffer/WAL usage of a given code section, we utilized continuously incrementing global counters that get updated when the actual activity (e.g. shared block read) occurred, and then calculated a diff when the code section ended. This resulted in a bottleneck for executor node instrumentation specifically, with the function BufferUsageAccumDiff showing up in profiles and in some cases adding up to 10% overhead to an EXPLAIN (ANALYZE, BUFFERS) run. Instead, introduce a stack-based mechanism, where the actual activity writes into the current stack entry. In the case of executor nodes, this means that each node gets its own stack entry that is pushed at InstrStartNode, and popped at InstrEndNode. Stack entries are zero initialized (avoiding the diff mechanism) and get added to their parent entry when they are finalized, i.e. no more modifications can occur. To correctly handle abort situations, any use of instrumentation stacks must involve either a top-level Instrumentation struct, and its associated InstrStart/ InstrStop helpers (which use resource owners to handle aborts), or dedicated PG_TRY/PG_FINALLY calls that ensure the stack is in a consistent state after an abort. --- .../pg_stat_statements/pg_stat_statements.c | 105 ++++----- src/backend/commands/explain.c | 8 +- src/backend/executor/execMain.c | 28 ++- src/backend/executor/execProcnode.c | 31 +++ src/backend/executor/instrument.c | 218 ++++++++++++++---- src/include/executor/executor.h | 1 + src/include/executor/instrument.h | 64 ++++- src/include/utils/resowner.h | 1 + 8 files changed, 333 insertions(+), 123 deletions(-) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 7f56592f536..1ed3660cf9b 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -911,21 +911,13 @@ pgss_planner(Query *parse, { instr_time start; instr_time duration; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + InstrStack stack = {0}; - /* We need to track buffer usage as the planner can access them. */ - bufusage_start = pgBufferUsage; - - /* - * Similarly the planner could write some WAL records in some cases - * (e.g. setting a hint bit with those being WAL-logged) - */ - walusage_start = pgWalUsage; INSTR_TIME_SET_CURRENT(start); + /* We need to track buffer/WAL usage as the planner can access them. */ + InstrPushStack(&stack); + nesting_level++; PG_TRY(); { @@ -938,6 +930,7 @@ pgss_planner(Query *parse, } PG_FINALLY(); { + InstrPopStack(&stack, true); nesting_level--; } PG_END_TRY(); @@ -945,14 +938,6 @@ pgss_planner(Query *parse, INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(query_string, parse->queryId, parse->stmt_location, @@ -960,8 +945,8 @@ pgss_planner(Query *parse, PGSS_PLAN, INSTR_TIME_GET_MILLISEC(duration), 0, - &bufusage, - &walusage, + &stack.bufusage, + &stack.walusage, NULL, NULL, 0, @@ -1089,8 +1074,13 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) PGSS_EXEC, INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), queryDesc->estate->es_total_processed, - &queryDesc->totaltime->bufusage, - &queryDesc->totaltime->walusage, + + /* + * Check if stack is initialized - it is not when ExecutorRun wasn't + * called + */ + queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->bufusage : NULL, + queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->walusage : NULL, queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, @@ -1157,14 +1147,10 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, instr_time start; instr_time duration; uint64 rows; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + InstrStack stack = {0}; - bufusage_start = pgBufferUsage; - walusage_start = pgWalUsage; INSTR_TIME_SET_CURRENT(start); + InstrPushStack(&stack); nesting_level++; PG_TRY(); @@ -1180,6 +1166,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, } PG_FINALLY(); { + InstrPopStack(&stack, true); nesting_level--; } PG_END_TRY(); @@ -1208,14 +1195,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ? qc->nprocessed : 0; - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(queryString, saved_queryId, saved_stmt_location, @@ -1223,8 +1202,8 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, PGSS_EXEC, INSTR_TIME_GET_MILLISEC(duration), rows, - &bufusage, - &walusage, + &stack.bufusage, + &stack.walusage, NULL, NULL, 0, @@ -1454,27 +1433,33 @@ pgss_store(const char *query, int64 queryId, } } entry->counters.rows += rows; - entry->counters.shared_blks_hit += bufusage->shared_blks_hit; - entry->counters.shared_blks_read += bufusage->shared_blks_read; - entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; - entry->counters.shared_blks_written += bufusage->shared_blks_written; - entry->counters.local_blks_hit += bufusage->local_blks_hit; - entry->counters.local_blks_read += bufusage->local_blks_read; - entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied; - entry->counters.local_blks_written += bufusage->local_blks_written; - entry->counters.temp_blks_read += bufusage->temp_blks_read; - entry->counters.temp_blks_written += bufusage->temp_blks_written; - entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time); - entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time); - entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time); - entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time); - entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time); - entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time); + if (bufusage) + { + entry->counters.shared_blks_hit += bufusage->shared_blks_hit; + entry->counters.shared_blks_read += bufusage->shared_blks_read; + entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; + entry->counters.shared_blks_written += bufusage->shared_blks_written; + entry->counters.local_blks_hit += bufusage->local_blks_hit; + entry->counters.local_blks_read += bufusage->local_blks_read; + entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied; + entry->counters.local_blks_written += bufusage->local_blks_written; + entry->counters.temp_blks_read += bufusage->temp_blks_read; + entry->counters.temp_blks_written += bufusage->temp_blks_written; + entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time); + entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time); + entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time); + entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time); + entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time); + entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time); + } entry->counters.usage += USAGE_EXEC(total_time); - entry->counters.wal_records += walusage->wal_records; - entry->counters.wal_fpi += walusage->wal_fpi; - entry->counters.wal_bytes += walusage->wal_bytes; - entry->counters.wal_buffers_full += walusage->wal_buffers_full; + if (walusage) + { + entry->counters.wal_records += walusage->wal_records; + entry->counters.wal_fpi += walusage->wal_fpi; + entry->counters.wal_bytes += walusage->wal_bytes; + entry->counters.wal_buffers_full += walusage->wal_buffers_full; + } if (jitusage) { entry->counters.jit_functions += jitusage->created_functions; diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index fee782d1c55..545185148ac 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2286,9 +2286,9 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage); if (es->wal && planstate->instrument) - show_wal_usage(es, &planstate->instrument->walusage); + show_wal_usage(es, &planstate->instrument->stack.walusage); /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) @@ -2305,9 +2305,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->bufusage); + show_buffer_usage(es, &instrument->stack.bufusage); if (es->wal) - show_wal_usage(es, &instrument->walusage); + show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); } } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index a97977d988a..baf2bef17d6 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -329,6 +329,13 @@ standard_ExecutorRun(QueryDesc *queryDesc, */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + /* + * Start up required top-level instrumentation stack for WAL/buffer + * tracking + */ + if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL))) + queryDesc->totaltime = InstrAlloc(1, estate->es_instrument); + /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) InstrStart(queryDesc->totaltime); @@ -383,7 +390,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + InstrStop(queryDesc->totaltime, false); MemoryContextSwitchTo(oldcontext); } @@ -442,8 +449,15 @@ standard_ExecutorFinish(QueryDesc *queryDesc) if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) AfterTriggerEndQuery(estate); + /* + * Accumulate per node statistics, and then shut down instrumentation + * stack + */ + if (queryDesc->totaltime && estate->es_instrument) + ExecAccumNodeInstrumentation(queryDesc->planstate); + if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + InstrStop(queryDesc->totaltime, true); MemoryContextSwitchTo(oldcontext); @@ -1266,7 +1280,15 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigWhenExprs = (ExprState **) palloc0(n * sizeof(ExprState *)); if (instrument_options) - resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options); + { + /* + * Triggers do not individually track buffer/WAL usage, even if + * otherwise tracked + */ + int opts = (instrument_options & INSTRUMENT_TIMER) != 0 ? INSTRUMENT_TIMER : INSTRUMENT_ROWS; + + resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, opts); + } } else { diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index d286471254b..d00cf820a27 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -122,6 +122,7 @@ static TupleTableSlot *ExecProcNodeFirst(PlanState *node); static TupleTableSlot *ExecProcNodeInstr(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); +static bool ExecAccumNodeInstrumentation_walker(PlanState *node, void *context); /* ------------------------------------------------------------------------ @@ -828,6 +829,36 @@ ExecShutdownNode_walker(PlanState *node, void *context) return false; } +/* + * ExecAccumNodeInstrumentation + * + * Accumulate instrumentation stats from all execution nodes to their respective + * parents (or the original parent instrumentation stack). + * + * This must run after the cleanup done by ExecShutdownNode, and not rely on any + * resources cleaned up by it. We also expect shutdown actions to have occurred, + * e.g. parallel worker instrumentation to have been added to the leader. + */ +void +ExecAccumNodeInstrumentation(PlanState *node) +{ + (void) ExecAccumNodeInstrumentation_walker(node, NULL); +} + +static bool +ExecAccumNodeInstrumentation_walker(PlanState *node, void *context) +{ + if (node == NULL) + return false; + + planstate_tree_walker(node, ExecAccumNodeInstrumentation_walker, context); + + if (node->instrument && node->instrument->stack.previous) + InstrStackAdd(node->instrument->stack.previous, &node->instrument->stack); + + return false; +} + /* * ExecSetTupleBound * diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 41a342cab7f..37055d01f61 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,37 +16,103 @@ #include #include "executor/instrument.h" +#include "utils/memutils.h" +#include "utils/resowner.h" BufferUsage pgBufferUsage; static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; +InstrStack *pgInstrStack = NULL; static WalUsage save_pgWalUsage; static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); +/* + * Use ResourceOwner mechanism to correctly reset pgInstrStack on abort. + */ +static void ResOwnerReleaseInstrumentation(Datum res); +static const ResourceOwnerDesc instrumentation_resowner_desc = +{ + .name = "instrumentation", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_INSTRUMENTATION, + .ReleaseResource = ResOwnerReleaseInstrumentation, + .DebugPrint = NULL, /* default message is fine */ +}; + +static inline void +ResourceOwnerRememberInstrStack(ResourceOwner owner, InstrStack * stack) +{ + ResourceOwnerRemember(owner, PointerGetDatum(stack), &instrumentation_resowner_desc); +} + +static inline void +ResourceOwnerForgetInstrStack(ResourceOwner owner, InstrStack * stack) +{ + ResourceOwnerForget(owner, PointerGetDatum(stack), &instrumentation_resowner_desc); +} + +static bool +StackIsParent(InstrStack * stack, InstrStack * entry) +{ + if (entry->previous == NULL) + return false; + + if (entry->previous == stack) + return true; + + return StackIsParent(stack, entry->previous); +} + +static void +ResOwnerReleaseInstrumentation(Datum res) +{ + InstrStack *stack = (InstrStack *) DatumGetPointer(res); + + if (pgInstrStack) + { + /* + * Because registered resources are *not* cleaned up in a guaranteed + * order, we may get a child context after we've processed the parent. + * Thus, we only change the stack if its not already a parent of the + * stack being released. + * + * If we already walked up the stack with an earlier resource, simply + * accumulate all collected stats before the abort to the current + * stack. + * + * Note that StackIsParent will recurse as needed, so it is + * inadvisible to use deeply nested stacks. + */ + if (!StackIsParent(pgInstrStack, stack)) + InstrPopStack(stack, true); + else + InstrStackAdd(pgInstrStack, stack); + } + + /* + * Ensure long-lived memory is freed now, as we don't expect InstrStop to + * be called + */ + pfree(stack); +} /* General purpose instrumentation handling */ Instrumentation * InstrAlloc(int n, int instrument_options) { - Instrumentation *instr; + Instrumentation *instr = palloc0(n * sizeof(Instrumentation)); + bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; + bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; + bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + int i; - /* initialize all fields to zeroes, then modify as needed */ - instr = palloc0(n * sizeof(Instrumentation)); - if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL)) + for (i = 0; i < n; i++) { - bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; - bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; - bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; - int i; - - for (i = 0; i < n; i++) - { - instr[i].need_bufusage = need_buffers; - instr[i].need_walusage = need_wal; - instr[i].need_timer = need_timer; - } + instr[i].need_bufusage = need_buffers; + instr[i].need_walusage = need_wal; + instr[i].need_timer = need_timer; } return instr; @@ -59,15 +125,31 @@ InstrStart(Instrumentation *instr) !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStart called twice in a row"); - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + Assert(CurrentResourceOwner != NULL); + instr->owner = CurrentResourceOwner; + + /* + * Allocate the stack resource in a memory context that survives + * during an abort. This will be freed by InstrStop (regular + * execution) or ResOwnerReleaseInstrumentation (abort). + * + * We don't do this in InstrAlloc to avoid leaking when InstrStart + + * InstrStop isn't called. + */ + if (instr->stack == NULL) + instr->stack = MemoryContextAllocZero(CurTransactionContext, sizeof(InstrStack)); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + ResourceOwnerEnlarge(instr->owner); + ResourceOwnerRememberInstrStack(instr->owner, instr->stack); + + InstrPushStack(instr->stack); + } } void -InstrStop(Instrumentation *instr) +InstrStop(Instrumentation *instr, bool finalize) { instr_time endtime; @@ -83,14 +165,28 @@ InstrStop(Instrumentation *instr) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + InstrPopStack(instr->stack, finalize); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + Assert(instr->owner != NULL); + ResourceOwnerForgetInstrStack(instr->owner, instr->stack); + instr->owner = NULL; + + if (finalize) + { + /* + * To avoid keeping memory allocated beyond when its needed, copy + * the result to the current memory context, and free it in the + * transaction context. + */ + InstrStack *stack = palloc(sizeof(InstrStack)); + + memcpy(stack, instr->stack, sizeof(InstrStack)); + pfree(instr->stack); + instr->stack = stack; + } + } } /* Trigger instrumentation handling */ @@ -98,15 +194,20 @@ TriggerInstrumentation * InstrAllocTrigger(int n, int instrument_options) { TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation)); - bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; - bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; int i; + /* + * To avoid having to determine when the last trigger fired, we never + * track WAL/buffer usage for now + */ + Assert((instrument_options & INSTRUMENT_BUFFERS) == 0); + Assert((instrument_options & INSTRUMENT_WAL) == 0); + for (i = 0; i < n; i++) { - tginstr[i].instr.need_bufusage = need_buffers; - tginstr[i].instr.need_walusage = need_wal; + tginstr[i].instr.need_bufusage = false; + tginstr[i].instr.need_walusage = false; tginstr[i].instr.need_timer = need_timer; } @@ -122,7 +223,11 @@ InstrStartTrigger(TriggerInstrumentation * tginstr) void InstrStopTrigger(TriggerInstrumentation * tginstr, int firings) { - InstrStop(&tginstr->instr); + /* + * trigger instrumentation does not track WAL/buffer usage, so its okay to + * never finalize + */ + InstrStop(&tginstr->instr, false); tginstr->firings += firings; } @@ -173,12 +278,13 @@ InstrStartNode(NodeInstrumentation * instr) !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); - /* save buffer usage totals at node entry, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(pgInstrStack != NULL); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + InstrPushStack(&instr->stack); + } } /* Exit from a plan node */ @@ -203,14 +309,14 @@ InstrStopNode(NodeInstrumentation * instr, double nTuples) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(instr->stack.previous != NULL); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + /* Adding to parent is handled by ExecAccumNodeInstrumentation */ + InstrPopStack(&instr->stack, false); + } /* Is this the first tuple of this cycle? */ if (!instr->running) @@ -287,13 +393,13 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add) /* Add delta of buffer usage since entry to node's totals */ if (dst->need_bufusage) - BufferUsageAdd(&dst->bufusage, &add->bufusage); + BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage); if (dst->need_walusage) - WalUsageAdd(&dst->walusage, &add->walusage); + WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); } -/* note current values during parallel executor startup */ +/* start instrumentation during parallel executor startup */ void InstrStartParallelQuery(void) { @@ -315,10 +421,28 @@ InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) { + if (pgInstrStack != NULL) + { + InstrStack *dst = pgInstrStack; + + BufferUsageAdd(&dst->bufusage, bufusage); + WalUsageAdd(&dst->walusage, walusage); + } + BufferUsageAdd(&pgBufferUsage, bufusage); WalUsageAdd(&pgWalUsage, walusage); } +void +InstrStackAdd(InstrStack * dst, InstrStack * add) +{ + Assert(dst != NULL); + Assert(add != NULL); + + BufferUsageAdd(&dst->bufusage, &add->bufusage); + WalUsageAdd(&dst->walusage, &add->walusage); +} + /* dst += add */ static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 8e7a5453064..692e3182e62 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -297,6 +297,7 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); +extern void ExecAccumNodeInstrumentation(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 8e435e1f92c..30d81fceaaa 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -67,12 +67,25 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* Stack of WAL/buffer usage used for per-node instrumentation */ +typedef struct InstrStack +{ + struct InstrStack *previous; + BufferUsage bufusage; + WalUsage walusage; +} InstrStack; + /* * General purpose instrumentation that can capture time and WAL/buffer usage * * Initialized through InstrAlloc, followed by one or more calls to a pair of * InstrStart/InstrStop (activity is measured inbetween). + * + * Uses resource owner mechanism for handling aborts, as such, the caller *must* not exit out of + * top level transaction between InstrStart/InstrStop calls in regular execution. If this is needed, + * directly use InstrPushStack/InstrPopStack in a PG_TRY/PG_FINALLY block instead. */ +struct ResourceOwnerData; typedef struct Instrumentation { /* Parameters set at creation: */ @@ -81,12 +94,10 @@ typedef struct Instrumentation bool need_walusage; /* true if we need WAL usage data */ /* Internal state keeping: */ instr_time starttime; /* start time of last InstrStart */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics: */ instr_time total; /* total runtime */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack *stack; /* stack tracking buffer/WAL usage */ + struct ResourceOwnerData *owner; } Instrumentation; /* Trigger instrumentation */ @@ -99,6 +110,10 @@ typedef struct TriggerInstrumentation /* * Specialized instrumentation for per-node execution statistics + * + * Requires use of an outer InstrStart/InstrStop to handle the stack used for WAL/buffer + * usage statistics, and relies on it for managing aborts. Solely intended for + * the executor and anyone reporting about its activities (e.g. EXPLAIN ANALYZE). */ typedef struct NodeInstrumentation { @@ -113,8 +128,6 @@ typedef struct NodeInstrumentation instr_time counter; /* accumulated runtime for this node */ instr_time firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics across all completed cycles: */ instr_time startup; /* total startup time */ instr_time total; /* total time */ @@ -123,8 +136,7 @@ typedef struct NodeInstrumentation double nloops; /* # of run cycles for this node */ double nfiltered1; /* # of tuples removed by scanqual or joinqual */ double nfiltered2; /* # of tuples removed by "other" quals */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack stack; /* stack tracking buffer/WAL usage */ } NodeInstrumentation; typedef struct WorkerInstrumentation @@ -135,10 +147,31 @@ typedef struct WorkerInstrumentation extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; +extern PGDLLIMPORT InstrStack * pgInstrStack; + +extern void InstrStackAdd(InstrStack * dst, InstrStack * add); + +static inline void +InstrPushStack(InstrStack * stack) +{ + stack->previous = pgInstrStack; + pgInstrStack = stack; +} + +static inline void +InstrPopStack(InstrStack * stack, bool add_to_parent) +{ + Assert(stack != NULL); + + pgInstrStack = stack->previous; + + if (pgInstrStack && add_to_parent) + InstrStackAdd(pgInstrStack, stack); +} extern Instrumentation *InstrAlloc(int n, int instrument_options); extern void InstrStart(Instrumentation *instr); -extern void InstrStop(Instrumentation *instr); +extern void InstrStop(Instrumentation *instr, bool finalize); extern TriggerInstrumentation * InstrAllocTrigger(int n, int instrument_options); extern void InstrStartTrigger(TriggerInstrumentation * tginstr); @@ -163,21 +196,34 @@ extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, #define INSTR_BUFUSAGE_INCR(fld) do { \ pgBufferUsage.fld++; \ + if (pgInstrStack) \ + pgInstrStack->bufusage.fld++; \ } while(0) #define INSTR_BUFUSAGE_ADD(fld,val) do { \ pgBufferUsage.fld += val; \ + if (pgInstrStack) \ + pgInstrStack->bufusage.fld += val; \ } while(0) #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ INSTR_TIME_ADD(pgBufferUsage.fld, val); \ + if (pgInstrStack) \ + INSTR_TIME_ADD(pgInstrStack->bufusage.fld, val); \ } while (0) #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ + if (pgInstrStack) \ + INSTR_TIME_ACCUM_DIFF(pgInstrStack->bufusage.fld, endval, startval); \ } while (0) + #define INSTR_WALUSAGE_INCR(fld) do { \ pgWalUsage.fld++; \ + if (pgInstrStack) \ + pgInstrStack->walusage.fld++; \ } while(0) #define INSTR_WALUSAGE_ADD(fld,val) do { \ pgWalUsage.fld += val; \ + if (pgInstrStack) \ + pgInstrStack->walusage.fld += val; \ } while(0) #endif /* INSTRUMENT_H */ diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index aede4bfc820..c02b75480ff 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority; #define RELEASE_PRIO_SNAPSHOT_REFS 500 #define RELEASE_PRIO_FILES 600 #define RELEASE_PRIO_WAITEVENTSETS 700 +#define RELEASE_PRIO_INSTRUMENTATION 800 /* 0 is considered invalid */ #define RELEASE_PRIO_FIRST 1 -- 2.47.1