From 39e4a7cae8edc26bfcc9ec99756ad01cd2f587b9 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 9 Sep 2025 02:16:59 -0700 Subject: [PATCH v2 4/7] Introduce stack for tracking per-node WAL/buffer usage --- .../pg_stat_statements/pg_stat_statements.c | 4 +- src/backend/commands/explain.c | 8 +- src/backend/commands/trigger.c | 4 +- src/backend/executor/execMain.c | 25 ++- src/backend/executor/execProcnode.c | 29 +++ src/backend/executor/instrument.c | 199 ++++++++++++++---- src/include/executor/executor.h | 1 + src/include/executor/instrument.h | 53 ++++- 8 files changed, 260 insertions(+), 63 deletions(-) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 4ec33fbf470..deb6d43a47f 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -1087,8 +1087,8 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) PGSS_EXEC, INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), queryDesc->estate->es_total_processed, - &queryDesc->totaltime->bufusage, - &queryDesc->totaltime->walusage, + &INSTR_GET_BUFUSAGE(queryDesc->totaltime), + &INSTR_GET_WALUSAGE(queryDesc->totaltime), queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index d005fcdbc98..1c4d0e14334 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2280,9 +2280,9 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage); if (es->wal && planstate->instrument) - show_wal_usage(es, &planstate->instrument->walusage); + show_wal_usage(es, &planstate->instrument->stack.walusage); /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) @@ -2299,9 +2299,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->bufusage); + show_buffer_usage(es, &instrument->stack.bufusage); if (es->wal) - show_wal_usage(es, &instrument->walusage); + show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); } } diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 9b53dd99e99..67a2fdd034a 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2392,7 +2392,7 @@ ExecCallTriggerFunc(TriggerData *trigdata, * one "tuple returned" (really the number of firings). */ if (instr) - InstrStop(instr + tgindx, 1); + InstrStop(instr + tgindx, 1, false); return (HeapTuple) DatumGetPointer(result); } @@ -4607,7 +4607,7 @@ AfterTriggerExecute(EState *estate, * one "tuple returned" (really the number of firings). */ if (instr) - InstrStop(instr + tgindx, 1); + InstrStop(instr + tgindx, 1, false); } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index e459b3aa797..37f00026dc6 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -329,6 +329,13 @@ standard_ExecutorRun(QueryDesc *queryDesc, */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + /* + * Start up required top-level instrumentation stack for WAL/buffer + * tracking + */ + if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL))) + queryDesc->totaltime = InstrAlloc(1, estate->es_instrument); + /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) InstrStart(queryDesc->totaltime); @@ -383,7 +390,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime, estate->es_processed); + InstrStop(queryDesc->totaltime, estate->es_processed, false); MemoryContextSwitchTo(oldcontext); } @@ -442,8 +449,15 @@ standard_ExecutorFinish(QueryDesc *queryDesc) if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) AfterTriggerEndQuery(estate); + /* + * Accumulate per node statistics, and then shut down instrumentation + * stack + */ + if (queryDesc->totaltime && estate->es_instrument) + ExecAccumNodeInstrumentation(queryDesc->planstate); + if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime, 0); + InstrStop(queryDesc->totaltime, 0, true); MemoryContextSwitchTo(oldcontext); @@ -1266,7 +1280,12 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigWhenExprs = (ExprState **) palloc0(n * sizeof(ExprState *)); if (instrument_options) - resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options); + { + if ((instrument_options & INSTRUMENT_TIMER) != 0) + resultRelInfo->ri_TrigInstrument = InstrAlloc(n, INSTRUMENT_TIMER); + else + resultRelInfo->ri_TrigInstrument = InstrAlloc(n, INSTRUMENT_ROWS); + } } else { diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index d286471254b..1b3b39222a9 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -122,6 +122,7 @@ static TupleTableSlot *ExecProcNodeFirst(PlanState *node); static TupleTableSlot *ExecProcNodeInstr(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); +static bool ExecAccumNodeInstrumentation_walker(PlanState *node, void *context); /* ------------------------------------------------------------------------ @@ -828,6 +829,34 @@ ExecShutdownNode_walker(PlanState *node, void *context) return false; } +/* + * ExecAccumNodeInstrumentation + * + * Accumulate instrumentation stats from all execution nodes to their respective + * parents (or the original parent instrumentation stack). + */ +void +ExecAccumNodeInstrumentation(PlanState *node) +{ + (void) ExecAccumNodeInstrumentation_walker(node, NULL); +} + +static bool +ExecAccumNodeInstrumentation_walker(PlanState *node, void *context) +{ + if (node == NULL) + return false; + + check_stack_depth(); + + planstate_tree_walker(node, ExecAccumNodeInstrumentation_walker, context); + + if (node->instrument && node->instrument->stack.previous) + InstrStackAdd(node->instrument->stack.previous, &node->instrument->stack); + + return false; +} + /* * ExecSetTupleBound * diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 1fe0f4204e5..8ef626721f3 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,56 +16,150 @@ #include #include "executor/instrument.h" +#include "utils/memutils.h" BufferUsage pgBufferUsage; static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; +InstrStack *pgInstrStack = NULL; static WalUsage save_pgWalUsage; static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); +/* + * Use ResourceOwner mechanism to correctly reset pgInstrStack on abort. + */ +static void ResOwnerReleaseInstrumentation(Datum res); +static const ResourceOwnerDesc instrumentation_resowner_desc = +{ + .name = "instrumentation", + .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS, + .release_priority = RELEASE_PRIO_FIRST, + .ReleaseResource = ResOwnerReleaseInstrumentation, + .DebugPrint = NULL, /* default message is fine */ +}; + +static inline void +ResourceOwnerRememberInstrStack(ResourceOwner owner, Instrumentation *instr) +{ + ResourceOwnerRemember(owner, PointerGetDatum(instr), &instrumentation_resowner_desc); +} + +static inline void +ResourceOwnerForgetInstrStack(ResourceOwner owner, Instrumentation *instr) +{ + ResourceOwnerForget(owner, PointerGetDatum(instr), &instrumentation_resowner_desc); +} + +static void +InstrPushStackResource(Instrumentation *res) +{ + ResourceOwner owner = CurrentResourceOwner; + + Assert(owner != NULL); + + res->owner = owner; + + ResourceOwnerEnlarge(owner); + ResourceOwnerRememberInstrStack(owner, res); + + res->stack.previous = pgInstrStack; + pgInstrStack = &res->stack; +} + +static void +InstrPopStackResource(Instrumentation *res) +{ + Assert(res != NULL); + Assert(res->owner != NULL); + + pgInstrStack = res->stack.previous; + + ResourceOwnerForgetInstrStack(res->owner, res); +} + +static bool +StackIsParent(InstrStack * stack, InstrStack * entry) +{ + if (entry->previous == NULL) + return false; + + if (entry->previous == stack) + return true; + + return StackIsParent(stack, entry->previous); +} + +static void +ResOwnerReleaseInstrumentation(Datum res) +{ + Instrumentation *instr = (Instrumentation *) DatumGetPointer(res); + + /* + * Because registered resources are *not* called in reverse order, we'll + * get what was first registered first at shutdown. Thus, on any later + * resources we need to not change the stack, which was already set to the + * correct previous entry. + */ + if (pgInstrStack && !StackIsParent(pgInstrStack, &instr->stack)) + pgInstrStack = instr->stack.previous; + + /* + * Always accumulate all collected stats before the abort, even if we + * already walked up the stack with an earlier resource. + */ + if (pgInstrStack) + InstrStackAdd(pgInstrStack, &instr->stack); + + instr->finalized = true; +} /* General purpose instrumentation handling */ Instrumentation * InstrAlloc(int n, int instrument_options) { - Instrumentation *instr; + Instrumentation *instr = NULL; + bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; + bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; + bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + int i; + + /* + * If resource owner will be used, we must allocate in the transaction + * context (not the calling context, usually a lower context), because the + * memory might otherwise be freed too early in an abort situation. + */ + if (need_buffers || need_wal) + instr = MemoryContextAllocZero(CurTransactionContext, n * sizeof(Instrumentation)); + else + instr = palloc0(n * sizeof(Instrumentation)); - /* initialize all fields to zeroes, then modify as needed */ - instr = palloc0(n * sizeof(Instrumentation)); - if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL)) + for (i = 0; i < n; i++) { - bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; - bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; - bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; - int i; - - for (i = 0; i < n; i++) - { - instr[i].need_bufusage = need_buffers; - instr[i].need_walusage = need_wal; - instr[i].need_timer = need_timer; - } + instr[i].need_bufusage = need_buffers; + instr[i].need_walusage = need_wal; + instr[i].need_timer = need_timer; } return instr; } + void InstrStart(Instrumentation *instr) { + Assert(!instr->finalized); + if (instr->need_timer && !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStart called twice in a row"); - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; - - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + if (instr->need_bufusage || instr->need_walusage) + InstrPushStackResource(instr); } + void -InstrStop(Instrumentation *instr, double nTuples) +InstrStop(Instrumentation *instr, double nTuples, bool finalize) { instr_time endtime; @@ -84,14 +178,15 @@ InstrStop(Instrumentation *instr, double nTuples) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); + if (instr->need_bufusage || instr->need_walusage) + InstrPopStackResource(instr); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + if (finalize) + { + instr->finalized = true; + if (pgInstrStack) + InstrStackAdd(pgInstrStack, &instr->stack); + } } /* Allocate new node instrumentation structure(s) */ @@ -139,12 +234,14 @@ InstrStartNode(NodeInstrumentation * instr) !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); - /* save buffer usage totals at node entry, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(pgInstrStack != NULL); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + instr->stack.previous = pgInstrStack; + pgInstrStack = &instr->stack; + } } /* Exit from a plan node */ @@ -169,14 +266,12 @@ InstrStopNode(NodeInstrumentation * instr, double nTuples) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); - - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(instr->stack.previous != NULL); + pgInstrStack = instr->stack.previous; + } /* Is this the first tuple of this cycle? */ if (!instr->running) @@ -253,10 +348,20 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add) /* Add delta of buffer usage since entry to node's totals */ if (dst->need_bufusage) - BufferUsageAdd(&dst->bufusage, &add->bufusage); + BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage); if (dst->need_walusage) - WalUsageAdd(&dst->walusage, &add->walusage); + WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); +} + +void +InstrStackAdd(InstrStack * dst, InstrStack * add) +{ + Assert(dst != NULL); + Assert(add != NULL); + + BufferUsageAdd(&dst->bufusage, &add->bufusage); + WalUsageAdd(&dst->walusage, &add->walusage); } /* note current values during parallel executor startup */ @@ -281,6 +386,14 @@ InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) { + if (pgInstrStack != NULL) + { + InstrStack *dst = pgInstrStack; + + BufferUsageAdd(&dst->bufusage, bufusage); + WalUsageAdd(&dst->walusage, walusage); + } + BufferUsageAdd(&pgBufferUsage, bufusage); WalUsageAdd(&pgWalUsage, walusage); } diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 31133514e84..ba76a370d3f 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -297,6 +297,7 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); +extern void ExecAccumNodeInstrumentation(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 78d3653997b..d04607ce40c 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -14,6 +14,7 @@ #define INSTRUMENT_H #include "portability/instr_time.h" +#include "utils/resowner.h" /* @@ -66,11 +67,23 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* Stack of WAL/buffer usage used for per-node instrumentation */ +typedef struct InstrStack +{ + struct InstrStack *previous; + BufferUsage bufusage; + WalUsage walusage; +} InstrStack; + /* * General purpose instrumentation that can capture time, WAL/buffer usage and tuples * * Initialized through InstrAlloc, followed by one or more calls to a pair of * InstrStart/InstrStop (activity is measured inbetween). + * + * Uses resource owner mechanism for handling aborts, as such, the caller *must* not exit out of + * top level transaction between InstrStart/InstrStop calls in regular execution. If this is needed, + * directly use InstrPushStack/InstrPopStack in a PG_TRY/PG_FINALLY block instead. */ typedef struct Instrumentation { @@ -79,18 +92,22 @@ typedef struct Instrumentation bool need_bufusage; /* true if we need buffer usage data */ bool need_walusage; /* true if we need WAL usage data */ /* Internal state keeping: */ + bool finalized; /* true if no more InstrStart calls are + * allowed */ instr_time starttime; /* start time of last InstrStart */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics: */ instr_time total; /* total runtime */ double ntuples; /* total tuples counted in InstrStop */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack stack; /* stack tracking buffer/WAL usage */ + ResourceOwner owner; } Instrumentation; /* * Specialized instrumentation for per-node execution statistics + * + * Requires use of an outer InstrStart/InstrStop to handle the stack used for WAL/buffer + * usage statistics, and relies on it for managing aborts. Solely intended for + * the executor and anyone reporting about its activities (e.g. EXPLAIN ANALYZE). */ typedef struct NodeInstrumentation { @@ -105,8 +122,6 @@ typedef struct NodeInstrumentation instr_time counter; /* accumulated runtime for this node */ instr_time firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics across all completed cycles: */ instr_time startup; /* total startup time */ instr_time total; /* total time */ @@ -115,8 +130,7 @@ typedef struct NodeInstrumentation double nloops; /* # of run cycles for this node */ double nfiltered1; /* # of tuples removed by scanqual or joinqual */ double nfiltered2; /* # of tuples removed by "other" quals */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack stack; /* stack tracking buffer/WAL usage */ } NodeInstrumentation; typedef struct WorkerInstrumentation @@ -127,10 +141,11 @@ typedef struct WorkerInstrumentation extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; +extern PGDLLIMPORT InstrStack * pgInstrStack; extern Instrumentation *InstrAlloc(int n, int instrument_options); extern void InstrStart(Instrumentation *instr); -extern void InstrStop(Instrumentation *instr, double nTuples); +extern void InstrStop(Instrumentation *instr, double nTuples, bool finalize); extern NodeInstrumentation * InstrAllocNode(int n, int instrument_options, bool async_mode); @@ -146,26 +161,46 @@ extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); extern void BufferUsageAccumDiff(BufferUsage *dst, const BufferUsage *add, const BufferUsage *sub); +extern void InstrStackAdd(InstrStack * dst, InstrStack * add); extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); +#define INSTR_GET_BUFUSAGE(instr) \ + instr->stack.bufusage + +#define INSTR_GET_WALUSAGE(instr) \ + instr->stack.walusage + #define INSTR_BUFUSAGE_INCR(fld) do { \ pgBufferUsage.fld++; \ + if (pgInstrStack) \ + pgInstrStack->bufusage.fld++; \ } while(0) #define INSTR_BUFUSAGE_ADD(fld,val) do { \ pgBufferUsage.fld += val; \ + if (pgInstrStack) \ + pgInstrStack->bufusage.fld += val; \ } while(0) #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ INSTR_TIME_ADD(pgBufferUsage.fld, val); \ + if (pgInstrStack) \ + INSTR_TIME_ADD(pgInstrStack->bufusage.fld, val); \ } while (0) #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ + if (pgInstrStack) \ + INSTR_TIME_ACCUM_DIFF(pgInstrStack->bufusage.fld, endval, startval); \ } while (0) + #define INSTR_WALUSAGE_INCR(fld) do { \ pgWalUsage.fld++; \ + if (pgInstrStack) \ + pgInstrStack->walusage.fld++; \ } while(0) #define INSTR_WALUSAGE_ADD(fld,val) do { \ pgWalUsage.fld += val; \ + if (pgInstrStack) \ + pgInstrStack->walusage.fld += val; \ } while(0) #endif /* INSTRUMENT_H */ -- 2.47.1