diff --git a/contrib/Makefile b/contrib/Makefile index da4e2316a3..721a58fcde 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -36,6 +36,7 @@ SUBDIRS = \ pg_prewarm \ pg_stat_statements \ pg_surgery \ + pg_tracing \ pg_trgm \ pgrowlocks \ pgstattuple \ diff --git a/contrib/meson.build b/contrib/meson.build index 84d4e18561..f73caf95f5 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -47,6 +47,7 @@ subdir('pgrowlocks') subdir('pg_stat_statements') subdir('pgstattuple') subdir('pg_surgery') +subdir('pg_tracing') subdir('pg_trgm') subdir('pg_visibility') subdir('pg_walinspect') diff --git a/contrib/pg_tracing/Makefile b/contrib/pg_tracing/Makefile new file mode 100644 index 0000000000..44398a7c44 --- /dev/null +++ b/contrib/pg_tracing/Makefile @@ -0,0 +1,28 @@ +# contrib/pg_tracing/Makefile + +MODULE_big = pg_tracing +OBJS = \ + $(WIN32RES) \ + pg_tracing.o \ + query_process.o \ + pg_tracing_explain.o \ + span.o + +EXTENSION = pg_tracing +DATA = pg_tracing--1.0.sql +TAP_TESTS = 0 +PGFILEDESC = "pg_tracing - Distributed tracing for postgres" + +REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_tracing/pg_tracing.conf +REGRESS = create_table utility select sample subxact reset nested wal parallel cleanup + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_tracing +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_tracing/expected/cleanup.out b/contrib/pg_tracing/expected/cleanup.out new file mode 100644 index 0000000000..ab11333d4e --- /dev/null +++ b/contrib/pg_tracing/expected/cleanup.out @@ -0,0 +1,3 @@ +DROP TABLE pg_tracing_test; +DROP function test_function; +DROP EXTENSION pg_tracing; diff --git a/contrib/pg_tracing/expected/create_table.out b/contrib/pg_tracing/expected/create_table.out new file mode 100644 index 0000000000..87875a113f --- /dev/null +++ b/contrib/pg_tracing/expected/create_table.out @@ -0,0 +1 @@ +CREATE TABLE pg_tracing_test (a int, b char(20)); diff --git a/contrib/pg_tracing/expected/nested.out b/contrib/pg_tracing/expected/nested.out new file mode 100644 index 0000000000..0a51c50938 --- /dev/null +++ b/contrib/pg_tracing/expected/nested.out @@ -0,0 +1,122 @@ +-- Create test functions +CREATE OR REPLACE FUNCTION test_function(a int) RETURNS SETOF oid AS +$BODY$ +BEGIN + RETURN QUERY SELECT oid from pg_class where oid = a; +END; +$BODY$ +LANGUAGE plpgsql; +CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ +DECLARE + r int; +BEGIN + SELECT (i + i)::int INTO r; +END; $$ LANGUAGE plpgsql; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000051-0000000000000051-01'*/ select test_function(1); + test_function +--------------- +(0 rows) + +SELECT span_id AS top_span, + extract(epoch from span_start) as top_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_end + from pg_tracing_spans(false) where parent_id=81 and name!='Parse' \gset +SELECT span_id AS top_run_span, + extract(epoch from span_start) as top_run_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_run_end + from pg_tracing_spans(false) where parent_id=:top_span and name='Executor' and resource='Run' \gset +SELECT span_id AS top_project, + extract(epoch from span_start) as top_project_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_project_end + from pg_tracing_spans(false) where parent_id=:top_run_span and name='ProjectSet' \gset +SELECT span_id AS top_result, + extract(epoch from span_start) as top_result_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_result_end + from pg_tracing_spans(false) where parent_id=:top_project and name='Result' \gset +SELECT span_id AS nested_select, + extract(epoch from span_start) as select_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as select_end + from pg_tracing_spans(false) where parent_id=:top_result and name='Select' \gset +SELECT span_id AS nested_run, + extract(epoch from span_start) as run_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as run_end + from pg_tracing_spans(false) where parent_id=:nested_select and resource='Run' \gset +SELECT :top_start < :top_run_start AS top_query_before_run, + :top_end >= :top_run_end AS top_end_after_run_end, + :top_run_start <= :top_project_start AS top_run_start_before_project, + :top_run_end >= :top_project_end AS top_run_end_after_project_end, + :top_run_end >= :select_end AS top_run_end_before_select_end, + :top_run_end >= :run_end AS top_run_end_after_nested_run_end, + :run_end >= :select_end AS run_end_after_select_end; + top_query_before_run | top_end_after_run_end | top_run_start_before_project | top_run_end_after_project_end | top_run_end_before_select_end | top_run_end_after_nested_run_end | run_end_after_select_end +----------------------+-----------------------+------------------------------+-------------------------------+-------------------------------+----------------------------------+-------------------------- + t | t | t | t | t | t | t +(1 row) + +WITH max_duration AS (select max(duration) from pg_tracing_spans(false)) +SELECT duration = max_duration.max from pg_tracing_spans(false), max_duration + where span_id = :top_span; + ?column? +---------- + t +(1 row) + +SELECT resource from pg_tracing_spans(false) where parent_id=:nested_run order by resource; + resource +--------------------------------------------------------------------------------- + IndexOnlyScan using pg_class_oid_index on pg_class|Index Cond: (oid = '1'::oid) +(1 row) + +-- Check tracking option +set pg_tracing.track = 'top'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000052-0000000000000052-01'*/ select test_function(1); + test_function +--------------- +(0 rows) + +SELECT count(*) from pg_tracing_spans where trace_id=82; + count +------- + 10 +(1 row) + +set pg_tracing.track = 'none'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000053-0000000000000053-01'*/ select test_function(1); + test_function +--------------- +(0 rows) + +SELECT count(*) from pg_tracing_spans where trace_id=83; + count +------- + 0 +(1 row) + +set pg_tracing.track = 'all'; +/*traceparent='00-00000000000000000000000000000054-0000000000000054-01'*/ CALL sum_one(3); +SELECT resource from pg_tracing_spans order by span_start, duration desc, resource; + resource +--------------------- + CALL sum_one(3); + Parse + Post Parse + CALL sum_one(3); + SELECT (i + i)::int + Parse + Post Parse + Planner + Planner + Start + Run + Result + Finish + End +(14 rows) + +set pg_tracing.track_utility=off; +/*traceparent='00-00000000000000000000000000000055-0000000000000055-01'*/ CALL sum_one(10); +SELECT resource from pg_tracing_spans order by span_start, duration desc, resource; + resource +---------- +(0 rows) + diff --git a/contrib/pg_tracing/expected/parallel.out b/contrib/pg_tracing/expected/parallel.out new file mode 100644 index 0000000000..2f12f65be0 --- /dev/null +++ b/contrib/pg_tracing/expected/parallel.out @@ -0,0 +1,91 @@ +begin; +-- encourage use of parallel plans +set parallel_setup_cost=0; +set parallel_tuple_cost=0; +set min_parallel_table_scan_size=0; +set max_parallel_workers_per_gather=4; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ select 1 from pg_class limit 1; + ?column? +---------- + 1 +(1 row) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ select 2 from pg_class limit 2; + ?column? +---------- + 2 + 2 +(2 rows) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-00'*/ select 3 from pg_class limit 3; + ?column? +---------- + 3 + 3 + 3 +(3 rows) + +commit; +SELECT resource from pg_tracing_spans order by resource; + resource +----------------------------------- + End + End + End + End + End + End + End + End + Finish + Finish + Finish + Finish + Finish + Finish + Finish + Finish + Gather + Gather + Limit + Limit + Parallel SeqScan on pg_class + Parallel SeqScan on pg_class + Parallel SeqScan on pg_class + Parallel SeqScan on pg_class + Parallel SeqScan on pg_class + Parallel SeqScan on pg_class + Parallel SeqScan on pg_class + Parallel SeqScan on pg_class + Parse + Parse + Planner + Planner + Post Parse + Post Parse + Run + Run + Run + Run + Run + Run + Run + Run + Start + Start + Start + Start + Start + Start + Start + Start + Worker 0 + Worker 0 + Worker 1 + Worker 1 + Worker 2 + Worker 2 + select $1 from pg_class limit $2; + select $1 from pg_class limit $2; +(58 rows) + diff --git a/contrib/pg_tracing/expected/reset.out b/contrib/pg_tracing/expected/reset.out new file mode 100644 index 0000000000..2e0c903121 --- /dev/null +++ b/contrib/pg_tracing/expected/reset.out @@ -0,0 +1,13 @@ +-- Check reset is working +SELECT pg_tracing_reset(); + pg_tracing_reset +------------------ + +(1 row) + +SELECT traces from pg_tracing_info; + traces +-------- + 0 +(1 row) + diff --git a/contrib/pg_tracing/expected/sample.out b/contrib/pg_tracing/expected/sample.out new file mode 100644 index 0000000000..7275407e5d --- /dev/null +++ b/contrib/pg_tracing/expected/sample.out @@ -0,0 +1,215 @@ +-- Trace nothing +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 0.0; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000021-0000000000000021-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +select count(distinct(trace_id)) from pg_tracing_spans(false); + count +------- + 0 +(1 row) + +select resource, parameters from pg_tracing_spans order by span_start, duration desc; + resource | parameters +----------+------------ +(0 rows) + +-- Trace everything +SET pg_tracing.sample_rate = 1.0; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000022-0000000000000022-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000023-0000000000000023-00'*/ SELECT 2; + ?column? +---------- + 2 +(1 row) + +SELECT 3; + ?column? +---------- + 3 +(1 row) + +SELECT 4; + ?column? +---------- + 4 +(1 row) + +select count(distinct(trace_id)) from pg_tracing_spans(false); + count +------- + 4 +(1 row) + +select resource, parameters from pg_tracing_spans(false) order by span_start, duration desc; + resource | parameters +------------+------------ + SELECT $1; | $1 = 1 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | + SELECT $1; | $1 = 2 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | + SELECT $1; | $1 = 3 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | + SELECT $1; | $1 = 4 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | +(36 rows) + +-- Top spans should reuse generated ids +select resource, parameters from pg_tracing_spans(false) where trace_id = parent_id and parent_id = span_id order by span_start, duration desc; + resource | parameters +------------+------------ + SELECT $1; | $1 = 3 + SELECT $1; | $1 = 4 +(2 rows) + +select resource, parameters from pg_tracing_spans order by span_start, duration desc; + resource | parameters +------------+------------ + SELECT $1; | $1 = 1 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | + SELECT $1; | $1 = 2 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | + SELECT $1; | $1 = 3 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | + SELECT $1; | $1 = 4 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | +(36 rows) + +-- Cleanup +SET plan_cache_mode='auto'; +SET pg_tracing.sample_rate = 0.0; +-- Only trace query with sampled flag +SET pg_tracing.caller_sample_rate = 1.0; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000024-0000000000000024-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000025-0000000000000025-00'*/ SELECT 2; + ?column? +---------- + 2 +(1 row) + +SELECT 1; + ?column? +---------- + 1 +(1 row) + +select count(distinct(trace_id)) from pg_tracing_spans(false); + count +------- + 3 +(1 row) + +select resource, parameters from pg_tracing_spans(false) order by span_start, duration desc; + resource | parameters +-----------------------------------+------------ + SET plan_cache_mode='auto'; | + Parse | + Post Parse | + SET plan_cache_mode='auto'; | + SET pg_tracing.sample_rate = 0.0; | + Parse | + Post Parse | + SET pg_tracing.sample_rate = 0.0; | + SELECT $1; | $1 = 1 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | +(17 rows) + +select resource, parameters from pg_tracing_spans order by span_start; + resource | parameters +-----------------------------------+------------ + SET plan_cache_mode='auto'; | + Parse | + Post Parse | + SET plan_cache_mode='auto'; | + SET pg_tracing.sample_rate = 0.0; | + Parse | + Post Parse | + SET pg_tracing.sample_rate = 0.0; | + SELECT $1; | $1 = 1 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | +(17 rows) + diff --git a/contrib/pg_tracing/expected/select.out b/contrib/pg_tracing/expected/select.out new file mode 100644 index 0000000000..206f0c03ed --- /dev/null +++ b/contrib/pg_tracing/expected/select.out @@ -0,0 +1,225 @@ +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +-- Get top span id +SELECT span_id AS top_span_id from pg_tracing_spans(false) where parent_id=1 and name!='Parse' \gset +-- Check parameters +SELECT parameters from pg_tracing_spans(false) where span_id=:top_span_id; + parameters +------------ + $1 = 1 +(1 row) + +-- Check the number of children +SELECT count(*) from pg_tracing_spans(false) where parent_id=:'top_span_id'; + count +------- + 7 +(1 row) + +-- Check resource and query id +SELECT resource, query_id from pg_tracing_spans(false) where trace_id=1 order by span_start, duration desc, resource; + resource | query_id +------------+--------------------- + SELECT $1; | 6865378226349601843 + Parse | 6865378226349601843 + Post Parse | 6865378226349601843 + Planner | 6865378226349601843 + Start | 6865378226349601843 + Run | 6865378226349601843 + Result | 6865378226349601843 + Finish | 6865378226349601843 + End | 6865378226349601843 +(9 rows) + +-- Check reported number of trace +SELECT traces from pg_tracing_info; + traces +-------- + 2 +(1 row) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ SELECT * from current_database(); + current_database +----------------------- + regression_pg_tracing +(1 row) + +SELECT resource from pg_tracing_spans where trace_id=3 order by resource; + resource +----------------------------------- + End + Finish + FunctionScan on current_database + Parse + Planner + Post Parse + Run + SELECT * from current_database(); + Start +(9 rows) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000004-0000000000000004-01'*/ SELECT s.relation_size + s.index_size +FROM (SELECT + pg_relation_size(C.oid) as relation_size, + pg_indexes_size(C.oid) as index_size + FROM pg_class C) as s limit 1; + ?column? +---------- + 0 +(1 row) + +SELECT resource from pg_tracing_spans where trace_id=4 order by resource; + resource +------------------------------------------------------------------------------------------------------------------------------------------------------------------ + End + Finish + Limit + Parse + Planner + Post Parse + Run + SELECT s.relation_size + s.index_sizeFROM (SELECT pg_relation_size(C.oid) as relation_size, pg_indexes_size(C.oid) as index_size FROM pg_class C) as s limit $1; + SeqScan on pg_class c + Start + SubqueryScan on s +(11 rows) + +-- Check that we're in a correct state after a timeout +set statement_timeout=200; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000007-0000000000000007-01'*/ select * from pg_sleep(10); +ERROR: canceling statement due to statement timeout +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000008-0000000000000008-01'*/ select 1; + ?column? +---------- + 1 +(1 row) + +SELECT trace_id, resource, sql_error_code from pg_tracing_spans order by span_start, duration desc, resource; + trace_id | resource | sql_error_code +----------+-----------------------------+---------------- + 7 | select * from pg_sleep($1); | 57014 + 7 | Parse | 00000 + 7 | Post Parse | 00000 + 7 | Planner | 00000 + 7 | Start | 00000 + 7 | Run | 57014 + 7 | FunctionScan on pg_sleep | 57014 + 8 | select $1; | 00000 + 8 | Parse | 00000 + 8 | Post Parse | 00000 + 8 | Planner | 00000 + 8 | Start | 00000 + 8 | Run | 00000 + 8 | Result | 00000 + 8 | Finish | 00000 + 8 | End | 00000 +(16 rows) + +set statement_timeout=0; +-- Test prepared statement +PREPARE test_prepared (text, integer) AS /*$1*/ SELECT $2; +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', 1); + ?column? +---------- + 1 +(1 row) + +SELECT trace_id, resource, parameters from pg_tracing_spans order by span_start, duration desc, resource; + trace_id | resource | parameters +----------+-----------------------------------------------------+---------------------------------------------------------------------------------------------------------------- + 9 | PREPARE test_prepared (text, integer) AS SELECT $2; | $1 = 'dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', $2 = '1' + 9 | Planner | + 9 | Start | + 9 | Run | + 9 | Result | + 9 | Finish | + 9 | End | +(7 rows) + +-- Check explain is untracked +explain (costs off) +select * from pg_class where oid < 0 limit 100; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using pg_class_oid_index on pg_class + Index Cond: (oid < '0'::oid) +(3 rows) + +SELECT trace_id, resource, parameters from pg_tracing_spans order by span_start, duration desc, resource; + trace_id | resource | parameters +----------+----------+------------ +(0 rows) + +-- Never executed node +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000a-000000000000000a-01'*/ select 1 limit 0; + ?column? +---------- +(0 rows) + +SELECT trace_id, resource, parameters from pg_tracing_spans order by span_start, duration desc, resource; + trace_id | resource | parameters +----------+---------------------+---------------- + 10 | select $1 limit $2; | $1 = 1, $2 = 0 + 10 | Parse | + 10 | Post Parse | + 10 | Planner | + 10 | Start | + 10 | Run | + 10 | Limit | + 10 | Finish | + 10 | End | +(9 rows) + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000010-0000000000000010-01''', 10); + ?column? +---------- + 10 +(1 row) + +SELECT trace_id, resource, parameters from pg_tracing_spans order by span_start, duration desc, resource; + trace_id | resource | parameters +----------+-----------------------------------------------------+------------ + 16 | PREPARE test_prepared (text, integer) AS SELECT $2; | + 16 | Start | + 16 | Run | + 16 | Result | + 16 | Finish | + 16 | End | +(6 rows) + +-- Test multiple statements +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000011-0000000000000012-01'*/ select 1; select 2; + ?column? +---------- + 1 +(1 row) + + ?column? +---------- + 2 +(1 row) + +select resource, parameters from pg_tracing_spans order by span_start, duration desc; + resource | parameters +------------+------------ + select $1; | $1 = 1 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | +(9 rows) + +-- Cleanup +SET plan_cache_mode='auto'; +DEALLOCATE test_prepared; diff --git a/contrib/pg_tracing/expected/subxact.out b/contrib/pg_tracing/expected/subxact.out new file mode 100644 index 0000000000..69fe117a4f --- /dev/null +++ b/contrib/pg_tracing/expected/subxact.out @@ -0,0 +1,70 @@ +SET pg_tracing.sample_rate = 1.0; +BEGIN; +SAVEPOINT s1; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s2; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s3; +SELECT 1; + ?column? +---------- + 1 +(1 row) + +COMMIT; +select resource, parameters, subxact_count from pg_tracing_spans order by span_start, duration desc; + resource | parameters | subxact_count +------------------------------------------------------------------+----------------------------+--------------- + BEGIN; | | 0 + Parse | | 0 + Post Parse | | 0 + BEGIN; | | 0 + SAVEPOINT $1; | $1 = s1 | 0 + Parse | | 0 + Post Parse | | 0 + SAVEPOINT s1; | | 0 + INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | $1 = 1, $2 = 2, $3 = 'aaa' | 0 + Parse | | 0 + Post Parse | | 0 + Planner | | 0 + Start | | 0 + Run | | 0 + Insert on pg_tracing_test | | 1 + ProjectSet | | 1 + Result | | 1 + Finish | | 1 + End | | 1 + SAVEPOINT $1; | $1 = s2 | 1 + Parse | | 1 + Post Parse | | 1 + SAVEPOINT s2; | | 1 + INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | $1 = 1, $2 = 2, $3 = 'aaa' | 1 + Parse | | 1 + Post Parse | | 1 + Planner | | 1 + Start | | 1 + Run | | 1 + Insert on pg_tracing_test | | 2 + ProjectSet | | 2 + Result | | 2 + Finish | | 2 + End | | 2 + SAVEPOINT $1; | $1 = s3 | 2 + Parse | | 2 + Post Parse | | 2 + SAVEPOINT s3; | | 2 + SELECT $1; | $1 = 1 | 2 + Parse | | 2 + Post Parse | | 2 + Planner | | 2 + Start | | 2 + Run | | 2 + Result | | 2 + Finish | | 2 + End | | 2 + COMMIT; | | 2 + Parse | | 2 + Post Parse | | 2 + COMMIT; | | 2 +(51 rows) + diff --git a/contrib/pg_tracing/expected/utility.out b/contrib/pg_tracing/expected/utility.out new file mode 100644 index 0000000000..4d7095eaac --- /dev/null +++ b/contrib/pg_tracing/expected/utility.out @@ -0,0 +1,118 @@ +-- Track statements in create extension +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing; +SELECT span_id AS top_utility, + extract(epoch from span_start) as top_utility_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_utility_end + from pg_tracing_spans(false) where parent_id=1 and name='Utility' \gset +SELECT span_id AS top_process_utility, + extract(epoch from span_start) as top_process_utility_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_process_utility_end + from pg_tracing_spans(false) where parent_id=:top_utility and name='ProcessUtility' \gset +SELECT span_id AS first_utility, + extract(epoch from span_start) as first_utility_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as first_utility_end + from pg_tracing_spans(false) where parent_id=:top_process_utility and name='Utility' limit 1 \gset +SELECT span_id AS first_process_utility, + extract(epoch from span_start) as first_process_utility_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as first_process_utility_end + from pg_tracing_spans(false) where parent_id=:first_utility and name='ProcessUtility' \gset +SELECT span_id AS second_utility, + extract(epoch from span_start) as second_utility_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as second_utility_end + from pg_tracing_spans(false) where parent_id=:top_process_utility and name='Utility' limit 1 offset 1 \gset +SELECT :top_utility_start < :top_process_utility_start AS top_utility_start_before_process_utility, + :top_utility_end >= :top_process_utility_end AS top_end_after_process_utility, + :first_process_utility_end < :first_utility_end AS nested_process_utility_ends_before, + :first_utility_end < :second_utility_start AS second_utility_start_after_first_one; + top_utility_start_before_process_utility | top_end_after_process_utility | nested_process_utility_ends_before | second_utility_start_after_first_one +------------------------------------------+-------------------------------+------------------------------------+-------------------------------------- + t | t | t | t +(1 row) + +-- Clean current spans +select count(*) from pg_tracing_spans; + count +------- + 38 +(1 row) + +-- Test utility off +SET pg_tracing.track_utility = off; +-- Test prepared statement +PREPARE test_prepared_one_param (integer) AS SELECT $1; +EXECUTE test_prepared_one_param(100); + ?column? +---------- + 100 +(1 row) + +select count(*) from pg_tracing_spans; + count +------- + 0 +(1 row) + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + ?column? +---------- + 200 +(1 row) + +select count(*) from pg_tracing_spans; + count +------- + 0 +(1 row) + +-- With utility on +SET pg_tracing.track_utility = on; +-- Test prepared statement +PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; +EXECUTE test_prepared_one_param_2(100); + ?column? +---------- + 100 +(1 row) + +select count(distinct(trace_id)) from pg_tracing_spans(false); + count +------- + 0 +(1 row) + +select resource, parameters from pg_tracing_spans(false) order by span_start, duration desc; + resource | parameters +----------+------------ +(0 rows) + +select resource, parameters from pg_tracing_spans where trace_id = parent_id and parent_id = span_id order by span_start; + resource | parameters +----------+------------ +(0 rows) + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + ?column? +---------- + 200 +(1 row) + +select count(distinct(trace_id)) from pg_tracing_spans(false); + count +------- + 0 +(1 row) + +select resource, parameters from pg_tracing_spans(false) order by span_start, duration desc; + resource | parameters +----------+------------ +(0 rows) + +select resource, parameters from pg_tracing_spans where trace_id = parent_id and parent_id = span_id order by span_start; + resource | parameters +----------+------------ +(0 rows) + diff --git a/contrib/pg_tracing/expected/wal.out b/contrib/pg_tracing/expected/wal.out new file mode 100644 index 0000000000..4d74aab625 --- /dev/null +++ b/contrib/pg_tracing/expected/wal.out @@ -0,0 +1,89 @@ +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; +-- Check WAL is generated for the above statements +SELECT trace_id, name, resource, +wal_records > 0 as wal_records, +wal_bytes > 0 as wal_bytes +FROM pg_tracing_spans order by span_start, duration desc, resource; + trace_id | name | resource | wal_records | wal_bytes +----------+------------+------------------------------------------------------------------+-------------+----------- + 1 | Insert | INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | t | t + 1 | Parse | Parse | | + 1 | Post Parse | Post Parse | | + 1 | Planner | Planner | f | f + 1 | Executor | Start | | + 1 | Executor | Run | | + 1 | Insert | Insert on pg_tracing_test | t | t + 1 | ProjectSet | ProjectSet | f | f + 1 | Result | Result | f | f + 1 | Executor | Finish | | + 1 | Executor | End | | + 2 | Update | UPDATE pg_tracing_test SET b = $1 WHERE a > $2; | t | t + 2 | Parse | Parse | | + 2 | Post Parse | Post Parse | | + 2 | Planner | Planner | f | f + 2 | Executor | Start | | + 2 | Executor | Run | | + 2 | Update | Update on pg_tracing_test | t | t + 2 | SeqScan | SeqScan on pg_tracing_test|Filter : (a > 7) | f | f + 2 | Executor | Finish | | + 2 | Executor | End | | + 3 | Delete | DELETE FROM pg_tracing_test WHERE a > $1; | t | t + 3 | Parse | Parse | | + 3 | Post Parse | Post Parse | | + 3 | Planner | Planner | f | f + 3 | Executor | Start | | + 3 | Executor | Run | | + 3 | Delete | Delete on pg_tracing_test | t | t + 3 | SeqScan | SeqScan on pg_tracing_test|Filter : (a > 9) | f | f + 3 | Executor | Finish | | + 3 | Executor | End | | +(31 rows) + +-- Redo the same but without wal instrumentation +set pg_tracing.instrument_wal = false; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; +SELECT trace_id, name, resource, +wal_records = 0 as wal_records, +wal_bytes = 0 as wal_bytes +FROM pg_tracing_spans order by span_start, duration desc, resource; + trace_id | name | resource | wal_records | wal_bytes +----------+------------+------------------------------------------------------------------+-------------+----------- + 1 | Insert | INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | t | t + 1 | Parse | Parse | | + 1 | Post Parse | Post Parse | | + 1 | Planner | Planner | t | t + 1 | Executor | Start | | + 1 | Executor | Run | | + 1 | Insert | Insert on pg_tracing_test | t | t + 1 | ProjectSet | ProjectSet | t | t + 1 | Result | Result | t | t + 1 | Executor | Finish | | + 1 | Executor | End | | + 2 | Update | UPDATE pg_tracing_test SET b = $1 WHERE a > $2; | t | t + 2 | Parse | Parse | | + 2 | Post Parse | Post Parse | | + 2 | Planner | Planner | t | t + 2 | Executor | Start | | + 2 | Executor | Run | | + 2 | Update | Update on pg_tracing_test | t | t + 2 | SeqScan | SeqScan on pg_tracing_test|Filter : (a > 7) | t | t + 2 | Executor | Finish | | + 2 | Executor | End | | + 3 | Delete | DELETE FROM pg_tracing_test WHERE a > $1; | t | t + 3 | Parse | Parse | | + 3 | Post Parse | Post Parse | | + 3 | Planner | Planner | t | t + 3 | Executor | Start | | + 3 | Executor | Run | | + 3 | Delete | Delete on pg_tracing_test | t | t + 3 | SeqScan | SeqScan on pg_tracing_test|Filter : (a > 9) | t | t + 3 | Executor | Finish | | + 3 | Executor | End | | +(31 rows) + +-- Cleanup +set pg_tracing.instrument_wal = true; diff --git a/contrib/pg_tracing/meson.build b/contrib/pg_tracing/meson.build new file mode 100644 index 0000000000..c3afd256fe --- /dev/null +++ b/contrib/pg_tracing/meson.build @@ -0,0 +1,47 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +pg_tracing_sources = files( + 'pg_tracing.c', + 'query_process.c', + 'pg_tracing_explain.c', + 'span.c', +) + +if host_system == 'windows' + pg_tracing_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_tracing', + '--FILEDESC', 'pg_tracing - Add support for distributed tracing',]) +endif + +pg_tracing = shared_module('pg_tracing', + pg_tracing_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_tracing + +install_data( + 'pg_tracing.control', + 'pg_tracing--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_tracing', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'create_table', + 'utility', + 'select', + 'sample', + 'subxact', + 'reset', + 'nested', + 'wal', + 'parallel', + 'cleanup' + ], + 'regress_args': ['--temp-config', files('pg_tracing.conf')], + }, +} diff --git a/contrib/pg_tracing/pg_tracing--1.0.sql b/contrib/pg_tracing/pg_tracing--1.0.sql new file mode 100644 index 0000000000..401fd75f39 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing--1.0.sql @@ -0,0 +1,94 @@ +/* contrib/pg_tracing/pg_tracing--1.0.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "CREATE EXTENSION pg_tracing" to load this file. \quit + +--- Define pg_tracing_info +CREATE FUNCTION pg_tracing_info( + OUT traces bigint, + OUT spans bigint, + OUT dropped_spans bigint, + OUT failed_truncates bigint, + OUT last_consume timestamp with time zone, + OUT stats_reset timestamp with time zone, + OUT sampling_rate double precision +) +RETURNS record +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT VOLATILE PARALLEL SAFE; + +CREATE FUNCTION pg_tracing_reset() +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C PARALLEL SAFE; + +CREATE FUNCTION pg_tracing_spans( + IN consume bool, + OUT trace_id bigint, + OUT parent_id bigint, + OUT span_id bigint, + OUT query_id bigint, + OUT name text, + OUT resource text, + OUT span_start timestamp with time zone, + OUT duration bigint, + OUT sql_error_code character(5), + OUT pid int4, + OUT nested_level smallint, + OUT subxact_count smallint, + OUT top_span bool, + +-- Plan counters + OUT startup_cost float8, + OUT total_cost float8, + OUT plan_rows float8, + OUT plan_width int, + +-- Node Counters + OUT rows int8, + OUT nloops int8, + + OUT shared_blks_hit int8, + OUT shared_blks_read int8, + OUT shared_blks_dirtied int8, + OUT shared_blks_written int8, + + OUT local_blks_hit int8, + OUT local_blks_read int8, + OUT local_blks_dirtied int8, + OUT local_blks_written int8, + + OUT blk_read_time float8, + OUT blk_write_time float8, + + OUT temp_blks_read int8, + OUT temp_blks_written int8, + OUT temp_blk_read_time float8, + OUT temp_blk_write_time float8, + + OUT wal_records int8, + OUT wal_fpi int8, + OUT wal_bytes numeric, + + OUT jit_functions int8, + OUT jit_generation_time float8, + OUT jit_inlining_time float8, + OUT jit_optimization_time float8, + OUT jit_emission_time float8, + +-- SpanNode specific data + OUT startup bigint, -- First tuple + OUT parameters text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +LANGUAGE C PARALLEL SAFE; + +CREATE VIEW pg_tracing_info AS + SELECT * FROM pg_tracing_info(); + +CREATE VIEW pg_tracing_spans AS + SELECT * FROM pg_tracing_spans(true); + +GRANT SELECT ON pg_tracing_info TO PUBLIC; +GRANT SELECT ON pg_tracing_spans TO PUBLIC; diff --git a/contrib/pg_tracing/pg_tracing.c b/contrib/pg_tracing/pg_tracing.c new file mode 100644 index 0000000000..22b8ca3041 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.c @@ -0,0 +1,2326 @@ +/*------------------------------------------------------------------------- + * + * pg_tracing.c + * Generate spans for distributed tracing from SQL query + * + * We rely on the caller to know whether the query needs to be traced or not. + * The information is propagated through https://google.github.io/sqlcommenter/. + * + * A query with sqlcommenter will look like: /\*dddbs='postgres.db',traceparent='00-00000000000000000000000000000009-0000000000000005-01'*\/ select 1; + * The traceparent fields are detailed in https://www.w3.org/TR/trace-context/#traceparent-header-field-values + * 00000000000000000000000000000009: trace id + * 0000000000000005: parent id + * 01: trace flags (01 == sampled) + * + * If sampled is set in the trace flags, we will generate spans for the ongoing query. + * A span represents an operation with a start and a duration. + * We will track the following operations: + * - Top Span: The top span for a query. They are created after extracting the traceid from traceparent or to represent a nested query. + * - Planner: We track the time spent in the planner and report the planner counters + * - Node Span: Created from planstate. The name is extracted from the node type (IndexScan, SeqScan), + * - Executor: We trace the different steps of the Executor: Start, Run, Finish and End + * + * A typical traced query will generate the following spans: + * +------------------------------------------------------------------------------------------------------------------------------------------------------+ + * | Name: Select | + * | Operation: Select * pgbench_accounts WHERE aid=$1; | + * +---+------------------------+-+------------------+--+------------------------------------------------------+-+--------------------+-+----------------++ + * | Name: Planner | | Name: Executor | |Name: Executor | | Name: Executor | | Name: Executor | + * | Operation: Planner | | Operation: Start | |Operation: Run | | Operation: Finish | | Operation: End | + * +------------------------+ +------------------+ +--+--------------------------------------------------++ +--------------------+ +----------------+ + * | Name: IndexScan | + * | Operation: IndexScan using pgbench_accounts_pkey | + * | on pgbench_accounts | + * +--------------------------------------------------+ + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_tracing_explain.h" +#include "pg_tracing.h" +#include "query_process.h" +#include "span.h" + +#include "utils/ruleutils.h" +#include "access/xact.h" +#include "common/pg_prng.h" +#include "funcapi.h" +#include "nodes/extensible.h" +#include "optimizer/planner.h" +#include "parser/analyze.h" +#include "storage/ipc.h" +#include "tcop/utility.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/xid8.h" + +PG_MODULE_MAGIC; + +typedef enum +{ + PG_TRACING_TRACK_NONE, /* track no statements */ + PG_TRACING_TRACK_TOP, /* only top level statements */ + PG_TRACING_TRACK_ALL /* all statements, including nested ones */ +} pgTracingTrackLevel; + +/* + * Structure to store flexible array of spans + */ +typedef struct pgTracingSpans +{ + int end; /* Index of last element */ + int max; /* Maximum number of element */ + Span spans[FLEXIBLE_ARRAY_MEMBER]; +} pgTracingSpans; + +/* + * A trace context for a specific parallel context + */ +typedef struct pgTracingParallelContext +{ + BackendId leader_backend_id; /* Backend id of the leader, set to + * InvalidBackendId if unused */ + uint64 trace_id; /* Trace id of the sampled query */ + uint64 parent_id; /* Id of the parent Gather span */ + uint64 query_id; /* Query id from the parent worker */ + pgTracingStartTime start_trace; + +} pgTracingParallelContext; + +/* + * Store context for parallel workers + */ +typedef struct pgTracingParallelWorkers +{ + slock_t mutex; + pgTracingParallelContext trace_contexts[FLEXIBLE_ARRAY_MEMBER]; +} pgTracingParallelWorkers; + +/* GUC variables */ +static int pg_tracing_max_span; /* Maximum number of spans to store */ +static int pg_tracing_max_parameter_str; /* Maximum number of spans to + * store */ +static int pg_tracing_max_traced_parallel_workers; /* Maximum number of + * parallel workers traced */ +static int pg_tracing_initial_allocated_spans = 25; /* Number of spans + * allocated at the + * start of a trace. */ +static bool pg_tracing_instrument_buffers; /* Enable buffers instrumentation */ +static bool pg_tracing_instrument_wal; /* Enable WAL instrumentation */ +static double pg_tracing_sample_rate = 0; /* Sample rate applied to queries + * without SQLCommenter */ +static double pg_tracing_caller_sample_rate = 1; /* Sample rate applied to + * queries with + * SQLCommenter */ +static int pg_tracing_track = PG_TRACING_TRACK_ALL; /* tracking level */ +static bool pg_tracing_track_utility = true; /* whether to track utility + * commands */ +static bool pg_tracing_drop_full_buffer = true; /* whether to drop buffered + * spans when full */ + + +int64 nested_query_start_ns = 0; + +static const struct config_enum_entry track_options[] = +{ + {"none", PG_TRACING_TRACK_NONE, false}, + {"top", PG_TRACING_TRACK_TOP, false}, + {"all", PG_TRACING_TRACK_ALL, false}, + {NULL, 0, false} +}; + +#define pg_tracing_enabled(level) \ + ((pg_tracing_track == PG_TRACING_TRACK_ALL || \ + (pg_tracing_track == PG_TRACING_TRACK_TOP && (level) == 0))) + +PG_FUNCTION_INFO_V1(pg_tracing_info); +PG_FUNCTION_INFO_V1(pg_tracing_spans); +PG_FUNCTION_INFO_V1(pg_tracing_reset); + +/* + * Global variables + */ + +/* Memory context for pg_tracing. */ +static MemoryContext pg_tracing_mem_ctx; + +/* Current trace state */ +struct pgTracingTrace current_trace = {}; + +/* Shared state with stats and file external state */ +static pgTracingSharedState * pg_tracing = NULL; + +/* + * Shared buffer storing spans. Query with sampled flag will add new spans to + * the shared state. Those spans will be consumed during calls to + * pg_tracing_spans + */ +static pgTracingSpans * shared_spans = NULL; + +/* + * Shared buffer storing trace context for parallel workers. + */ +static pgTracingParallelWorkers * pg_tracing_parallel = NULL; + +/* Index of the parallel worker context shared buffer if any */ +static int parallel_context_index = -1; + +/* +* Text for spans are buffered in this +* stringinfo and written at the end of the tracing in a single write. +*/ +static StringInfo current_trace_text; + +/* Start timestamp of the last executor finish */ +static TimestampTz executor_finish_start_ts; + +/* + * Maximum nested level for a query to know how many top spans we need to + * copy in shared_spans + */ +static int max_nested_level = -1; + +/* Current nesting depth of planner calls */ +static int plan_nested_level = 0; + +/* Current nesting depth of ExecutorRun+ProcessUtility calls */ +static int exec_nested_level = 0; + +/* Number of allocated levels. */ +static int allocated_nested_level = 0; + +/* + * Per nested level buffers + */ +typedef struct pgTracingPerLevelBuffer +{ + int top_span_index; /* Index of the top span in the + * current_trace_spans buffer */ + int parse_span_index; /* Index of the latest parse span in the + * current_trace_spans */ + uint64 executor_id; /* Span id of executor run spans by nested + * level Executor run is used as parent for + * spans generated from planstate */ + uint64_t query_id; /* Query id by nested level when available */ +} pgTracingPerLevelBuffer; + +static pgTracingPerLevelBuffer * per_level_buffers; + +/* + * Store all spans for the curent trace except top spans and parse spans. + * They will be added to the shared buffer at the end of query tracing. + * + * This buffer can be dynamically resized so care needs to be taken when using span pointers. + */ +static pgTracingSpans * current_trace_spans; + +/* + * Timestamp of the latest statement checked for sampling. + */ +static TimestampTz last_statement_check_for_sampling = 0; + +static void pg_tracing_shmem_request(void); +static void pg_tracing_shmem_startup(void); + +/* Saved hook values in case of unload */ +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL; +static planner_hook_type prev_planner_hook = NULL; +static ExecutorStart_hook_type prev_ExecutorStart = NULL; +static ExecutorRun_hook_type prev_ExecutorRun = NULL; +static ExecutorFinish_hook_type prev_ExecutorFinish = NULL; +static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; +static ProcessUtility_hook_type prev_ProcessUtility = NULL; + +static void pg_tracing_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate); +static PlannedStmt *pg_tracing_planner_hook(Query *parse, + const char *query_string, + int cursorOptions, + ParamListInfo params); +static void pg_tracing_ExecutorStart(QueryDesc *queryDesc, int eflags); +static void pg_tracing_ExecutorRun(QueryDesc *queryDesc, + ScanDirection direction, + uint64 count, bool execute_once); +static void pg_tracing_ExecutorFinish(QueryDesc *queryDesc); +static void pg_tracing_ExecutorEnd(QueryDesc *queryDesc); +static void pg_tracing_ProcessUtility(PlannedStmt *pstmt, const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, QueryCompletion *qc); + +static void generate_member_nodes(PlanState **planstates, int nplans, planstateTraceContext * planstateTraceContext, uint64 parent_id); +static void generate_span_from_planstate(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id); + +#define PG_TRACING_INFO_COLS 6 +#define PG_TRACING_TRACES_COLS 43 + +/* + * Module load callback + */ +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + DefineCustomIntVariable("pg_tracing.max_span", + "Maximum number of spans stored in shared memory.", + NULL, + &pg_tracing_max_span, + 1000, + 0, + 500000, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_tracing.max_parameter_size", + "Maximum size of parameters. -1 to disable parameter in top span.", + NULL, + &pg_tracing_max_parameter_str, + 1024, + 0, + 10000, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.instrument_buffers", + "Instrument and add buffers usage in spans.", + NULL, + &pg_tracing_instrument_buffers, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.instrument_wal", + "Instrument and add WAL usage in spans.", + NULL, + &pg_tracing_instrument_wal, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_tracing.max_traced_parallel_workers", + "Maximum number of parallel workers traced. -1 to disable tracing of parallel workers.", + NULL, + &pg_tracing_max_traced_parallel_workers, + 100, + 0, + 10000, + PGC_POSTMASTER, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_tracing.initial_allocated_spans", + "The number of allocated spans at the start of a trace. A higher number can help avoid reallocation when extending the buffer.", + NULL, + &pg_tracing_initial_allocated_spans, + 25, + 0, + 500, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomEnumVariable("pg_tracing.track", + "Selects which statements are tracked by pg_tracing.", + NULL, + &pg_tracing_track, + PG_TRACING_TRACK_ALL, + track_options, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.track_utility", + "Selects whether utility commands are tracked by pg_tracing.", + NULL, + &pg_tracing_track_utility, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.drop_on_full_buffer", + "Selects whether buffered spans should be dropped when buffer is full.", + NULL, + &pg_tracing_drop_full_buffer, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomRealVariable("pg_tracing.sample_rate", + "Fraction of queries to process.", + NULL, + &pg_tracing_sample_rate, + 0.0, + 0.0, + 1.0, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomRealVariable("pg_tracing.caller_sample_rate", + "Fraction of queries with sampled flag to process.", + NULL, + &pg_tracing_caller_sample_rate, + 1.0, + 0.0, + 1.0, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + MarkGUCPrefixReserved("pg_tracing"); + + /* For jumble state */ + EnableQueryId(); + + /* Install hooks. */ + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = pg_tracing_shmem_request; + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = pg_tracing_shmem_startup; + + prev_post_parse_analyze_hook = post_parse_analyze_hook; + post_parse_analyze_hook = pg_tracing_post_parse_analyze; + + prev_planner_hook = planner_hook; + planner_hook = pg_tracing_planner_hook; + + prev_ExecutorStart = ExecutorStart_hook; + ExecutorStart_hook = pg_tracing_ExecutorStart; + prev_ExecutorRun = ExecutorRun_hook; + ExecutorRun_hook = pg_tracing_ExecutorRun; + prev_ExecutorFinish = ExecutorFinish_hook; + ExecutorFinish_hook = pg_tracing_ExecutorFinish; + prev_ExecutorEnd = ExecutorEnd_hook; + ExecutorEnd_hook = pg_tracing_ExecutorEnd; + + prev_ProcessUtility = ProcessUtility_hook; + ProcessUtility_hook = pg_tracing_ProcessUtility; +} + +/* + * Estimate shared memory space needed. + */ +static Size +pg_tracing_memsize(void) +{ + Size size; + + /* pg_tracing shared state */ + size = sizeof(pgTracingSharedState); + /* span struct */ + size = add_size(size, sizeof(pgTracingSpans)); + /* the span variable array */ + size = add_size(size, mul_size(pg_tracing_max_span, sizeof(Span))); + /* and the parallel workers context */ + size = add_size(size, mul_size(pg_tracing_max_traced_parallel_workers, sizeof(pgTracingParallelContext))); + + return size; +} + +/* + * shmem_request hook: request additional shared resources. We'll allocate + * or attach to the shared resources in pgss_shmem_startup(). + */ +static void +pg_tracing_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + RequestAddinShmemSpace(pg_tracing_memsize()); +} + +/* + * Add current trace context to the parallel workers shared buffer. + */ +static void +add_parallel_context(uint64 parent_id, uint64 query_id) +{ + volatile pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel; + + if (pg_tracing_max_traced_parallel_workers <= 0) + /* No tracing of parallel workers */ + return; + + Assert(parallel_context_index == -1); + SpinLockAcquire(&p->mutex); + for (int i = 0; i < pg_tracing_max_traced_parallel_workers; i++) + { + volatile pgTracingParallelContext *ctx = p->trace_contexts + i; + + Assert(ctx->leader_backend_id != MyBackendId); + if (ctx->leader_backend_id != InvalidBackendId) + continue; + /* Slot is available */ + parallel_context_index = i; + ctx->leader_backend_id = MyBackendId; + ctx->trace_id = current_trace.trace_id; + ctx->parent_id = parent_id; + ctx->query_id = query_id; + ctx->start_trace = current_trace.start_trace; + break; + } + SpinLockRelease(&p->mutex); +} + +/* + * Return a slot from the current_trace_spans buffer. + * current_trace_spans may be extended if needed. + */ +static int +get_index_from_trace_spans(void) +{ + if (current_trace_spans->end + 1 >= current_trace_spans->max) + { + MemoryContext oldcxt; + + /* Need to extend. */ + /* TODO: Have a configurable limit? */ + current_trace_spans->max *= 2; + oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx); + current_trace_spans = repalloc(current_trace_spans, current_trace_spans->max * sizeof(Span)); + MemoryContextSwitchTo(oldcxt); + } + return current_trace_spans->end++; +} + +/* + * Get current_span from provided index + */ +static Span * +get_span_from_index(int span_index) +{ + Assert(span_index < current_trace_spans->end); + return ¤t_trace_spans->spans[span_index]; +} + +/* + * Get top span for the provided nested level + */ +static Span * +get_top_span_for_nested_level(int nested_level) +{ + Span *top_span; + + Assert(nested_level <= max_nested_level); + if (per_level_buffers[nested_level].top_span_index == -1) + per_level_buffers[nested_level].top_span_index = get_index_from_trace_spans(); + top_span = get_span_from_index(per_level_buffers[nested_level].top_span_index); + return top_span; +} + +/* + * Set top span to the provided span index + */ +static void +set_top_span(int nested_level, int top_span_index) +{ + Assert(nested_level <= max_nested_level); + Assert(top_span_index < current_trace_spans->max); + per_level_buffers[nested_level].top_span_index = top_span_index; +} + +/* + * Get parse span for the provided nested level + */ +static Span * +get_parse_span_from_nested_level(int nested_level, bool force_allocation) +{ + Span *parse_span; + + if (force_allocation && per_level_buffers[nested_level].parse_span_index != -1) + { + parse_span = get_span_from_index(per_level_buffers[nested_level].parse_span_index); + Assert(parse_span->ended == true); + } + if (force_allocation || per_level_buffers[nested_level].parse_span_index == -1) + per_level_buffers[nested_level].parse_span_index = get_index_from_trace_spans(); + parse_span = get_span_from_index(per_level_buffers[nested_level].parse_span_index); + return parse_span; +} + +/* + * End the trace of the span at the provided index + */ +static void +end_span_index(int span_index, const int64 *end_span_input) +{ + Span *span; + + if (span_index < 0) + return; + span = get_span_from_index(span_index); + end_span(span, end_span_input); +} + +/* + * End top span for the provided nested level + */ +static void +end_top_span(int nested_level, const int64 *end_span_ns) +{ + Assert(nested_level <= max_nested_level); + Assert(per_level_buffers[nested_level].top_span_index < current_trace_spans->max); + Assert(per_level_buffers[nested_level].top_span_index != -1); + end_span_index(per_level_buffers[nested_level].top_span_index, end_span_ns); +} + +/* + * Remove parallel context for the current leader from the shared memory. + */ +static void +remove_parallel_context(void) +{ + if (parallel_context_index < 0) + /* No tracing of parallel workers */ + return; + { + volatile pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel; + + SpinLockAcquire(&p->mutex); + p->trace_contexts[parallel_context_index].leader_backend_id = InvalidBackendId; + SpinLockRelease(&p->mutex); + } + parallel_context_index = -1; +} + +/* + * If we're inside a parallel worker, check if the trace context is stored in shared memory. + * If a trace context exists, it means that the query is sampled and worker tracing is enabled. + */ +static void +fetch_parallel_context(pgTracingTrace * trace) +{ + volatile pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel; + + if (pg_tracing_max_traced_parallel_workers <= 0) + { + /* No tracing of parallel workers */ + return; + } + SpinLockAcquire(&p->mutex); + for (int i = 0; i < pg_tracing_max_traced_parallel_workers; i++) + { + if (p->trace_contexts[i].leader_backend_id != ParallelLeaderBackendId) + continue; + /* Found a matching a trace context, fetch it */ + trace->trace_id = p->trace_contexts[i].trace_id; + trace->parent_id = p->trace_contexts[i].parent_id; + trace->start_trace = p->trace_contexts[i].start_trace; + trace->query_id = p->trace_contexts[i].query_id; + trace->sampled = true; + } + SpinLockRelease(&p->mutex); +} + +/* + * shmem_startup hook: allocate or attach to shared memory, Also create and + * load the query-texts file, which is expected to exist (even if empty) + * while the module is enabled. + */ +static void +pg_tracing_shmem_startup(void) +{ + bool found_pg_tracing; + bool found_shared_spans; + bool found_parallel; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + /* reset in case this is a restart within the postmaster */ + pg_tracing = NULL; + + /* Create or attach to the shared memory state */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + pg_tracing = ShmemInitStruct("PgTracing Shared", sizeof(pgTracingSharedState), &found_pg_tracing); + shared_spans = ShmemInitStruct("PgTracing Spans", + sizeof(pgTracingSpans) + pg_tracing_max_span * sizeof(Span), + &found_shared_spans); + pg_tracing_parallel = ShmemInitStruct("PgTracing Parallel Workers Context", + sizeof(pgTracingParallelWorkers) + pg_tracing_max_traced_parallel_workers * sizeof(pgTracingParallelContext), + &found_parallel); + + /* Initialize pg_tracing memory context */ + pg_tracing_mem_ctx = AllocSetContextCreate(TopMemoryContext, + "pg_tracing memory context", + ALLOCSET_DEFAULT_SIZES); + + /* First time, let's init shared state */ + if (!found_pg_tracing) + { + pg_tracing->stats.traces = 0; + pg_tracing->stats.stats_reset = GetCurrentTimestamp(); + SpinLockInit(&pg_tracing->mutex); + SpinLockInit(&pg_tracing_parallel->mutex); + } + if (!found_shared_spans) + { + shared_spans->end = 0; + shared_spans->max = pg_tracing_max_span; + } + if (!found_parallel) + { + for (int i = 0; i < pg_tracing_max_traced_parallel_workers; i++) + pg_tracing_parallel->trace_contexts[i].leader_backend_id = InvalidBackendId; + } + LWLockRelease(AddinShmemInitLock); +} + +/* + * Add span to the shared memory. Mutex should already have been acquired. + */ +static void +add_span_to_shared_buffer_locked(const Span * span) +{ + /* Spans must be ended before adding them to the shared buffer */ + Assert(span->ended); + if (shared_spans->end + 1 >= shared_spans->max) + pg_tracing->stats.dropped_spans++; + else + { + pg_tracing->stats.spans++; + shared_spans->spans[shared_spans->end++] = *span; + } +} + +/* + * Check if we still have available space in the shared spans. + * + * Between the moment we check and the moment we want to insert, the buffer + * may be full but we redo a check before appending the span. This is done + * early when starting a top span to bail out early if the buffer is + * already full since we don't immediately add the span in the shared buffer. + */ +static bool +check_full_shared_spans() +{ + if (shared_spans->end + 1 >= shared_spans->max) + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + bool full_buffer = false; + + SpinLockAcquire(&s->mutex); + if (pg_tracing_drop_full_buffer) + { + s->stats.dropped_spans += shared_spans->end; + shared_spans->end = 0; + s->extent = 0; + } + else + { + full_buffer = true; + s->stats.dropped_spans++; + } + SpinLockRelease(&s->mutex); + return full_buffer; + } + return false; +} + +/* + * Process a query descriptor: Gather all query instrumentation in the query + * span counters and generate span nodes from queryDesc planstate + */ +static void +process_query_desc(QueryDesc *queryDesc, int sql_error_code) +{ + MemoryContext oldcxt; + uint64 parent_id = per_level_buffers[exec_nested_level].executor_id; + NodeCounters *node_counters = &get_top_span_for_nested_level(exec_nested_level)->node_counters; + + oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); + /* Process total counters */ + if (queryDesc->totaltime) + { + InstrEndLoop(queryDesc->totaltime); + node_counters->buffer_usage = queryDesc->totaltime->bufusage; + node_counters->wal_usage = queryDesc->totaltime->walusage; + } + if (queryDesc->estate->es_jit) + node_counters->jit_usage = queryDesc->estate->es_jit->instr; + node_counters->rows = queryDesc->estate->es_total_processed; + + /* Process planstate */ + if (queryDesc->planstate) + { + Bitmapset *rels_used = NULL; + planstateTraceContext planstateTraceContext; + + planstateTraceContext.rtable_names = select_rtable_names_for_explain(queryDesc->plannedstmt->rtable, rels_used); + planstateTraceContext.trace = ¤t_trace; + planstateTraceContext.ancestors = NULL; + planstateTraceContext.sql_error_code = sql_error_code; + planstateTraceContext.deparse_ctx = deparse_context_for_plan_tree(queryDesc->plannedstmt, planstateTraceContext.rtable_names); + + generate_span_from_planstate(queryDesc->planstate, &planstateTraceContext, parent_id); + } + MemoryContextSwitchTo(oldcxt); +} + +/* + * If the query was started as a prepared statement, we won't be able to + * extract traceparent during query parsing since parsing was skipped. + * + * We assume that SQLCommenter content can be passed as a text in the + * first parameter + */ +static void +check_traceparameter_in_parameter(ParamListInfo params) +{ + Oid typoutput; + bool typisvarlena; + char *pstring; + ParamExternData param; + + if (params == NULL || params->numParams == 0) + return; + + param = params->params[0]; + /* Check first parameter type */ + if (param.ptype != TEXTOID) + return; + + /* Get the text of the first string parameter */ + getTypeOutputInfo(param.ptype, &typoutput, &typisvarlena); + pstring = OidOutputFunctionCall(typoutput, param.value); + current_trace = extract_traceparent(pstring, true); +} + +/* + * If trace was started from the global sample rate without + * a parent trace, we need to generate a random trace id + */ +static void +set_traceparent_trace_id(struct pgTracingTrace *traceparent_parameter) +{ + if (traceparent_parameter->trace_id != 0) + return; + + /* + * We leave parent_id to 0 as a way to indicate that this is a standalone + * trace. + */ + Assert(traceparent_parameter->parent_id == 0); + traceparent_parameter->trace_id = pg_prng_uint64(&pg_global_prng_state); +} + +/* + * Decide whether a query should be sampled depending on the traceparent sampled + * and the provided sample rate configurations + */ +static bool +apply_sample_rate(struct pgTracingTrace const *traceparent_parameter) +{ + double rand; + bool sampled; + + /* Everything is sampled */ + if (pg_tracing_sample_rate >= 1.0) + return true; + + /* No SQLCommenter sampled and no global sample rate */ + if (traceparent_parameter->sampled == 0 && pg_tracing_sample_rate == 0.0) + return false; + + /* SQLCommenter sampled caller sample */ + if (traceparent_parameter->sampled == 1 && pg_tracing_caller_sample_rate >= 1.0) + return true; + + /* + * We have a non zero global sample rate or a sampled flag. Either way, we + * need a rand value + */ + rand = pg_prng_double(&pg_global_prng_state); + if (traceparent_parameter->sampled == 1) + /* Sampled flag case */ + sampled = (rand < pg_tracing_caller_sample_rate); + else + /* Global rate case */ + sampled = (rand < pg_tracing_sample_rate); + return sampled; +} + +/* + * Check whether the current query should be sampled or not + */ +static void +check_query_sampled(ParseState *pstate, const ParamListInfo params) +{ + TimestampTz statement_start_ts; + + /* Safety check... */ + if (!pg_tracing || !pg_tracing_enabled(exec_nested_level)) + return; + + /* sampling already started */ + if (current_trace.sampled > 0) + return; + + /* Don't start tracing if we're not at the top */ + if (exec_nested_level > 0) + return; + + /* + * In a parallel worker, check the parallel context shared buffer to see + * if the leader left a trace context + */ + if (IsParallelWorker()) + { + fetch_parallel_context(¤t_trace); + return; + } + + Assert(max_nested_level = -1); + + current_trace.trace_id = 0; + current_trace.parent_id = 0; + if (params != NULL) + check_traceparameter_in_parameter(params); + else if (pstate != NULL) + current_trace = extract_traceparent(pstate->p_sourcetext, false); + + /* + * A statement can go through this function 3 times: post parsing, planner + * and executor run. If no sampling flag was extracted from SQLCommenter + * and we've already seen this statement (using statement_start ts), don't + * try to apply sample rate and exit. + */ + statement_start_ts = GetCurrentStatementStartTimestamp(); + if (current_trace.sampled == 0 && last_statement_check_for_sampling == statement_start_ts) + return; + last_statement_check_for_sampling = statement_start_ts; + + current_trace.sampled = apply_sample_rate(¤t_trace); + + if (current_trace.sampled > 0 && check_full_shared_spans()) + /* Buffer is full, abort sampling */ + current_trace.sampled = 0; + + if (current_trace.sampled > 0) + set_traceparent_trace_id(¤t_trace); +} + +/* + * Adjust spans' offsets to the provided file offset + */ +static void +adjust_file_offset(Span * span, Size file_position) +{ + if (span->name_offset != -1) + span->name_offset += file_position; + if (span->operation_name_offset != -1) + span->operation_name_offset += file_position; + if (span->parameter_offset != -1) + span->parameter_offset += file_position; +} + +/* + * Get the start time for a new top span + */ +static int64 +get_top_span_start(void) +{ + /* For the top span of parallel worker, we get the current ns */ + if (exec_nested_level == 0 && IsParallelWorker()) + return get_current_ns(); + /* For the root top span, use the trace start */ + if (exec_nested_level == 0) + { + Assert(current_trace.start_trace.ns > 0); + return current_trace.start_trace.ns; + } + + /* + * We're in a nested query, use set nested start. This is only set if + * we're inside a ProcessUtility. Other nested queries (like function + * call) will have their start/duration adjusted when queryDesc is + * processed. + */ + return nested_query_start_ns; +} + +/* + * Reset pg_tracing memory context and global state. + */ +static void +cleanup_tracing(void) +{ + if (current_trace.sampled == 0) + /* No need for cleaning */ + return; + remove_parallel_context(); + MemoryContextReset(pg_tracing_mem_ctx); + current_trace.sampled = 0; + current_trace.trace_id = 0; + current_trace.parent_id = 0; + max_nested_level = -1; + current_trace_spans = NULL; + per_level_buffers = NULL; +} + +/* + * End of the query tracing, dump all of top spans in the shared + * memory. This may happen either when query is finished or on a caught error. + */ +static void +end_tracing(const int64 *end_span_ns) +{ + Size file_position = 0; + + /* We're still a nested query, tracing is not finished */ + if (exec_nested_level > 0) + return; + + /* Dump all buffered texts in file */ + text_store_file(pg_tracing, current_trace_text->data, current_trace_text->len, &file_position); + + for (int i = 0; i < current_trace_spans->end; i++) + adjust_file_offset(current_trace_spans->spans + i, file_position); + + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + /* We're at the end, add all spans to the shared memory */ + for (int i = 0; i < current_trace_spans->end; i++) + add_span_to_shared_buffer_locked(¤t_trace_spans->spans[i]); + SpinLockRelease(&s->mutex); + } + + /* We can reset the memory context here */ + cleanup_tracing(); +} + +/* + * When we catch an error (timeout, cancel query), we need to flag the ongoing + * span with an error, send current spans in the shared buffer and clean + * our memory context + */ +static void +handle_pg_error(int span_index, QueryDesc *queryDesc) +{ + Span *span; + Span *top_span; + int64 end_span_ns; + int sql_error_code; + + /* If we're not sampling the query, bail out */ + if (current_trace.sampled == 0 && !pg_tracing_enabled(exec_nested_level)) + return; + + if (span_index < 0) + return; + + sql_error_code = geterrcode(); + /* Assign the error code to the latest span */ + span = get_span_from_index(span_index); + span->sql_error_code = sql_error_code; + + /* Same for top span */ + top_span = get_top_span_for_nested_level(exec_nested_level); + top_span->sql_error_code = sql_error_code; + + /* + * Order matters there. We want to process query desc before getting the + * end time otherwise, the span nodes will have a higher duration than + * their parents + */ + if (queryDesc != NULL) + process_query_desc(queryDesc, sql_error_code); + + /* End current span and ongoing top spans */ + end_span_ns = get_current_ns(); + for (int i = 0; i <= exec_nested_level; i++) + end_top_span(i, &end_span_ns); + end_span(span, &end_span_ns); + end_tracing(&end_span_ns); +} + +/* + * Get the parent id for the given nested level + * + * nested_level can be negative when creating the top top span. + * In this case, we want to use the trace parent id. + */ +static uint64 +get_parent_id(int nested_level) +{ + Span *top_span; + + if (nested_level < 0) + return current_trace.parent_id; + Assert(nested_level <= allocated_nested_level); + top_span = get_top_span_for_nested_level(nested_level); + Assert(top_span->span_id > 0); + return top_span->span_id; +} + +/* + * If we enter a new nested level, initialize all necessary buffers + * and start_trace timer. + * + * The start of a top span can vary: prepared statement will skip parsing, the use of cached + * plans will skip the planner hook. + * Thus, a top span can start in either post parse, planner hook or executor run. + * + * Since this is called after the we've detected the start of a trace, we check for available + * space in the buffer. If the buffer is full, we abort tracing by setting sampled to false. + * Callers need to check that tracing was aborted. + */ +static bool +initialize_trace_level(void) +{ + /* Check if we've already created a top span for this nested level */ + if (exec_nested_level <= max_nested_level) + return false; + + /* First time */ + if (max_nested_level == -1) + { + MemoryContext oldcxt; + + Assert(pg_tracing_mem_ctx->isReset); + + /* + * We need to be able to pass informations that depend on the nested + * level. + * + * executor span ids: Since an executor run becomes the parent span, + * we need subsequent created node spans to have the correct parent. + * + * query ids: used to propagate queryId to all spans in the same level + * + * top spans: We create one top span per nested level and those are + * only inserted in the shared buffer at the end + * + * parse spans: We don't know the parent of nested parse span until we + * process the plan node. We keep them in this buffer to be able to + * change their parent. + */ + oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx); + /* initial allocation */ + allocated_nested_level = 1; + per_level_buffers = palloc0(allocated_nested_level * sizeof(pgTracingPerLevelBuffer)); + /* 0 is a valid index, initialise all indexes to -1 */ + for (int i = 0; i < allocated_nested_level; i++) + { + per_level_buffers[i].top_span_index = -1; + per_level_buffers[i].parse_span_index = -1; + } + current_trace_spans = palloc0(pg_tracing_initial_allocated_spans * sizeof(Span)); + current_trace_spans->max = pg_tracing_initial_allocated_spans; + current_trace_text = makeStringInfo(); + + MemoryContextSwitchTo(oldcxt); + + /* + * Parallel workers will get the referential timers from the leader + * parallel context + */ + if (!IsParallelWorker()) + { + /* Start referential timers */ + current_trace.start_trace.ts = GetCurrentStatementStartTimestamp(); + + /* + * We can't get the monotonic clock at the start of the statement + * so let's estimate it from here. + */ + current_trace.start_trace.ns = get_current_ns() - (GetCurrentTimestamp() - current_trace.start_trace.ts) * NS_PER_US; + } + } + else if (exec_nested_level >= allocated_nested_level) + { + /* New nested level, allocate more memory */ + MemoryContext oldcxt; + int old_allocated_nested_level = allocated_nested_level; + + oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx); + allocated_nested_level++; + per_level_buffers = repalloc0(per_level_buffers, old_allocated_nested_level * sizeof(pgTracingPerLevelBuffer), allocated_nested_level * sizeof(pgTracingPerLevelBuffer)); + for (int i = old_allocated_nested_level; i < allocated_nested_level; i++) + { + per_level_buffers[i].top_span_index = -1; + per_level_buffers[i].parse_span_index = -1; + } + MemoryContextSwitchTo(oldcxt); + } + + max_nested_level = exec_nested_level; + if (exec_nested_level == 0) + { + /* Start of a new trace */ + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->stats.traces++; + SpinLockRelease(&s->mutex); + } + return true; +} + +/* + * Add a new string to the current_trace_text stringinfo + */ +static int +add_str_to_trace_buffer(const char *str, int str_len) +{ + int position = current_trace_text->cursor; + + appendBinaryStringInfo(current_trace_text, str, str_len); + appendStringInfoChar(current_trace_text, '\0'); + current_trace_text->cursor = current_trace_text->len; + return position; +} + +/* + * Add the worker name to the current_trace_text stringinfo + */ +static int +add_worker_name_to_trace_buffer() +{ + int position = current_trace_text->cursor; + + appendStringInfo(current_trace_text, "Worker %d", ParallelWorkerNumber); + appendStringInfoChar(current_trace_text, '\0'); + current_trace_text->cursor = current_trace_text->len; + return position; +} + +/* + * Start a new top span if we've entered a new nested level + */ +static void +begin_top_span(Span * top_span, CmdType commandType, + const Query *query, const JumbleState *jstate, + const char *query_text, int64 start_time_ns) +{ + int query_len; + const char *normalised_query; + + /* in case of a cached plan, query might be unavailable */ + if (query != NULL) + per_level_buffers[exec_nested_level].query_id = query->queryId; + else if (current_trace.query_id > 0) + per_level_buffers[exec_nested_level].query_id = current_trace.query_id; + + begin_span(top_span, command_type_to_span_type(commandType), + ¤t_trace, get_parent_id(exec_nested_level - 1), per_level_buffers[exec_nested_level].query_id, &start_time_ns, exec_nested_level); + top_span->top_span = true; + + if (IsParallelWorker()) + { + /* + * In a parallel worker, we use the worker name as the span's + * operation + */ + top_span->operation_name_offset = add_worker_name_to_trace_buffer(); + return; + } + + if (jstate && jstate->clocations_count > 0 && query != NULL) + { + /* jstate is available, normalise query and extract parameters' values */ + char *paramStr; + + query_len = query->stmt_len; + normalised_query = normalise_query_parameters(jstate, query_text, + query->stmt_location, + &query_len, ¶mStr); + top_span->parameter_offset = add_str_to_trace_buffer(paramStr, strlen(paramStr)); + } + else + { + /* + * No jstate available, normalise query but we won't be able to + * extract parameters + */ + int stmt_location; + + if (query != NULL && query->stmt_len > 0) + { + query_len = query->stmt_len; + stmt_location = query->stmt_location; + } + else + { + query_len = strlen(query_text); + stmt_location = 0; + } + normalised_query = normalise_query(query_text, stmt_location, &query_len); + } + top_span->operation_name_offset = add_str_to_trace_buffer(normalised_query, query_len); +} + +/* + * Initialise buffer if we are in a new nested level and start top span. + */ +static int64 +initialize_trace_level_and_top_span(CmdType commandType, Query *query, JumbleState *jstate, const char *query_text) +{ + bool new_nested_level = initialize_trace_level(); + int64 top_span_start_ns = get_top_span_start(); + Span *current_top = get_top_span_for_nested_level(exec_nested_level); + + /* current_trace_spans buffer should have been allocated */ + Assert(current_trace_spans != NULL); + if (new_nested_level) + { + begin_top_span(current_top, commandType, query, jstate, query_text, top_span_start_ns); + } + else if (current_top->ended) + { + /* + * It's possible to have multiple top spans within a nested query. If + * the previous top span was closed, start a new one + */ + int span_index = get_index_from_trace_spans(); + Span *span = get_span_from_index(span_index); + + set_top_span(exec_nested_level, span_index); + begin_top_span(span, commandType, query, jstate, query_text, top_span_start_ns); + } + return top_span_start_ns; +} + +/* + * Post-parse-analyze hook + * + * Tracing can be started here if: + * - The query has a SQLCommenter with traceparent parameter with sampled flag on and passes the caller_sample_rate + * - The query passes the sample_rate + * + * If query is sampled, create a post parse span and start the top span. + */ +static void +pg_tracing_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate) +{ + Span *parse_span = NULL; + Span *post_parse_span; + int post_parse_index = -1; + int64 start_post_parse_ns; + int64 end_parse_ns; + uint64 parent_id; + int64 top_span_start_ns; + bool leave_parse_span_open = false; + + if (prev_post_parse_analyze_hook) + prev_post_parse_analyze_hook(pstate, query, jstate); + + /* + * In case there's an multi statement transaction using extended protocol, + * we will have the PREPARE of the next statement before we go through the + * ExecutorEnd of the previous statement. + * + * For now, we skip parse span and will only start the next statement + * tracing during planner hook. + * + * TODO: Find a way to keep parse span for the next statement + */ + if (!pg_tracing_mem_ctx->isReset && exec_nested_level == 0) + { + return; + } + + /* If disabled, don't trace utility statement */ + if (query->utilityStmt && !pg_tracing_track_utility) + return; + + check_query_sampled(pstate, NULL); + + if (current_trace.sampled == 0) + /* Query is not sampled, nothing to do */ + return; + + /* + * We want to avoid calling get_ns at the start of post parse as it will + * impact all queries and we will only use it when the query is sampled. + */ + start_post_parse_ns = get_current_ns(); + end_parse_ns = start_post_parse_ns - 1000; + + /* + * Either we're inside a nested sampled query or we've parsed a query with + * the sampled flag, start a new level with a top span + */ + top_span_start_ns = initialize_trace_level_and_top_span(query->commandType, query, jstate, pstate->p_sourcetext); + + parent_id = get_parent_id(exec_nested_level); + Assert(parent_id > 0); + + if (top_span_start_ns == 0) + { + /* + * We are inside a nested query that wasn't started from + * ProcessUtility. We will rely on reparent span at the executor end + * to set the parse start and duration will be used to store the parse + * span end. + */ + top_span_start_ns = end_parse_ns; + leave_parse_span_open = true; + /* TODO: Handle this? */ + } + + if (!leave_parse_span_open) + { + /* + * We need to keep track of the parse span. Within nested queries, and + * outside of processUtility, we only know of the start of the parse + * span when we process planstate nodes at the end of the Executor. In + * this case, top_span_start_ns is set to 0 + */ + parse_span = get_parse_span_from_nested_level(exec_nested_level, true); + begin_span(parse_span, SPAN_PARSE, + ¤t_trace, parent_id, per_level_buffers[exec_nested_level].query_id, + &top_span_start_ns, exec_nested_level); + + /* + * In case of queries nested in a result, we don't have the parse + * start. We will need to update this span when processing the + * queryDesc. + */ + end_span(parse_span, &end_parse_ns); + } + + /* Post parse span */ + post_parse_index = get_index_from_trace_spans(); + post_parse_span = get_span_from_index(post_parse_index); + begin_span(post_parse_span, SPAN_POST_PARSE, + ¤t_trace, + parent_id, per_level_buffers[exec_nested_level].query_id, + &start_post_parse_ns, exec_nested_level); + if (parse_span != NULL && parse_span->start + parse_span->duration_ns / NS_PER_US >= post_parse_span->start) + + /* + * We want to avoid parse spans ending at the exact same microsend as + * the post parse span. Artificially increase post parse time if it's + * the case + */ + post_parse_span->start++; + end_span(post_parse_span, NULL); +} + +/* + * Planner hook + * + * Tracing can start here if the query skipped parsing (prepared query) and passes the random sample_rate. + * + * If query is sampled, create a plan span and start the top span if not already started. + */ +static PlannedStmt * +pg_tracing_planner_hook(Query *query, + const char *query_string, + int cursorOptions, + ParamListInfo params) +{ + PlannedStmt *result; + int span_planner_index = -1; + Span *span_planner; + + check_query_sampled(NULL, params); + + /* No sampling */ + if (current_trace.sampled == 0 || !pg_tracing_enabled(plan_nested_level + exec_nested_level)) + { + if (prev_planner_hook) + result = prev_planner_hook(query, query_string, cursorOptions, + params); + else + result = standard_planner(query, query_string, cursorOptions, + params); + return result; + } + + /* + * We may have skipped parsing if statement was prepared, start a new top + * span if we don't have one + */ + if (current_trace.sampled == 1 && pg_tracing_enabled(plan_nested_level + exec_nested_level)) + initialize_trace_level_and_top_span(query->commandType, query, NULL, query_string); + + /* Start planner span */ + span_planner_index = get_index_from_trace_spans(); + span_planner = get_span_from_index(span_planner_index); + begin_span(span_planner, SPAN_PLANNER, ¤t_trace, + get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level); + + plan_nested_level++; + PG_TRY(); + { + if (prev_planner_hook) + result = prev_planner_hook(query, query_string, cursorOptions, + params); + else + result = standard_planner(query, query_string, cursorOptions, + params); + } + PG_CATCH(); + { + plan_nested_level--; + handle_pg_error(span_planner_index, NULL); + PG_RE_THROW(); + } + PG_END_TRY(); + plan_nested_level--; + + /* End planner span */ + end_span_index(span_planner_index, NULL); + + /* If we have a prepared statement, add bound parameters to the top span */ + if (params != NULL) + { + char *paramStr = BuildParamLogString(params, + NULL, pg_tracing_max_parameter_str); + Span *current_top = get_top_span_for_nested_level(exec_nested_level); + + Assert(current_top->parameter_offset == -1); + if (paramStr != NULL) + current_top->parameter_offset = add_str_to_trace_buffer(paramStr, strlen(paramStr)); + } + return result; +} + +/* + * ExecutorStart hook: Activate query instrumentation if query is sampled + * + * Tracing can be started here if the query used a cached plan and passes the random sample_rate + * If query is sampled, start the top span if not already started. + */ +static void +pg_tracing_ExecutorStart(QueryDesc *queryDesc, int eflags) +{ + int executor_start_span_index = -1; + Span *executor_span; + int instrument_options = INSTRUMENT_TIMER | INSTRUMENT_ROWS; + + check_query_sampled(NULL, queryDesc->params); + + if (pg_tracing_instrument_buffers) + instrument_options |= INSTRUMENT_BUFFERS; + if (pg_tracing_instrument_wal) + instrument_options |= INSTRUMENT_WAL; + + if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level)) + { + /* + * In case of a cached plan, we haven't gone through neither parsing + * nor planner hook. + * + * Parallel workers will also start tracing from there. + */ + initialize_trace_level_and_top_span(queryDesc->operation, NULL, NULL, queryDesc->sourceText); + executor_start_span_index = get_index_from_trace_spans(); + executor_span = get_span_from_index(executor_start_span_index); + begin_span(executor_span, SPAN_EXECUTOR_START, ¤t_trace, get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level); + + /* + * Activate query instrumentation. Timer and rows instrumentation are + * mandatory. + */ + queryDesc->instrument_options = instrument_options; + } + + if (prev_ExecutorStart) + prev_ExecutorStart(queryDesc, eflags); + else + standard_ExecutorStart(queryDesc, eflags); + + if (executor_start_span_index != -1) + { + /* Allocate instrumentation */ + if (queryDesc->totaltime == NULL) + { + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); + queryDesc->totaltime = InstrAlloc(1, instrument_options, false); + queryDesc->planstate->instrument = InstrAlloc(1, instrument_options, false); + MemoryContextSwitchTo(oldcxt); + } + end_span_index(executor_start_span_index, NULL); + } +} + +/* + * ExecutorRun hook: track nesting depth and create executor run span. + * If the plan needs to create parallel workers, push the trace context in parallel shared buffer. + */ +static void +pg_tracing_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, + uint64 count, bool execute_once) +{ + int span_index = -1; + + if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level)) + { + Span *span; + + span_index = get_index_from_trace_spans(); + span = get_span_from_index(span_index); + begin_span(span, SPAN_EXECUTOR_RUN, ¤t_trace, get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level); + + /* + * Executor run is used as the parent's span for nodes, store it's id + * in the per level buffer. + */ + per_level_buffers[exec_nested_level].executor_id = span->span_id; + if (queryDesc->plannedstmt->parallelModeNeeded) + add_parallel_context(span->span_id, per_level_buffers[exec_nested_level].query_id); + } + + exec_nested_level++; + PG_TRY(); + { + if (prev_ExecutorRun) + prev_ExecutorRun(queryDesc, direction, count, execute_once); + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + } + PG_CATCH(); + { + exec_nested_level--; + handle_pg_error(span_index, queryDesc); + PG_RE_THROW(); + } + PG_END_TRY(); + exec_nested_level--; + + /* + * When a query is executing the pg_tracing_spans function, sampling will + * be disabled while we have an ongoing trace. Check for + * current_trace_spans + */ + if (current_trace_spans != NULL) + end_span_index(span_index, NULL); +} + +/* + * ExecutorFinish hook: create executor finish span and track nesting depth + */ +static void +pg_tracing_ExecutorFinish(QueryDesc *queryDesc) +{ + int span_index = -1; + Span *span = NULL; + + if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level)) + { + span_index = get_index_from_trace_spans(); + span = get_span_from_index(span_index); + begin_span(span, SPAN_EXECUTOR_FINISH, ¤t_trace, get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level); + executor_finish_start_ts = span->start; + } + + exec_nested_level++; + PG_TRY(); + { + if (prev_ExecutorFinish) + prev_ExecutorFinish(queryDesc); + else + standard_ExecutorFinish(queryDesc); + } + PG_CATCH(); + { + exec_nested_level--; + handle_pg_error(span_index, queryDesc); + PG_RE_THROW(); + } + PG_END_TRY(); + exec_nested_level--; + + end_span_index(span_index, NULL); +} + +/* + * ExecutorEnd hook: process queryDesc and end tracing if we're at nested + * level 0 + */ +static void +pg_tracing_ExecutorEnd(QueryDesc *queryDesc) +{ + int span_index = -1; + + if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level)) + { + Span *span; + + span_index = get_index_from_trace_spans(); + span = get_span_from_index(span_index); + begin_span(span, SPAN_EXECUTOR_END, ¤t_trace, get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, + NULL, exec_nested_level); + + /* + * Executor finish can have a sub microsencond duration leading to + * both finish and end to start at the same timestamp. Artificially + * inscreate the start of the finish span if it's the case. + */ + if (span->start == executor_finish_start_ts) + span->start++; + + /* Query finished normally, send 0 as error code */ + process_query_desc(queryDesc, 0); + } + + if (prev_ExecutorEnd) + prev_ExecutorEnd(queryDesc); + else + standard_ExecutorEnd(queryDesc); + + if (span_index > -1) + { + int64 end_span_ns = get_current_ns(); + int64 end_child_span_ns = end_span_ns - 1000; + + if (exec_nested_level == 0) + end_top_span(exec_nested_level, &end_span_ns); + /* End the matching top span */ + end_span_index(span_index, &end_child_span_ns); + end_tracing(&end_span_ns); + } +} + +/* + * With nested queries, the post parse hook will be called before we know + * the top span and parse span parent. We will only know and create their + * parent when we generate span node from planstate. + * + * When those nodes are created, we attach the parse spans and top spans to them and + * adjust their start/duration. + */ +static void +reparent_child_query(Span const *parent_span, int64 start_span_ns) +{ + Span *child_parse_span; + Span *child_top_span; + + if (exec_nested_level + 1 > max_nested_level) + /* No child to reparent */ + return; + + /* Use the same start/duration as the parent */ + child_top_span = get_top_span_for_nested_level(exec_nested_level + 1); + Assert(child_top_span->ended == false); + child_top_span->parent_id = parent_span->span_id; + child_top_span->start = parent_span->start; + child_top_span->duration_ns = parent_span->duration_ns; + child_top_span->ended = true; + + if (per_level_buffers[exec_nested_level + 1].parse_span_index == -1) + /* No parse span started */ + return; + + child_parse_span = get_parse_span_from_nested_level(exec_nested_level + 1, false); + if (child_parse_span->trace_id > 0) + { + int64 end_parse_span_ns; + + /* + * We've stored the end ns of the parse span in the duration_ns field. + * We can match the parse start with it's parent and compute the + * duration with the elements we have. + */ + end_parse_span_ns = child_parse_span->duration_ns; + child_parse_span->start = parent_span->start; + child_parse_span->duration_ns = start_span_ns; + end_span(child_parse_span, &end_parse_span_ns); + Assert(child_parse_span->duration_ns > 0); + } +} + +/* + * ProcessUtility hook + * + * Trace utility query if utility tracking is enabled and sampling was enabled + * during parse step. + * + * Explain statements are skipped + */ +static void +pg_tracing_ProcessUtility(PlannedStmt *pstmt, const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, QueryEnvironment *queryEnv, + DestReceiver *dest, QueryCompletion *qc) +{ + Span *process_utility_span; + int process_utility_span_index = -1; + int old_top_span_index = -1; + int64 end_span_ns; + int64 end_child_span_ns; + int64 start_span_ns; + + if (!pg_tracing_track_utility || current_trace.sampled == 0 || !pg_tracing_enabled(exec_nested_level)) + { + if (prev_ProcessUtility) + prev_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + else + standard_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + + /* + * Tracing may have been started within the utility call without going + * through ExecutorEnd (ex: prepare statement). Check and end tracing + * here in this case. + */ + if (!pg_tracing_mem_ctx->isReset) + { + Assert(current_trace.sampled); + Assert(exec_nested_level == 0); + end_tracing(NULL); + } + return; + } + + start_span_ns = get_current_ns(); + + /* + * Set a possible start for nested query. ProcessUtility may start a + * nested query and we can't use the statement start like we usually do. + * We artificially start nested a bit later to avoid overlapping spans due + * to rounding + */ + nested_query_start_ns = start_span_ns + 1000; + + /* Build the process utility span. */ + process_utility_span_index = get_index_from_trace_spans(); + process_utility_span = get_span_from_index(process_utility_span_index); + begin_span(process_utility_span, SPAN_PROCESS_UTILITY, ¤t_trace, + get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, &start_span_ns, exec_nested_level); + /* Extract and normalise the query */ + if (queryString != NULL && pstmt->stmt_location != -1 && pstmt->stmt_len > 0) + { + int query_len = pstmt->stmt_len; + const char *normalised_query = normalise_query(queryString, pstmt->stmt_location, &query_len); + + process_utility_span->operation_name_offset = add_str_to_trace_buffer(normalised_query, query_len); + } + + /* + * Set ProcessUtility span as the new top span just in case we have nested + * query. We need to keep the old top span index to be able to close it. + */ + old_top_span_index = per_level_buffers[exec_nested_level].top_span_index; + set_top_span(exec_nested_level, process_utility_span_index); + + exec_nested_level++; + PG_TRY(); + { + if (prev_ProcessUtility) + prev_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + else + standard_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + } + PG_CATCH(); + { + exec_nested_level--; + handle_pg_error(process_utility_span_index, NULL); + PG_RE_THROW(); + } + PG_END_TRY(); + end_span_ns = get_current_ns(); + + /* ProcessUtility may have created a top span, close it here. */ + if (exec_nested_level <= max_nested_level && per_level_buffers[exec_nested_level].top_span_index != -1) + { + Span *top_span = get_top_span_for_nested_level(exec_nested_level); + + if (top_span->ended == false) + end_top_span(exec_nested_level, &end_span_ns); + } + exec_nested_level--; + + /* buffer may have been repalloced */ + process_utility_span = get_span_from_index(process_utility_span_index); + if (qc != NULL) + process_utility_span->node_counters.rows = qc->nprocessed; + /* ProcessUtility is the current top span, no need to call end_top_span */ + end_child_span_ns = end_span_ns - 1000; + end_span(process_utility_span, &end_child_span_ns); + + /* Also end the old top span */ + end_span_index(old_top_span_index, &end_span_ns); + end_tracing(&end_span_ns); + + /* + * We may iterate through a list of nested ProcessUtility calls, update + * the nested query start. + */ + nested_query_start_ns = end_span_ns + 1000; +} + +/* + * Iterate over a list of planstate to generate span node + */ +static void +generate_member_nodes(PlanState **planstates, int nplans, planstateTraceContext * planstateTraceContext, uint64 parent_id) +{ + int j; + + for (j = 0; j < nplans; j++) + generate_span_from_planstate(planstates[j], planstateTraceContext, parent_id); +} + +/* + * Iterate over custom scan planstates to generate span node + */ +static void +generate_span_from_custom_scan(CustomScanState *css, planstateTraceContext * planstateTraceContext, uint64 parent_id) +{ + ListCell *cell; + + foreach(cell, css->custom_ps) + generate_span_from_planstate((PlanState *) lfirst(cell), planstateTraceContext, parent_id); +} + +/* + * Create span node for the provided planstate + */ +static uint64 +create_span_node(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id, + SpanType span_type, char *subplan_name) +{ + Span *span; + int span_index = -1; + char const *span_name; + char const *operation_name; + Plan const *plan = planstate->plan; + int64 node_duration_ns; + int64 start_span_ns; + + /* + * Make sure stats accumulation is done. Function is a no-op if if was + * already done. + */ + InstrEndLoop(planstate->instrument); + + /* We only create span node on node that were executed */ + Assert(!INSTR_TIME_IS_ZERO(planstate->instrument->firsttime)); + + start_span_ns = INSTR_TIME_GET_NANOSEC(planstate->instrument->firsttime); + span_index = get_index_from_trace_spans(); + + span = get_span_from_index(span_index); + begin_span(span, span_type, planstateTraceContext->trace, parent_id, per_level_buffers[exec_nested_level].query_id, + &start_span_ns, exec_nested_level); + + /* first tuple time */ + span->startup = planstate->instrument->startup * NS_PER_S; + + if (span_type == SPAN_NODE) + { + /* Generate names and store them */ + span_name = plan_to_span_name(plan); + operation_name = plan_to_operation(planstateTraceContext, planstate, span_name); + + span->name_offset = add_str_to_trace_buffer(span_name, strlen(span_name)); + span->operation_name_offset = add_str_to_trace_buffer(operation_name, strlen(operation_name)); + } + else if (subplan_name != NULL) + { + span->operation_name_offset = add_str_to_trace_buffer(subplan_name, strlen(subplan_name)); + } + + span->node_counters.rows = (int64) planstate->instrument->ntuples / planstate->instrument->nloops; + span->node_counters.nloops = (int64) planstate->instrument->nloops; + span->node_counters.buffer_usage = planstate->instrument->bufusage; + span->node_counters.wal_usage = planstate->instrument->walusage; + + span->plan_counters.startup_cost = plan->startup_cost; + span->plan_counters.total_cost = plan->total_cost; + span->plan_counters.plan_rows = plan->plan_rows; + span->plan_counters.plan_width = plan->plan_width; + + if (!planstate->state->es_finished) + { + /* + * We're processing this node in an error handler, stop the node + * instrumentation to get it's current state + */ + InstrStopNode(planstate->instrument, planstate->state->es_processed); + InstrEndLoop(planstate->instrument); + span->sql_error_code = planstateTraceContext->sql_error_code; + } + node_duration_ns = INSTR_TIME_GET_NANOSEC(planstate->instrument->firsttime); + Assert(planstate->instrument->total > 0); + node_duration_ns += planstate->instrument->total * NS_PER_S; + end_span(span, &node_duration_ns); + + /* + * If we have a Result node, make it the span parent of the next query + * span if we have any + * + * TODO: Do we have other node that could be the base for nested queries? + */ + if (nodeTag(plan) == T_Result) + reparent_child_query(span, start_span_ns); + return span->span_id; +} + +/* + * Walk through the planstate tree generating a node span for each node. + * We pass possible error code to tag unfinished node with it + */ +static void +generate_span_from_planstate(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id) +{ + ListCell *l; + Plan const *plan = planstate->plan; + uint64 span_id; + + /* The node was never executed, skip it */ + if (INSTR_TIME_IS_ZERO(planstate->instrument->firsttime)) + return; + + span_id = create_span_node(planstate, planstateTraceContext, parent_id, SPAN_NODE, NULL); + planstateTraceContext->ancestors = lcons(planstate->plan, planstateTraceContext->ancestors); + + /* Walk the planstate tree */ + if (planstate->lefttree) + generate_span_from_planstate(planstate->lefttree, planstateTraceContext, span_id); + if (planstate->righttree) + generate_span_from_planstate(planstate->righttree, planstateTraceContext, span_id); + + /* Handle init plans and subplans */ + foreach(l, planstate->initPlan) + { + SubPlanState *sstate = (SubPlanState *) lfirst(l); + PlanState *splan = sstate->planstate; + uint64_t init_span_id; + + if (INSTR_TIME_IS_ZERO(splan->instrument->firsttime)) + continue; + init_span_id = create_span_node(splan, planstateTraceContext, parent_id, SPAN_NODE_INIT_PLAN, sstate->subplan->plan_name); + generate_span_from_planstate(splan, planstateTraceContext, init_span_id); + } + + foreach(l, planstate->subPlan) + { + SubPlanState *sstate = (SubPlanState *) lfirst(l); + PlanState *splan = sstate->planstate; + uint64_t subplan_span_id; + + if (INSTR_TIME_IS_ZERO(splan->instrument->firsttime)) + continue; + subplan_span_id = create_span_node(splan, planstateTraceContext, parent_id, SPAN_NODE_SUBPLAN, sstate->subplan->plan_name); + generate_span_from_planstate(splan, planstateTraceContext, subplan_span_id); + } + + /* Handle special nodes with children nodes */ + switch (nodeTag(plan)) + { + case T_Append: + generate_member_nodes(((AppendState *) planstate)->appendplans, + ((AppendState *) planstate)->as_nplans, planstateTraceContext, span_id); + break; + case T_MergeAppend: + generate_member_nodes(((MergeAppendState *) planstate)->mergeplans, + ((MergeAppendState *) planstate)->ms_nplans, planstateTraceContext, span_id); + break; + case T_BitmapAnd: + generate_member_nodes(((BitmapAndState *) planstate)->bitmapplans, + ((BitmapAndState *) planstate)->nplans, planstateTraceContext, span_id); + break; + case T_BitmapOr: + generate_member_nodes(((BitmapOrState *) planstate)->bitmapplans, + ((BitmapOrState *) planstate)->nplans, planstateTraceContext, span_id); + break; + case T_SubqueryScan: + generate_span_from_planstate(((SubqueryScanState *) planstate)->subplan, planstateTraceContext, span_id); + break; + case T_CustomScan: + generate_span_from_custom_scan((CustomScanState *) planstate, planstateTraceContext, span_id); + break; + default: + break; + } + planstateTraceContext->ancestors = list_delete_first(planstateTraceContext->ancestors); +} + +/* + * Add plan counters to Datum output + */ +static int +add_plan_counters(const PlanCounters * plan_counters, int i, Datum *values) +{ + values[i++] = Float8GetDatumFast(plan_counters->startup_cost); + values[i++] = Float8GetDatumFast(plan_counters->total_cost); + values[i++] = Float8GetDatumFast(plan_counters->plan_rows); + values[i++] = Int32GetDatum(plan_counters->plan_width); + return i; +} + +/* + * Add node counters to Datum output + */ +static int +add_node_counters(const NodeCounters * node_counters, int i, Datum *values) +{ + Datum wal_bytes; + char buf[256]; + + values[i++] = Int64GetDatumFast(node_counters->rows); + values[i++] = Int64GetDatumFast(node_counters->nloops); + + /* Buffer usage */ + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_hit); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_read); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_dirtied); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_written); + + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_hit); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_read); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_dirtied); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_written); + + values[i++] = Float8GetDatumFast(INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.blk_read_time)); + values[i++] = Float8GetDatumFast(INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.blk_write_time)); + + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.temp_blks_read); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.temp_blks_written); + values[i++] = Float8GetDatumFast(INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.temp_blk_read_time)); + values[i++] = Float8GetDatumFast(INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.temp_blk_write_time)); + + /* WAL usage */ + values[i++] = Int64GetDatumFast(node_counters->wal_usage.wal_records); + values[i++] = Int64GetDatumFast(node_counters->wal_usage.wal_fpi); + snprintf(buf, sizeof buf, UINT64_FORMAT, node_counters->wal_usage.wal_bytes); + + /* Convert to numeric. */ + wal_bytes = DirectFunctionCall3(numeric_in, + CStringGetDatum(buf), + ObjectIdGetDatum(0), + Int32GetDatum(-1)); + values[i++] = wal_bytes; + + /* JIT usage */ + values[i++] = Int8GetDatum(node_counters->jit_usage.created_functions); + values[i++] = Float8GetDatumFast(INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.generation_counter)); + values[i++] = Float8GetDatumFast(INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.inlining_counter)); + values[i++] = Float8GetDatumFast(INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.optimization_counter)); + values[i++] = Float8GetDatumFast(INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.emission_counter)); + + return i; +} + +/* + * Build the tuple for a Span and add it to the output + */ +static void +add_result_span(ReturnSetInfo *rsinfo, Span * span, + const char *qbuffer, Size qbuffer_size, Relation rel) +{ + Datum values[PG_TRACING_TRACES_COLS] = {0}; + bool nulls[PG_TRACING_TRACES_COLS] = {0}; + + int i = 0; + + values[i++] = UInt64GetDatum(span->trace_id); + values[i++] = UInt64GetDatum(span->parent_id); + values[i++] = UInt64GetDatum(span->span_id); + values[i++] = UInt64GetDatum(span->query_id); + values[i++] = CStringGetTextDatum(get_span_name(span, qbuffer, qbuffer_size)); + values[i++] = CStringGetTextDatum(get_operation_name(span, qbuffer, qbuffer_size)); + values[i++] = Int64GetDatum(span->start); + + values[i++] = Int64GetDatum(span->duration_ns); + values[i++] = CStringGetTextDatum(unpack_sql_state(span->sql_error_code)); + values[i++] = UInt32GetDatum(span->be_pid); + values[i++] = UInt8GetDatum(span->nested_level); + values[i++] = UInt8GetDatum(span->subxact_count); + values[i++] = BoolGetDatum(span->top_span); + + if ((span->type >= SPAN_NODE && span->type <= SPAN_NODE_UNKNOWN) + || span->type == SPAN_PLANNER) + { + i = add_plan_counters(&span->plan_counters, i, values); + i = add_node_counters(&span->node_counters, i, values); + + values[i++] = Int64GetDatum(span->startup); + if (span->parameter_offset != -1) + values[i++] = CStringGetTextDatum(qbuffer + span->parameter_offset); + } + + for (int j = i; j < PG_TRACING_TRACES_COLS; j++) + nulls[j] = 1; + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); +} + +/* + * Return spans as a result set. + * + * Accept a consume parameter. When consume is set, + * we empty the shared buffer and query text. + */ +Datum +pg_tracing_spans(PG_FUNCTION_ARGS) +{ + bool consume; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + Span *span; + const char *qbuffer; + Size qbuffer_size; + + consume = PG_GETARG_BOOL(0); + if (!pg_tracing) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("pg_tracing must be loaded via shared_preload_libraries"))); + InitMaterializedSRF(fcinfo, 0); + + /* + * If this query was sampled and we're consuming tracing_spans buffer, the + * spans will target a query string that doesn't exist anymore in the + * query file. + * + * Better abort the sampling, clean ongoing traces and remove any possible + * parallel context pushed + * + * Since this will be called within an ExecutorRun, we will need to check + * for current_trace_spans later. + */ + cleanup_tracing(); + + qbuffer = qtext_load_file(&qbuffer_size); + if (!qbuffer) + { + qbuffer = palloc(0); + qbuffer_size = 0; + } + + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + for (int i = 0; i < shared_spans->end; i++) + { + span = shared_spans->spans + i; + add_result_span(rsinfo, span, qbuffer, qbuffer_size, NULL); + } + + /* Consume is set, remove spans from the shared buffer */ + if (consume) + { + shared_spans->end = 0; + /* We only truncate query file if there's no active writers */ + if (s->n_writers == 0) + { + s->extent = 0; + pg_truncate(PG_TRACING_TEXT_FILE, 0); + } + else + s->stats.failed_truncates++; + } + s->stats.last_consume = GetCurrentTimestamp(); + SpinLockRelease(&s->mutex); + } + return (Datum) 0; +} + +/* + * Return statistics of pg_tracing. + */ +Datum +pg_tracing_info(PG_FUNCTION_ARGS) +{ + pgTracingGlobalStats stats; + TupleDesc tupdesc; + Datum values[PG_TRACING_INFO_COLS] = {0}; + bool nulls[PG_TRACING_INFO_COLS] = {0}; + int i = 0; + + if (!pg_tracing) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("pg_tracing must be loaded via shared_preload_libraries"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + /* Get a copy of the pg_tracing stats */ + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + stats = s->stats; + SpinLockRelease(&s->mutex); + } + + values[i++] = Int64GetDatum(stats.traces); + values[i++] = Int64GetDatum(stats.spans); + values[i++] = Int64GetDatum(stats.dropped_spans); + values[i++] = TimestampTzGetDatum(stats.last_consume); + values[i++] = TimestampTzGetDatum(stats.stats_reset); + values[i++] = Float8GetDatum(pg_tracing_sample_rate); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +/* + * Reset pg_tracing statistics. + */ +Datum +pg_tracing_reset(PG_FUNCTION_ARGS) +{ + /* + * Reset global statistics for pg_tracing since all entries are removed. + */ + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->stats.traces = 0; + s->stats.spans = 0; + s->stats.dropped_spans = 0; + s->stats.failed_truncates = 0; + s->stats.last_consume = 0; + s->stats.stats_reset = GetCurrentTimestamp(); + SpinLockRelease(&s->mutex); + } + PG_RETURN_VOID(); +} diff --git a/contrib/pg_tracing/pg_tracing.conf b/contrib/pg_tracing/pg_tracing.conf new file mode 100644 index 0000000000..ad6b5e7c1d --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.conf @@ -0,0 +1 @@ +shared_preload_libraries = 'pg_tracing' diff --git a/contrib/pg_tracing/pg_tracing.control b/contrib/pg_tracing/pg_tracing.control new file mode 100644 index 0000000000..efa78a6610 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.control @@ -0,0 +1,5 @@ +# pg_tracing extension +comment = 'Distributed tracing for postgres' +default_version = '1.0' +module_pathname = '$libdir/pg_tracing' +relocatable = true diff --git a/contrib/pg_tracing/pg_tracing.h b/contrib/pg_tracing/pg_tracing.h new file mode 100644 index 0000000000..4a7b041df4 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.h @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * pg_tracing.h + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing.h + *------------------------------------------------------------------------- + */ +#ifndef _PG_TRACING_H_ +#define _PG_TRACING_H_ + +#include "postgres.h" +#include "span.h" + +#include "storage/s_lock.h" + +/* + * Global statistics for pg_tracing + */ +typedef struct pgTracingGlobalStats +{ + int64 traces; /* number of traces processed */ + int64 spans; /* number of spans processed */ + int64 dropped_spans; /* number of dropped spans due to full buffer */ + int64 failed_truncates; /* number of failed query file truncates + * due to active writer */ + TimestampTz last_consume; /* Last time the shared spans buffer was + * consumed */ + TimestampTz stats_reset; /* Last time stats were reset */ +} pgTracingGlobalStats; + +/* + * Global shared state + */ +typedef struct pgTracingSharedState +{ + slock_t mutex; /* protects shared stats fields and + * shared_spans buffer */ + Size extent; /* current extent of query file */ + int n_writers; /* number of active writers to query file */ + pgTracingGlobalStats stats; /* global statistics for pg_tracing */ +} pgTracingSharedState; + +#endif diff --git a/contrib/pg_tracing/pg_tracing_explain.c b/contrib/pg_tracing/pg_tracing_explain.c new file mode 100644 index 0000000000..7f7155d2d3 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_explain.c @@ -0,0 +1,527 @@ +/*------------------------------------------------------------------------- + * + * pg_tracing_explain.c + * + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing_explain.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_tracing_explain.h" + +#include "nodes/makefuncs.h" +#include "utils/builtins.h" +#include "parser/parsetree.h" +#include "nodes/extensible.h" +#include "utils/lsyscache.h" +#include "utils/ruleutils.h" + + +static void +ExplainTargetRel(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, Index rti, StringInfo str) +{ + char *objectname = NULL; + char *namespace = NULL; + RangeTblEntry *rte; + char *refname; + + List *rtable; + + rtable = planstate->state->es_range_table; + rte = rt_fetch(rti, rtable); + + refname = (char *) list_nth(planstateTraceContext->rtable_names, rti - 1); + if (refname == NULL) + refname = rte->eref->aliasname; + + switch (nodeTag(planstate->plan)) + { + case T_SeqScan: + case T_SampleScan: + case T_IndexScan: + case T_IndexOnlyScan: + case T_BitmapHeapScan: + case T_TidScan: + case T_TidRangeScan: + case T_ForeignScan: + case T_CustomScan: + case T_ModifyTable: + /* Assert it's on a real relation */ + Assert(rte->rtekind == RTE_RELATION); + objectname = get_rel_name(rte->relid); + break; + case T_FunctionScan: + { + FunctionScan *fscan = (FunctionScan *) planstate->plan; + + /* Assert it's on a RangeFunction */ + Assert(rte->rtekind == RTE_FUNCTION); + + /* + * If the expression is still a function call of a single + * function, we can get the real name of the function. + * Otherwise, punt. (Even if it was a single function call + * originally, the optimizer could have simplified it away.) + */ + if (list_length(fscan->functions) == 1) + { + RangeTblFunction *rtfunc = (RangeTblFunction *) linitial(fscan->functions); + + if (IsA(rtfunc->funcexpr, FuncExpr)) + { + FuncExpr *funcexpr = (FuncExpr *) rtfunc->funcexpr; + Oid funcid = funcexpr->funcid; + + objectname = get_func_name(funcid); + } + } + } + break; + case T_TableFuncScan: + Assert(rte->rtekind == RTE_TABLEFUNC); + objectname = "xmltable"; + break; + case T_ValuesScan: + Assert(rte->rtekind == RTE_VALUES); + break; + case T_CteScan: + /* Assert it's on a non-self-reference CTE */ + Assert(rte->rtekind == RTE_CTE); + Assert(!rte->self_reference); + objectname = rte->ctename; + break; + case T_NamedTuplestoreScan: + Assert(rte->rtekind == RTE_NAMEDTUPLESTORE); + objectname = rte->enrname; + break; + case T_WorkTableScan: + /* Assert it's on a self-reference CTE */ + Assert(rte->rtekind == RTE_CTE); + Assert(rte->self_reference); + objectname = rte->ctename; + break; + default: + break; + } + + appendStringInfoString(str, " on"); + if (namespace != NULL) + appendStringInfo(str, " %s.%s", quote_identifier(namespace), + quote_identifier(objectname)); + else if (objectname != NULL) + appendStringInfo(str, " %s", quote_identifier(objectname)); + if (objectname == NULL || strcmp(refname, objectname) != 0) + appendStringInfo(str, " %s", quote_identifier(refname)); +} + + +static void +ExplainScanTarget(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const Scan *plan, StringInfo str) +{ + ExplainTargetRel(planstateTraceContext, planstate, plan->scanrelid, str); +} + +static void +ExplainModifyTarget(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const ModifyTable *plan, StringInfo str) +{ + ExplainTargetRel(planstateTraceContext, planstate, plan->nominalRelation, str); +} + +static void +ExplainIndexScanDetails(Oid indexId, ScanDirection indexorderdir, + StringInfo str) +{ + const char *indexname = get_rel_name(indexId); + + if (ScanDirectionIsBackward(indexorderdir)) + appendStringInfoString(str, " Backward"); + appendStringInfo(str, " using %s", quote_identifier(indexname)); +} + +static void +plan_to_rel_name(const PlanState *planstate, const planstateTraceContext * planstateTraceContext, StringInfo str) +{ + const Plan *plan = planstate->plan; + + switch (nodeTag(plan)) + { + case T_SeqScan: + case T_SampleScan: + case T_BitmapHeapScan: + case T_TidScan: + case T_TidRangeScan: + case T_SubqueryScan: + case T_FunctionScan: + case T_TableFuncScan: + case T_ValuesScan: + case T_CteScan: + case T_WorkTableScan: + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) plan, str); + break; + case T_ForeignScan: + case T_CustomScan: + if (((Scan *) plan)->scanrelid > 0) + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) plan, str); + break; + case T_IndexScan: + { + IndexScan *indexscan = (IndexScan *) plan; + + ExplainIndexScanDetails(indexscan->indexid, + indexscan->indexorderdir, + str); + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) indexscan, str); + } + break; + case T_IndexOnlyScan: + { + IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan; + + ExplainIndexScanDetails(indexonlyscan->indexid, + indexonlyscan->indexorderdir, + str); + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) indexonlyscan, str); + } + break; + case T_BitmapIndexScan: + { + BitmapIndexScan *bitmapindexscan = (BitmapIndexScan *) plan; + const char *indexname = get_rel_name(bitmapindexscan->indexid); + + appendStringInfo(str, " on %s", + quote_identifier(indexname)); + } + break; + case T_ModifyTable: + ExplainModifyTarget(planstateTraceContext, planstate, (ModifyTable *) plan, str); + break; + case T_NestLoop: + case T_MergeJoin: + case T_HashJoin: + { + const char *jointype; + + switch (((Join *) plan)->jointype) + { + case JOIN_INNER: + jointype = "Inner"; + break; + case JOIN_LEFT: + jointype = "Left"; + break; + case JOIN_FULL: + jointype = "Full"; + break; + case JOIN_RIGHT: + jointype = "Right"; + break; + case JOIN_SEMI: + jointype = "Semi"; + break; + case JOIN_ANTI: + jointype = "Anti"; + break; + case JOIN_RIGHT_ANTI: + jointype = "Right Anti"; + break; + default: + jointype = "???"; + break; + } + if (((Join *) plan)->jointype != JOIN_INNER) + appendStringInfo(str, " %s Join", jointype); + else if (!IsA(plan, NestLoop)) + appendStringInfoString(str, " Join"); + } + break; + case T_SetOp: + { + const char *setopcmd; + + switch (((SetOp *) plan)->cmd) + { + case SETOPCMD_INTERSECT: + setopcmd = "Intersect"; + break; + case SETOPCMD_INTERSECT_ALL: + setopcmd = "Intersect All"; + break; + case SETOPCMD_EXCEPT: + setopcmd = "Except"; + break; + case SETOPCMD_EXCEPT_ALL: + setopcmd = "Except All"; + break; + default: + setopcmd = "???"; + break; + } + appendStringInfo(str, " %s", setopcmd); + } + break; + default: + break; + } +} + +static void +add_scan_qual(StringInfo str, const PlanState *planstate, List *qual, + const char *qlabel, List *ancestors, List *deparse_ctx, + bool useprefix) +{ + List *context; + char *exprstr; + Node *node; + + if (qual == NIL) + return; + + node = (Node *) make_ands_explicit(qual); + + /* Set up deparsing context */ + context = set_deparse_context_plan(deparse_ctx, + planstate->plan, + ancestors); + + /* Deparse the expression */ + appendStringInfoString(str, "|"); + exprstr = deparse_expression(node, context, useprefix, false); + appendStringInfoString(str, qlabel); + appendStringInfoString(str, exprstr); +} + +char const * +plan_to_operation(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const char *spanName) +{ + StringInfo operation_name = makeStringInfo(); + Plan const *plan = planstate->plan; + List *ancestors = planstateTraceContext->ancestors; + List *deparse_ctx = planstateTraceContext->deparse_ctx; + + if (plan->parallel_aware) + appendStringInfoString(operation_name, "Parallel "); + if (plan->async_capable) + appendStringInfoString(operation_name, "Async "); + appendStringInfoString(operation_name, spanName); + + plan_to_rel_name(planstate, planstateTraceContext, operation_name); + + switch (nodeTag(plan)) + { + case T_IndexScan: + add_scan_qual(operation_name, planstate, ((IndexScan *) plan)->indexqualorig, "Index Cond: ", + ancestors, deparse_ctx, false); + break; + case T_IndexOnlyScan: + add_scan_qual(operation_name, planstate, ((IndexOnlyScan *) plan)->indexqual, "Index Cond: ", + ancestors, deparse_ctx, false); + break; + case T_BitmapIndexScan: + add_scan_qual(operation_name, planstate, ((BitmapIndexScan *) plan)->indexqualorig, "Index Cond: ", + ancestors, deparse_ctx, false); + break; + case T_BitmapHeapScan: + add_scan_qual(operation_name, planstate, ((BitmapHeapScan *) plan)->bitmapqualorig, "Recheck Cond: ", + ancestors, deparse_ctx, false); + break; + case T_SeqScan: + case T_ValuesScan: + case T_CteScan: + case T_NamedTuplestoreScan: + case T_WorkTableScan: + case T_SubqueryScan: + case T_Gather: + case T_GatherMerge: + case T_FunctionScan: + case T_TableFuncScan: + case T_TidRangeScan: + case T_ForeignScan: + case T_CustomScan: + case T_Agg: + case T_WindowAgg: + case T_Group: + case T_Result: + add_scan_qual(operation_name, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + case T_NestLoop: + add_scan_qual(operation_name, planstate, ((NestLoop *) plan)->join.joinqual, "Join Filter : ", + ancestors, deparse_ctx, false); + add_scan_qual(operation_name, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + case T_MergeJoin: + add_scan_qual(operation_name, planstate, ((MergeJoin *) plan)->mergeclauses, "Merge Cond: ", + ancestors, deparse_ctx, false); + add_scan_qual(operation_name, planstate, ((MergeJoin *) plan)->join.joinqual, "Join Filter: ", + ancestors, deparse_ctx, false); + add_scan_qual(operation_name, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + case T_HashJoin: + add_scan_qual(operation_name, planstate, ((HashJoin *) plan)->hashclauses, "Hash Cond: ", + ancestors, deparse_ctx, false); + add_scan_qual(operation_name, planstate, ((MergeJoin *) plan)->join.joinqual, "Join Filter: ", + ancestors, deparse_ctx, false); + add_scan_qual(operation_name, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + default: + break; + } + + return operation_name->data; +} + +const char * +plan_to_span_name(const Plan *plan) +{ + const char *custom_name; + + switch (nodeTag(plan)) + { + case T_Result: + return "Result"; + case T_ProjectSet: + return "ProjectSet"; + case T_ModifyTable: + switch (((ModifyTable *) plan)->operation) + { + case CMD_INSERT: + return "Insert"; + case CMD_UPDATE: + return "Update"; + case CMD_DELETE: + return "Delete"; + case CMD_MERGE: + return "Merge"; + default: + return "???"; + } + case T_Append: + return "Append"; + case T_MergeAppend: + return "MergeAppend"; + case T_RecursiveUnion: + return "RecursiveUnion"; + case T_BitmapAnd: + return "BitmapAnd"; + case T_BitmapOr: + return "BitmapOr"; + case T_NestLoop: + return "NestedLoop"; + case T_MergeJoin: + return "Merge"; /* "Join" gets added by jointype switch */ + case T_HashJoin: + return "Hash"; /* "Join" gets added by jointype switch */ + case T_SeqScan: + return "SeqScan"; + case T_SampleScan: + return "SampleScan"; + case T_Gather: + return "Gather"; + case T_GatherMerge: + return "GatherMerge"; + case T_IndexScan: + return "IndexScan"; + case T_IndexOnlyScan: + return "IndexOnlyScan"; + case T_BitmapIndexScan: + return "BitmapIndexScan"; + case T_BitmapHeapScan: + return "BitmapHeapScan"; + case T_TidScan: + return "TidScan"; + case T_TidRangeScan: + return "TidRangeScan"; + case T_SubqueryScan: + return "SubqueryScan"; + case T_FunctionScan: + return "FunctionScan"; + case T_TableFuncScan: + return "TableFunctionScan"; + case T_ValuesScan: + return "ValuesScan"; + case T_CteScan: + return "CTEScan"; + case T_NamedTuplestoreScan: + return "NamedTuplestoreScan"; + case T_WorkTableScan: + return "WorkTableScan"; + case T_ForeignScan: + switch (((ForeignScan *) plan)->operation) + { + case CMD_SELECT: + return "ForeignScan"; + case CMD_INSERT: + return "ForeignInsert"; + case CMD_UPDATE: + return "ForeignUpdate"; + case CMD_DELETE: + return "ForeignDelete"; + default: + return "???"; + } + case T_CustomScan: + custom_name = ((CustomScan *) plan)->methods->CustomName; + if (custom_name) + return psprintf("CustomScan (%s)", custom_name); + else + return "CustomScan"; + case T_Material: + return "Materialize"; + case T_Memoize: + return "Memoize"; + case T_Sort: + return "Sort"; + case T_IncrementalSort: + return "IncrementalSort"; + case T_Group: + return "Group"; + case T_Agg: + { + Agg *agg = (Agg *) plan; + + switch (agg->aggstrategy) + { + case AGG_PLAIN: + return "Aggregate"; + case AGG_SORTED: + return "GroupAggregate"; + case AGG_HASHED: + return "HashAggregate"; + case AGG_MIXED: + return "MixedAggregate"; + default: + return "Aggregate ???"; + } + } + case T_WindowAgg: + return "WindowAgg"; + case T_Unique: + return "Unique"; + case T_SetOp: + switch (((SetOp *) plan)->strategy) + { + case SETOP_SORTED: + return "SetOp"; + case SETOP_HASHED: + return "HashSetOp"; + default: + return "SetOp ???"; + } + case T_LockRows: + return "LockRows"; + case T_Limit: + return "Limit"; + case T_Hash: + return "Hash"; + default: + return "???"; + } +} diff --git a/contrib/pg_tracing/pg_tracing_explain.h b/contrib/pg_tracing/pg_tracing_explain.h new file mode 100644 index 0000000000..6551e15d61 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_explain.h @@ -0,0 +1,20 @@ +#ifndef _PG_TRACING_EXPLAIN_H_ +#define _PG_TRACING_EXPLAIN_H_ + +#include "span.h" +#include "executor/execdesc.h" + +/* Context needed when generating spans from planstate */ +typedef struct planstateTraceContext +{ + pgTracingTrace *trace; + int sql_error_code; + List *ancestors; + List *deparse_ctx; + List *rtable_names; +} planstateTraceContext; + +extern const char *plan_to_span_name(const Plan *plan); +extern const char *plan_to_operation(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const char *spanName); + +#endif diff --git a/contrib/pg_tracing/query_process.c b/contrib/pg_tracing/query_process.c new file mode 100644 index 0000000000..37a8c2b22c --- /dev/null +++ b/contrib/pg_tracing/query_process.c @@ -0,0 +1,475 @@ +#include "query_process.h" +#include "pg_tracing.h" + +#include +#include +#include "parser/scanner.h" +#include "nodes/extensible.h" + +/* + * Initialize traceparent parameter fields + */ +void +initialize_traceparent(pgTracingTrace * traceparent_parameter) +{ + traceparent_parameter->sampled = 0; + traceparent_parameter->trace_id = 0; + traceparent_parameter->parent_id = 0; +} + +/* + * Extract traceparent parameters from SQLCommenter + * + * We're expecting the query to start with a SQLComment containing the + * traceparent parameter + * "/\*traceparent='00-00000000000000000000000000000009-0000000000000005-01'*\/ + * SELECT 1;" + * Traceparent has the following format: version-traceid-parentid-sampled + * We also accept SQLCommenter as a parameter. In this case, we don't have + * the start and end of comments. + * + * if is_parameter is true, the SQLComment is passed as a query parameter and + * won't have the comment start and end. + */ +pgTracingTrace +extract_traceparent(const char *sqlcomment_str, bool is_parameter) +{ + const char *expected_start = "/*"; + const char *traceparent; + const char *end_sqlcomment; + char *endptr; + pgTracingTrace pgTracingTrace = {}; + + if (!is_parameter) + { + /* + * Look for the start of a comment We're expecting the comment to be + * at the very start of the query + */ + for (size_t i = 0; i < strlen(expected_start); i++) + { + if (sqlcomment_str[i] != expected_start[i]) + return pgTracingTrace; + } + + /* + * Check that we have an end + */ + end_sqlcomment = strstr(sqlcomment_str, "*/"); + if (end_sqlcomment == NULL) + return pgTracingTrace; + } + else + end_sqlcomment = sqlcomment_str + strlen(sqlcomment_str); + + /* + * Locate traceparent parameter and make sure it has the expected size + * "traceparent" + "=" + '' -> 13 characters + * 00-00000000000000000000000000000009-0000000000000005-01 -> 55 + * characters + */ + traceparent = strstr(sqlcomment_str, "traceparent='"); + if (traceparent == NULL || traceparent > end_sqlcomment || end_sqlcomment - traceparent < 55 + 13) + return pgTracingTrace; + + /* + * Move to the start of the traceparent values + */ + traceparent = traceparent + 13; + + /* + * Check that '-' are at the expected places + */ + if (traceparent[2] != '-' || traceparent[35] != '-' || traceparent[52] != '-') + return pgTracingTrace; + + /* + * Parse traceparent parameters + */ + errno = 0; + pgTracingTrace.trace_id = strtol(&traceparent[3], &endptr, 16); + if (endptr != traceparent + 35 || errno) + return pgTracingTrace; + pgTracingTrace.parent_id = strtol(&traceparent[36], &endptr, 16); + if (endptr != traceparent + 52 || errno) + return pgTracingTrace; + pgTracingTrace.sampled = strtol(&traceparent[53], &endptr, 16); + if (endptr != traceparent + 55 || errno) + + /* + * Just to be sure, reset sampled on error + */ + pgTracingTrace.sampled = 0; + return pgTracingTrace; +} + + +/* + * comp_location: comparator for qsorting LocationLen structs by location + */ +static int +comp_location(const void *a, const void *b) +{ + int l = ((const LocationLen *) a)->location; + int r = ((const LocationLen *) b)->location; + + if (l < r) + return -1; + else if (l > r) + return +1; + else + return 0; +} + + +/* + * Normalise query and fill paramStr. + * Normalised query will separate tokens with a single space and + * parameters are replaced by $1, $2... + * Parameters are put in the paramStr wich will contain all parameters values + * using the format: "$1 = 0, $2 = 'v'" + */ +const char * +normalise_query_parameters(const JumbleState *jstate, const char *query, + int query_loc, int *query_len_p, char **paramStr) +{ + char *norm_query; + int query_len = *query_len_p; + int norm_query_buflen, /* Space allowed for norm_query */ + n_quer_loc = 0; + LocationLen *locs; + core_yyscan_t yyscanner; + core_yy_extra_type yyextra; + core_YYSTYPE yylval; + YYLTYPE yylloc; + int current_loc = 0; + StringInfoData buf; + + initStringInfo(&buf); + + norm_query_buflen = query_len + jstate->clocations_count * 10; + Assert(norm_query_buflen > 0); + locs = jstate->clocations; + + /* Allocate result buffer */ + norm_query = palloc(norm_query_buflen + 1); + + if (jstate->clocations_count > 1) + qsort(jstate->clocations, jstate->clocations_count, + sizeof(LocationLen), comp_location); + + /* initialize the flex scanner --- should match raw_parser() */ + yyscanner = scanner_init(query + query_loc, + &yyextra, + &ScanKeywords, + ScanKeywordTokens); + + for (;;) + { + int loc = locs[current_loc].location; + int tok; + + loc -= query_loc; + + tok = core_yylex(&yylval, &yylloc, yyscanner); + + /* + * We should not hit end-of-string, but if we do, behave sanely + */ + if (tok == 0) + break; /* out of inner for-loop */ + if (yylloc > query_len) + break; + + /* + * We should find the token position exactly, but if we somehow run + * past it, work with that. + */ + if (current_loc < jstate->clocations_count && yylloc >= loc) + { + appendStringInfo(&buf, + "%s$%d = ", + current_loc > 0 ? ", " : "", + current_loc + 1); + if (query[loc] == '-') + { + /* + * It's a negative value - this is the one and only case where + * we replace more than a single token. + * + * Do not compensate for the core system's special-case + * adjustment of location to that of the leading '-' operator + * in the event of a negative constant. It is also useful for + * our purposes to start from the minus symbol. In this way, + * queries like "select * from foo where bar = 1" and "select * + * from foo where bar = -2" will have identical normalized + * query strings. + */ + appendStringInfoChar(&buf, '-'); + tok = core_yylex(&yylval, &yylloc, yyscanner); + if (tok == 0) + break; /* out of inner for-loop */ + } + if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0) + { + norm_query[n_quer_loc++] = ' '; + } + + /* + * Append the current parameter $x in the normalised query + */ + n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d", + current_loc + 1 + jstate->highest_extern_param_id); + + appendStringInfoString(&buf, yyextra.scanbuf + yylloc); + + current_loc++; + } + else + { + int to_copy; + + if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0) + { + norm_query[n_quer_loc++] = ' '; + } + to_copy = strlen(yyextra.scanbuf + yylloc); + Assert(n_quer_loc + to_copy < norm_query_buflen + 1); + memcpy(norm_query + n_quer_loc, yyextra.scanbuf + yylloc, to_copy); + n_quer_loc += to_copy; + } + } + scanner_finish(yyscanner); + + *query_len_p = n_quer_loc; + norm_query[n_quer_loc] = '\0'; + *paramStr = buf.data; + return norm_query; +} + +/* + * Normalise query: tokens will be separated by a single space + */ +const char * +normalise_query(const char *query, int query_loc, int *query_len_p) +{ + char *norm_query; + int query_len = *query_len_p; + int norm_query_buflen = query_len; + int n_quer_loc = 0; + core_yyscan_t yyscanner; + core_yy_extra_type yyextra; + core_YYSTYPE yylval; + YYLTYPE yylloc; + + /* Allocate result buffer */ + norm_query = palloc(norm_query_buflen + 2); + + /* initialize the flex scanner --- should match raw_parser() */ + yyscanner = scanner_init(query + query_loc, &yyextra, &ScanKeywords, ScanKeywordTokens); + for (;;) + { + int tok; + int to_copy; + + tok = core_yylex(&yylval, &yylloc, yyscanner); + + if (tok == 0) + break; /* out of inner for-loop */ + if (yylloc > query_len) + break; + + if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0) + { + norm_query[n_quer_loc++] = ' '; + } + to_copy = strlen(yyextra.scanbuf + yylloc); + Assert(n_quer_loc + to_copy < norm_query_buflen + 2); + memcpy(norm_query + n_quer_loc, yyextra.scanbuf + yylloc, to_copy); + n_quer_loc += to_copy; + } + scanner_finish(yyscanner); + + *query_len_p = n_quer_loc; + norm_query[n_quer_loc] = '\0'; + return norm_query; +} + +/* + * Store text in the pg_tracing stat file + */ +bool +text_store_file(pgTracingSharedState * pg_tracing, const char *text, int text_len, + Size *query_offset) +{ + Size off; + int fd; + + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + off = s->extent; + s->extent += text_len + 1; + s->n_writers++; + SpinLockRelease(&s->mutex); + } + + /* + * Don't allow the file to grow larger than what qtext_load_file can + * (theoretically) handle. This has been seen to be reachable on 32-bit + * platforms. + */ + if (unlikely(text_len >= MaxAllocHugeSize - off)) + { + errno = EFBIG; /* not quite right, but it'll do */ + fd = -1; + goto error; + } + + /* Now write the data into the successfully-reserved part of the file */ + fd = OpenTransientFile(PG_TRACING_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) + goto error; + + if (pg_pwrite(fd, text, text_len, off) != text_len) + goto error; + if (pg_pwrite(fd, "\0", 1, off + text_len) != 1) + goto error; + + CloseTransientFile(fd); + + /* Mark our write complete */ + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->n_writers--; + SpinLockRelease(&s->mutex); + } + + /* + * Set offset once write was succesful + */ + *query_offset = off; + + return true; + +error: + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + + if (fd >= 0) + CloseTransientFile(fd); + + /* Mark our write complete */ + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->n_writers--; + SpinLockRelease(&s->mutex); + } + + return false; +} + +/* + * Read the external query text file into a malloc'd buffer. + * + * Returns NULL (without throwing an error) if unable to read, eg file not + * there or insufficient memory. + * + * On success, the buffer size is also returned into *buffer_size. + * + * This can be called without any lock on pgss->lock, but in that case the + * caller is responsible for verifying that the result is sane. + */ +const char * +qtext_load_file(Size *buffer_size) +{ + char *buf; + int fd; + struct stat stat; + Size nread; + + fd = OpenTransientFile(PG_TRACING_TEXT_FILE, O_RDONLY | PG_BINARY); + if (fd < 0) + { + if (errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + return NULL; + } + + /* Get file length */ + if (fstat(fd, &stat)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + CloseTransientFile(fd); + return NULL; + } + + /* Allocate buffer; beware that off_t might be wider than size_t */ + if (stat.st_size <= MaxAllocHugeSize) + buf = (char *) malloc(stat.st_size); + else + buf = NULL; + if (buf == NULL) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Could not allocate enough memory to read file \"%s\".", + PG_TRACING_TEXT_FILE))); + CloseTransientFile(fd); + return NULL; + } + + /* + * OK, slurp in the file. Windows fails if we try to read more than + * INT_MAX bytes at once, and other platforms might not like that either, + * so read a very large file in 1GB segments. + */ + nread = 0; + while (nread < stat.st_size) + { + int toread = Min(1024 * 1024 * 1024, stat.st_size - nread); + + /* + * If we get a short read and errno doesn't get set, the reason is + * probably that garbage collection truncated the file since we did + * the fstat(), so we don't log a complaint --- but we don't return + * the data, either, since it's most likely corrupt due to concurrent + * writes from garbage collection. + */ + errno = 0; + if (read(fd, buf + nread, toread) != toread) + { + if (errno) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + free(buf); + CloseTransientFile(fd); + return NULL; + } + nread += toread; + } + + if (CloseTransientFile(fd) != 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", PG_TRACING_TEXT_FILE))); + + *buffer_size = nread; + return buf; +} diff --git a/contrib/pg_tracing/query_process.h b/contrib/pg_tracing/query_process.h new file mode 100644 index 0000000000..ce37b2e0d4 --- /dev/null +++ b/contrib/pg_tracing/query_process.h @@ -0,0 +1,30 @@ +#ifndef _QUERY_PROCESS_H_ +#define _QUERY_PROCESS_H_ + +#include "pg_tracing.h" +#include "nodes/queryjumble.h" +#include "parser/parse_node.h" + +/* + * Normalise query: - Comments are removed - Constants are replaced by $x - + * All tokens are separated by a single space + */ +extern const char *normalise_query_parameters(const JumbleState *jstate, const char *query, + int query_loc, int *query_len_p, char **paramStr); + +extern pgTracingTrace extract_traceparent(const char *query_str, bool is_parameter); + +/* + * Normalise simple query + */ +extern const char *normalise_query(const char *query, int query_loc, int *query_len_p); +bool text_store_file(pgTracingSharedState * pg_tracing, const char *query, + int query_len, Size *query_offset); +extern const char *qtext_load_file(Size *buffer_size); +extern const char *qtext_load_file(Size *buffer_size); +extern void initialize_traceparent(pgTracingTrace * traceparent_parameter); + +/* Location of external text file */ +#define PG_TRACING_TEXT_FILE PG_STAT_TMP_DIR "/pg_tracing.stat" + +#endif diff --git a/contrib/pg_tracing/span.c b/contrib/pg_tracing/span.c new file mode 100644 index 0000000000..d2528cb3e7 --- /dev/null +++ b/contrib/pg_tracing/span.c @@ -0,0 +1,311 @@ +#include "span.h" + +#include "nodes/extensible.h" +#include "common/pg_prng.h" + + +/* + * Convert a node CmdType to the matching SpanType + */ +SpanType +command_type_to_span_type(CmdType cmd_type) +{ + switch (cmd_type) + { + case CMD_SELECT: + return SPAN_NODE_SELECT; + case CMD_INSERT: + return SPAN_NODE_INSERT; + case CMD_UPDATE: + return SPAN_NODE_UPDATE; + case CMD_DELETE: + return SPAN_NODE_DELETE; + case CMD_MERGE: + return SPAN_NODE_MERGE; + case CMD_UTILITY: + return SPAN_NODE_UTILITY; + case CMD_NOTHING: + return SPAN_NODE_NOTHING; + case CMD_UNKNOWN: + return SPAN_NODE_UNKNOWN; + } + return SPAN_NODE_UNKNOWN; +} + +/* + * Initialize span fields + */ +static void +initialize_span_fields(Span * span, SpanType type, const pgTracingTrace * trace, uint64 parent_id, uint64 query_id, int nested_level) +{ + span->trace_id = trace->trace_id; + span->type = type; + + /* + * If parent id is unset, it means that there's no propagated trace + * informations from the caller. In this case, this is the top span, + * span_id == parent_id and we reuse the generated trace id for the span + * id. + */ + if (parent_id == 0) + { + span->parent_id = trace->trace_id; + span->span_id = trace->trace_id; + } + else + { + span->parent_id = parent_id; + span->span_id = pg_prng_uint64(&pg_global_prng_state); + } + span->name_offset = -1; + span->operation_name_offset = -1; + span->parameter_offset = -1; + span->sql_error_code = 0; + span->startup = 0; + span->be_pid = MyProcPid; + span->subxact_count = MyProc->subxidStatus.count; + span->duration_ns = 0; + span->ended = false; + span->query_id = query_id; + span->nested_level = nested_level; + memset(&span->node_counters, 0, sizeof(NodeCounters)); + memset(&span->plan_counters, 0, sizeof(PlanCounters)); + + /* + * Store the starting buffer for planner and process utility spans + * + * TODO: Do we need wal buffer for planner? + */ + if (type == SPAN_PLANNER || span->type == SPAN_PROCESS_UTILITY) + { + /* TODO Do we need to track wal for planner? */ + span->node_counters.buffer_usage = pgBufferUsage; + span->node_counters.wal_usage = pgWalUsage; + } +} + +/* + * Initialize span and set span starting time. + * + * A span needs a start timestamp and a duration. + * Given that spans can have a very short duration (less than 1ms), we + * need to rely on monotonic clock as much as possible to have the best precision. + * + * For that, after we established that a query was sampled, we get both the start + * timestamp and instr_time. All subsequent times will be taken instr_time and we + * deduct the starting time from the delta between the current instr_time and the + * start_instr_time. + * + * We provide multiple times: + * - start_trace_ts is the earliest timestamp we've captured for the current trace. + * - start_trace_instr_time is the instr_time taken at the same moment as the start_trace_ts. + * - start_span_time is an optional start instr_time of the provided span. It will be used + * to get the start timestamp of the span. If nothing is provided, we use the current instr_time. + * This parameter is mostly used when generating spans from planstate as we need to rely on the + * query instrumentation to find the node start. + */ +void +begin_span(Span * span, SpanType type, const pgTracingTrace * trace, uint64 parent_id, uint64 query_id, const int64 *start_span, int nested_level) +{ + int64 start_ns; + + initialize_span_fields(span, type, trace, parent_id, query_id, nested_level); + + /* + * If no start span is provided, get the current one + */ + if (start_span == NULL) + start_ns = get_current_ns(); + else + start_ns = *start_span; + + /* + * We use duration to temporarily store ns at span start + */ + span->duration_ns = start_ns; + + /* + * Compute the span timestamp start by using delta instr_time between + * start trace and start span. + */ + span->start = trace->start_trace.ts + (start_ns - trace->start_trace.ns) / NS_PER_US; +} + +/* + * Set span duration and accumulated buffers + * end_span_input is optional, if NULL is passed, we use + * the current time + */ +void +end_span(Span * span, const int64 *end_span_input) +{ + BufferUsage buffer_usage; + WalUsage wal_usage; + int64 end_span_ns; + + /* + * We used span->duration as a temporary storage for the instr_time at the + * begining of the span, fetch it + */ + int64 start_span_ns = span->duration_ns; + + Assert(start_span_ns > 0); + Assert(span->trace_id > 0); + + /* Span should be ended only once */ + Assert(span->ended == false); + span->ended = true; + + /* + * Set span duration with the end time before substrating the start + */ + if (end_span_input == NULL) + end_span_ns = get_current_ns(); + else + end_span_ns = *end_span_input; + span->duration_ns = end_span_ns - start_span_ns; + + if (span->type == SPAN_PLANNER || span->type == SPAN_PROCESS_UTILITY) + { + /* calc differences of buffer counters. */ + memset(&buffer_usage, 0, sizeof(BufferUsage)); + BufferUsageAccumDiff(&buffer_usage, &pgBufferUsage, + &span->node_counters.buffer_usage); + span->node_counters.buffer_usage = buffer_usage; + + /* calc differences of WAL counters. */ + memset(&wal_usage, 0, sizeof(wal_usage)); + WalUsageAccumDiff(&wal_usage, &pgWalUsage, + &span->node_counters.wal_usage); + span->node_counters.wal_usage = wal_usage; + } +} + +/* + * Get the name of a span. + * If it is a node span, the name may be pulled from the stat file. + */ +const char * +get_span_name(const Span * span, const char *qbuffer, Size qbuffer_size) +{ + if (span->name_offset != -1) + { + if (span->name_offset <= qbuffer_size && qbuffer_size > 0) + return qbuffer + span->name_offset; + else if (span->name_offset > qbuffer_size && qbuffer_size > 0) + return qbuffer + qbuffer_size; + } + + switch (span->type) + { + case SPAN_POST_PARSE: + return "Post Parse"; + case SPAN_PARSE: + return "Parse"; + case SPAN_PLANNER: + return "Planner"; + case SPAN_FUNCTION: + return "Function"; + case SPAN_PROCESS_UTILITY: + return "ProcessUtility"; + case SPAN_EXECUTOR_START: + return "Executor"; + case SPAN_EXECUTOR_RUN: + return "Executor"; + case SPAN_EXECUTOR_END: + return "Executor"; + case SPAN_EXECUTOR_FINISH: + return "Executor"; + + case SPAN_NODE_INIT_PLAN: + return "InitPlan"; + case SPAN_NODE_SUBPLAN: + return "SubPlan"; + + case SPAN_NODE_SELECT: + return "Select"; + case SPAN_NODE_INSERT: + return "Insert"; + case SPAN_NODE_UPDATE: + return "Update"; + case SPAN_NODE_DELETE: + return "Delete"; + case SPAN_NODE_MERGE: + return "Merge"; + case SPAN_NODE_UTILITY: + return "Utility"; + case SPAN_NODE_NOTHING: + return "Nothing"; + case SPAN_NODE_UNKNOWN: + return "Unknown"; + + /* Check name offset for span node */ + case SPAN_NODE: + return "Node"; + } + return "???"; +} + +/* + * Get the operation of a span. + * If it is a node span, the name may be pulled from the stat file. + */ +const char * +get_operation_name(const Span * span, const char *qbuffer, Size qbuffer_size) +{ + if (span->operation_name_offset != -1) + { + if (span->operation_name_offset <= qbuffer_size && qbuffer_size > 0) + return qbuffer + span->operation_name_offset; + else if (span->operation_name_offset > qbuffer_size && qbuffer_size > 0) + return qbuffer + qbuffer_size; + } + + switch (span->type) + { + case SPAN_POST_PARSE: + return "Post Parse"; + case SPAN_PARSE: + return "Parse"; + case SPAN_PLANNER: + return "Planner"; + case SPAN_FUNCTION: + return "Function"; + case SPAN_PROCESS_UTILITY: + return "ProcessUtility"; + case SPAN_EXECUTOR_START: + return "Start"; + case SPAN_EXECUTOR_RUN: + return "Run"; + case SPAN_EXECUTOR_END: + return "End"; + case SPAN_EXECUTOR_FINISH: + return "Finish"; + /* Node should have name offset set */ + case SPAN_NODE_SELECT: + case SPAN_NODE_INSERT: + case SPAN_NODE_UPDATE: + case SPAN_NODE_DELETE: + case SPAN_NODE_MERGE: + case SPAN_NODE_UTILITY: + case SPAN_NODE_NOTHING: + case SPAN_NODE_INIT_PLAN: + case SPAN_NODE_SUBPLAN: + case SPAN_NODE_UNKNOWN: + case SPAN_NODE: + return "Node"; + } + return "Unknown type"; +} + +/* +* Get the current nanoseconds value +*/ +int64 +get_current_ns(void) +{ + instr_time t; + + INSTR_TIME_SET_CURRENT(t); + return INSTR_TIME_GET_NANOSEC(t); +} diff --git a/contrib/pg_tracing/span.h b/contrib/pg_tracing/span.h new file mode 100644 index 0000000000..109f2eb7a9 --- /dev/null +++ b/contrib/pg_tracing/span.h @@ -0,0 +1,153 @@ +/*------------------------------------------------------------------------- + * contrib/pg_tracing/span.h + * + * Header for span. + * + * IDENTIFICATION + * contrib/pg_tracing/span.h + * + *------------------------------------------------------------------------- + */ +#ifndef _SPAN_H_ +#define _SPAN_H_ + +#include "postgres.h" + +#include "jit/jit.h" +#include "pgstat.h" +#include "access/transam.h" + +/* + * SpanType: Type of the span + */ +typedef enum +{ + SPAN_PARSE, /* Wraps query parsing */ + SPAN_POST_PARSE, /* Wraps post parsing */ + SPAN_PLANNER, /* Wraps planner execution in planner hook */ + SPAN_FUNCTION, /* Wraps function in fmgr hook */ + SPAN_PROCESS_UTILITY, /* Wraps ProcessUtility execution */ + + SPAN_EXECUTOR_START, /* Executor Spans wrapping the matching hooks */ + SPAN_EXECUTOR_RUN, + SPAN_EXECUTOR_END, + SPAN_EXECUTOR_FINISH, + + SPAN_NODE_INIT_PLAN, + SPAN_NODE_SUBPLAN, + + SPAN_NODE, /* Represents a node execution, generated from + * planstate */ + + SPAN_NODE_SELECT, /* Query Span types. They are created from the + * query cmdType */ + SPAN_NODE_INSERT, + SPAN_NODE_UPDATE, + SPAN_NODE_DELETE, + SPAN_NODE_MERGE, + SPAN_NODE_UTILITY, + SPAN_NODE_NOTHING, + SPAN_NODE_UNKNOWN, +} SpanType; + + +/* + * Counters extracted from query instrumentation + */ +typedef struct NodeCounters +{ + int64 rows; /* # of tuples processed */ + int64 nloops; /* # of cycles for this node */ + + BufferUsage buffer_usage; /* total buffer usage for this node */ + WalUsage wal_usage; /* total WAL usage for this node */ + JitInstrumentation jit_usage; /* total JIT usage for this node */ +} NodeCounters; + +/* + * Counters extracted from query's plan + */ +typedef struct PlanCounters +{ + /* + * estimated execution costs for plan (see costsize.c for more info) + */ + Cost startup_cost; /* cost expended before fetching any tuples */ + Cost total_cost; /* total cost (assuming all tuples fetched) */ + + /* + * planner's estimate of result size of this plan step + */ + Cardinality plan_rows; /* number of rows plan is expected to emit */ + int plan_width; /* average row width in bytes */ +} PlanCounters; + +typedef struct +{ + TimestampTz ts; + int64 ns; +} pgTracingStartTime; + +typedef struct pgTracingTrace +{ + uint64 trace_id; /* Id of the trace */ + uint64 parent_id; /* Span id of the parent */ + int sampled; /* Is current statement sampled? */ + uint64 query_id; /* Query id of the current statement */ + pgTracingStartTime start_trace; +} pgTracingTrace; + +/* + * The Span data structure represents an operation with a start, a duration. + * It contains the minimum needed to represent a span and serves as a base + * for the SpanNode. It is used for simple operations that don't need + * significant metadata like Executor spans. + */ +typedef struct Span +{ + uint64 trace_id; /* Trace id extracted from the SQLCommenter's + * traceparent */ + uint64 span_id; /* Span Identifier generated from a random + * uint64 */ + uint64 parent_id; /* Span's parent id. For the top span, it's + * extracted from SQLCommenter's traceparent. + * For other spans, we pass the parent's span. */ + uint64 query_id; + + TimestampTz start; /* Start of the span. */ + int64 duration_ns; /* Duration of the span in nanoseconds. */ + SpanType type; /* Type of the span. Used to generate the + * span's name for all spans except SPAN_NODE. */ + int be_pid; /* Pid of the backend process */ + bool top_span; + bool ended; /* Track if the span was already ended */ + uint8 nested_level; /* Only used for debugging */ + uint8 subxact_count; /* Active count of backend's subtransaction */ + + /* + * We store variable size metadata in an external file. Those represent + * the position of NULL terminated strings in the file. Set to -1 if + * unused. + */ + Size name_offset; /* span name offset in external file */ + Size operation_name_offset; /* operation name offset in external + * file */ + Size parameter_offset; /* parameters offset in external file */ + + PlanCounters plan_counters; /* Counters with plan costs */ + NodeCounters node_counters; /* Counters with node costs (jit, wal, + * buffers) */ + int64 startup; /* Time to the first tuple */ + int sql_error_code; /* query error code extracted from ErrorData, + * 0 if query was successful */ +} Span; + +extern void begin_span(Span * span, SpanType type, const pgTracingTrace * trace, uint64 parent_id, uint64 query_id, const int64 *start_span, int nested_level); +extern void end_span(Span * span, const int64 *end_time); + +extern SpanType command_type_to_span_type(CmdType cmd_type); +extern const char *get_span_name(const Span * span, const char *qbuffer, Size qbuffer_size); +extern const char *get_operation_name(const Span * span, const char *qbuffer, Size qbuffer_size); +extern int64 get_current_ns(void); + +#endif diff --git a/contrib/pg_tracing/sql/cleanup.sql b/contrib/pg_tracing/sql/cleanup.sql new file mode 100644 index 0000000000..ab11333d4e --- /dev/null +++ b/contrib/pg_tracing/sql/cleanup.sql @@ -0,0 +1,3 @@ +DROP TABLE pg_tracing_test; +DROP function test_function; +DROP EXTENSION pg_tracing; diff --git a/contrib/pg_tracing/sql/create_table.sql b/contrib/pg_tracing/sql/create_table.sql new file mode 100644 index 0000000000..87875a113f --- /dev/null +++ b/contrib/pg_tracing/sql/create_table.sql @@ -0,0 +1 @@ +CREATE TABLE pg_tracing_test (a int, b char(20)); diff --git a/contrib/pg_tracing/sql/nested.sql b/contrib/pg_tracing/sql/nested.sql new file mode 100644 index 0000000000..59d52b319b --- /dev/null +++ b/contrib/pg_tracing/sql/nested.sql @@ -0,0 +1,78 @@ +-- Create test functions +CREATE OR REPLACE FUNCTION test_function(a int) RETURNS SETOF oid AS +$BODY$ +BEGIN + RETURN QUERY SELECT oid from pg_class where oid = a; +END; +$BODY$ +LANGUAGE plpgsql; + +CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ +DECLARE + r int; +BEGIN + SELECT (i + i)::int INTO r; +END; $$ LANGUAGE plpgsql; + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000051-0000000000000051-01'*/ select test_function(1); + +SELECT span_id AS top_span, + extract(epoch from span_start) as top_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_end + from pg_tracing_spans(false) where parent_id=81 and name!='Parse' \gset +SELECT span_id AS top_run_span, + extract(epoch from span_start) as top_run_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_run_end + from pg_tracing_spans(false) where parent_id=:top_span and name='Executor' and resource='Run' \gset +SELECT span_id AS top_project, + extract(epoch from span_start) as top_project_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_project_end + from pg_tracing_spans(false) where parent_id=:top_run_span and name='ProjectSet' \gset +SELECT span_id AS top_result, + extract(epoch from span_start) as top_result_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_result_end + from pg_tracing_spans(false) where parent_id=:top_project and name='Result' \gset +SELECT span_id AS nested_select, + extract(epoch from span_start) as select_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as select_end + from pg_tracing_spans(false) where parent_id=:top_result and name='Select' \gset +SELECT span_id AS nested_run, + extract(epoch from span_start) as run_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as run_end + from pg_tracing_spans(false) where parent_id=:nested_select and resource='Run' \gset + +SELECT :top_start < :top_run_start AS top_query_before_run, + :top_end >= :top_run_end AS top_end_after_run_end, + + :top_run_start <= :top_project_start AS top_run_start_before_project, + + :top_run_end >= :top_project_end AS top_run_end_after_project_end, + :top_run_end >= :select_end AS top_run_end_before_select_end, + :top_run_end >= :run_end AS top_run_end_after_nested_run_end, + + :run_end >= :select_end AS run_end_after_select_end; + +WITH max_duration AS (select max(duration) from pg_tracing_spans(false)) +SELECT duration = max_duration.max from pg_tracing_spans(false), max_duration + where span_id = :top_span; + +SELECT resource from pg_tracing_spans(false) where parent_id=:nested_run order by resource; + + +-- Check tracking option +set pg_tracing.track = 'top'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000052-0000000000000052-01'*/ select test_function(1); +SELECT count(*) from pg_tracing_spans where trace_id=82; +set pg_tracing.track = 'none'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000053-0000000000000053-01'*/ select test_function(1); +SELECT count(*) from pg_tracing_spans where trace_id=83; +set pg_tracing.track = 'all'; + +/*traceparent='00-00000000000000000000000000000054-0000000000000054-01'*/ CALL sum_one(3); +SELECT resource from pg_tracing_spans order by span_start, duration desc, resource; + + +set pg_tracing.track_utility=off; + +/*traceparent='00-00000000000000000000000000000055-0000000000000055-01'*/ CALL sum_one(10); +SELECT resource from pg_tracing_spans order by span_start, duration desc, resource; diff --git a/contrib/pg_tracing/sql/parallel.sql b/contrib/pg_tracing/sql/parallel.sql new file mode 100644 index 0000000000..15e05b26bc --- /dev/null +++ b/contrib/pg_tracing/sql/parallel.sql @@ -0,0 +1,12 @@ +begin; +-- encourage use of parallel plans +set parallel_setup_cost=0; +set parallel_tuple_cost=0; +set min_parallel_table_scan_size=0; +set max_parallel_workers_per_gather=4; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ select 1 from pg_class limit 1; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ select 2 from pg_class limit 2; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-00'*/ select 3 from pg_class limit 3; +commit; + +SELECT resource from pg_tracing_spans order by resource; diff --git a/contrib/pg_tracing/sql/reset.sql b/contrib/pg_tracing/sql/reset.sql new file mode 100644 index 0000000000..531db2172d --- /dev/null +++ b/contrib/pg_tracing/sql/reset.sql @@ -0,0 +1,3 @@ +-- Check reset is working +SELECT pg_tracing_reset(); +SELECT traces from pg_tracing_info; diff --git a/contrib/pg_tracing/sql/sample.sql b/contrib/pg_tracing/sql/sample.sql new file mode 100644 index 0000000000..6c9f8d7aa7 --- /dev/null +++ b/contrib/pg_tracing/sql/sample.sql @@ -0,0 +1,33 @@ +-- Trace nothing +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 0.0; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000021-0000000000000021-01'*/ SELECT 1; + +select count(distinct(trace_id)) from pg_tracing_spans(false); +select resource, parameters from pg_tracing_spans order by span_start, duration desc; + +-- Trace everything +SET pg_tracing.sample_rate = 1.0; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000022-0000000000000022-01'*/ SELECT 1; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000023-0000000000000023-00'*/ SELECT 2; +SELECT 3; +SELECT 4; + +select count(distinct(trace_id)) from pg_tracing_spans(false); +select resource, parameters from pg_tracing_spans(false) order by span_start, duration desc; +-- Top spans should reuse generated ids +select resource, parameters from pg_tracing_spans(false) where trace_id = parent_id and parent_id = span_id order by span_start, duration desc; +select resource, parameters from pg_tracing_spans order by span_start, duration desc; + +-- Cleanup +SET plan_cache_mode='auto'; +SET pg_tracing.sample_rate = 0.0; + +-- Only trace query with sampled flag +SET pg_tracing.caller_sample_rate = 1.0; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000024-0000000000000024-01'*/ SELECT 1; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000025-0000000000000025-00'*/ SELECT 2; +SELECT 1; +select count(distinct(trace_id)) from pg_tracing_spans(false); +select resource, parameters from pg_tracing_spans(false) order by span_start, duration desc; +select resource, parameters from pg_tracing_spans order by span_start; diff --git a/contrib/pg_tracing/sql/select.sql b/contrib/pg_tracing/sql/select.sql new file mode 100644 index 0000000000..2c92df7b13 --- /dev/null +++ b/contrib/pg_tracing/sql/select.sql @@ -0,0 +1,60 @@ +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ SELECT 1; + +-- Get top span id +SELECT span_id AS top_span_id from pg_tracing_spans(false) where parent_id=1 and name!='Parse' \gset + +-- Check parameters +SELECT parameters from pg_tracing_spans(false) where span_id=:top_span_id; + +-- Check the number of children +SELECT count(*) from pg_tracing_spans(false) where parent_id=:'top_span_id'; + +-- Check resource and query id +SELECT resource, query_id from pg_tracing_spans(false) where trace_id=1 order by span_start, duration desc, resource; + +-- Check reported number of trace +SELECT traces from pg_tracing_info; + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ SELECT * from current_database(); +SELECT resource from pg_tracing_spans where trace_id=3 order by resource; + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000004-0000000000000004-01'*/ SELECT s.relation_size + s.index_size +FROM (SELECT + pg_relation_size(C.oid) as relation_size, + pg_indexes_size(C.oid) as index_size + FROM pg_class C) as s limit 1; +SELECT resource from pg_tracing_spans where trace_id=4 order by resource; + +-- Check that we're in a correct state after a timeout +set statement_timeout=200; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000007-0000000000000007-01'*/ select * from pg_sleep(10); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000008-0000000000000008-01'*/ select 1; +SELECT trace_id, resource, sql_error_code from pg_tracing_spans order by span_start, duration desc, resource; +set statement_timeout=0; + +-- Test prepared statement +PREPARE test_prepared (text, integer) AS /*$1*/ SELECT $2; +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', 1); +SELECT trace_id, resource, parameters from pg_tracing_spans order by span_start, duration desc, resource; + +-- Check explain is untracked +explain (costs off) +select * from pg_class where oid < 0 limit 100; +SELECT trace_id, resource, parameters from pg_tracing_spans order by span_start, duration desc, resource; + +-- Never executed node +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000a-000000000000000a-01'*/ select 1 limit 0; +SELECT trace_id, resource, parameters from pg_tracing_spans order by span_start, duration desc, resource; + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000010-0000000000000010-01''', 10); +SELECT trace_id, resource, parameters from pg_tracing_spans order by span_start, duration desc, resource; + +-- Test multiple statements +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000011-0000000000000012-01'*/ select 1; select 2; +select resource, parameters from pg_tracing_spans order by span_start, duration desc; + +-- Cleanup +SET plan_cache_mode='auto'; +DEALLOCATE test_prepared; diff --git a/contrib/pg_tracing/sql/subxact.sql b/contrib/pg_tracing/sql/subxact.sql new file mode 100644 index 0000000000..95408e6348 --- /dev/null +++ b/contrib/pg_tracing/sql/subxact.sql @@ -0,0 +1,12 @@ +SET pg_tracing.sample_rate = 1.0; +BEGIN; +SAVEPOINT s1; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s2; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s3; +SELECT 1; +COMMIT; + +select resource, parameters, subxact_count from pg_tracing_spans order by span_start, duration desc; + diff --git a/contrib/pg_tracing/sql/utility.sql b/contrib/pg_tracing/sql/utility.sql new file mode 100644 index 0000000000..4d08a27c1c --- /dev/null +++ b/contrib/pg_tracing/sql/utility.sql @@ -0,0 +1,72 @@ +-- Track statements in create extension +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing; + +SELECT span_id AS top_utility, + extract(epoch from span_start) as top_utility_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_utility_end + from pg_tracing_spans(false) where parent_id=1 and name='Utility' \gset + +SELECT span_id AS top_process_utility, + extract(epoch from span_start) as top_process_utility_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as top_process_utility_end + from pg_tracing_spans(false) where parent_id=:top_utility and name='ProcessUtility' \gset + +SELECT span_id AS first_utility, + extract(epoch from span_start) as first_utility_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as first_utility_end + from pg_tracing_spans(false) where parent_id=:top_process_utility and name='Utility' limit 1 \gset + +SELECT span_id AS first_process_utility, + extract(epoch from span_start) as first_process_utility_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as first_process_utility_end + from pg_tracing_spans(false) where parent_id=:first_utility and name='ProcessUtility' \gset + +SELECT span_id AS second_utility, + extract(epoch from span_start) as second_utility_start, + round(extract(epoch from span_start) + duration / 1000000000.0, 6) as second_utility_end + from pg_tracing_spans(false) where parent_id=:top_process_utility and name='Utility' limit 1 offset 1 \gset + +SELECT :top_utility_start < :top_process_utility_start AS top_utility_start_before_process_utility, + :top_utility_end >= :top_process_utility_end AS top_end_after_process_utility, + + :first_process_utility_end < :first_utility_end AS nested_process_utility_ends_before, + :first_utility_end < :second_utility_start AS second_utility_start_after_first_one; + +-- Clean current spans +select count(*) from pg_tracing_spans; + +-- Test utility off +SET pg_tracing.track_utility = off; + +-- Test prepared statement +PREPARE test_prepared_one_param (integer) AS SELECT $1; +EXECUTE test_prepared_one_param(100); + +select count(*) from pg_tracing_spans; + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + +select count(*) from pg_tracing_spans; + + +-- With utility on +SET pg_tracing.track_utility = on; + +-- Test prepared statement +PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; +EXECUTE test_prepared_one_param_2(100); + +select count(distinct(trace_id)) from pg_tracing_spans(false); +select resource, parameters from pg_tracing_spans(false) order by span_start, duration desc; +select resource, parameters from pg_tracing_spans where trace_id = parent_id and parent_id = span_id order by span_start; + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + +select count(distinct(trace_id)) from pg_tracing_spans(false); +select resource, parameters from pg_tracing_spans(false) order by span_start, duration desc; +select resource, parameters from pg_tracing_spans where trace_id = parent_id and parent_id = span_id order by span_start; + diff --git a/contrib/pg_tracing/sql/wal.sql b/contrib/pg_tracing/sql/wal.sql new file mode 100644 index 0000000000..10c0aa6a56 --- /dev/null +++ b/contrib/pg_tracing/sql/wal.sql @@ -0,0 +1,24 @@ +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; + +-- Check WAL is generated for the above statements +SELECT trace_id, name, resource, +wal_records > 0 as wal_records, +wal_bytes > 0 as wal_bytes +FROM pg_tracing_spans order by span_start, duration desc, resource; + +-- Redo the same but without wal instrumentation +set pg_tracing.instrument_wal = false; + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; + +SELECT trace_id, name, resource, +wal_records = 0 as wal_records, +wal_bytes = 0 as wal_bytes +FROM pg_tracing_spans order by span_start, duration desc, resource; + +-- Cleanup +set pg_tracing.instrument_wal = true; diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index ee78a5749d..1174e2450f 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -70,6 +70,7 @@ InstrStartNode(Instrumentation *instr) if (instr->need_timer && !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); + INSTR_TIME_SET_CURRENT_LAZY(instr->firsttime); /* save buffer usage totals at node entry, if needed */ if (instr->need_bufusage) diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 87e5e2183b..13840be1d7 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -72,6 +72,7 @@ typedef struct Instrumentation bool async_mode; /* true if node is in async mode */ /* Info about current plan cycle: */ bool running; /* true if we've completed first tuple */ + instr_time firsttime; /* start time of this node */ instr_time starttime; /* start time of current iteration of node */ instr_time counter; /* accumulated runtime for this node */ double firsttuple; /* time for first tuple of this cycle */