From 673d4355c068172ec1a04523239d26ffbd41eef5 Mon Sep 17 00:00:00 2001
From: David Geier <geidav.pg@gmail.com>
Date: Mon, 23 Jan 2023 18:13:55 +0100
Subject: [PATCH v7 3/3] Use RDTSC(P) instructions to measure time on x86 Linux

For now this is only enabled on Linux/x86 when the system clocksource is
marked tsc as well, as determined at runtime. This way we can rely on the
Linux kernel to make a decision whether tsc is invariant and usable on the
current CPU architecture. In all other cases we continue to use the
clock_gettime() implementation like before.

Note that this intentionally uses rdtsc, not rdtscp, as rdtscp waits for
currently running CPU instructions to have finished, and that adds up to
noticable latency for little benefit in the typical InstrStartNode() /
InstrStopNode() use case.
---
 src/backend/access/heap/vacuumlazy.c    |   3 +-
 src/backend/commands/explain.c          |   2 +-
 src/backend/executor/instrument.c       |  12 +-
 src/backend/utils/init/postinit.c       |   3 +
 src/bin/pg_test_timing/pg_test_timing.c |   7 +-
 src/bin/pgbench/pgbench.c               |   3 +
 src/bin/psql/startup.c                  |   4 +
 src/common/Makefile                     |   1 +
 src/common/instr_time.c                 | 157 ++++++++++++++++++++++++
 src/common/meson.build                  |   1 +
 src/include/portability/instr_time.h    | 130 ++++++++++++++++----
 src/tools/msvc/Mkvcbuild.pm             |   2 +-
 12 files changed, 293 insertions(+), 32 deletions(-)
 create mode 100644 src/common/instr_time.c

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 8f14cf85f3..212093568b 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -3002,8 +3002,7 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
 			INSTR_TIME_SET_CURRENT(currenttime);
 			elapsed = currenttime;
 			INSTR_TIME_SUBTRACT(elapsed, starttime);
-			if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
-				>= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
+			if (INSTR_TIME_GET_MILLISEC(elapsed) >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
 			{
 				if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
 				{
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 5212a64b1e..c4c4ccbebe 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -526,7 +526,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 	double		totaltime = 0;
 	int			eflags;
 	int			instrument_option = 0;
-
+    
 	Assert(plannedstmt->commandType != CMD_UTILITY);
 
 	if (es->analyze && es->timing)
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index ee78a5749d..1ce3d62cff 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -67,9 +67,13 @@ InstrInit(Instrumentation *instr, int instrument_options)
 void
 InstrStartNode(Instrumentation *instr)
 {
-	if (instr->need_timer &&
-		!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
-		elog(ERROR, "InstrStartNode called twice in a row");
+	if (instr->need_timer)
+	{
+		if (!INSTR_TIME_IS_ZERO(instr->starttime))
+			elog(ERROR, "InstrStartNode called twice in a row");
+		else
+			INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
+	}
 
 	/* save buffer usage totals at node entry, if needed */
 	if (instr->need_bufusage)
@@ -95,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 		if (INSTR_TIME_IS_ZERO(instr->starttime))
 			elog(ERROR, "InstrStopNode called without start");
 
-		INSTR_TIME_SET_CURRENT(endtime);
+		INSTR_TIME_SET_CURRENT_FAST(endtime);
 		INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
 
 		INSTR_TIME_SET_ZERO(instr->starttime);
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 2f07ca7a0e..2eef902483 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -805,6 +805,9 @@ InitPostgres(const char *in_dbname, Oid dboid,
 	/* Initialize portal manager */
 	EnablePortalManager();
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	/* Initialize status reporting */
 	pgstat_beinit();
 
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index e20718669a..d185c8fe2e 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -128,7 +128,8 @@ test_timing(unsigned int duration)
 				end_time;
 	instr_time	cur;
 
-	INSTR_TIME_SET_CURRENT(start_time);
+	INSTR_TIME_INITIALIZE();
+	INSTR_TIME_SET_CURRENT_FAST(start_time);
 
 	/*
 	 * To reduce loop overhead, check loop condition in instr_time domain.
@@ -147,7 +148,7 @@ test_timing(unsigned int duration)
 		int32		bits = 0;
 
 		prev = cur;
-		INSTR_TIME_SET_CURRENT(cur);
+		INSTR_TIME_SET_CURRENT_FAST(cur);
 		temp = cur;
 		INSTR_TIME_SUBTRACT(temp, prev);
 		diff = INSTR_TIME_GET_NANOSEC(temp);
@@ -179,7 +180,7 @@ test_timing(unsigned int duration)
 		loop_count++;
 	}
 
-	INSTR_TIME_SET_CURRENT(end_time);
+	INSTR_TIME_SET_CURRENT_FAST(end_time);
 
 	INSTR_TIME_SUBTRACT(end_time, start_time);
 
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 9c12ffaea9..b84dbaf478 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -7059,6 +7059,9 @@ main(int argc, char **argv)
 		initRandomState(&state[i].cs_func_rs);
 	}
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	/* opening connection... */
 	con = doConnect();
 	if (con == NULL)
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index 5a28b6f713..3f95e01320 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -24,6 +24,7 @@
 #include "help.h"
 #include "input.h"
 #include "mainloop.h"
+#include "portability/instr_time.h"
 #include "settings.h"
 
 /*
@@ -322,6 +323,9 @@ main(int argc, char *argv[])
 
 	PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL);
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	SyncVariables();
 
 	if (options.list_dbs)
diff --git a/src/common/Makefile b/src/common/Makefile
index 2f424a5735..ac79261556 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -58,6 +58,7 @@ OBJS_COMMON = \
 	file_perm.o \
 	file_utils.o \
 	hashfn.o \
+	instr_time.o \
 	ip.o \
 	jsonapi.o \
 	keywords.o \
diff --git a/src/common/instr_time.c b/src/common/instr_time.c
new file mode 100644
index 0000000000..281416da89
--- /dev/null
+++ b/src/common/instr_time.c
@@ -0,0 +1,157 @@
+/*-------------------------------------------------------------------------
+ *
+ * instr_time.c
+ *	   Non-inline parts of the portable high-precision interval timing
+ *	 implementation
+ *
+ * Portions Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/port/instr_time.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "portability/instr_time.h"
+
+#if defined(__x86_64__) && defined(__linux__)
+/*
+ * Indicates if RDTSC can be used (Linux/x86 only, when OS uses TSC clocksource)
+ */
+bool has_rdtsc = false;
+
+/*
+ * Indicates if RDTSCP can be used. True if RDTSC can be used and RDTSCP is available.
+ */
+bool has_rdtscp = false;
+
+/*
+ * Stores what the number of cycles needs to be multiplied with to end up
+ * with nanoseconds using integer math. See comment in pg_initialize_rdtsc()
+ * for more details.
+ *
+ * By default assume we are using clock_gettime() as a fallback which uses
+ * nanoseconds as ticks. Hence, we set the multiplier to the precision scalar
+ * so that the division in INSTR_TIME_GET_NANOSEC() won't change the nanoseconds.
+ *
+ * When using the RDTSC instruction directly this is filled in during initialization
+ * based on the relevant CPUID fields.
+ */
+int64 ticks_per_ns_scaled = TICKS_TO_NS_PRECISION;
+int64 ticks_per_sec = NS_PER_S;
+int64 max_ticks_no_overflow = PG_INT64_MAX / TICKS_TO_NS_PRECISION;
+
+static bool is_under_hypervisor(void)
+{
+	uint32 r0, r1, r2, r3;
+	int result = __get_cpuid(0x1, &r0, &r1, &r2, &r3);
+	return result > 0 && ((r2 >> 31) & 1) == 1;
+}
+
+static bool get_hypervisor_tsc_frequency_khz(uint32 *tsc_freq)
+{
+	uint32 r0, r1, r2;
+
+	if (!is_under_hypervisor())
+		return false;
+
+	/*
+	 * Use __cpuid() directly because __get_cpuid() incorrectly bails out for
+	 * VM leafs. The problem is that leaf 0x0 returns the maximum leaf and
+	 * __get_cpuid() for any input leaf simple ANDs with 0x80000000 to get the
+	 * info leaf 0x0 to obtain the maximum. For the VM timing leaf the ANDing
+	 * also results in 0x0 while it should result in 0x40000000 to get the
+	 * maximum VM leaf.
+	*/
+	__cpuid(0x40000010, *tsc_freq, r0, r1, r2);
+	return *tsc_freq > 0;
+}
+
+static bool get_host_tsc_frequency_khz(uint32 *tsc_freq)
+{
+	uint32 r0, r1, r2;
+	int result = __get_cpuid(0x16, tsc_freq, &r0, &r1, &r2);
+	*tsc_freq *= 1000; // Convert from MHz to KHz
+	return result > 0 && *tsc_freq > 0;
+}
+
+static bool get_tsc_frequency_khz(uint32 *tsc_freq)
+{
+	if (get_hypervisor_tsc_frequency_khz(tsc_freq))
+		return true;
+	if (get_host_tsc_frequency_khz(tsc_freq))
+		return true;
+
+	return false;
+}
+
+static bool is_rdtscp_available()
+{
+	uint32 r0, r1, r2, r3;
+	return __get_cpuid(0x80000001, &r0, &r1, &r2, &r3) > 0 && (r3 & (1 << 27)) != 0;
+}
+
+/*
+ * Decide whether we use the RDTSC instruction at runtime, for Linux/x86,
+ * instead of incurring the overhead of a full clock_gettime() call.
+ *
+ * This can't be reliably determined at compile time, since the
+ * availability of an "invariant" TSC (that is not affected by CPU
+ * frequency changes) is dependent on the CPU architecture. Additionally,
+ * there are cases where TSC availability is impacted by virtualization,
+ * where a simple cpuid feature check would not be enough.
+ *
+ * Since Linux already does a significant amount of work to determine
+ * whether TSC is a viable clock source, decide based on that.
+ */
+void pg_initialize_rdtsc(void)
+{
+	FILE *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
+
+	if (fp)
+	{
+		char buf[128];
+
+		if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0)
+		{
+			/*
+			* Compute baseline CPU peformance, determines speed at which RDTSC advances.
+			*/
+			uint32 tsc_freq;
+
+			if (get_tsc_frequency_khz(&tsc_freq))
+			{
+				/*
+				* Ticks to nanoseconds conversion requires floating point math
+				* because because:
+				* 
+				* sec = ticks / frequency_hz
+				* ns  = ticks / frequency_hz * 1,000,000,000
+				* ns  = ticks * (1,000,000,000 / frequency_hz)
+				* ns  = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
+				*
+				* Here, 'ns' is usually a floating number. For example for a 2.5 GHz
+				* CPU the scaling factor becomes 1,000,000 / 2,500,000 = 1.2.
+				*
+				* To be able to use integer math we work around the lack of precision.
+				* We first scale the integer up and after the multiplication by the
+				* number of ticks in INSTR_TIME_GET_NANOSEC() we divide again by the
+				* same value. We picked the scaler such that it provides enough precision
+				* and is a power-of-two which allows for shifting instead of doing an
+				* integer division.
+				*/
+				ticks_per_ns_scaled = INT64CONST(1000000) * TICKS_TO_NS_PRECISION / tsc_freq;
+				ticks_per_sec = tsc_freq * 1000; // KHz -> Hz
+				max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+
+				has_rdtsc = true;
+				has_rdtscp = is_rdtscp_available();
+			}
+		}
+
+		fclose(fp);
+	}
+}
+#endif							/* defined(__x86_64__) && defined(__linux__) */
diff --git a/src/common/meson.build b/src/common/meson.build
index 1caa1fed04..5a7941b9e3 100644
--- a/src/common/meson.build
+++ b/src/common/meson.build
@@ -10,6 +10,7 @@ common_sources = files(
   'file_perm.c',
   'file_utils.c',
   'hashfn.c',
+  'instr_time.c',
   'ip.c',
   'jsonapi.c',
   'keywords.c',
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index aab80effb0..6fda565366 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -4,9 +4,11 @@
  *	  portable high-precision interval timing
  *
  * This file provides an abstraction layer to hide portability issues in
- * interval timing.  On Unix we use clock_gettime(), and on Windows we use
- * QueryPerformanceCounter().  These macros also give some breathing room to
- * use other high-precision-timing APIs.
+ * interval timing. On Linux/x86 we use the rdtsc instruction when a TSC
+ * clocksource is also used on the host OS.  Otherwise, and on other
+ * Unix-like systems we use clock_gettime() and on Windows we use
+ * QueryPerformanceCounter(). These macros also give some breathing
+ * room to use other high-precision-timing APIs.
  *
  * The basic data type is instr_time, which all callers should treat as an
  * opaque typedef.  instr_time can store either an absolute time (of
@@ -19,10 +21,11 @@
  *
  * INSTR_TIME_SET_ZERO(t)			set t to zero (memset is acceptable too)
  *
- * INSTR_TIME_SET_CURRENT(t)		set t to current time
- *
- * INSTR_TIME_SET_CURRENT_LAZY(t)	set t to current time if t is zero,
- *									evaluates to whether t changed
+ * INSTR_TIME_SET_CURRENT_FAST(t)	set t to current time without waiting
+ * 									for instructions in out-of-order window
+ * 
+ * INSTR_TIME_SET_CURRENT(t)		set t to current time while waiting for
+ * 									instructions in OOO to retire
  *
  * INSTR_TIME_SET_SECONDS(t, s)		set t to s seconds
  *
@@ -84,7 +87,15 @@ typedef struct instr_time
 
 
 #ifndef WIN32
+/*
+ * Make sure this is a power-of-two, so that the compiler can turn the
+ * multiplications and divisions into shifts.
+ */
+#define TICKS_TO_NS_PRECISION (1<<14)
 
+extern int64 ticks_per_ns_scaled;
+extern int64 ticks_per_sec;
+extern int64 max_ticks_no_overflow;
 
 /* Use clock_gettime() */
 
@@ -110,35 +121,113 @@ typedef struct instr_time
 #define PG_INSTR_CLOCK	CLOCK_REALTIME
 #endif
 
-/* helper for INSTR_TIME_SET_CURRENT */
+#if defined(__x86_64__) && defined(__linux__)
+#include <x86intrin.h>
+#include <cpuid.h>
+
+extern bool has_rdtsc;
+extern bool has_rdtscp;
+#endif
+
 static inline instr_time
-pg_clock_gettime_ns(void)
+pg_clock_gettime()
 {
 	instr_time	now;
 	struct timespec tmp;
 
 	clock_gettime(PG_INSTR_CLOCK, &tmp);
 	now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec;
-
 	return now;
 }
 
+static inline instr_time
+pg_get_ticks_fast(void)
+{
+#if defined(__x86_64__) && defined(__linux__)
+	if (has_rdtsc)
+	{
+		instr_time now;
+		uint32 a;
+		now.ticks = __rdtscp(&a);
+		return now;
+	}
+#endif
+
+	return pg_clock_gettime();
+}
+
+static inline instr_time
+pg_get_ticks(void)
+{
+#if defined(__x86_64__) && defined(__linux__)
+	if (has_rdtscp)
+	{
+		instr_time now;
+		uint32 unused;
+		now.ticks = __rdtscp(&unused);
+		return now;
+	}
+#endif
+
+	return pg_clock_gettime();
+}
+
+static inline int64_t
+pg_ticks_to_ns(instr_time t)
+{
+	/*
+	 * Would multiplication overflow? If so perform computation in two parts.
+	 * Check overflow without actually overflowing via:
+	 *    a * b > max <=> a > max / b
+	 */
+	int64 ns = 0;
+
+	if (unlikely(t.ticks > max_ticks_no_overflow))
+	{
+		/*
+		* Compute how often the maximum number of ticks fits completely into the
+		* number of elapsed ticks and convert that number into nanoseconds. 
+		* Then multiply by the count to arrive at the final value. In a 2nd step
+		* we adjust the number of elapsed ticks and convert the remaining ticks.
+		*/
+		int64 count = t.ticks / max_ticks_no_overflow;
+		int64 max_ns = max_ticks_no_overflow * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+		ns = max_ns * count;
+
+		/*
+		* Subtract the ticks that we now already accounted for, so that they don't
+		* get counted twice.
+		*/
+		t.ticks -= count * max_ticks_no_overflow;
+		Assert(t.ticks >= 0);
+	}
+
+	ns += t.ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+	return ns;
+}
+
+extern void pg_initialize_rdtsc(void);
+
+#define INSTR_TIME_INITIALIZE() \
+	pg_initialize_rdtsc()
+
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+	((t) = pg_get_ticks_fast())
+
 #define INSTR_TIME_SET_CURRENT(t) \
-	((t) = pg_clock_gettime_ns())
+	((t) = pg_get_ticks())
 
 #define INSTR_TIME_SET_SECONDS(t, s) \
-	((t).ticks = NS_PER_S * (s))
+	((t).ticks = (s) * ticks_per_sec)
 
 #define INSTR_TIME_GET_NANOSEC(t) \
-	((int64) (t).ticks)
-
+	pg_ticks_to_ns(t)
 
 #else							/* WIN32 */
 
 
 /* Use QueryPerformanceCounter() */
 
-/* helper for INSTR_TIME_SET_CURRENT */
 static inline instr_time
 pg_query_performance_counter(void)
 {
@@ -160,6 +249,11 @@ GetTimerFrequency(void)
 	return (double) f.QuadPart;
 }
 
+#define INSTR_TIME_INITIALIZE()
+
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+	((t) = pg_query_performance_counter())
+
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_query_performance_counter())
 
@@ -180,13 +274,8 @@ GetTimerFrequency(void)
 
 #define INSTR_TIME_IS_LT(x, y)	((x).ticks < (y).ticks)
 
-
 #define INSTR_TIME_SET_ZERO(t)	((t).ticks = 0)
 
-#define INSTR_TIME_SET_CURRENT_LAZY(t) \
-	(INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false)
-
-
 #define INSTR_TIME_ADD(x,y) \
 	((x).ticks += (y).ticks)
 
@@ -196,7 +285,6 @@ GetTimerFrequency(void)
 #define INSTR_TIME_ACCUM_DIFF(x,y,z) \
 	((x).ticks += (y).ticks - (z).ticks)
 
-
 #define INSTR_TIME_GET_DOUBLE(t) \
 	((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)
 
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index ee49424d6f..021525f3d8 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -135,7 +135,7 @@ sub mkvcbuild
 	our @pgcommonallfiles = qw(
 	  base64.c checksum_helper.c compression.c
 	  config_info.c controldata_utils.c d2s.c encnames.c exec.c
-	  f2s.c file_perm.c file_utils.c hashfn.c ip.c jsonapi.c
+	  f2s.c file_perm.c file_utils.c hashfn.c ip.c instr_time.o jsonapi.c
 	  keywords.c kwlookup.c link-canary.c md5_common.c percentrepl.c
 	  pg_get_line.c pg_lzcompress.c pg_prng.c pgfnames.c psprintf.c relpath.c
 	  rmtree.c saslprep.c scram-common.c string.c stringinfo.c unicode_norm.c
-- 
2.34.1