From ca651eb85a6656c79fee5aaabc99e4b772b1b8fe Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Tue, 27 May 2025 23:08:48 +0200 Subject: [PATCH v2 7/7] NUMA: pin backends to NUMA nodes When initializing the backend, we pick a PGPROC entry from the right NUMA node where the backend is running. But the process can move to a different core / node, so to prevent that we pin it. --- src/backend/storage/lmgr/proc.c | 21 +++++++++++++++++++++ src/backend/utils/init/globals.c | 1 + src/backend/utils/misc/guc_tables.c | 10 ++++++++++ src/include/miscadmin.h | 1 + 4 files changed, 33 insertions(+) diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 9d3e94a7b3a..4c9e55608b2 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -729,6 +729,27 @@ InitProcess(void) } MyProcNumber = GetNumberFromPGProc(MyProc); + /* + * Optionally, restrict the process to only run on CPUs from the same NUMA + * as the PGPROC. We do this even if the PGPROC has a different NUMA node, + * but not for PGPROC entries without a node (i.e. aux/2PC entries). + * + * This also means we only do this with numa_procs_interleave, because + * without that we'll have numa_node=-1 for all PGPROC entries. + * + * FIXME add proper error-checking for libnuma functions + */ + if (numa_procs_pin && MyProc->numa_node != -1) + { + struct bitmask *cpumask = numa_allocate_cpumask(); + + numa_node_to_cpus(MyProc->numa_node, cpumask); + + numa_sched_setaffinity(MyProcPid, cpumask); + + numa_free_cpumask(cpumask); + } + /* * Cross-check that the PGPROC is of the type we expect; if this were not * the case, it would get returned to the wrong list. diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 6ee4684d1b8..3f88659b49f 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -150,6 +150,7 @@ bool numa_buffers_interleave = false; bool numa_localalloc = false; bool numa_partition_freelist = false; bool numa_procs_interleave = false; +bool numa_procs_pin = false; /* GUC parameters for vacuum */ int VacuumBufferUsageLimit = 2048; diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 7b718760248..862341e137e 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2156,6 +2156,16 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"numa_procs_pin", PGC_POSTMASTER, DEVELOPER_OPTIONS, + gettext_noop("Enables pinning backends to NUMA nodes (matching the PGPROC node)."), + gettext_noop("When enabled, sets affinity to CPUs from the same NUMA node."), + }, + &numa_procs_pin, + false, + NULL, NULL, NULL + }, + { {"sync_replication_slots", PGC_SIGHUP, REPLICATION_STANDBY, gettext_noop("Enables a physical standby to synchronize logical failover replication slots from the primary server."), diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index cdeee8dccba..a97741c6707 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -182,6 +182,7 @@ extern PGDLLIMPORT bool numa_buffers_interleave; extern PGDLLIMPORT bool numa_localalloc; extern PGDLLIMPORT bool numa_partition_freelist; extern PGDLLIMPORT bool numa_procs_interleave; +extern PGDLLIMPORT bool numa_procs_pin; extern PGDLLIMPORT int commit_timestamp_buffers; extern PGDLLIMPORT int multixact_member_buffers; -- 2.49.0