From 6068f8b5c4a0eb29d684e8865221801a0c682543 Mon Sep 17 00:00:00 2001 From: Jakub Wartak Date: Fri, 21 Feb 2025 14:20:18 +0100 Subject: [PATCH v12 3/3] Add pg_shmem_numa_allocations to show NUMA zones for shared memory allocations Author: Jakub Wartak Reviewed-by: Andres Freund Reviewed-by: Bertrand Drouvot Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com --- doc/src/sgml/system-views.sgml | 78 ++++++++++++++ src/backend/catalog/system_views.sql | 8 ++ src/backend/storage/ipc/shmem.c | 125 +++++++++++++++++++++++ src/include/catalog/pg_proc.dat | 8 ++ src/test/regress/expected/numa.out | 12 +++ src/test/regress/expected/numa_1.out | 3 + src/test/regress/expected/privileges.out | 16 ++- src/test/regress/expected/rules.out | 4 + src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/numa.sql | 9 ++ src/test/regress/sql/privileges.sql | 6 +- 11 files changed, 266 insertions(+), 5 deletions(-) create mode 100644 src/test/regress/expected/numa.out create mode 100644 src/test/regress/expected/numa_1.out create mode 100644 src/test/regress/sql/numa.sql diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 3f5a306247e..5164083131a 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -176,6 +176,11 @@ shared memory allocations + + pg_shmem_numa_allocations + NUMA mappings for shared memory allocations + + pg_stats planner statistics @@ -3746,6 +3751,79 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx + + <structname>pg_shmem_numa_allocations</structname> + + + pg_shmem_numa_allocations + + + + The pg_shmem_numa_allocations view shows NUMA nodes + assigned allocations made from the server's main shared memory segment. + This includes both memory allocated by PostgreSQL + itself and memory allocated by extensions using the mechanisms detailed in + . + + + + Note that this view does not include memory allocated using the dynamic + shared memory infrastructure. + + + + <structname>pg_shmem_numa_allocations</structname> Columns + + + + + Column Type + + + Description + + + + + + + + name text + + + The name of the shared memory allocation. + + + + + + numa_zone_id int4 + + + ID of NUMA node + + + + + + numa_size int4 + + + Size of the allocation on this particular NUMA node in bytes + + + + + +
+ + + By default, the pg_shmem_numa_allocations view can be + read only by superusers or roles with privileges of the + pg_read_all_stats role. + +
+ <structname>pg_stats</structname> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index a4d2cfdcaf5..cc014a62dc2 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats; +CREATE VIEW pg_shmem_numa_allocations AS + SELECT * FROM pg_get_shmem_numa_allocations(); + +REVOKE ALL ON pg_shmem_numa_allocations FROM PUBLIC; +GRANT SELECT ON pg_shmem_numa_allocations TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_shmem_numa_allocations() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_shmem_numa_allocations() TO pg_read_all_stats; + CREATE VIEW pg_backend_memory_contexts AS SELECT * FROM pg_get_backend_memory_contexts(); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 895a43fb39e..9331a5760f6 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -68,6 +68,7 @@ #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" +#include "port//pg_numa.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" #include "storage/shmem.h" @@ -89,6 +90,8 @@ slock_t *ShmemLock; /* spinlock for shared memory and LWLock static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ +/* To get reliable results for NUMA inquiry we need to "touch pages" once */ +static bool firstUseInBackend = true; /* * InitShmemAccess() --- set up basic pointers to shared memory. @@ -568,3 +571,125 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) return (Datum) 0; } + +/* SQL SRF showing NUMA zones for allocated shared memory */ +Datum +pg_get_shmem_numa_allocations(PG_FUNCTION_ARGS) +{ +#define PG_GET_SHMEM_NUMA_SIZES_COLS 3 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + HASH_SEQ_STATUS hstat; + ShmemIndexEnt *ent; + Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS]; + bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS]; + Size os_page_size; + void **page_ptrs; + int *pages_status; + int shm_total_page_count, + shm_ent_page_count, + max_zones; + Size *zones; + + InitMaterializedSRF(fcinfo, 0); + + if (pg_numa_init() == -1) + { + elog(NOTICE, "libnuma initialization failed or NUMA is not supported on this platform, some NUMA data might be unavailable.");; + return (Datum) 0; + } + max_zones = pg_numa_get_max_node(); + zones = palloc(sizeof(Size) * (max_zones + 1)); + + /* + * This is for gathering some NUMA statistics. We might be using various + * DB block sizes (4kB, 8kB , .. 32kB) that end up being allocated in + * various different OS memory pages sizes, so first we need to understand + * the OS memory page size before calling move_pages() + */ + os_page_size = pg_numa_get_pagesize(); + + /* + * Preallocate memory all at once without going into details which shared + * memory segment is the biggest (technically min s_b can be as low as + * 16xBLCKSZ) + */ + shm_total_page_count = ShmemSegHdr->totalsize / os_page_size; + page_ptrs = palloc(sizeof(void *) * shm_total_page_count); + pages_status = palloc(sizeof(int) * shm_total_page_count); + memset(page_ptrs, 0, sizeof(void *) * shm_total_page_count); + + if (firstUseInBackend) + elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts"); + + LWLockAcquire(ShmemIndexLock, LW_SHARED); + + hash_seq_init(&hstat, ShmemIndex); + + /* output all allocated entries */ + memset(nulls, 0, sizeof(nulls)); + while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) + { + int i; + + shm_ent_page_count = ent->allocated_size / os_page_size; + /* It is always at least 1 page */ + shm_ent_page_count = shm_ent_page_count == 0 ? 1 : shm_ent_page_count; + + /* + * If we get ever 0xff back from kernel inquiry, then we probably have + * bug in our buffers to OS page mapping code here + */ + memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count); + + for (i = 0; i < shm_ent_page_count; i++) + { + /* + * In order to get reliable results we also need to touch memory + * pages so that inquiry about NUMA zone doesn't return -2. + */ + volatile uint64 touch pg_attribute_unused(); + + page_ptrs[i] = (char *) ent->location + (i * os_page_size); + if (firstUseInBackend) + pg_numa_touch_mem_if_required(touch, page_ptrs[i]); + + CHECK_FOR_INTERRUPTS(); + } + + if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1) + elog(ERROR, "failed NUMA pages inquiry status: %m"); + + memset(zones, 0, sizeof(Size) * (max_zones + 1)); + /* Count number of NUMA zones used for this shared memory entry */ + for (i = 0; i < shm_ent_page_count; i++) + { + int s = pages_status[i]; + + /* Ensure we are adding only valid index to the array */ + if (s >= 0 && s <= max_zones) + zones[s]++; + } + + for (i = 0; i <= max_zones; i++) + { + values[0] = CStringGetTextDatum(ent->key); + values[1] = i; + values[2] = Int64GetDatum(zones[i] * os_page_size); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + } + + /* + * XXX: We are ignoring in NUMA version reporting of the following regions + * (compare to pg_get_shmem_allocations() case): 1. output shared memory + * allocated but not counted via the shmem index 2. output as-of-yet + * unused shared memory + */ + + LWLockRelease(ShmemIndexLock); + firstUseInBackend = false; + + return (Datum) 0; +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 85902903653..55ff305a713 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8496,6 +8496,14 @@ proname => 'pg_numa_available', provolatile => 'v', prorettype => 'bool', proargtypes => '', prosrc => 'pg_numa_available' }, +# shared memory usage with NUMA info +{ oid => '9686', descr => 'NUMA mappings for the main shared memory segment', + proname => 'pg_get_shmem_numa_allocations', prorows => '50', proretset => 't', + provolatile => 'v', prorettype => 'record', proargtypes => '', + proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}', + proargnames => '{name,numa_zone_id,numa_size}', + prosrc => 'pg_get_shmem_numa_allocations' }, + # memory context of local backend { oid => '2282', descr => 'information about all memory contexts of local backend', diff --git a/src/test/regress/expected/numa.out b/src/test/regress/expected/numa.out new file mode 100644 index 00000000000..fb882c5b771 --- /dev/null +++ b/src/test/regress/expected/numa.out @@ -0,0 +1,12 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif +-- switch to superuser +\c - +SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_numa_allocations; + ok +---- + t +(1 row) + diff --git a/src/test/regress/expected/numa_1.out b/src/test/regress/expected/numa_1.out new file mode 100644 index 00000000000..6dd6824b4e4 --- /dev/null +++ b/src/test/regress/expected/numa_1.out @@ -0,0 +1,3 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 954f549555e..d9d62470cdc 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -3127,8 +3127,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user; -- clean up DROP TABLE lock_table; DROP USER regress_locktable_user; --- test to check privileges of system views pg_shmem_allocations and --- pg_backend_memory_contexts. +-- test to check privileges of system views pg_shmem_allocations, +-- pg_shmem_numa_allocations and pg_backend_memory_contexts. -- switch to superuser \c - CREATE ROLE regress_readallstats; @@ -3144,6 +3144,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT f (1 row) +SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- no + has_table_privilege +--------------------- + f +(1 row) + GRANT pg_read_all_stats TO regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes has_table_privilege @@ -3157,6 +3163,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT t (1 row) +SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- yes + has_table_privilege +--------------------- + t +(1 row) + -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; SELECT COUNT(*) >= 0 AS ok FROM pg_backend_memory_contexts; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 62f69ac20b2..b63c6e0f744 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1740,6 +1740,10 @@ pg_shmem_allocations| SELECT name, size, allocated_size FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size); +pg_shmem_numa_allocations| SELECT name, + numa_zone_id, + numa_size + FROM pg_get_shmem_numa_allocations() pg_get_shmem_numa_allocations(name, numa_zone_id, numa_size); pg_stat_activity| SELECT s.datid, d.datname, s.pid, diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 37b6d21e1f9..c07a4c7633a 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr # The stats test resets stats, so nothing else needing stats access can be in # this group. # ---------- -test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate +test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa # event_trigger depends on create_am and cannot run concurrently with # any test that runs DDL diff --git a/src/test/regress/sql/numa.sql b/src/test/regress/sql/numa.sql new file mode 100644 index 00000000000..fddb21a260a --- /dev/null +++ b/src/test/regress/sql/numa.sql @@ -0,0 +1,9 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif + +-- switch to superuser +\c - + +SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_numa_allocations; diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index b81694c24f2..f93d4829702 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -1911,8 +1911,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user; DROP TABLE lock_table; DROP USER regress_locktable_user; --- test to check privileges of system views pg_shmem_allocations and --- pg_backend_memory_contexts. +-- test to check privileges of system views pg_shmem_allocations, +-- pg_shmem_numa_allocations and pg_backend_memory_contexts. -- switch to superuser \c - @@ -1921,11 +1921,13 @@ CREATE ROLE regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no +SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- no GRANT pg_read_all_stats TO regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes +SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- yes -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; -- 2.39.5