From ee17287bcab2178bfd473a7043ece3b54f498817 Mon Sep 17 00:00:00 2001 From: Jakub Wartak Date: Wed, 19 Mar 2025 09:34:56 +0100 Subject: [PATCH v15 3/4] Extend pg_buffercache with new view pg_buffercache_numa to show NUMA memory node for individual buffer. Author: Jakub Wartak Reviewed-by: Andres Freund Reviewed-by: Bertrand Drouvot Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com --- contrib/pg_buffercache/Makefile | 3 +- .../expected/pg_buffercache_numa.out | 28 +++ .../expected/pg_buffercache_numa_1.out | 3 + contrib/pg_buffercache/meson.build | 2 + .../pg_buffercache--1.5--1.6.sql | 42 +++++ contrib/pg_buffercache/pg_buffercache.control | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 172 +++++++++++++++++- .../sql/pg_buffercache_numa.sql | 20 ++ doc/src/sgml/pgbuffercache.sgml | 61 ++++++- 9 files changed, 329 insertions(+), 4 deletions(-) create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_numa.out create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_numa_1.out create mode 100644 contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql create mode 100644 contrib/pg_buffercache/sql/pg_buffercache_numa.sql diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index eae65ead9e5..2a33602537e 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -8,7 +8,8 @@ OBJS = \ EXTENSION = pg_buffercache DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \ - pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql + pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \ + pg_buffercache--1.5--1.6.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" REGRESS = pg_buffercache diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa.out b/contrib/pg_buffercache/expected/pg_buffercache_numa.out new file mode 100644 index 00000000000..d4de5ea52fc --- /dev/null +++ b/contrib/pg_buffercache/expected/pg_buffercache_numa.out @@ -0,0 +1,28 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif +select count(*) = (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +-- Check that the functions / views can't be accessed by default. To avoid +-- having to create a dedicated user, use the pg_database_owner pseudo-role. +SET ROLE pg_database_owner; +SELECT count(*) > 0 FROM pg_buffercache_numa; +ERROR: permission denied for view pg_buffercache_numa +RESET role; +-- Check that pg_monitor is allowed to query view / function +SET ROLE pg_monitor; +SELECT count(*) > 0 FROM pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +RESET role; diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out new file mode 100644 index 00000000000..6dd6824b4e4 --- /dev/null +++ b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out @@ -0,0 +1,3 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build index 12d1fe48717..7cd039a1df9 100644 --- a/contrib/pg_buffercache/meson.build +++ b/contrib/pg_buffercache/meson.build @@ -23,6 +23,7 @@ install_data( 'pg_buffercache--1.2.sql', 'pg_buffercache--1.3--1.4.sql', 'pg_buffercache--1.4--1.5.sql', + 'pg_buffercache--1.5--1.6.sql', 'pg_buffercache.control', kwargs: contrib_data_args, ) @@ -34,6 +35,7 @@ tests += { 'regress': { 'sql': [ 'pg_buffercache', + 'pg_buffercache_numa', ], }, } diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql new file mode 100644 index 00000000000..0f4b2eaf444 --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql @@ -0,0 +1,42 @@ +/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_buffercache" to load this file. \quit + +-- Register the new function. +DROP VIEW pg_buffercache; +DROP FUNCTION pg_buffercache_pages(); + +CREATE OR REPLACE FUNCTION pg_buffercache_pages() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_pages' +LANGUAGE C PARALLEL SAFE; + +CREATE OR REPLACE FUNCTION pg_buffercache_numa_pages() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_numa_pages' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE OR REPLACE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages() AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4); + +CREATE OR REPLACE VIEW pg_buffercache_numa AS + SELECT P.* FROM pg_buffercache_numa_pages() AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4, node_id int4); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC; +REVOKE ALL ON FUNCTION pg_buffercache_numa_pages() FROM PUBLIC; +REVOKE ALL ON pg_buffercache FROM PUBLIC; +REVOKE ALL ON pg_buffercache_numa FROM PUBLIC; + +GRANT EXECUTE ON FUNCTION pg_buffercache_pages() TO pg_monitor; +GRANT EXECUTE ON FUNCTION pg_buffercache_numa_pages() TO pg_monitor; +GRANT SELECT ON pg_buffercache TO pg_monitor; +GRANT SELECT ON pg_buffercache_numa TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index 5ee875f77dd..b030ba3a6fa 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.5' +default_version = '1.6' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index f342005fd96..35b019206f5 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -11,11 +11,12 @@ #include "access/htup_details.h" #include "catalog/pg_type.h" #include "funcapi.h" +#include "port/pg_numa.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 -#define NUM_BUFFERCACHE_PAGES_ELEM 9 +#define NUM_BUFFERCACHE_PAGES_ELEM 10 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 @@ -42,6 +43,7 @@ typedef struct * because of bufmgr.c's PrivateRefCount infrastructure. */ int32 pinning_backends; + int32 numa_node_id; } BufferCachePagesRec; @@ -60,10 +62,56 @@ typedef struct * relation node/tablespace/database/blocknum and dirty indicator. */ PG_FUNCTION_INFO_V1(pg_buffercache_pages); +PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages); PG_FUNCTION_INFO_V1(pg_buffercache_summary); PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); +/* Only need to touch memory once per backend process lifetime */ +static bool firstNumaTouch = true; + +/* + * Helper routine to map Buffers into addresses that is used by + * pg_numa_query_pages(). + * + * When database block size (BLCKSZ) is smaller than the OS page size (4kB), + * multiple database buffers will map to the same OS memory page. In this case, + * we only need to query the NUMA node for the first memory address of each + * unique OS page rather than for every buffer. + * + * In order to get reliable results we also need to touch memory pages, so that + * inquiry about NUMA memory node doesn't return -2 (which indicates + * unmapped/unallocated pages). + */ +static inline void +pg_buffercache_numa_prepare_ptrs(int buffer_id, float pages_per_blk, + Size os_page_size, + void **os_page_ptrs) +{ + size_t blk2page = (size_t) (buffer_id * pages_per_blk); + + for (size_t j = 0; j < pages_per_blk; j++) + { + size_t blk2pageoff = blk2page + j; + + if (os_page_ptrs[blk2pageoff] == 0) + { + volatile uint64 touch pg_attribute_unused(); + + /* NBuffers starts from 1 */ + os_page_ptrs[blk2pageoff] = (char *) BufferGetBlock(buffer_id + 1) + + (os_page_size * j); + + /* Only need to touch memory once per backend process lifetime */ + if (firstNumaTouch) + pg_numa_touch_mem_if_required(touch, os_page_ptrs[blk2pageoff]); + + } + + CHECK_FOR_INTERRUPTS(); + } +} + /* * Helper routine for pg_buffercache_pages() and pg_buffercache_numa_pages(). * @@ -121,6 +169,9 @@ pg_buffercache_init_entries(FuncCallContext *funcctx, FunctionCallInfo fcinfo) if (expected_tupledesc->natts >= NUM_BUFFERCACHE_PAGES_ELEM - 1) TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", INT4OID, -1, 0); + if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + TupleDescInitEntry(tupledesc, (AttrNumber) 10, "node_id", + INT4OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); @@ -173,6 +224,8 @@ pg_buffercache_build_tuple(int record_id, BufferCachePagesContext *fctx) else fctx->record[record_id].isvalid = false; + fctx->record[record_id].numa_node_id = -1; + UnlockBufHdr(bufHdr, buf_state); } @@ -208,6 +261,7 @@ get_buffercache_tuple(int record_id, BufferCachePagesContext *fctx) /* unused for v1.0 callers, but the array is always long enough */ nulls[8] = true; + nulls[9] = true; } else { @@ -231,6 +285,8 @@ get_buffercache_tuple(int record_id, BufferCachePagesContext *fctx) */ values[8] = Int32GetDatum(fctx->record[record_id].pinning_backends); nulls[8] = false; + values[9] = Int32GetDatum(fctx->record[record_id].numa_node_id); + nulls[9] = false; } /* Build and return the tuple. */ @@ -282,6 +338,120 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) } } +/* + * This is almost identical to the above, but performs + * NUMA inuqiry about memory mappings. + */ +Datum +pg_buffercache_numa_pages(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + BufferCachePagesContext *fctx; /* User function context. */ + + if (SRF_IS_FIRSTCALL()) + { + int i; + Size os_page_size = 0; + void **os_page_ptrs = NULL; + int *os_pages_status = NULL; + uint64 os_page_count = 0; + float pages_per_blk = 0; + + funcctx = SRF_FIRSTCALL_INIT(); + + if (pg_numa_init() == -1) + elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); + + fctx = pg_buffercache_init_entries(funcctx, fcinfo); + + /* + * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, + * while the OS may have different memory page sizes. + * + * To correctly map between them, we need to: 1. Determine the OS + * memory page size 2. Calculate how many OS pages are used by all + * buffer blocks 3. Calculate how many OS pages are contained within + * each database block. + * + * This information is needed before calling move_pages() for NUMA + * node id inquiry. + */ + os_page_size = pg_numa_get_pagesize(); + os_page_count = ((uint64) NBuffers * BLCKSZ) / os_page_size; + pages_per_blk = (float) BLCKSZ / os_page_size; + + elog(DEBUG1, "NUMA: os_page_count=%lu os_page_size=%zu pages_per_blk=%.2f", + (unsigned long) os_page_count, os_page_size, pages_per_blk); + + os_page_ptrs = palloc0(sizeof(void *) * os_page_count); + os_pages_status = palloc(sizeof(uint64) * os_page_count); + + /* + * If we ever get 0xff back from kernel inquiry, then we probably have + * bug in our buffers to OS page mapping code here. + * + */ + memset(os_pages_status, 0xff, sizeof(int) * os_page_count); + + if (firstNumaTouch) + elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts"); + + /* + * Scan through all the buffers, saving the relevant fields in the + * fctx->record structure. + * + * We don't hold the partition locks, so we don't get a consistent + * snapshot across all buffers, but we do grab the buffer header + * locks, so the information of each buffer is self-consistent. + */ + for (i = 0; i < NBuffers; i++) + { + pg_buffercache_build_tuple(i, fctx); + pg_buffercache_numa_prepare_ptrs(i, pages_per_blk, os_page_size, + os_page_ptrs); + } + + if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_pages_status) == -1) + elog(ERROR, "failed NUMA pages inquiry: %m"); + + for (i = 0; i < NBuffers; i++) + { + int blk2page = (int) i * pages_per_blk; + + /* + * Set the NUMA node id for this buffer based on the first OS page + * it maps to. + * + * Note: We could check for errors in os_pages_status and report + * them. Also, a single DB block might span multiple NUMA nodes if + * it crosses OS pages on node boundaries, but we only record the + * node of the first page. This is a simplification but should be + * sufficient for most analyses. + */ + fctx->record[i].numa_node_id = os_pages_status[blk2page]; + } + } + + funcctx = SRF_PERCALL_SETUP(); + + /* Get the saved state */ + fctx = funcctx->user_fctx; + + if (funcctx->call_cntr < funcctx->max_calls) + { + Datum result; + uint32 i = funcctx->call_cntr; + + result = get_buffercache_tuple(i, fctx); + SRF_RETURN_NEXT(funcctx, result); + } + else + { + firstNumaTouch = false; + SRF_RETURN_DONE(funcctx); + } +} + Datum pg_buffercache_summary(PG_FUNCTION_ARGS) { diff --git a/contrib/pg_buffercache/sql/pg_buffercache_numa.sql b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql new file mode 100644 index 00000000000..2225b879f58 --- /dev/null +++ b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql @@ -0,0 +1,20 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif + +select count(*) = (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; + +-- Check that the functions / views can't be accessed by default. To avoid +-- having to create a dedicated user, use the pg_database_owner pseudo-role. +SET ROLE pg_database_owner; +SELECT count(*) > 0 FROM pg_buffercache_numa; +RESET role; + +-- Check that pg_monitor is allowed to query view / function +SET ROLE pg_monitor; +SELECT count(*) > 0 FROM pg_buffercache_numa; +RESET role; diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 802a5112d77..086e0062a17 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -30,7 +30,9 @@ This module provides the pg_buffercache_pages() function (wrapped in the pg_buffercache view), - the pg_buffercache_summary() function, the + pg_buffercache_numa_pages() function (wrapped in the + pg_buffercache_numa view), the + pg_buffercache_summary() function, the pg_buffercache_usage_counts() function and the pg_buffercache_evict() function. @@ -42,6 +44,14 @@ convenient use. + + The pg_buffercache_numa_pages() provides the same information + as pg_buffercache_pages() but is slower because it also + provides the NUMA node ID per shared buffer entry. + The pg_buffercache_numa view wraps the function for + convenient use. + + The pg_buffercache_summary() function returns a single row summarizing the state of the shared buffer cache. @@ -200,6 +210,55 @@ + + The <structname>pg_buffercache_numa</structname> View + + + The definitions of the columns exposed are identical to the + pg_buffercache view, except that this one includes + one additional node_id column as defined in + . + + + + <structname>pg_buffercache_numa</structname> Extra column + + + + + Column Type + + + Description + + + + + + + + node_id integer + + + NUMA node ID. NULL if the shared buffer + has not been used yet. On systems without NUMA support + this returns 0. + + + + + +
+ + + As NUMA node ID inquiry for each page requires memory pages + to be paged-in, the first execution of this function can take a noticeable + amount of time. In all the cases (first execution or not), retrieving this + information is costly and querying the view at a high frequency is not recommended. + + +
+ The <function>pg_buffercache_summary()</function> Function -- 2.39.5