From dd36f2194986e5366e1f0800eff6b8a61611dd0f Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 24 Jul 2025 13:00:24 +0200 Subject: [PATCH v1] pgstattuple: analyze TIDs on btree leaf pages --- contrib/pgstattuple/Makefile | 3 +- contrib/pgstattuple/pgstatindex.c | 447 ++++++++++++++++++ contrib/pgstattuple/pgstattuple--1.5--1.6.sql | 30 ++ contrib/pgstattuple/pgstattuple.control | 2 +- 4 files changed, 480 insertions(+), 2 deletions(-) create mode 100644 contrib/pgstattuple/pgstattuple--1.5--1.6.sql diff --git a/contrib/pgstattuple/Makefile b/contrib/pgstattuple/Makefile index c5b17fc703e..d5c62ba36f9 100644 --- a/contrib/pgstattuple/Makefile +++ b/contrib/pgstattuple/Makefile @@ -10,7 +10,8 @@ OBJS = \ EXTENSION = pgstattuple DATA = pgstattuple--1.4.sql pgstattuple--1.4--1.5.sql \ pgstattuple--1.3--1.4.sql pgstattuple--1.2--1.3.sql \ - pgstattuple--1.1--1.2.sql pgstattuple--1.0--1.1.sql + pgstattuple--1.1--1.2.sql pgstattuple--1.0--1.1.sql \ + pgstattuple--1.5--1.6.sql PGFILEDESC = "pgstattuple - tuple-level statistics" REGRESS = pgstattuple diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c index 4b9d76ec4e4..aa4431075b0 100644 --- a/contrib/pgstattuple/pgstatindex.c +++ b/contrib/pgstattuple/pgstatindex.c @@ -62,6 +62,9 @@ PG_FUNCTION_INFO_V1(pg_relpages_v1_5); PG_FUNCTION_INFO_V1(pg_relpagesbyid_v1_5); PG_FUNCTION_INFO_V1(pgstatginindex_v1_5); +PG_FUNCTION_INFO_V1(pgstatindex_nheap_v1_6); +PG_FUNCTION_INFO_V1(pgstatindex_runs_v1_6); + Datum pgstatginindex_internal(Oid relid, FunctionCallInfo fcinfo); #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX) @@ -128,6 +131,9 @@ static Datum pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo); static int64 pg_relpages_impl(Relation rel); static void GetHashPageStats(Page page, HashIndexStat *stats); +static Datum pgstatindex_nheap_impl(Relation rel, int64 nblocks, FunctionCallInfo fcinfo); +static Datum pgstatindex_runs_impl(Relation rel, int64 nblocks, FunctionCallInfo fcinfo); + /* ------------------------------------------------------ * pgstatindex() * @@ -756,3 +762,444 @@ GetHashPageStats(Page page, HashIndexStat *stats) } stats->free_space += PageGetExactFreeSpace(page); } + +/* + */ +Datum +pgstatindex_nheap_v1_6(PG_FUNCTION_ARGS) +{ + text *relname = PG_GETARG_TEXT_PP(0); + int64 nblocks = PG_GETARG_INT64(1); + Relation rel; + RangeVar *relrv; + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + PG_RETURN_DATUM(pgstatindex_nheap_impl(rel, nblocks, fcinfo)); +} + +/* + */ +Datum +pgstatindex_runs_v1_6(PG_FUNCTION_ARGS) +{ + text *relname = PG_GETARG_TEXT_PP(0); + int64 nblocks = PG_GETARG_INT64(1); + Relation rel; + RangeVar *relrv; + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + PG_RETURN_DATUM(pgstatindex_runs_impl(rel, nblocks, fcinfo)); +} + +static int +count_block_runs(int range_nblocks, BlockNumber *range_blocks) +{ + int nruns = 1; + + for (int i = 1; i < range_nblocks; i++) + { + if (range_blocks[i] != range_blocks[i-1]) + nruns++; + } + + return nruns; +} + +static int +blocknum_cmp(const void *a, const void *b) +{ + return memcmp(a, b, sizeof(BlockNumber)); +} + +static int +count_blocks_distinct(int range_nblocks, BlockNumber *range_blocks) +{ + pg_qsort(range_blocks, range_nblocks, sizeof(BlockNumber), blocknum_cmp); + + return count_block_runs(range_nblocks, range_blocks); +} + +static Datum +pgstatindex_nheap_impl(Relation rel, int64 range_len, FunctionCallInfo fcinfo) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + BlockNumber nblocks; + BlockNumber blkno; + BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); + + BlockNumber range_start = 0; + BlockNumber *range_blocks = NULL; + int range_nblocks; + int range_leafs; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + + Datum values[5]; + bool nulls[5]; + + /* no NULLs */ + memset(nulls, 0, sizeof(nulls)); + + if (!IS_INDEX(rel) || !IS_BTREE(rel)) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation \"%s\" is not a btree index", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + /* + * A !indisready index could lead to ERRCODE_DATA_CORRUPTED later, so exit + * early. We're capable of assessing an indisready&&!indisvalid index, + * but the results could be confusing. For example, the index's size + * could be too low for a valid index of the table. + */ + if (!rel->rd_index->indisvalid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("index \"%s\" is not valid", + RelationGetRelationName(rel)))); + + InitMaterializedSRF(fcinfo, 0); + tupdesc = rsinfo->setDesc; + tupstore = rsinfo->setResult; + + /* pre-allocate space for the maximum number of TIDs we might see + * on range_len pages */ + range_nblocks = 0; + range_blocks = palloc_array(BlockNumber, range_len * MaxTIDsPerBTreePage); + + /* + * Scan all blocks except the metapage + */ + nblocks = RelationGetNumberOfBlocks(rel); + + for (blkno = 1; blkno < nblocks; blkno++) + { + Buffer buffer; + Page page; + BTPageOpaque opaque; + + CHECK_FOR_INTERRUPTS(); + + /* + * If we started a new range, count the distinct blocks and add + * a tuple into the result set (if there are any TIDs). + */ + if (range_start + range_len <= blkno) + { + if (range_nblocks > 0) + { + HeapTuple tuple; + int ndistinct; + int nruns; + + nruns = count_block_runs(range_nblocks, range_blocks); + + /* this modifies the array, so do once */ + ndistinct = count_blocks_distinct(range_nblocks, range_blocks); + + values[0] = Int64GetDatum(range_start); + values[1] = Int64GetDatum(range_leafs); + values[2] = Int64GetDatum(range_nblocks); + values[3] = Int64GetDatum(ndistinct); + values[4] = Int64GetDatum(nruns); + + /* Build and return the result tuple */ + tuple = heap_form_tuple(tupdesc, values, nulls); + + tuplestore_puttuple(tupstore, tuple); + + range_nblocks = 0; + range_leafs = 0; + } + + while (range_start + range_len <= blkno) + range_start += range_len; + } + + /* Read and lock buffer */ + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buffer); + opaque = BTPageGetOpaque(page); + + /* + * Ignore deleted/dead pages, and internal (non-leaf) pages. + */ + if (!P_ISDELETED(opaque) && !P_IGNORE(opaque) && P_ISLEAF(opaque)) + { + OffsetNumber offset = FirstOffsetNumber; + OffsetNumber maxoffset = PageGetMaxOffsetNumber(page); + bool rightmost = P_RIGHTMOST(opaque); + + while (offset < maxoffset) + { + ItemId id = PageGetItemId(page, offset); + IndexTuple itup = (IndexTuple) PageGetItem(page, id); + BlockNumber block = ItemPointerGetBlockNumber(&itup->t_tid); + bool ispivot = (!rightmost && offset == P_HIKEY); + + Assert(range_nblocks >= 0); + Assert(range_nblocks < range_len * MaxTIDsPerBTreePage); + + /* ignore pivot tuples */ + if (!ispivot) + range_blocks[range_nblocks++] = block; + + offset++; + } + + range_leafs++; + } + + /* Unlock and release buffer */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + + /* output the last range */ + if (range_nblocks > 0) + { + HeapTuple tuple; + int ndistinct; + int nruns; + + nruns = count_block_runs(range_nblocks, range_blocks); + + /* this modifies the array, so do once */ + ndistinct = count_blocks_distinct(range_nblocks, range_blocks); + + values[0] = Int64GetDatum(range_start); + values[1] = Int64GetDatum(range_leafs); + values[2] = Int64GetDatum(range_nblocks); + values[3] = Int64GetDatum(ndistinct); + values[4] = Int64GetDatum(nruns); + + /* Build and return the result tuple */ + tuple = heap_form_tuple(tupdesc, values, nulls); + + tuplestore_puttuple(tupstore, tuple); + + range_nblocks = 0; + range_leafs = 0; + } + + relation_close(rel, AccessShareLock); + + PG_RETURN_NULL(); +} + +static void +count_run_lengths(int range_nblocks, BlockNumber *range_blocks, int *lengths) +{ + int len = 1; + BlockNumber curr = range_blocks[0]; + + for (int i = 0; i < range_nblocks; i++) + { + if (range_blocks[i] != curr) + { + lengths[len]++; + len = 1; + curr = range_blocks[i]; + continue; + } + + len++; + } + + lengths[len]++; +} + +static Datum +pgstatindex_runs_impl(Relation rel, int64 range_len, FunctionCallInfo fcinfo) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + BlockNumber nblocks; + BlockNumber blkno; + BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); + + BlockNumber range_start = 0; + BlockNumber *range_blocks = NULL; + int *run_lengths = NULL; + int range_nblocks; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + + Datum values[3]; + bool nulls[3]; + + /* no NULLs */ + memset(nulls, 0, sizeof(nulls)); + + if (!IS_INDEX(rel) || !IS_BTREE(rel)) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation \"%s\" is not a btree index", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + /* + * A !indisready index could lead to ERRCODE_DATA_CORRUPTED later, so exit + * early. We're capable of assessing an indisready&&!indisvalid index, + * but the results could be confusing. For example, the index's size + * could be too low for a valid index of the table. + */ + if (!rel->rd_index->indisvalid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("index \"%s\" is not valid", + RelationGetRelationName(rel)))); + + InitMaterializedSRF(fcinfo, 0); + tupdesc = rsinfo->setDesc; + tupstore = rsinfo->setResult; + + /* pre-allocate space for the maximum number of TIDs we might see + * on range_len pages */ + range_nblocks = 0; + range_blocks = palloc_array(BlockNumber, range_len * MaxTIDsPerBTreePage); + + /* lengths of runs (the range could be one long run) */ + run_lengths = palloc_array(int, (range_len * MaxTIDsPerBTreePage + 1)); + + /* + * Scan all blocks except the metapage + */ + nblocks = RelationGetNumberOfBlocks(rel); + + for (blkno = 1; blkno < nblocks; blkno++) + { + Buffer buffer; + Page page; + BTPageOpaque opaque; + + CHECK_FOR_INTERRUPTS(); + + /* + * If we started a new range, count the distinct blocks and add + * a tuple into the result set (if there are any TIDs). + */ + if (range_start + range_len <= blkno) + { + if (range_nblocks > 0) + { + HeapTuple tuple; + + memset(run_lengths, 0, sizeof(int) * (range_len * MaxTIDsPerBTreePage + 1)); + count_run_lengths(range_nblocks, range_blocks, run_lengths); + + for (int i = 1; i <= (range_len * MaxTIDsPerBTreePage); i++) + { + if (run_lengths[i] > 0) + { + values[0] = Int64GetDatum(range_start); + values[1] = Int32GetDatum(i); + values[2] = Int32GetDatum(run_lengths[i]); + + /* Build and return the result tuple */ + tuple = heap_form_tuple(tupdesc, values, nulls); + + tuplestore_puttuple(tupstore, tuple); + } + } + + range_nblocks = 0; + } + + while (range_start + range_len <= blkno) + range_start += range_len; + } + + /* Read and lock buffer */ + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buffer); + opaque = BTPageGetOpaque(page); + + /* + * Ignore deleted/dead pages, and internal (non-leaf) pages. + */ + if (!P_ISDELETED(opaque) && !P_IGNORE(opaque) && P_ISLEAF(opaque)) + { + OffsetNumber offset = FirstOffsetNumber; + OffsetNumber maxoffset = PageGetMaxOffsetNumber(page); + bool rightmost = P_RIGHTMOST(opaque); + + while (offset < maxoffset) + { + ItemId id = PageGetItemId(page, offset); + IndexTuple itup = (IndexTuple) PageGetItem(page, id); + BlockNumber block = ItemPointerGetBlockNumber(&itup->t_tid); + bool ispivot = (!rightmost && offset == P_HIKEY); + + Assert(range_nblocks >= 0); + Assert(range_nblocks < range_len * MaxTIDsPerBTreePage); + + /* ignore pivot tuples */ + if (!ispivot) + range_blocks[range_nblocks++] = block; + + offset++; + } + } + + /* Unlock and release buffer */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + + /* output the last range */ + if (range_nblocks > 0) + { + HeapTuple tuple; + + memset(run_lengths, 0, sizeof(int) * (range_len * MaxTIDsPerBTreePage + 1)); + count_run_lengths(range_nblocks, range_blocks, run_lengths); + + for (int i = 1; i <= (range_len * MaxTIDsPerBTreePage); i++) + { + if (run_lengths[i] > 0) + { + values[0] = Int64GetDatum(range_start); + values[1] = Int32GetDatum(i); + values[2] = Int32GetDatum(run_lengths[i]); + + /* Build and return the result tuple */ + tuple = heap_form_tuple(tupdesc, values, nulls); + + tuplestore_puttuple(tupstore, tuple); + } + } + + range_nblocks = 0; + } + + relation_close(rel, AccessShareLock); + + PG_RETURN_NULL(); +} diff --git a/contrib/pgstattuple/pgstattuple--1.5--1.6.sql b/contrib/pgstattuple/pgstattuple--1.5--1.6.sql new file mode 100644 index 00000000000..c6a450a301b --- /dev/null +++ b/contrib/pgstattuple/pgstattuple--1.5--1.6.sql @@ -0,0 +1,30 @@ +/* contrib/pgstattuple/pgstattuple--1.5--1.6.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pgstattuple UPDATE TO '1.6'" to load this file. \quit + +CREATE OR REPLACE FUNCTION pgstatindex_nheap(IN relname text, + IN blocks BIGINT, -- length of ranges to analyze + OUT block BIGINT, -- first block of a range + OUT num_leafs BIGINT, -- number of leaf pages + OUT num_items BIGINT, -- number of heap TIDs + OUT num_blocks BIGINT, -- number of distinct blocks + OUT num_runs BIGINT) -- number of continuous runs +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pgstatindex_nheap_v1_6' +LANGUAGE C STRICT PARALLEL SAFE; + +REVOKE EXECUTE ON FUNCTION pgstatindex_nheap(text, bigint) FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pgstatindex_nheap(text, bigint) TO pg_stat_scan_tables; + +CREATE OR REPLACE FUNCTION pgstatindex_runs(IN relname text, + IN blocks BIGINT, -- length of ranges to analyze + OUT block BIGINT, -- first block of a range + OUT run_length INT, -- number of leaf pages + OUT run_count INT) -- number of heap TIDs +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pgstatindex_runs_v1_6' +LANGUAGE C STRICT PARALLEL SAFE; + +REVOKE EXECUTE ON FUNCTION pgstatindex_runs(text, bigint) FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pgstatindex_runs(text, bigint) TO pg_stat_scan_tables; diff --git a/contrib/pgstattuple/pgstattuple.control b/contrib/pgstattuple/pgstattuple.control index 6af40757b27..80d06958e90 100644 --- a/contrib/pgstattuple/pgstattuple.control +++ b/contrib/pgstattuple/pgstattuple.control @@ -1,5 +1,5 @@ # pgstattuple extension comment = 'show tuple-level statistics' -default_version = '1.5' +default_version = '1.6' module_pathname = '$libdir/pgstattuple' relocatable = true -- 2.50.1