From 6eb8314fb3e07595e4016ade40fe8dade7fb9c37 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 7 Mar 2025 22:55:24 +0100 Subject: [PATCH v9 2/4] GIST: Fix visibility issues in IOS Previously, GIST IOS could buffer tuples from pages while VACUUM came along and cleaned up an ALL_DEAD tuple, marking the tuple's page ALL_VISIBLE again and making IOS mistakenly believe the tuple is indeed visible. With this patch, pins now conflict with GIST vacuum, and we now do preliminary visibility checks to be used by IOS so that the IOS infrastructure knows to recheck the heap page even if that page is now ALL_VISIBLE. Note: For PG17 and below, this needs some adaptations to use e.g. VM_ALL_VISIBLE, and pack its fields in places that work fine on 32-bit systems, too. Idea from Heikki Linnakangas Backpatch: 17- --- src/include/access/gist_private.h | 27 +++++-- src/backend/access/gist/gistget.c | 104 ++++++++++++++++++++++++++- src/backend/access/gist/gistscan.c | 5 ++ src/backend/access/gist/gistvacuum.c | 6 +- 4 files changed, 131 insertions(+), 11 deletions(-) diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 39404ec7cdb..4261565b5ad 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -22,6 +22,7 @@ #include "storage/buffile.h" #include "utils/hsearch.h" #include "access/genam.h" +#include "tableam.h" /* * Maximum number of "halves" a page can be split into in one operation. @@ -124,6 +125,8 @@ typedef struct GISTSearchHeapItem * index-only scans */ OffsetNumber offnum; /* track offset in page to mark tuple as * LP_DEAD */ + uint8 visrecheck; /* Cached visibility check result for this + * heap pointer. */ } GISTSearchHeapItem; /* Unvisited item, either index page or heap tuple */ @@ -170,12 +173,24 @@ typedef struct GISTScanOpaqueData BlockNumber curBlkno; /* current number of block */ GistNSN curPageLSN; /* pos in the WAL stream when page was read */ - /* In a non-ordered search, returnable heap items are stored here: */ - GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; - OffsetNumber nPageData; /* number of valid items in array */ - OffsetNumber curPageData; /* next item to return */ - MemoryContext pageDataCxt; /* context holding the fetched tuples, for - * index-only scans */ + /* info used by Index-Only Scans */ + Buffer vmbuf; /* reusable buffer for IOS' vm lookups */ + + union { + struct { + /* In a non-ordered search, returnable heap items are stored here: */ + GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; + OffsetNumber nPageData; /* number of valid items in array */ + OffsetNumber curPageData; /* next item to return */ + MemoryContext pageDataCxt; /* context holding the fetched tuples, + * for index-only scans */ + }; + struct { + /* In an ordered search, we use this as scratch space */ + GISTSearchHeapItem *sortData[BLCKSZ / sizeof(IndexTupleData)]; + OffsetNumber nsortData; /* number of items in sortData */ + }; + }; } GISTScanOpaqueData; typedef GISTScanOpaqueData *GISTScanOpaque; diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index cc40e928e0a..bb4caa6c310 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -24,6 +24,7 @@ #include "utils/float.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "access/tableam.h" /* * gistkillitems() -- set LP_DEAD state for items an indexscan caller has @@ -394,7 +395,15 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, return; } - so->nPageData = so->curPageData = 0; + if (scan->numberOfOrderBys) + { + so->nsortData = 0; + } + else + { + so->nPageData = so->curPageData = 0; + } + scan->xs_hitup = NULL; /* might point into pageDataCxt */ if (so->pageDataCxt) MemoryContextReset(so->pageDataCxt); @@ -501,7 +510,11 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, * In an index-only scan, also fetch the data from the tuple. */ if (scan->xs_want_itup) + { item->data.heap.recontup = gistFetchTuple(giststate, r, it); + so->sortData[so->nsortData] = &item->data.heap; + so->nsortData += 1; + } } else { @@ -526,7 +539,88 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, } } - UnlockReleaseBuffer(buffer); + /* Allow writes to the buffer */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + if (scan->xs_want_itup) + { + TM_IndexVisibilityCheckOp op; + op.vmbuf = &so->vmbuf; + + if (scan->numberOfOrderBys > 0) + { + op.nchecktids = so->nsortData; + + if (op.nchecktids > 0) + { + op.checktids = palloc(op.nchecktids * sizeof(TM_VisCheck)); + + for (int off = 0; off < op.nchecktids; off++) + { + op.checktids[off].vischeckresult = TMVC_Unchecked; + op.checktids[off].tid = so->sortData[off]->heapPtr; + op.checktids[off].idxoffnum = off; + Assert(ItemPointerIsValid(&op.checktids[off].tid)); + } + } + } + else + { + op.nchecktids = so->nPageData; + + if (op.nchecktids > 0) + { + op.checktids = palloc_array(TM_VisCheck, op.nchecktids); + + for (int off = 0; off < op.nchecktids; off++) + { + op.checktids[off].vischeckresult = TMVC_Unchecked; + op.checktids[off].tid = so->pageData[off].heapPtr; + op.checktids[off].idxoffnum = off; + Assert(ItemPointerIsValid(&op.checktids[off].tid)); + } + } + } + + if (op.nchecktids > 0) + { + table_index_vischeck_tuples(scan->heapRelation, &op); + + if (scan->numberOfOrderBys > 0) + { + for (int off = 0; off < op.nchecktids; off++) + { + TM_VisCheck *check = &op.checktids[off]; + GISTSearchHeapItem *item = so->sortData[check->idxoffnum]; + + Assert(check->idxoffnum < op.nchecktids); + Assert(ItemPointerEquals(&item->heapPtr, &check->tid)); + + item->visrecheck = check->vischeckresult; + } + /* reset state */ + so->nsortData = 0; + } + else + { + for (int off = 0; off < op.nchecktids; off++) + { + TM_VisCheck *check = &op.checktids[off]; + GISTSearchHeapItem *item = &so->pageData[check->idxoffnum]; + + Assert(check->idxoffnum < op.nchecktids); + Assert(ItemPointerEquals(&item->heapPtr, &check->tid)); + + item->visrecheck = check->vischeckresult; + } + } + + pfree(op.checktids); + } + } + + /* Allow VACUUM to process the buffer again */ + ReleaseBuffer(buffer); } /* @@ -588,7 +682,10 @@ getNextNearest(IndexScanDesc scan) /* in an index-only scan, also return the reconstructed tuple. */ if (scan->xs_want_itup) + { scan->xs_hitup = item->data.heap.recontup; + scan->xs_visrecheck = item->data.heap.visrecheck; + } res = true; } else @@ -673,7 +770,10 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) /* in an index-only scan, also return the reconstructed tuple */ if (scan->xs_want_itup) + { scan->xs_hitup = so->pageData[so->curPageData].recontup; + scan->xs_visrecheck = so->pageData[so->curPageData].visrecheck; + } so->curPageData++; diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index 700fa959d03..52f5f144ccd 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -347,6 +347,11 @@ void gistendscan(IndexScanDesc scan) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + if (BufferIsValid(so->vmbuf)) + { + ReleaseBuffer(so->vmbuf); + so->vmbuf = InvalidBuffer; + } /* * freeGISTstate is enough to clean up everything made by gistbeginscan, diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index dd0d9d5006c..5a95b93236e 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -289,10 +289,10 @@ restart: info->strategy); /* - * We are not going to stay here for a long time, aggressively grab an - * exclusive lock. + * We are not going to stay here for a long time, aggressively grab a + * cleanup lock. */ - LockBuffer(buffer, GIST_EXCLUSIVE); + LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (gistPageRecyclable(page)) -- 2.45.2