From 403ff258a881eda28122a4a3a42a58c9e9a4c9a0 Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Fri, 13 Jul 2018 22:12:17 +0400 Subject: [PATCH 1/2] Delete pages during GiST VACUUM v7 --- src/backend/access/gist/README | 35 ++++++++++ src/backend/access/gist/gist.c | 18 ++++++ src/backend/access/gist/gistbuild.c | 5 -- src/backend/access/gist/gistutil.c | 3 +- src/backend/access/gist/gistvacuum.c | 122 +++++++++++++++++++++++++++++++++-- src/backend/access/gist/gistxlog.c | 63 +++++++++++++++++- src/include/access/gist.h | 11 ++++ src/include/access/gist_private.h | 24 +++++-- src/include/access/gistxlog.h | 17 ++++- src/test/regress/expected/gist.out | 4 +- src/test/regress/sql/gist.sql | 4 +- 11 files changed, 280 insertions(+), 26 deletions(-) diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 02228662b8..9548872be8 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -413,6 +413,41 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops through buffers at a given level until all buffers at that level have been emptied, and then moves down to the next level. +Bulk delete algorithm (VACUUM) +------------------------------ + +Function gistbulkdelete() is responsible for marking empty leaf pages as free +so that they can be used it allocate newly split pages. To find this pages +function can choose between two strategies: logical scan or physical scan. + +Physical scan reads the entire index from the first page to last. This scan +maintains graph structure in palloc'ed array to collect block numbers of +internal pages that need cleansing from references to empty leafs. Also, the +array contains offsets on the internal page to potentially free leaf page. This +scan method is chosen when maintenance work memory is sufficient to hold +necessary graph structure. + +The logical scan is chosen when there is not enough maintenance memory to +execute the physical scan. Logical scan traverses GiST index in DFS, looking up +into incomplete split branches. The logical scan can be slower on hard disk +drives. + +The result of both scans are the same: the stack of block numbers of internal +pages with the list of offsets potentially referencing empty leaf pages. After +the scan, for each internal pages under exclusive lock, each potentially free +leaf page is examined. gistbulkdelete() never delete last one reference from +internal page to preserve balanced tree properties. + +The physical scan can return empty leaf pages offsets unordered. Thus, before +executing PageIndexMultiDelete offsets (already locked and checked) are sorted. +This step is not necessary for the logical scan. + +Both scans hold only one lock at a time. Physical scan grabs exclusive lock +instantly, while logical scan takes shared lock and then swaps it to exclusive. +This is done because amount of work on internal page done by physical scan is +lower and amount of internal pages is relatively low compared to the amount of +leaf pages. + Authors: Teodor Sigaev diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 8a42effdf7..5297e1691d 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -700,6 +700,11 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) GISTInsertStack *item; OffsetNumber downlinkoffnum; + /* + * Currently internal pages are not deleted during vacuum, + * so we do not need to check if page is deleted + */ + downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); @@ -834,6 +839,19 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) } } + /* + * Leaf pages can be left deleted but still referenced + * until it's space is reused. Downlink to this page may be already + * removed from the internal page, but this scan can posess it. + */ + if(GistPageIsDeleted(stack->page)) + { + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuple(&state, stack, giststate, itup, diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 434f15f014..f26f139a9e 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -1126,11 +1126,6 @@ gistGetMaxLevel(Relation index) * but will be added there the first time we visit them. */ -typedef struct -{ - BlockNumber childblkno; /* hash key */ - BlockNumber parentblkno; -} ParentMapEntry; static void gistInitParentMap(GISTBuildState *buildstate) diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 12804c321c..41978bb5e5 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -23,6 +23,7 @@ #include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/syscache.h" +#include "utils/snapmgr.h" /* @@ -806,7 +807,7 @@ gistNewBuffer(Relation r) gistcheckpage(r, buffer); - if (GistPageIsDeleted(page)) + if (GistPageIsDeleted(page) && TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalDataXmin)) return buffer; /* OK to use */ LockBuffer(buffer, GIST_UNLOCK); diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 5948218c77..a96d91eb1d 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -16,10 +16,13 @@ #include "access/genam.h" #include "access/gist_private.h" +#include "access/transam.h" #include "commands/vacuum.h" #include "miscadmin.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "utils/snapmgr.h" +#include "access/xact.h" /* @@ -125,7 +128,6 @@ pushStackIfSplited(Page page, GistBDItem *stack) } } - /* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples left after upgrade. @@ -135,12 +137,14 @@ pushStackIfSplited(Page page, GistBDItem *stack) * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * -gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, - IndexBulkDeleteCallback callback, void *callback_state) +gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state) { Relation rel = info->index; GistBDItem *stack, *ptr; + BlockNumber recentParent = InvalidBlockNumber; + List *rescanList = NULL; + ListCell *cell; /* first time through? */ if (stats == NULL) @@ -233,9 +237,16 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, END_CRIT_SECTION(); } + if (ntodelete == maxoff && recentParent!=InvalidBlockNumber && + (rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent)) + { + /* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */ + rescanList = lappend_int(rescanList, recentParent); + } } else { + recentParent = stack->blkno; /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); @@ -270,5 +281,108 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vacuum_delay_point(); } + /* rescan inner pages that had empty child pages */ + foreach(cell,rescanList) + { + Buffer buffer; + Page page; + OffsetNumber i, + maxoff; + IndexTuple idxtuple; + ItemId iid; + OffsetNumber todelete[MaxOffsetNumber]; + Buffer buftodelete[MaxOffsetNumber]; + int ntodelete = 0; + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell), + RBM_NORMAL, info->strategy); + LockBuffer(buffer, GIST_EXCLUSIVE); + gistcheckpage(rel, buffer); + page = (Page) BufferGetPage(buffer); + + Assert(!GistPageIsLeaf(page)); + + maxoff = PageGetMaxOffsetNumber(page); + + for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i)) + { + Buffer leafBuffer; + Page leafPage; + + iid = PageGetItemId(page, i); + idxtuple = (IndexTuple) PageGetItem(page, iid); + + leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&(idxtuple->t_tid)), + RBM_NORMAL, info->strategy); + LockBuffer(leafBuffer, GIST_EXCLUSIVE); + gistcheckpage(rel, leafBuffer); + leafPage = (Page) BufferGetPage(leafBuffer); + Assert(GistPageIsLeaf(leafPage)); + + if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */ + && !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */ + && ntodelete < maxoff-1) /* We must keep at least one leaf page per each */ + { + buftodelete[ntodelete] = leafBuffer; + todelete[ntodelete++] = i; + } + else + UnlockReleaseBuffer(leafBuffer); + } + + + if (ntodelete) + { + /* + * Like in _bt_unlink_halfdead_page we need a upper bound on xid + * that could hold downlinks to this page. We use + * ReadNewTransactionId() to instead of GetCurrentTransactionId + * since we are in a VACUUM. + */ + TransactionId txid = ReadNewTransactionId(); + + START_CRIT_SECTION(); + + /* Mark pages as deleted dropping references from internal pages */ + for (i = 0; i < ntodelete; i++) + { + Page leafPage = (Page)BufferGetPage(buftodelete[i]); + + GistPageSetDeleteXid(leafPage,txid); + + GistPageSetDeleted(leafPage); + MarkBufferDirty(buftodelete[i]); + stats->pages_deleted++; + + MarkBufferDirty(buffer); + /* Offsets are changed as long as we delete tuples from internal page */ + PageIndexTupleDelete(page, todelete[i] - i); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr = + gistXLogSetDeleted(rel->rd_node, buftodelete[i], + GistPageGetDeleteXid(leafPage), buffer, todelete[i] - i); + PageSetLSN(page, recptr); + PageSetLSN(leafPage, recptr); + } + else + { + PageSetLSN(page, gistGetFakeLSN(rel)); + PageSetLSN(leafPage, gistGetFakeLSN(rel)); + } + + UnlockReleaseBuffer(buftodelete[i]); + } + END_CRIT_SECTION(); + } + + UnlockReleaseBuffer(buffer); + + vacuum_delay_point(); + } + + list_free(rescanList); + return stats; -} +} \ No newline at end of file diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 1e09126978..a48a8341c0 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -60,6 +60,39 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id) UnlockReleaseBuffer(buffer); } +static void +gistRedoPageSetDeleted(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record); + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + GistPageSetDeleteXid(page, xldata->id); + GistPageSetDeleted(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + + UnlockReleaseBuffer(buffer); + } + + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + PageIndexTupleDelete(page, xldata->downlinkOffset); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + + UnlockReleaseBuffer(buffer); + } +} /* * redo any page update (except page split) */ @@ -112,8 +145,8 @@ gistRedoPageUpdateRecord(XLogReaderState *record) data += sizeof(OffsetNumber) * xldata->ntodelete; PageIndexMultiDelete(page, todelete, xldata->ntodelete); - if (GistPageIsLeaf(page)) - GistMarkTuplesDeleted(page); + + GistMarkTuplesDeleted(page); } /* Add new tuples if any */ @@ -324,6 +357,9 @@ gist_redo(XLogReaderState *record) case XLOG_GIST_CREATE_INDEX: gistRedoCreateIndex(record); break; + case XLOG_GIST_PAGE_DELETE: + gistRedoPageSetDeleted(record); + break; default: elog(PANIC, "gist_redo: unknown op code %u", info); } @@ -442,6 +478,29 @@ gistXLogSplit(bool page_is_leaf, return recptr; } +/* + * Write XLOG record describing a page delete. This also includes removal of + * downlink from internal page. + */ +XLogRecPtr +gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid, + Buffer internalPageBuffer, OffsetNumber internalPageOffset) { + gistxlogPageDelete xlrec; + XLogRecPtr recptr; + + xlrec.id = xid; + xlrec.downlinkOffset = internalPageBuffer; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete)); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBuffer(1, internalPageBuffer, REGBUF_STANDARD); + /* new tuples */ + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE); + return recptr; +} + /* * Write XLOG record describing a page update. The update can include any * number of deletions and/or insertions of tuples on a single index page. diff --git a/src/include/access/gist.h b/src/include/access/gist.h index 827566dc6e..d2b1edfc81 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -63,7 +63,14 @@ typedef struct GISTPageOpaqueData uint16 gist_page_id; /* for identification of GiST indexes */ } GISTPageOpaqueData; +typedef struct GISTDeletedPageOpaqueData +{ + PageGistNSN nsn; /* this value must change on page split */ + TransactionId deleteXid; /* xid of transaction after deleting VACUUM */ +} GISTDeletedPageOpaqueData; + typedef GISTPageOpaqueData *GISTPageOpaque; +typedef GISTDeletedPageOpaqueData *GISTDeletedPageOpaque; /* * The page ID is for the convenience of pg_filedump and similar utilities, @@ -151,6 +158,10 @@ typedef struct GISTENTRY #define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn)) #define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val)) +#define GistPageGetDeleteOpaque(page) ( (GISTDeletedPageOpaque) PageGetSpecialPointer(page) ) +#define GistPageGetDeleteXid(page) ( GistPageGetDeleteOpaque(page)->deleteXid) +#define GistPageSetDeleteXid(page, val) ( GistPageGetDeleteOpaque(page)->deleteXid = val) + /* * Vector of GISTENTRY structs; user-defined methods union and picksplit * take it as one of their arguments diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 36ed7244ba..1f82695b1d 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -16,6 +16,7 @@ #include "access/amapi.h" #include "access/gist.h" +#include "access/gistxlog.h" #include "access/itup.h" #include "fmgr.h" #include "lib/pairingheap.h" @@ -51,6 +52,11 @@ typedef struct char tupledata[FLEXIBLE_ARRAY_MEMBER]; } GISTNodeBufferPage; +typedef struct +{ + BlockNumber childblkno; /* hash key */ + BlockNumber parentblkno; +} ParentMapEntry; #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)) /* Returns free space in node buffer page */ #define PAGE_FREE_SPACE(nbp) (nbp->freespace) @@ -176,13 +182,6 @@ typedef struct GISTScanOpaqueData typedef GISTScanOpaqueData *GISTScanOpaque; -/* despite the name, gistxlogPage is not part of any xlog record */ -typedef struct gistxlogPage -{ - BlockNumber blkno; - int num; /* number of index tuples following */ -} gistxlogPage; - /* SplitedPageLayout - gistSplit function result */ typedef struct SplitedPageLayout { @@ -409,6 +408,17 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate); +/* gistxlog.c */ +extern void gist_redo(XLogReaderState *record); +extern void gist_desc(StringInfo buf, XLogReaderState *record); +extern const char *gist_identify(uint8 info); +extern void gist_xlog_startup(void); +extern void gist_xlog_cleanup(void); + +extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer, + TransactionId xid, Buffer internalPageBuffer, + OffsetNumber internalPageOffset); + extern XLogRecPtr gistXLogUpdate(Buffer buffer, OffsetNumber *todelete, int ntodelete, IndexTuple *itup, int ntup, diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h index 1a2b9496d0..f93024ab25 100644 --- a/src/include/access/gistxlog.h +++ b/src/include/access/gistxlog.h @@ -17,12 +17,14 @@ #include "access/xlogreader.h" #include "lib/stringinfo.h" +/* XLog stuff */ + #define XLOG_GIST_PAGE_UPDATE 0x00 /* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */ #define XLOG_GIST_PAGE_SPLIT 0x30 /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */ #define XLOG_GIST_CREATE_INDEX 0x50 - /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */ +#define XLOG_GIST_PAGE_DELETE 0x60 /* * Backup Blk 0: updated page. @@ -59,6 +61,19 @@ typedef struct gistxlogPageSplit */ } gistxlogPageSplit; +typedef struct gistxlogPageDelete +{ + TransactionId id; + OffsetNumber downlinkOffset; /* Offset of the downlink referencing this page */ +} gistxlogPageDelete; + +/* despite the name, gistxlogPage is not part of any xlog record */ +typedef struct gistxlogPage +{ + BlockNumber blkno; + int num; /* number of index tuples following */ +} gistxlogPage; + extern void gist_redo(XLogReaderState *record); extern void gist_desc(StringInfo buf, XLogReaderState *record); extern const char *gist_identify(uint8 info); diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out index f5a2993aaf..5b92f08c74 100644 --- a/src/test/regress/expected/gist.out +++ b/src/test/regress/expected/gist.out @@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g; -- To test vacuum, delete some entries from all over the index. delete from gist_point_tbl where id % 2 = 1; --- And also delete some concentration of values. (GiST doesn't currently --- attempt to delete pages even when they become empty, but if it did, this --- would exercise it) +-- And also delete some concentration of values. delete from gist_point_tbl where id < 10000; vacuum analyze gist_point_tbl; -- rebuild the index with a different fillfactor diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql index bae722fe13..e66396e851 100644 --- a/src/test/regress/sql/gist.sql +++ b/src/test/regress/sql/gist.sql @@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g; -- To test vacuum, delete some entries from all over the index. delete from gist_point_tbl where id % 2 = 1; --- And also delete some concentration of values. (GiST doesn't currently --- attempt to delete pages even when they become empty, but if it did, this --- would exercise it) +-- And also delete some concentration of values. delete from gist_point_tbl where id < 10000; vacuum analyze gist_point_tbl; -- 2.15.2 (Apple Git-101.1)