BTree vacuum before page splitting - Mailing list pgsql-patches
From | Junji TERAMOTO |
---|---|
Subject | BTree vacuum before page splitting |
Date | |
Msg-id | 43D9F82D.6010004@lab.ntt.co.jp Whole thread Raw |
Responses |
Re: BTree vacuum before page splitting
|
List | pgsql-patches |
Hi all, This patch adds a function to remove unnecessary items before split page of BTree. When a new item is put in the page, it looks for the "LP_DELETE" item, and removes that item. No heap access is required. Moreover, the penalty is also few because it operates instead of page split. Only when the super-exclusive lock was able to obtain, it removes items because the deletion of the item might disturb the simultaneous execution of the index scanning. It is an optimistic control. Because own scan might be lost even if the super-exclusive lock can obtain, we also modify _bt_restscan(). The growth of the index frequently updated by using this patch can be suppressed. Your comments are welcome. Thanks. ---- For example(pgbench): transaction type: TPC-B (sort of) scaling factor: 100 number of clients: 50 number of transactions per client: 5000 number of transactions actually processed: 250000/250000 [before] Relation Name File Name Tuples Pages File Size public.accounts_pkey 16568 10001200 21899 175192 public.branches_pkey 16564 100 410 3280 public.tellers_pkey 16566 1000 374 2992 [after] Relation Name File Name Tuples Pages File Size public.accounts_pkey 16688 10004100 21899 175192 public.branches_pkey 16684 100 44 352 <== public.tellers_pkey 16686 1000 26 208 <== ---- We also test this patch on DBT-2. -- Junji Teramoto / teramoto.junji (a) lab.ntt.co.jp NTT Cyber Space Lab. diff -cpr pgsql-orig/src/backend/access/nbtree/nbtinsert.c pgsql/src/backend/access/nbtree/nbtinsert.c *** pgsql-orig/src/backend/access/nbtree/nbtinsert.c 2006-01-24 21:39:25.000000000 +0900 --- pgsql/src/backend/access/nbtree/nbtinsert.c 2006-01-25 05:07:56.000000000 +0900 *************** static void _bt_pgaddtup(Relation rel, P *** 62,67 **** --- 62,68 ---- OffsetNumber itup_off, const char *where); static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey); + static int _bt_vacuum_page(Relation rel, Buffer buf); /* *************** _bt_check_unique(Relation rel, BTItem bt *** 260,265 **** --- 261,267 ---- hbuffer) == HEAPTUPLE_DEAD) { curitemid->lp_flags |= LP_DELETE; + opaque->btpo_flags |= BTP_HAS_DEAD; if (nbuf != InvalidBuffer) SetBufferCommitInfoNeedsSave(nbuf); else *************** _bt_insertonpg(Relation rel, *** 424,433 **** */ bool movedright = false; ! while (PageGetFreeSpace(page) < itemsz && ! !P_RIGHTMOST(lpageop) && ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 && ! random() > (MAX_RANDOM_VALUE / 100)) { /* * step right to next non-dead page --- 426,432 ---- */ bool movedright = false; ! while (PageGetFreeSpace(page) < itemsz) { /* * step right to next non-dead page *************** _bt_insertonpg(Relation rel, *** 440,445 **** --- 439,457 ---- */ Buffer rbuf = InvalidBuffer; + if (P_ISLEAF(lpageop) && P_HAS_DEAD(lpageop) && + !rel->rd_istemp && BufferSuperLockHeldByMe(buf)) + { + _bt_vacuum_page(rel, buf); + if (PageGetFreeSpace(page) >= itemsz) + break; + } + + if (P_RIGHTMOST(lpageop) || + _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || + random() <= (MAX_RANDOM_VALUE / 100)) + break; + for (;;) { BlockNumber rblkno = lpageop->btpo_next; *************** _bt_isequal(TupleDesc itupdesc, Page pag *** 1590,1592 **** --- 1602,1651 ---- /* if we get here, the keys are equal */ return true; } + + /* + * _bt_vacuum_page + * + * @param buf Must have super-exclusive-lock. + */ + static int + _bt_vacuum_page(Relation rel, Buffer buf) + { + OffsetNumber deletable[MaxOffsetNumber]; + BTPageOpaque opaque; + OffsetNumber offnum, minoff, maxoff; + int ndeletable; + Page page = BufferGetPage(buf); + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_ISLEAF(opaque) || !P_HAS_DEAD(opaque)) + return -1; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + if (minoff > maxoff) + return -1; + + ndeletable = 0; + + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + if (!ItemIdIsUsed(itemId) || ItemIdDeleted(itemId)) + { + deletable[ndeletable++] = offnum; + } + } + + if (ndeletable > 0) + { + _bt_delitems(rel, buf, deletable, ndeletable); + } + + /* clear the dead tuple hint bit */ + opaque->btpo_flags &= ~BTP_HAS_DEAD; + + return ndeletable; + } diff -cpr pgsql-orig/src/backend/access/nbtree/nbtree.c pgsql/src/backend/access/nbtree/nbtree.c *** pgsql-orig/src/backend/access/nbtree/nbtree.c 2006-01-24 21:39:25.000000000 +0900 --- pgsql/src/backend/access/nbtree/nbtree.c 2006-01-25 05:30:06.000000000 +0900 *************** btgettuple(PG_FUNCTION_ARGS) *** 271,276 **** --- 271,277 ---- offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData)); page = BufferGetPage(so->btso_curbuf); PageGetItemId(page, offnum)->lp_flags |= LP_DELETE; + ((BTPageOpaque) PageGetSpecialPointer(page))->btpo_flags |= BTP_HAS_DEAD; /* * Since this can be redone later if needed, it's treated the same *************** btbulkdelete(PG_FUNCTION_ARGS) *** 632,637 **** --- 633,641 ---- } } + /* clear the dead tuple hint bit */ + opaque->btpo_flags &= ~BTP_HAS_DEAD; + /* * If we need to delete anything, do it and write the buffer; else * just release the buffer. *************** _bt_restscan(IndexScanDesc scan) *** 898,903 **** --- 902,929 ---- return; } + /* Check for item on this page */ + if (offnum <= maxoff) + { + for (; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + item = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + if (BTTidSame(item->bti_itup.t_tid, *target)) + { + /* Found it */ + current->ip_posid = offnum; + return; + } + } + + /* Move offsetnumber to first, because we may delete LP_DELETE tuple */ + maxoff = ItemPointerGetOffsetNumber(current) - 1; + } + + offnum = P_FIRSTDATAKEY(opaque); + /* * The item we were on may have moved right due to insertions. Find it * again. We use the heap TID to identify the item uniquely. diff -cpr pgsql-orig/src/backend/storage/buffer/bufmgr.c pgsql/src/backend/storage/buffer/bufmgr.c *** pgsql-orig/src/backend/storage/buffer/bufmgr.c 2006-01-24 21:39:25.000000000 +0900 --- pgsql/src/backend/storage/buffer/bufmgr.c 2006-01-25 05:19:50.000000000 +0900 *************** LockBufferForCleanup(Buffer buffer) *** 1880,1885 **** --- 1880,1904 ---- } /* + * BufferSuperLockHeldByMe + * + * @param buffer Must have BUFFER_LOCK_EXCLUSIVE. + */ + bool + BufferSuperLockHeldByMe(Buffer buffer) + { + volatile BufferDesc *bufHdr; + unsigned refcount; + + bufHdr = &BufferDescriptors[buffer - 1]; + LockBufHdr(bufHdr); + refcount = bufHdr->refcount; + UnlockBufHdr(bufHdr); + + return refcount == 1; + } + + /* * Functions for buffer I/O handling * * Note: We assume that nested buffer I/O never occurs. diff -cpr pgsql-orig/src/include/access/nbtree.h pgsql/src/include/access/nbtree.h *** pgsql-orig/src/include/access/nbtree.h 2006-01-24 21:39:25.000000000 +0900 --- pgsql/src/include/access/nbtree.h 2006-01-25 04:58:05.000000000 +0900 *************** typedef BTPageOpaqueData *BTPageOpaque; *** 55,60 **** --- 55,61 ---- #define BTP_DELETED (1 << 2) /* page has been deleted from tree */ #define BTP_META (1 << 3) /* meta-page */ #define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ + #define BTP_HAS_DEAD (1 << 5) /* page has dead tuples */ /* *************** typedef BTItemData *BTItem; *** 155,160 **** --- 156,162 ---- #define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT) #define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED) #define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) + #define P_HAS_DEAD(opaque) ((opaque)->btpo_flags & BTP_HAS_DEAD) /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost diff -cpr pgsql-orig/src/include/storage/bufmgr.h pgsql/src/include/storage/bufmgr.h *** pgsql-orig/src/include/storage/bufmgr.h 2006-01-24 21:39:25.000000000 +0900 --- pgsql/src/include/storage/bufmgr.h 2006-01-25 05:06:04.000000000 +0900 *************** extern void UnlockBuffers(void); *** 149,154 **** --- 149,155 ---- extern void LockBuffer(Buffer buffer, int mode); extern bool ConditionalLockBuffer(Buffer buffer); extern void LockBufferForCleanup(Buffer buffer); + extern bool BufferSuperLockHeldByMe(Buffer buffer); extern void AbortBufferIO(void);
pgsql-patches by date: