diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 8140418..9ac1eb7 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -169,6 +169,62 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, } /* + * Extend a relation by multiple blocks to avoid future contention on the + * relation extension lock. Our goal is to pre-extend the relation by an + * amount which ramps up as the degree of contention ramps up, but limiting + * the result to some sane overall value. + */ +static void +RelationAddExtraBlocks(Relation relation, BulkInsertState bistate) +{ + Page page; + BlockNumber blockNum, + firstBlock, + lastBlock; + int extraBlocks = 0; + int lockWaiters = 0; + Buffer buffer; + + /* + * We use the length of the lock wait queue to judge how much to extend. + * It might seem like multiplying the number of lock waiters by as much + * as 20 is too aggressive, but benchmarking revealed that smaller numbers + * were insufficient. 512 is just an arbitrary cap to prevent pathological + * results (and excessive wasted disk space). + */ + lockWaiters = RelationExtensionLockWaiterCount(relation); + extraBlocks = Min(512, lockWaiters * 20); + + firstBlock = lastBlock = InvalidBlockNumber; + + while (extraBlocks-- >= 0) + { + /* Ouch - an unnecessary lseek() each time through the loop! */ + buffer = ReadBufferBI(relation, P_NEW, bistate); + + /* Extend by one page. */ + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + MarkBufferDirty(buffer); + blockNum = BufferGetBlockNumber(buffer); + UnlockReleaseBuffer(buffer); + + if (firstBlock == InvalidBlockNumber) + firstBlock = blockNum; + + lastBlock = blockNum; + } + + /* + * Put the page in the freespace map so other backends can find it. + * This is what will keep those other backends from also queueing up + * on the relation extension lock. + */ + FreeSpaceMapBulkExtend(relation, firstBlock, lastBlock); +} + +/* * RelationGetBufferForTuple * * Returns pinned and exclusive-locked buffer of a page in given relation @@ -233,8 +289,8 @@ RelationGetBufferForTuple(Relation relation, Size len, bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM); Buffer buffer = InvalidBuffer; Page page; - Size pageFreeSpace, - saveFreeSpace; + Size pageFreeSpace = 0, + saveFreeSpace = 0; BlockNumber targetBlock, otherBlock; bool needLock; @@ -308,6 +364,7 @@ RelationGetBufferForTuple(Relation relation, Size len, } } +loop: while (targetBlock != InvalidBlockNumber) { /* @@ -440,10 +497,46 @@ RelationGetBufferForTuple(Relation relation, Size len, */ needLock = !RELATION_IS_LOCAL(relation); + /* + * If we need the lock but are not able to acquire it immediately, we'll + * consider extending the relation by multiple blocks at a time to manage + * contention on the relation extension lock. However, this only makes + * sense if we're using the FSM; otherwise, there's no point. + */ if (needLock) - LockRelationForExtension(relation, ExclusiveLock); + { + if (!use_fsm) + LockRelationForExtension(relation, ExclusiveLock); + else if (!ConditionLockRelationForExtension(relation, ExclusiveLock)) + { + /* Couldn't get the lock immediately; wait for it. */ + LockRelationForExtension(relation, ExclusiveLock); + + /* + * Check if some other backend has extended a block for us while + * we were waiting on the lock. + */ + targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); + + /* + * If some other waiter has already extended the relation, we + * don't need to do so; just use the existing freespace. + */ + if (targetBlock != InvalidBlockNumber) + { + UnlockRelationForExtension(relation, ExclusiveLock); + goto loop; + } + + /* Time to bulk-extend. */ + RelationAddExtraBlocks(relation, bistate); + } + } /* + * In addition to whatever extension we performed above, we always add + * at least one block to satisfy our own request. + * * XXX This does an lseek - rather expensive - but at the moment it is the * only way to accurately determine how many blocks are in a relation. Is * it worth keeping an accurate file length in shared memory someplace, diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 2631080..5f45891 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -109,7 +109,10 @@ static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, uint8 newValue, uint8 minValue); static BlockNumber fsm_search(Relation rel, uint8 min_cat); static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof); - +static bool fsm_addr_on_same_page(FSMAddress addr1, FSMAddress addr2); +static void fsm_set_range(Relation rel, FSMAddress addr, uint16 firstSlot, + uint16 lastSlot, uint8 newValue); +static void fsm_update_recursive(Relation rel, FSMAddress addr, uint8 new_cat); /******** Public API ********/ @@ -189,6 +192,49 @@ RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail) } /* + * Sets all the FSM entries for pages between firstBlock and lastBlock to + * BLKSIZE and then bubbles that up to the higher levels of the tree and + * all the way to the root. + */ +void +FreeSpaceMapBulkExtend(Relation rel, BlockNumber firstBlock, + BlockNumber lastBlock) +{ + int new_cat = fsm_space_avail_to_cat(BLCKSZ); + FSMAddress addr1, + addr2; + uint16 firstSlot; + uint16 lastSlot; + uint16 slot; + BlockNumber blockNum; + + blockNum = firstBlock; + addr1 = fsm_get_location(firstBlock, &slot); + + while (blockNum < lastBlock) + { + blockNum++; + + addr2 = fsm_get_location(blockNum, &slot); + + if (!fsm_addr_on_same_page(addr1, addr2) + || blockNum == lastBlock) + { + /* This BLock is on the next FSM page so update the FSM page */ + fsm_get_location(firstBlock, &firstSlot); + fsm_get_location(blockNum - 1, &lastSlot); + fsm_set_range(rel, addr1, firstSlot, lastSlot, new_cat); + fsm_update_recursive(rel, addr1, new_cat); + /* + * Continue updating FSM till last page, so set the addr1 as new + * Address. and continue search for this page. + */ + addr1 = addr2; + } + } +} + +/* * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in * WAL replay */ @@ -788,3 +834,56 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) return max_avail; } + +static bool +fsm_addr_on_same_page(FSMAddress addr1, FSMAddress addr2) +{ + Assert(addr1); + Assert(addr2); + + if ((addr1.level != addr2.level) + || (addr1.logpageno != addr2.logpageno)) + return false; + + return true; +} + +/* + * Set value in given FSM page for given slot range. + */ +static void +fsm_set_range(Relation rel, FSMAddress addr, uint16 firstSlot, + uint16 lastSlot, uint8 newValue) +{ + Buffer buf; + Page page; + int slot; + + buf = fsm_readbuf(rel, addr, true); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(buf); + + slot = firstSlot; + + for (slot = firstSlot; slot <= lastSlot; slot++) + fsm_set_avail(page, slot, newValue); + + MarkBufferDirtyHint(buf, false); + + UnlockReleaseBuffer(buf); +} + +static void +fsm_update_recursive(Relation rel, FSMAddress addr, uint8 new_cat) +{ + uint16 parentslot; + FSMAddress parent; + + if (addr.level == FSM_ROOT_LEVEL) + return; + + parent = fsm_get_parent(addr, &parentslot); + fsm_set_and_search(rel, parent, parentslot, new_cat, 0); + fsm_update_recursive(rel, parent, new_cat); +} diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index 0632fc0..7e04137 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -341,6 +341,41 @@ LockRelationForExtension(Relation relation, LOCKMODE lockmode) } /* + * ConditionLockRelationForExtension + * + * As above, but only lock if we can get the lock without blocking. + * Returns TRUE iff the lock was acquired. + */ +bool +ConditionLockRelationForExtension(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL); +} + +/* + * RelationExtensionLockWaiterCount + * + * Count the lock requester for the RelationExtension lock. + */ +int +RelationExtensionLockWaiterCount(Relation relation) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + return LockWaiterCount(&tag); +} + +/* * UnlockRelationForExtension */ void diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index b30b7b1..353f705 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -4380,3 +4380,40 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) LockRelease(&tag, ShareLock, false); return true; } + +/* + * LockWaiterCount + * + * Find the number of lock requester on this locktag + */ +int +LockWaiterCount(const LOCKTAG *locktag) +{ + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LOCK *lock; + bool found; + uint32 hashcode; + LWLock *partitionLock; + int waiters = 0; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + + hashcode = LockTagHashCode(locktag); + partitionLock = LockHashPartitionLock(hashcode); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + (const void *) locktag, + hashcode, + HASH_FIND, + &found); + if (found) + { + Assert(lock != NULL); + waiters = lock->nRequested; + } + LWLockRelease(partitionLock); + + return waiters; +} diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h index 19dcb8d..0f2a2be 100644 --- a/src/include/storage/freespace.h +++ b/src/include/storage/freespace.h @@ -32,5 +32,7 @@ extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks); extern void FreeSpaceMapVacuum(Relation rel); +extern void FreeSpaceMapBulkExtend(Relation rel, BlockNumber firstBlock, + BlockNumber lastBlock); #endif /* FREESPACE_H_ */ diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h index 975b6f8..4460756 100644 --- a/src/include/storage/lmgr.h +++ b/src/include/storage/lmgr.h @@ -53,6 +53,9 @@ extern void UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode); /* Lock a relation for extension */ extern void LockRelationForExtension(Relation relation, LOCKMODE lockmode); extern void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode); +extern bool ConditionLockRelationForExtension(Relation relation, + LOCKMODE lockmode); +extern int RelationExtensionLockWaiterCount(Relation relation); /* Lock a page (currently only used within indexes) */ extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index b26427d..9c08679 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -574,6 +574,8 @@ extern void RememberSimpleDeadLock(PGPROC *proc1, PGPROC *proc2); extern void InitDeadLockChecking(void); +extern int LockWaiterCount(const LOCKTAG *locktag); + #ifdef LOCK_DEBUG extern void DumpLocks(PGPROC *proc); extern void DumpAllLocks(void);