diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c new file mode 100644 index 3e13394..00a1807 *** a/src/backend/storage/lmgr/lwlock.c --- b/src/backend/storage/lmgr/lwlock.c *************** GetLWLockIdentifier(uint32 classId, uint *** 728,791 **** static bool LWLockAttemptLock(LWLock *lock, LWLockMode mode) { ! uint32 old_state; AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED); /* ! * Read once outside the loop, later iterations will get the newer value ! * via compare & exchange. */ ! old_state = pg_atomic_read_u32(&lock->state); ! /* loop until we've determined whether we could acquire the lock or not */ ! while (true) { ! uint32 desired_state; ! bool lock_free; ! ! desired_state = old_state; ! ! if (mode == LW_EXCLUSIVE) ! { ! lock_free = (old_state & LW_LOCK_MASK) == 0; ! if (lock_free) ! desired_state += LW_VAL_EXCLUSIVE; ! } ! else ! { ! lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0; ! if (lock_free) ! desired_state += LW_VAL_SHARED; ! } ! ! /* ! * Attempt to swap in the state we are expecting. If we didn't see ! * lock to be free, that's just the old value. If we saw it as free, ! * we'll attempt to mark it acquired. The reason that we always swap ! * in the value is that this doubles as a memory barrier. We could try ! * to be smarter and only swap in values if we saw the lock as free, ! * but benchmark haven't shown it as beneficial so far. ! * ! * Retry if the value changed since we last looked at it. ! */ ! if (pg_atomic_compare_exchange_u32(&lock->state, ! &old_state, desired_state)) ! { ! if (lock_free) ! { ! /* Great! Got the lock. */ #ifdef LOCK_DEBUG ! if (mode == LW_EXCLUSIVE) ! lock->owner = MyProc; #endif ! return false; ! } ! else ! return true; /* somebody else has the lock */ ! } } - pg_unreachable(); } /* --- 728,773 ---- static bool LWLockAttemptLock(LWLock *lock, LWLockMode mode) { ! uint32 old_state, ! mask, ! increment; AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED); + if (mode == LW_EXCLUSIVE) + { + mask = LW_LOCK_MASK; + increment = LW_VAL_EXCLUSIVE; + } + else + { + mask = LW_VAL_EXCLUSIVE; + increment = LW_VAL_SHARED; + } + /* ! * Use 'check mask then add' atomic which actually do all the useful job ! * for us. */ ! old_state = pg_atomic_fetch_mask_add_u32(&lock->state, mask, increment); ! /* ! * If state was free according to the mask, we assume that operation was ! * successful. ! */ ! if ((old_state & mask) == 0) { ! /* Great! Got the lock. */ #ifdef LOCK_DEBUG ! if (mode == LW_EXCLUSIVE) ! lock->owner = MyProc; #endif ! return false; ! } ! else ! { ! return true; /* somebody else has the lock */ } } /* diff --git a/src/include/port/atomics.h b/src/include/port/atomics.h new file mode 100644 index 2e2ec27..74c2a41 *** a/src/include/port/atomics.h --- b/src/include/port/atomics.h *************** pg_atomic_sub_fetch_u32(volatile pg_atom *** 415,420 **** --- 415,437 ---- return pg_atomic_sub_fetch_u32_impl(ptr, sub_); } + /* + * pg_atomic_fetch_mask_add_u32 - atomically check that masked bits in variable + * and if they are clear then add to variable. + * + * Returns the value of ptr before the atomic operation. + * + * Full barrier semantics. + */ + static inline uint32 + pg_atomic_fetch_mask_add_u32(volatile pg_atomic_uint32 *ptr, + uint32 mask_, uint32 add_) + { + AssertPointerAlignment(ptr, 4); + return pg_atomic_fetch_mask_add_u32_impl(ptr, mask_, add_); + } + + /* ---- * The 64 bit operations have the same semantics as their 32bit counterparts * if they are available. Check the corresponding 32bit function for diff --git a/src/include/port/atomics/arch-ppc.h b/src/include/port/atomics/arch-ppc.h new file mode 100644 index ed1cd9d..cce2b55 *** a/src/include/port/atomics/arch-ppc.h --- b/src/include/port/atomics/arch-ppc.h *************** *** 23,26 **** --- 23,83 ---- #define pg_memory_barrier_impl() __asm__ __volatile__ ("sync" : : : "memory") #define pg_read_barrier_impl() __asm__ __volatile__ ("lwsync" : : : "memory") #define pg_write_barrier_impl() __asm__ __volatile__ ("lwsync" : : : "memory") + + #if defined(HAVE_ATOMICS) \ + && (defined(HAVE_GCC__ATOMIC_INT32_CAS) || defined(HAVE_GCC__SYNC_INT32_CAS)) + + /* + * Declare pg_atomic_uint32 structure before generic-gcc.h does it in order to + * use it in function arguments. + */ + #define PG_HAVE_ATOMIC_U32_SUPPORT + typedef struct pg_atomic_uint32 + { + volatile uint32 value; + } pg_atomic_uint32; + + /* + * Optimized implementation of pg_atomic_fetch_mask_add_u32() for Power + * processors. Atomic operations on Power processors are implemented using + * optimistic locking. 'lwarx' instruction 'reserves index', but that + * reservation could be broken on 'stwcx.' and then we have to retry. Thus, + * each CAS operation is a loop. But loop of CAS operation is two level nested + * loop. Experiments on multicore Power machines shows that we can have huge + * benefit from having this operation done using single loop in assembly. + */ + #define PG_HAVE_ATOMIC_FETCH_MASK_ADD_U32 + static inline uint32 + pg_atomic_fetch_mask_add_u32_impl(volatile pg_atomic_uint32 *ptr, + uint32 mask, uint32 increment) + { + uint32 result, + tmp; + + __asm__ __volatile__( + /* read *ptr and reserve index */ + #ifdef USE_PPC_LWARX_MUTEX_HINT + " lwarx %0,0,%5,1 \n" + #else + " lwarx %0,0,%5 \n" + #endif + " and %1,%0,%3 \n" /* calculate '*ptr & mask" */ + " cmpwi %1,0 \n" /* compare '*ptr & mark' vs 0 */ + " bne- $+16 \n" /* exit on '*ptr & mark != 0' */ + " add %1,%0,%4 \n" /* calculate '*ptr + increment' */ + " stwcx. %1,0,%5 \n" /* try to store '*ptr + increment' into *ptr */ + " bne- $-24 \n" /* retry if index reservation is broken */ + #ifdef USE_PPC_LWSYNC + " lwsync \n" + #else + " isync \n" + #endif + : "=&r"(result), "=&r"(tmp), "+m"(*ptr) + : "r"(mask), "r"(increment), "r"(ptr) + : "memory", "cc"); + return result; + } + + #endif + #endif diff --git a/src/include/port/atomics/generic.h b/src/include/port/atomics/generic.h new file mode 100644 index a5b29d8..ac934ce *** a/src/include/port/atomics/generic.h --- b/src/include/port/atomics/generic.h *************** pg_atomic_sub_fetch_u64_impl(volatile pg *** 390,392 **** --- 390,439 ---- #endif #endif /* PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64 */ + + #if !defined(PG_HAVE_ATOMIC_FETCH_MASK_ADD_U32) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32) + #define PG_HAVE_ATOMIC_FETCH_MASK_ADD_U32 + /* + * Generic implementation of pg_atomic_fetch_mask_add_u32() via loop + * of compare & exchange. + */ + static inline uint32 + pg_atomic_fetch_mask_add_u32_impl(volatile pg_atomic_uint32 *ptr, + uint32 mask_, uint32 add_) + { + uint32 old_value; + + /* + * Read once outside the loop, later iterations will get the newer value + * via compare & exchange. + */ + old_value = pg_atomic_read_u32_impl(ptr); + + /* loop until we've determined whether we could make an increment or not */ + while (true) + { + uint32 desired_value; + bool free; + + desired_value = old_value; + free = (old_value & mask_) == 0; + if (free) + desired_value += add_; + + /* + * Attempt to swap in the value we are expecting. If we didn't see + * masked bits to be clear, that's just the old value. If we saw them + * as clear, we'll attempt to make an increment. The reason that we + * always swap in the value is that this doubles as a memory barrier. + * We could try to be smarter and only swap in values if we saw the + * maked bits as clear, but benchmark haven't shown it as beneficial + * so far. + * + * Retry if the value changed since we last looked at it. + */ + if (pg_atomic_compare_exchange_u32_impl(ptr, &old_value, desired_value)) + return old_value; + } + pg_unreachable(); + } + #endif