From: Andres Freund Date: Thu, 18 Sep 2014 14:14:16 +0000 (+0200) Subject: Wait free LW_SHARED lwlock acquiration X-Git-Url: http://git.postgresql.org/gitweb/static/developers.postgresql.org?a=commitdiff_plain;h=eedffb0e25b6cc1fa37dce9c8b548605e46a26c5;p=users%2Fandresfreund%2Fpostgres.git Wait free LW_SHARED lwlock acquiration --- diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 1fd26aaaf2..82e85c971c 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -24,6 +24,82 @@ * IDENTIFICATION * src/backend/storage/lmgr/lwlock.c * + * NOTES: + * + * This used to be a pretty straight forward reader-writer lock + * implementation, in which the internal state was protected by a + * spinlock. Unfortunately the overhead of taking the spinlock proved to be + * too high for workloads/locks that were locked in shared mode very + * frequently. + * Thus a new implementation was devised that provides wait-free shared lock + * acquiration for locks that aren't exclusively locked. + * + * The basic idea is to have a single atomic variable 'lockcount' instead of + * the formerly separate shared and exclusive counters and to use an atomic + * increment to acquire the lock. That's fairly easy to do for rw-spinlocks, + * but a lot harder for something like LWLocks that want to wait in the OS. + * + * For exlusive lock acquisition we use an atomic compare-and-exchange on the + * lockcount variable swapping in EXCLUSIVE_LOCK/1<<31-1/0x7FFFFFFF if and only + * if the current value of lockcount is 0. If the swap was not successfull, we + * have to wait. + * + * For shared lock acquisition we use an atomic add (lock xadd) to the + * lockcount variable to add 1. If the value is bigger than EXCLUSIVE_LOCK we + * know that somebody actually has an exclusive lock, and we back out by + * atomically decrementing by 1 again. If so, we have to wait for the exlusive + * locker to release the lock. + * + * To release the lock we use an atomic decrement to release the lock. If the + * new value is zero (we get that atomically), we know we have to release + * waiters. + * + * The attentive reader probably might have noticed that naively doing the + * above has two glaring race conditions: + * + * 1) too-quick-for-queueing: We try to lock using the atomic operations and + * notice that we have to wait. Unfortunately until we have finished queuing, + * the former locker very well might have already finished it's work. That's + * problematic because we're now stuck waiting inside the OS. + * + * 2) spurious failed locks: Due to the logic of backing out of shared + * locks after we unconditionally added a 1 to lockcount, we might have + * prevented another exclusive locker from getting the lock: + * 1) Session A: LWLockAcquire(LW_EXCLUSIVE) - success + * 2) Session B: LWLockAcquire(LW_SHARED) - lockcount += 1 + * 3) Session B: LWLockAcquire(LW_SHARED) - oops, bigger than EXCLUSIVE_LOCK + * 4) Session A: LWLockRelease() + * 5) Session C: LWLockAcquire(LW_EXCLUSIVE) - check if lockcount = 0, no. wait. + * 6) Session B: LWLockAcquire(LW_SHARED) - lockcount -= 1 + * 7) Session B: LWLockAcquire(LW_SHARED) - wait + * + * So we'd now have both B) and C) waiting on a lock that nobody is holding + * anymore. Not good. + * + * To mitigate those races we use a two phased attempt at locking: + * Phase 1: * Try to do it atomically, if we succeed, nice + * Phase 2: Add us too the waitqueue of the lock + * Phase 3: Try to grab the lock again, if we succeed, remove ourselves from + * the queue + * Phase 4: Sleep till wakeup, goto Phase 1 + * + * This protects us against both problems from above: + * 1) Nobody can release too quick, before we're queued, since after Phase 2 + * we're already queued. + * 2) If somebody spuriously got blocked from acquiring the lock, they will + * get queued in Phase 2 and we can wake them up if neccessary or they will + * have gotten the lock in Phase 3. + * + * There above algorithm only works for LWLockAcquire, not directly for + * LWLockAcquireConditional where we don't want to wait. In that case we just + * need to retry acquiring the lock until we're sure we didn't disturb anybody + * in doing so. + * + * TODO: + * - decide if we need a spinlock fallback + * - expand documentation + * - make LWLOCK_STATS do something sensible again + * - make LOCK_DEBUG output nicer *------------------------------------------------------------------------- */ #include "postgres.h" @@ -35,7 +111,6 @@ #include "miscadmin.h" #include "pg_trace.h" #include "replication/slot.h" -#include "storage/barrier.h" #include "storage/ipc.h" #include "storage/predicate.h" #include "storage/proc.h" @@ -50,6 +125,10 @@ /* We use the ShmemLock spinlock to protect LWLockAssign */ extern slock_t *ShmemLock; +#define EXCLUSIVE_LOCK (((uint32) 1) << (31 - 1)) +/* must be greater than MAX_BACKENDS */ +#define SHARED_LOCK_MASK (~EXCLUSIVE_LOCK) + /* * This is indexed by tranche ID and stores metadata for all tranches known * to the current backend. @@ -80,8 +159,14 @@ static LWLockTranche MainLWLockTranche; */ #define MAX_SIMUL_LWLOCKS 100 +typedef struct LWLockHandle +{ + LWLock *lock; + LWLockMode mode; +} LWLockHandle; + static int num_held_lwlocks = 0; -static LWLock *held_lwlocks[MAX_SIMUL_LWLOCKS]; +static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS]; static int lock_addin_request = 0; static bool lock_addin_request_allowed = true; @@ -100,8 +185,11 @@ typedef struct lwlock_stats { lwlock_stats_key key; int sh_acquire_count; + int sh_attempt_backout; int ex_acquire_count; + int ex_race; int block_count; + int dequeue_self_count; int spin_delay_count; } lwlock_stats; @@ -113,23 +201,30 @@ static lwlock_stats lwlock_stats_dummy; bool Trace_lwlocks = false; inline static void -PRINT_LWDEBUG(const char *where, const LWLock *lock) +PRINT_LWDEBUG(const char *where, const LWLock *lock, LWLockMode mode) { if (Trace_lwlocks) - elog(LOG, "%s(%s %d): excl %d shared %d rOK %d", + { + uint32 lockcount = pg_atomic_read_u32(&lock->lockcount); + + elog(LOG, "%d: %s(%s %d): excl %u shared %u waiters %u rOK %d\n", + MyProcPid, where, T_NAME(lock), T_ID(lock), - (int) lock->exclusive, lock->shared, + lockcount >= EXCLUSIVE_LOCK, + lockcount & SHARED_LOCK_MASK, + pg_atomic_read_u32(&lock->nwaiters), (int) lock->releaseOK); + } } inline static void LOG_LWDEBUG(const char *where, const char *name, int index, const char *msg) { if (Trace_lwlocks) - elog(LOG, "%s(%s %d): %s", where, name, index, msg); + elog(LOG, "%d: %s(%s %d): %s\n", MyProcPid, where, name, index, msg); } #else /* not LOCK_DEBUG */ -#define PRINT_LWDEBUG(a,b) +#define PRINT_LWDEBUG(a,b,c) #define LOG_LWDEBUG(a,b,c,d) #endif /* LOCK_DEBUG */ @@ -192,11 +287,12 @@ print_lwlock_stats(int code, Datum arg) while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL) { fprintf(stderr, - "PID %d lwlock %s %d: shacq %u exacq %u blk %u spindelay %u\n", + "PID %d lwlock %s %d: shacq %u exacq %u blk %u spindelay %u, backout %u, ex race %u, dequeue self %u\n", MyProcPid, LWLockTrancheArray[lwstats->key.tranche]->name, lwstats->key.instance, lwstats->sh_acquire_count, lwstats->ex_acquire_count, lwstats->block_count, - lwstats->spin_delay_count); + lwstats->spin_delay_count, lwstats->sh_attempt_backout, + lwstats->ex_race, lwstats->dequeue_self_count); } LWLockRelease(&MainLWLockArray[0].lock); @@ -224,8 +320,11 @@ get_lwlock_stats_entry(LWLock *lock) if (!found) { lwstats->sh_acquire_count = 0; + lwstats->sh_attempt_backout = 0; lwstats->ex_acquire_count = 0; + lwstats->ex_race = 0; lwstats->block_count = 0; + lwstats->dequeue_self_count = 0; lwstats->spin_delay_count = 0; } return lwstats; @@ -473,12 +572,299 @@ LWLockInitialize(LWLock *lock, int tranche_id) { SpinLockInit(&lock->mutex); lock->releaseOK = true; - lock->exclusive = 0; - lock->shared = 0; + pg_atomic_init_u32(&lock->lockcount, 0); + pg_atomic_init_u32(&lock->nwaiters, 0); lock->tranche = tranche_id; dlist_init(&lock->waiters); } +/* + * Internal function handling the atomic manipulation of lock->lockcount. + * + * 'double_check' = true means that we try to check more carefully + * against causing somebody else to spuriously believe the lock is + * already taken, although we're just about to back out of it. + */ +static inline bool +LWLockAttemptLock(LWLock* l, LWLockMode mode, bool double_check, bool *potentially_spurious) +{ + bool mustwait; + uint32 oldstate; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + lwstats = get_lwlock_stats_entry(l); +#endif + + Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED); + + *potentially_spurious = false; + + if (mode == LW_EXCLUSIVE) + { + uint32 expected; + + /* check without CAS first; it's way cheaper, frequently locked otherwise */ + expected = pg_atomic_read_u32(&l->lockcount); + + Assert(expected < EXCLUSIVE_LOCK + (1 << 16)); + + if (expected != 0) + mustwait = true; + else if (!pg_atomic_compare_exchange_u32(&l->lockcount, + &expected, EXCLUSIVE_LOCK)) + { + /* + * ok, no can do. Between the pg_atomic_read() above and the + * CAS somebody else acquired the lock. + */ + mustwait = true; + Assert(expected < EXCLUSIVE_LOCK + (1 << 16)); + } + else + { + /* yipeyyahee */ + mustwait = false; +#ifdef LOCK_DEBUG + l->owner = MyProc; +#endif + Assert(expected == 0); + } + } + else + { + /* + * If requested by caller, do an unlocked check first. This is useful + * if potentially spurious results have a noticeable cost. + */ + if (double_check) + { + if (pg_atomic_read_u32(&l->lockcount) >= EXCLUSIVE_LOCK) + { + mustwait = true; + goto out; + } + } + + /* + * Acquire the share lock unconditionally using an atomic addition. We + * might have to back out again if it turns out somebody else has an + * exclusive lock. + */ + oldstate = pg_atomic_fetch_add_u32(&l->lockcount, 1); + + if (oldstate >= EXCLUSIVE_LOCK) + { + /* + * Ok, somebody else holds the lock exclusively. We need to back + * away from the shared lock, since we don't actually hold it right + * now. Since there's a window between lockcount += 1 and lockcount + * -= 1, the previous exclusive locker could have released and + * another exclusive locker could have seen our +1. We need to + * signal that to the upper layers so they can deal with the race + * condition. + */ + + /* + * FIXME: check return value if (double_check), it's not + * spurious if still exclusively locked. + */ + pg_atomic_fetch_sub_u32(&l->lockcount, 1); + + + mustwait = true; + *potentially_spurious = true; +#ifdef LWLOCK_STATS + lwstats->sh_attempt_backout++; +#endif + } + else + { + /* yipeyyahee */ + mustwait = false; + } + } + +out: + return mustwait; +} + +/* + * Wakeup all the lockers that currently have a chance to run. + */ +static void +LWLockWakeup(LWLock *lock, LWLockMode mode) +{ + bool releaseOK; + bool wokeup_somebody = false; + dlist_head wakeup; + dlist_mutable_iter iter; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + lwstats = get_lwlock_stats_entry(l); +#endif + + dlist_init(&wakeup); + + /* remove the to-be-awakened PGPROCs from the queue */ + releaseOK = true; + + /* Acquire mutex. Time spent holding mutex should be short! */ +#ifdef LWLOCK_STATS + lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex); +#else + SpinLockAcquire(&lock->mutex); +#endif + + /* + * We're still waiting for backends to get scheduled, don't wake them up + * again. + */ + if (!lock->releaseOK) + { + SpinLockRelease(&lock->mutex); + PRINT_LWDEBUG("LWLockRelease skip releaseok", lock, mode); + return; + } + + dlist_foreach_modify(iter, (dlist_head *) &lock->waiters) + { + PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur); + + if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE) + continue; + + dlist_delete(&waiter->lwWaitLink); + dlist_push_tail(&wakeup, &waiter->lwWaitLink); + + if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) + { + /* + * Prevent additional wakeups until retryer gets to run. Backends + * that are just waiting for the lock to become free don't retry + * automatically. + */ + releaseOK = false; + /* + * Don't wakeup (further) exclusive locks. + */ + wokeup_somebody = true; + } + + /* + * Once we've woken up an exclusive lock, there's no point in waking + * up anybody else. + */ + if(waiter->lwWaitMode == LW_EXCLUSIVE) + break; + } + lock->releaseOK = releaseOK; + + + /* We are done updating shared state of the lock queue. */ + SpinLockRelease(&lock->mutex); + + /* + * Awaken any waiters I removed from the queue. + */ + dlist_foreach_modify(iter, &wakeup) + { + PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur); + + LOG_LWDEBUG("LWLockRelease", T_NAME(l), T_ID(l), "release waiter"); + dlist_delete(&waiter->lwWaitLink); + pg_write_barrier(); + waiter->lwWaiting = false; + PGSemaphoreUnlock(&waiter->sem); + } +} + +/* + * Add ourselves to the end of the queue. Mode can be LW_WAIT_UNTIL_FREE here! + */ +static inline void +LWLockQueueSelf(LWLock *lock, LWLockMode mode) +{ +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + lwstats = get_lwlock_stats_entry(l); +#endif + + /* + * If we don't have a PGPROC structure, there's no way to wait. This + * should never occur, since MyProc should only be null during shared + * memory initialization. + */ + if (MyProc == NULL) + elog(PANIC, "cannot wait without a PGPROC structure"); + + pg_atomic_fetch_add_u32(&lock->nwaiters, 1); + +#ifdef LWLOCK_STATS + lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex); +#else + SpinLockAcquire(&lock->mutex); +#endif + + if (MyProc->lwWaiting) + elog(PANIC, "queueing for lock while waiting on another one"); + + MyProc->lwWaiting = true; + MyProc->lwWaitMode = mode; + + /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */ + if (mode == LW_WAIT_UNTIL_FREE) + dlist_push_head((dlist_head *) &lock->waiters, &MyProc->lwWaitLink); + else + dlist_push_tail((dlist_head *) &lock->waiters, &MyProc->lwWaitLink); + + /* Can release the mutex now */ + SpinLockRelease(&lock->mutex); +} + +/* + * Remove ourselves from the waitlist. This is used if we queued ourselves + * because we thought we needed to sleep but, after further checking, we + * discovered that we don't actually need to do so. Somebody else might have + * already woken us up though, in that case return false. + */ +static inline bool +LWLockDequeueSelf(LWLock *lock) +{ + bool found = false; + dlist_mutable_iter iter; + +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + lwstats = get_lwlock_stats_entry(l); +#endif + +#ifdef LWLOCK_STATS + lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex); +#else + SpinLockAcquire(&lock->mutex); +#endif + + /* need to iterate, somebody else could have unqueued us */ + dlist_foreach_modify(iter, (dlist_head *) &lock->waiters) + { + PGPROC *proc = dlist_container(PGPROC, lwWaitLink, iter.cur); + if (proc == MyProc) + { + found = true; + dlist_delete(&proc->lwWaitLink); + break; + } + } + + /* clear waiting state again, nice for debugging */ + if (found) + MyProc->lwWaiting = false; + + SpinLockRelease(&lock->mutex); + + pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + return found; +} /* * LWLockAcquire - acquire a lightweight lock in the specified mode @@ -510,14 +896,17 @@ static inline bool LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val) { PGPROC *proc = MyProc; - bool retry = false; bool result = true; int extraWaits = 0; + bool potentially_spurious; + #ifdef LWLOCK_STATS lwlock_stats *lwstats; #endif - PRINT_LWDEBUG("LWLockAcquire", lock); + AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockAcquire", lock, mode); #ifdef LWLOCK_STATS lwstats = get_lwlock_stats_entry(lock); @@ -567,58 +956,77 @@ LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val) { bool mustwait; - /* Acquire mutex. Time spent holding mutex should be short! */ -#ifdef LWLOCK_STATS - lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex); -#else - SpinLockAcquire(&lock->mutex); -#endif + /* + * try to grab the lock the first time, we're not in the waitqueue yet. + */ + mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious); - /* If retrying, allow LWLockRelease to release waiters again */ - if (retry) - lock->releaseOK = true; + if (!mustwait) + { + LOG_LWDEBUG("LWLockAcquire", T_NAME(l), T_ID(l), "success"); + break; /* got the lock */ + } - /* If I can get the lock, do so quickly. */ - if (mode == LW_EXCLUSIVE) + /* + * Ok, at this point we couldn't grab the lock on the first try. We + * cannot simply queue ourselves to the end of the list and wait to be + * woken up because by now the lock could long have been released. + * Instead add us to the queue and try to grab the lock again. If we + * succeed we need to revert the queuing and be happy, otherwise we + * recheck the lock. If we still couldn't grab it, we know that the + * other lock will see our queue entries when releasing since they + * existed before we checked for the lock. + * FIXME: add note referring to overall notes + */ + + /* add to the queue */ + LWLockQueueSelf(lock, mode); + + /* we're now guaranteed to be woken up if necessary */ + mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious); + + /* ok, grabbed the lock the second time round, need to undo queueing */ + if (!mustwait) { - if (lock->exclusive == 0 && lock->shared == 0) +#ifdef LWLOCK_STATS + lwstats->dequeue_self_count++; +#endif + if (!LWLockDequeueSelf(lock)) { - lock->exclusive++; - mustwait = false; + /* + * Somebody else dequeued us and has or will wake us up. Wait + * for the correct wakeup, otherwise our ->lwWaiting would get + * reset at some inconvenient point later, and releaseOk + * wouldn't be managed correctly. + */ + for (;;) + { + PGSemaphoreLock(&proc->sem, false); + if (!proc->lwWaiting) + break; + extraWaits++; + } + /* + * Reset releaseOk - if somebody woke us they'll have set it + * to false. + */ + SpinLockAcquire(&lock->mutex); + lock->releaseOK = true; + SpinLockRelease(&lock->mutex); } - else - mustwait = true; + PRINT_LWDEBUG("LWLockAcquire success: undo queue", lock, mode); + break; } else { - if (lock->exclusive == 0) - { - lock->shared++; - mustwait = false; - } - else - mustwait = true; + PRINT_LWDEBUG("LWLockAcquire waiting 4", lock, mode); } - if (!mustwait) - break; /* got the lock */ - /* - * Add myself to wait queue. - * - * If we don't have a PGPROC structure, there's no way to wait. This - * should never occur, since MyProc should only be null during shared - * memory initialization. + * NB: There's no need to deal with spurious lock attempts + * here. Anyone we prevented from acquiring the lock will + * enqueue themselves using the same protocol we used here. */ - if (proc == NULL) - elog(PANIC, "cannot wait without a PGPROC structure"); - - proc->lwWaiting = true; - proc->lwWaitMode = mode; - dlist_push_head((dlist_head *) &lock->waiters, &proc->lwWaitLink); - - /* Can release the mutex now */ - SpinLockRelease(&lock->mutex); /* * Wait until awakened. @@ -653,8 +1061,14 @@ LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val) LOG_LWDEBUG("LWLockAcquire", T_NAME(lock), T_ID(lock), "awakened"); + /* not waiting anymore */ + pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + /* Now loop back and try to acquire lock again. */ - retry = true; + SpinLockAcquire(&lock->mutex); + lock->releaseOK = true; + SpinLockRelease(&lock->mutex); + result = false; } @@ -662,13 +1076,11 @@ LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val) if (valptr) *valptr = val; - /* We are done updating shared state of the lock itself. */ - SpinLockRelease(&lock->mutex); - TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), T_ID(lock), mode); /* Add lock to list of locks held by this backend */ - held_lwlocks[num_held_lwlocks++] = lock; + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; /* * Fix the process wait semaphore's count for any absorbed wakeups. @@ -690,8 +1102,11 @@ bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode) { bool mustwait; + bool potentially_spurious; + + AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE); - PRINT_LWDEBUG("LWLockConditionalAcquire", lock); + PRINT_LWDEBUG("LWLockConditionalAcquire", l, mode); /* Ensure we will have room to remember the lock */ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) @@ -704,50 +1119,44 @@ LWLockConditionalAcquire(LWLock *lock, LWLockMode mode) */ HOLD_INTERRUPTS(); - /* Acquire mutex. Time spent holding mutex should be short! */ - SpinLockAcquire(&lock->mutex); - - /* If I can get the lock, do so quickly. */ - if (mode == LW_EXCLUSIVE) - { - if (lock->exclusive == 0 && lock->shared == 0) - { - lock->exclusive++; - mustwait = false; - } - else - mustwait = true; - } - else - { - if (lock->exclusive == 0) - { - lock->shared++; - mustwait = false; - } - else - mustwait = true; - } - - /* We are done updating shared state of the lock itself. */ - SpinLockRelease(&lock->mutex); +retry: + /* + * passing 'true' to check more carefully to avoid potential + * spurious acquisitions + */ + mustwait = LWLockAttemptLock(lock, mode, true, &potentially_spurious); if (mustwait) { /* Failed to get lock, so release interrupt holdoff */ RESUME_INTERRUPTS(); + LOG_LWDEBUG("LWLockConditionalAcquire", T_NAME(lock), T_ID(lock), "failed"); - TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), - T_ID(lock), mode); + TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), T_ID(lock), mode); + + /* + * We ran into an exclusive lock and might have blocked another + * exclusive lock from taking a shot because it took a time to back + * off. Retry till we are either sure we didn't block somebody (because + * somebody else certainly has the lock) or till we got it. + * + * We cannot rely on the two-step lock-acquisition protocol as in + * LWLockAcquire because we're not using it. + */ + if (potentially_spurious) + { + SPIN_DELAY(); + goto retry; + } } else { /* Add lock to list of locks held by this backend */ - held_lwlocks[num_held_lwlocks++] = lock; + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), T_ID(lock), mode); } - return !mustwait; } @@ -771,11 +1180,15 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) PGPROC *proc = MyProc; bool mustwait; int extraWaits = 0; + bool potentially_spurious_first; + bool potentially_spurious_second; #ifdef LWLOCK_STATS lwlock_stats *lwstats; #endif - PRINT_LWDEBUG("LWLockAcquireOrWait", lock); + Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode); #ifdef LWLOCK_STATS lwstats = get_lwlock_stats_entry(lock); @@ -792,81 +1205,58 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) */ HOLD_INTERRUPTS(); - /* Acquire mutex. Time spent holding mutex should be short! */ - SpinLockAcquire(&lock->mutex); - - /* If I can get the lock, do so quickly. */ - if (mode == LW_EXCLUSIVE) - { - if (lock->exclusive == 0 && lock->shared == 0) - { - lock->exclusive++; - mustwait = false; - } - else - mustwait = true; - } - else - { - if (lock->exclusive == 0) - { - lock->shared++; - mustwait = false; - } - else - mustwait = true; - } + /* + * NB: We're using nearly the same twice-in-a-row lock acquisition + * protocol as LWLockAcquire(). Check its comments for details. + */ + mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious_first); if (mustwait) { - /* - * Add myself to wait queue. - * - * If we don't have a PGPROC structure, there's no way to wait. This - * should never occur, since MyProc should only be null during shared - * memory initialization. - */ - if (proc == NULL) - elog(PANIC, "cannot wait without a PGPROC structure"); - - proc->lwWaiting = true; - proc->lwWaitMode = LW_WAIT_UNTIL_FREE; - dlist_push_head((dlist_head *) &lock->waiters, &proc->lwWaitLink); + LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); - /* Can release the mutex now */ - SpinLockRelease(&lock->mutex); + mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious_second); - /* - * Wait until awakened. Like in LWLockAcquire, be prepared for bogus - * wakups, because we share the semaphore with ProcWaitForSignal. - */ - LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock), - "waiting"); + if (mustwait) + { + /* + * Wait until awakened. Like in LWLockAcquire, be prepared for bogus + * wakups, because we share the semaphore with ProcWaitForSignal. + */ + LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock), + "waiting"); #ifdef LWLOCK_STATS - lwstats->block_count++; + lwstats->block_count++; #endif + TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(l), T_ID(l), mode); - TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), T_ID(lock), mode); + for (;;) + { + /* "false" means cannot accept cancel/die interrupt here. */ + PGSemaphoreLock(&proc->sem, false); + if (!proc->lwWaiting) + break; + extraWaits++; + } + pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); - for (;;) + TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock), mode); + } + else { - /* "false" means cannot accept cancel/die interrupt here. */ - PGSemaphoreLock(&proc->sem, false); - if (!proc->lwWaiting) - break; - extraWaits++; + /* got lock in the second attempt, undo queueing */ + if (!LWLockDequeueSelf(lock)) + { + for (;;) + { + PGSemaphoreLock(&proc->sem, false); + if (!proc->lwWaiting) + break; + extraWaits++; + } + } } - - TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock), mode); - - LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock), - "awakened"); - } - else - { - /* We are done updating shared state of the lock itself. */ - SpinLockRelease(&lock->mutex); } /* @@ -885,10 +1275,11 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) } else { + LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock), "succeeded"); /* Add lock to list of locks held by this backend */ - held_lwlocks[num_held_lwlocks++] = lock; - TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), T_ID(lock), - mode); + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; + TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), T_ID(lock), mode); } return !mustwait; @@ -922,7 +1313,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) lwlock_stats *lwstats; #endif - PRINT_LWDEBUG("LWLockWaitForVar", lock); + PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_EXCLUSIVE); #ifdef LWLOCK_STATS lwstats = get_lwlock_stats_entry(lock); @@ -935,7 +1326,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) * barrier here as far as the current usage is concerned. But that might * not be safe in general. */ - if (lock->exclusive == 0) + if (pg_atomic_read_u32(&lock->lockcount) == 0) return true; /* @@ -953,21 +1344,16 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) bool mustwait; uint64 value; - /* Acquire mutex. Time spent holding mutex should be short! */ -#ifdef LWLOCK_STATS - lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex); -#else - SpinLockAcquire(&lock->mutex); -#endif + mustwait = pg_atomic_read_u32(&lock->lockcount) != 0; - /* Is the lock now free, and if not, does the value match? */ - if (lock->exclusive == 0) - { - result = true; - mustwait = false; - } - else + if (mustwait) { + /* + * Perform comparison using spinlock as we can't rely on atomic 64 + * bit reads/stores. + */ + SpinLockAcquire(&lock->mutex); + value = *valptr; if (value != oldval) { @@ -977,21 +1363,62 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) } else mustwait = true; + SpinLockRelease(&lock->mutex); } + else + mustwait = false; if (!mustwait) break; /* the lock was free or value didn't match */ /* - * Add myself to wait queue. + * Add myself to wait queue. Note that this is racy, somebody else + * could wakeup before we're finished queuing. + * NB: We're using nearly the same twice-in-a-row lock acquisition + * protocol as LWLockAcquire(). Check its comments for details. */ - proc->lwWaiting = true; - proc->lwWaitMode = LW_WAIT_UNTIL_FREE; - /* waiters are added to the front of the queue */ - dlist_push_head((dlist_head *) &lock->waiters, &proc->lwWaitLink); + LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); - /* Can release the mutex now */ - SpinLockRelease(&lock->mutex); + /* + * We're now guaranteed to be woken up if necessary. Recheck the + * lock's state. + */ + pg_read_barrier(); + mustwait = pg_atomic_read_u32(&lock->lockcount) != 0; + + /* ok, grabbed the lock the second time round, need to undo queueing */ + if (!mustwait) + { +#ifdef LWLOCK_STATS + lwstats->dequeue_self_count++; +#endif + if (!LWLockDequeueSelf(lock)) + { + /* + * Somebody else dequeued us and has or will wake us up. Wait + * for the correct wakeup, otherwise our ->lwWaiting would get + * reset at some inconvenient point later. + */ + for (;;) + { + PGSemaphoreLock(&proc->sem, false); + if (!proc->lwWaiting) + break; + extraWaits++; + } + } + PRINT_LWDEBUG("LWLockWaitForVar undo queue", lock, LW_EXCLUSIVE); + break; + } + else + { + PRINT_LWDEBUG("LWLockWaitForVar waiting 4", lock, LW_EXCLUSIVE); + } + + /* + * NB: Just as in LWLockAcquireCommon() there's no need to deal with + * spurious lock attempts here. + */ /* * Wait until awakened. @@ -1027,13 +1454,11 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) LW_EXCLUSIVE); LOG_LWDEBUG("LWLockWaitForVar", T_NAME(lock), T_ID(lock), "awakened"); + pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); /* Now loop back and check the status of the lock again. */ } - /* We are done updating shared state of the lock itself. */ - SpinLockRelease(&lock->mutex); - TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), T_ID(lock), LW_EXCLUSIVE); /* @@ -1072,8 +1497,7 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val) /* Acquire mutex. Time spent holding mutex should be short! */ SpinLockAcquire(&lock->mutex); - /* we should hold the lock */ - Assert(lock->exclusive == 1); + Assert(pg_atomic_read_u32(&lock->lockcount) >= EXCLUSIVE_LOCK); /* Update the lock's value */ *valptr = val; @@ -1099,7 +1523,7 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val) /* * Awaken any waiters I removed from the queue. */ - dlist_foreach_modify(iter, (dlist_head *) &wakeup) + dlist_foreach_modify(iter, &wakeup) { PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur); dlist_delete(&waiter->lwWaitLink); @@ -1116,22 +1540,23 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val) void LWLockRelease(LWLock *lock) { - dlist_head wakeup; - dlist_mutable_iter iter; + LWLockMode mode; + uint32 lockcount; + bool check_waiters; + bool have_waiters = false; int i; - dlist_init(&wakeup); - - PRINT_LWDEBUG("LWLockRelease", lock); - /* * Remove lock from list of locks held. Usually, but not always, it will * be the latest-acquired lock; so search array backwards. */ for (i = num_held_lwlocks; --i >= 0;) { - if (lock == held_lwlocks[i]) + if (lock == held_lwlocks[i].lock) + { + mode = held_lwlocks[i].mode; break; + } } if (i < 0) elog(ERROR, "lock %s %d is not held", T_NAME(lock), T_ID(lock)); @@ -1139,77 +1564,44 @@ LWLockRelease(LWLock *lock) for (; i < num_held_lwlocks; i++) held_lwlocks[i] = held_lwlocks[i + 1]; - /* Acquire mutex. Time spent holding mutex should be short! */ - SpinLockAcquire(&lock->mutex); + PRINT_LWDEBUG("LWLockRelease", lock, mode); - /* Release my hold on lock */ - if (lock->exclusive > 0) - lock->exclusive--; + /* Release my hold on lock, both are a full barrier */ + if (mode == LW_EXCLUSIVE) + lockcount = pg_atomic_sub_fetch_u32(&lock->lockcount, EXCLUSIVE_LOCK); else - { - Assert(lock->shared > 0); - lock->shared--; - } + lockcount = pg_atomic_sub_fetch_u32(&lock->lockcount, 1); + + /* nobody else can have that kind of lock */ + Assert(lockcount < EXCLUSIVE_LOCK); /* - * See if I need to awaken any waiters. If I released a non-last shared - * hold, there cannot be anything to do. Also, do not awaken any waiters - * if someone has already awakened waiters that haven't yet acquired the - * lock. + * Anybody we need to wakeup needs to have started queueing before + * we removed ourselves from the queue and the __sync_ operations + * above are full barriers. */ - if (lock->exclusive == 0 && lock->shared == 0 && lock->releaseOK) - { - /* - * Remove the to-be-awakened PGPROCs from the queue. - */ - bool releaseOK = true; - bool wokeup_somebody = false; - dlist_foreach_modify(iter, (dlist_head *) &lock->waiters) - { - PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur); + if (pg_atomic_read_u32(&lock->nwaiters) > 0) + have_waiters = true; - if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE) - continue; - - dlist_delete(&waiter->lwWaitLink); - dlist_push_tail(&wakeup, &waiter->lwWaitLink); - - /* - * Prevent additional wakeups until retryer gets to - * run. Backends that are just waiting for the lock to become - * free don't retry automatically. - */ - if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) - { - releaseOK = false; - wokeup_somebody = true; - } + /* grant permission to run, even if a spurious share lock increases lockcount */ + if (mode == LW_EXCLUSIVE && have_waiters) + check_waiters = true; + /* nobody has this locked anymore, potential exclusive lockers get a chance */ + else if (lockcount == 0 && have_waiters) + check_waiters = true; + /* nobody queued or not free */ + else + check_waiters = false; - if(waiter->lwWaitMode == LW_EXCLUSIVE) - break; - } - lock->releaseOK = releaseOK; + if (check_waiters) + { + PRINT_LWDEBUG("LWLockRelease releasing", lock, mode); + LWLockWakeup(lock, mode); } - /* We are done updating shared state of the lock itself. */ - SpinLockRelease(&lock->mutex); - TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock), T_ID(lock)); - /* - * Awaken any waiters I removed from the queue. - */ - dlist_foreach_modify(iter, (dlist_head *) &wakeup) - { - PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur); - LOG_LWDEBUG("LWLockRelease", T_NAME(l), T_ID(l), "release waiter"); - dlist_delete(&waiter->lwWaitLink); - pg_write_barrier(); - waiter->lwWaiting = false; - PGSemaphoreUnlock(&waiter->sem); - } - /* * Now okay to allow cancel/die interrupts. */ @@ -1233,7 +1625,7 @@ LWLockReleaseAll(void) { HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */ - LWLockRelease(held_lwlocks[num_held_lwlocks - 1]); + LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock); } } @@ -1241,8 +1633,8 @@ LWLockReleaseAll(void) /* * LWLockHeldByMe - test whether my process currently holds a lock * - * This is meant as debug support only. We do not distinguish whether the - * lock is held shared or exclusive. + * This is meant as debug support only. We currently do not distinguish + * whether the lock is held shared or exclusive. */ bool LWLockHeldByMe(LWLock *l) @@ -1251,7 +1643,7 @@ LWLockHeldByMe(LWLock *l) for (i = 0; i < num_held_lwlocks; i++) { - if (held_lwlocks[i] == l) + if (held_lwlocks[i].lock == l) return true; } return false; diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 8b20fd9f9f..3376b1435e 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -16,6 +16,7 @@ #include "lib/ilist.h" #include "storage/s_lock.h" +#include "port/atomics.h" struct PGPROC; @@ -48,10 +49,14 @@ typedef struct LWLock { slock_t mutex; /* Protects LWLock and queue of PGPROCs */ bool releaseOK; /* T if ok to release waiters */ - char exclusive; /* # of exclusive holders (0 or 1) */ - int shared; /* # of shared holders (0..MaxBackends) */ + + pg_atomic_uint32 lockcount; /* state of exlusive/nonexclusive lockers */ + pg_atomic_uint32 nwaiters; /* number of waiters */ int tranche; /* tranche ID */ dlist_head waiters; /* list of waiting PGPROCs */ +#ifdef LOCK_DEBUG + struct PGPROC *owner; +#endif } LWLock; /* diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h index f49d17f0c0..f217bd6c46 100644 --- a/src/include/storage/s_lock.h +++ b/src/include/storage/s_lock.h @@ -906,6 +906,8 @@ extern int tas_sema(volatile slock_t *lock); #define S_LOCK_FREE(lock) (*(lock) == 0) #endif /* S_LOCK_FREE */ +#include "storage/barrier.h" + #if !defined(S_UNLOCK) /* * Our default implementation of S_UNLOCK is essentially *(lock) = 0. This