From: Andres Freund <andres@anarazel.de>
Date: Thu, 18 Sep 2014 14:14:16 +0000 (+0200)
Subject: Wait free LW_SHARED lwlock acquiration
X-Git-Url: http://git.postgresql.org/gitweb/static/developers.postgresql.org?a=commitdiff_plain;h=eedffb0e25b6cc1fa37dce9c8b548605e46a26c5;p=users%2Fandresfreund%2Fpostgres.git

Wait free LW_SHARED lwlock acquiration
---

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 1fd26aaaf2..82e85c971c 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -24,6 +24,82 @@
  * IDENTIFICATION
  *	  src/backend/storage/lmgr/lwlock.c
  *
+ * NOTES:
+ *
+ * This used to be a pretty straight forward reader-writer lock
+ * implementation, in which the internal state was protected by a
+ * spinlock. Unfortunately the overhead of taking the spinlock proved to be
+ * too high for workloads/locks that were locked in shared mode very
+ * frequently.
+ * Thus a new implementation was devised that provides wait-free shared lock
+ * acquiration for locks that aren't exclusively locked.
+ *
+ * The basic idea is to have a single atomic variable 'lockcount' instead of
+ * the formerly separate shared and exclusive counters and to use an atomic
+ * increment to acquire the lock. That's fairly easy to do for rw-spinlocks,
+ * but a lot harder for something like LWLocks that want to wait in the OS.
+ *
+ * For exlusive lock acquisition we use an atomic compare-and-exchange on the
+ * lockcount variable swapping in EXCLUSIVE_LOCK/1<<31-1/0x7FFFFFFF if and only
+ * if the current value of lockcount is 0. If the swap was not successfull, we
+ * have to wait.
+ *
+ * For shared lock acquisition we use an atomic add (lock xadd) to the
+ * lockcount variable to add 1. If the value is bigger than EXCLUSIVE_LOCK we
+ * know that somebody actually has an exclusive lock, and we back out by
+ * atomically decrementing by 1 again. If so, we have to wait for the exlusive
+ * locker to release the lock.
+ *
+ * To release the lock we use an atomic decrement to release the lock. If the
+ * new value is zero (we get that atomically), we know we have to release
+ * waiters.
+ *
+ * The attentive reader probably might have noticed that naively doing the
+ * above has two glaring race conditions:
+ *
+ * 1) too-quick-for-queueing: We try to lock using the atomic operations and
+ * notice that we have to wait. Unfortunately until we have finished queuing,
+ * the former locker very well might have already finished it's work. That's
+ * problematic because we're now stuck waiting inside the OS.
+ *
+ * 2) spurious failed locks: Due to the logic of backing out of shared
+ * locks after we unconditionally added a 1 to lockcount, we might have
+ * prevented another exclusive locker from getting the lock:
+ *   1) Session A: LWLockAcquire(LW_EXCLUSIVE) - success
+ *   2) Session B: LWLockAcquire(LW_SHARED) - lockcount += 1
+ *   3) Session B: LWLockAcquire(LW_SHARED) - oops, bigger than EXCLUSIVE_LOCK
+ *   4) Session A: LWLockRelease()
+ *   5) Session C: LWLockAcquire(LW_EXCLUSIVE) - check if lockcount = 0, no. wait.
+ *   6) Session B: LWLockAcquire(LW_SHARED) - lockcount -= 1
+ *   7) Session B: LWLockAcquire(LW_SHARED) - wait
+ *
+ * So we'd now have both B) and C) waiting on a lock that nobody is holding
+ * anymore. Not good.
+ *
+ * To mitigate those races we use a two phased attempt at locking:
+ *   Phase 1: * Try to do it atomically, if we succeed, nice
+ *   Phase 2: Add us too the waitqueue of the lock
+ *   Phase 3: Try to grab the lock again, if we succeed, remove ourselves from
+ *            the queue
+ *   Phase 4: Sleep till wakeup, goto Phase 1
+ *
+ * This protects us against both problems from above:
+ * 1) Nobody can release too quick, before we're queued, since after Phase 2
+ *    we're already queued.
+ * 2) If somebody spuriously got blocked from acquiring the lock, they will
+ *    get queued in Phase 2 and we can wake them up if neccessary or they will
+ *    have gotten the lock in Phase 3.
+ *
+ * There above algorithm only works for LWLockAcquire, not directly for
+ * LWLockAcquireConditional where we don't want to wait. In that case we just
+ * need to retry acquiring the lock until we're sure we didn't disturb anybody
+ * in doing so.
+ *
+ * TODO:
+ * - decide if we need a spinlock fallback
+ * - expand documentation
+ * - make LWLOCK_STATS do something sensible again
+ * - make LOCK_DEBUG output nicer
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -35,7 +111,6 @@
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "replication/slot.h"
-#include "storage/barrier.h"
 #include "storage/ipc.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@@ -50,6 +125,10 @@
 /* We use the ShmemLock spinlock to protect LWLockAssign */
 extern slock_t *ShmemLock;
 
+#define EXCLUSIVE_LOCK (((uint32) 1) << (31 - 1))
+/* must be greater than MAX_BACKENDS */
+#define SHARED_LOCK_MASK (~EXCLUSIVE_LOCK)
+
 /*
  * This is indexed by tranche ID and stores metadata for all tranches known
  * to the current backend.
@@ -80,8 +159,14 @@ static LWLockTranche MainLWLockTranche;
  */
 #define MAX_SIMUL_LWLOCKS	100
 
+typedef struct LWLockHandle
+{
+	LWLock *lock;
+	LWLockMode	mode;
+} LWLockHandle;
+
 static int	num_held_lwlocks = 0;
-static LWLock *held_lwlocks[MAX_SIMUL_LWLOCKS];
+static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS];
 
 static int	lock_addin_request = 0;
 static bool lock_addin_request_allowed = true;
@@ -100,8 +185,11 @@ typedef struct lwlock_stats
 {
 	lwlock_stats_key key;
 	int			sh_acquire_count;
+	int			sh_attempt_backout;
 	int			ex_acquire_count;
+	int			ex_race;
 	int			block_count;
+	int			dequeue_self_count;
 	int			spin_delay_count;
 }	lwlock_stats;
 
@@ -113,23 +201,30 @@ static lwlock_stats lwlock_stats_dummy;
 bool		Trace_lwlocks = false;
 
 inline static void
-PRINT_LWDEBUG(const char *where, const LWLock *lock)
+PRINT_LWDEBUG(const char *where, const LWLock *lock, LWLockMode mode)
 {
 	if (Trace_lwlocks)
-		elog(LOG, "%s(%s %d): excl %d shared %d rOK %d",
+	{
+		uint32 lockcount = pg_atomic_read_u32(&lock->lockcount);
+
+		elog(LOG, "%d: %s(%s %d): excl %u shared %u waiters %u rOK %d\n",
+			 MyProcPid,
 			 where, T_NAME(lock), T_ID(lock),
-			 (int) lock->exclusive, lock->shared,
+			 lockcount >= EXCLUSIVE_LOCK,
+			 lockcount & SHARED_LOCK_MASK,
+			 pg_atomic_read_u32(&lock->nwaiters),
 			 (int) lock->releaseOK);
+	}
 }
 
 inline static void
 LOG_LWDEBUG(const char *where, const char *name, int index, const char *msg)
 {
 	if (Trace_lwlocks)
-		elog(LOG, "%s(%s %d): %s", where, name, index, msg);
+		elog(LOG, "%d: %s(%s %d): %s\n", MyProcPid, where, name, index, msg);
 }
 #else							/* not LOCK_DEBUG */
-#define PRINT_LWDEBUG(a,b)
+#define PRINT_LWDEBUG(a,b,c)
 #define LOG_LWDEBUG(a,b,c,d)
 #endif   /* LOCK_DEBUG */
 
@@ -192,11 +287,12 @@ print_lwlock_stats(int code, Datum arg)
 	while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL)
 	{
 		fprintf(stderr,
-			  "PID %d lwlock %s %d: shacq %u exacq %u blk %u spindelay %u\n",
+				"PID %d lwlock %s %d: shacq %u exacq %u blk %u spindelay %u, backout %u, ex race %u, dequeue self %u\n",
 				MyProcPid, LWLockTrancheArray[lwstats->key.tranche]->name,
 				lwstats->key.instance, lwstats->sh_acquire_count,
 				lwstats->ex_acquire_count, lwstats->block_count,
-				lwstats->spin_delay_count);
+				lwstats->spin_delay_count, lwstats->sh_attempt_backout,
+				lwstats->ex_race, lwstats->dequeue_self_count);
 	}
 
 	LWLockRelease(&MainLWLockArray[0].lock);
@@ -224,8 +320,11 @@ get_lwlock_stats_entry(LWLock *lock)
 	if (!found)
 	{
 		lwstats->sh_acquire_count = 0;
+		lwstats->sh_attempt_backout = 0;
 		lwstats->ex_acquire_count = 0;
+		lwstats->ex_race = 0;
 		lwstats->block_count = 0;
+		lwstats->dequeue_self_count = 0;
 		lwstats->spin_delay_count = 0;
 	}
 	return lwstats;
@@ -473,12 +572,299 @@ LWLockInitialize(LWLock *lock, int tranche_id)
 {
 	SpinLockInit(&lock->mutex);
 	lock->releaseOK = true;
-	lock->exclusive = 0;
-	lock->shared = 0;
+	pg_atomic_init_u32(&lock->lockcount, 0);
+	pg_atomic_init_u32(&lock->nwaiters, 0);
 	lock->tranche = tranche_id;
 	dlist_init(&lock->waiters);
 }
 
+/*
+ * Internal function handling the atomic manipulation of lock->lockcount.
+ *
+ * 'double_check' = true means that we try to check more carefully
+ * against causing somebody else to spuriously believe the lock is
+ * already taken, although we're just about to back out of it.
+ */
+static inline bool
+LWLockAttemptLock(LWLock* l, LWLockMode mode, bool double_check, bool *potentially_spurious)
+{
+	bool		mustwait;
+	uint32		oldstate;
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+	lwstats = get_lwlock_stats_entry(l);
+#endif
+
+	Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED);
+
+	*potentially_spurious = false;
+
+	if (mode == LW_EXCLUSIVE)
+	{
+		uint32 expected;
+
+		/* check without CAS first; it's way cheaper, frequently locked otherwise */
+		expected = pg_atomic_read_u32(&l->lockcount);
+
+		Assert(expected < EXCLUSIVE_LOCK + (1 << 16));
+
+		if (expected != 0)
+			mustwait = true;
+		else if (!pg_atomic_compare_exchange_u32(&l->lockcount,
+												 &expected, EXCLUSIVE_LOCK))
+		{
+			/*
+			 * ok, no can do. Between the pg_atomic_read() above and the
+			 * CAS somebody else acquired the lock.
+			 */
+			mustwait = true;
+			Assert(expected < EXCLUSIVE_LOCK + (1 << 16));
+		}
+		else
+		{
+			/* yipeyyahee */
+			mustwait = false;
+#ifdef LOCK_DEBUG
+			l->owner = MyProc;
+#endif
+			Assert(expected == 0);
+		}
+	}
+	else
+	{
+		/*
+		 * If requested by caller, do an unlocked check first.  This is useful
+		 * if potentially spurious results have a noticeable cost.
+		 */
+		if (double_check)
+		{
+			if (pg_atomic_read_u32(&l->lockcount) >= EXCLUSIVE_LOCK)
+			{
+				mustwait = true;
+				goto out;
+			}
+		}
+
+		/*
+		 * Acquire the share lock unconditionally using an atomic addition. We
+		 * might have to back out again if it turns out somebody else has an
+		 * exclusive lock.
+		 */
+		oldstate = pg_atomic_fetch_add_u32(&l->lockcount, 1);
+
+		if (oldstate >= EXCLUSIVE_LOCK)
+		{
+			/*
+			 * Ok, somebody else holds the lock exclusively. We need to back
+			 * away from the shared lock, since we don't actually hold it right
+			 * now.  Since there's a window between lockcount += 1 and lockcount
+			 * -= 1, the previous exclusive locker could have released and
+			 * another exclusive locker could have seen our +1. We need to
+			 * signal that to the upper layers so they can deal with the race
+			 * condition.
+			 */
+
+			/*
+			 * FIXME: check return value if (double_check), it's not
+			 * spurious if still exclusively locked.
+			 */
+			pg_atomic_fetch_sub_u32(&l->lockcount, 1);
+
+
+			mustwait = true;
+			*potentially_spurious = true;
+#ifdef LWLOCK_STATS
+			lwstats->sh_attempt_backout++;
+#endif
+		}
+		else
+		{
+			/* yipeyyahee */
+			mustwait = false;
+		}
+	}
+
+out:
+	return mustwait;
+}
+
+/*
+ * Wakeup all the lockers that currently have a chance to run.
+ */
+static void
+LWLockWakeup(LWLock *lock, LWLockMode mode)
+{
+	bool		releaseOK;
+	bool		wokeup_somebody = false;
+	dlist_head	wakeup;
+	dlist_mutable_iter iter;
+#ifdef LWLOCK_STATS
+	lwlock_stats   *lwstats;
+	lwstats = get_lwlock_stats_entry(l);
+#endif
+
+	dlist_init(&wakeup);
+
+	/* remove the to-be-awakened PGPROCs from the queue */
+	releaseOK = true;
+
+	/* Acquire mutex.  Time spent holding mutex should be short! */
+#ifdef LWLOCK_STATS
+	lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
+#else
+	SpinLockAcquire(&lock->mutex);
+#endif
+
+	/*
+	 * We're still waiting for backends to get scheduled, don't wake them up
+	 * again.
+	 */
+	if (!lock->releaseOK)
+	{
+		SpinLockRelease(&lock->mutex);
+		PRINT_LWDEBUG("LWLockRelease skip releaseok", lock, mode);
+		return;
+	}
+
+	dlist_foreach_modify(iter, (dlist_head *) &lock->waiters)
+	{
+		PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);
+
+		if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE)
+			continue;
+
+		dlist_delete(&waiter->lwWaitLink);
+		dlist_push_tail(&wakeup, &waiter->lwWaitLink);
+
+		if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+		{
+			/*
+			 * Prevent additional wakeups until retryer gets to run. Backends
+			 * that are just waiting for the lock to become free don't retry
+			 * automatically.
+			 */
+			releaseOK = false;
+			/*
+			 * Don't wakeup (further) exclusive locks.
+			 */
+			wokeup_somebody = true;
+		}
+
+		/*
+		 * Once we've woken up an exclusive lock, there's no point in waking
+		 * up anybody else.
+		 */
+		if(waiter->lwWaitMode == LW_EXCLUSIVE)
+			break;
+	}
+	lock->releaseOK = releaseOK;
+
+
+	/* We are done updating shared state of the lock queue. */
+	SpinLockRelease(&lock->mutex);
+
+	/*
+	 * Awaken any waiters I removed from the queue.
+	 */
+	dlist_foreach_modify(iter, &wakeup)
+	{
+		PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);
+
+		LOG_LWDEBUG("LWLockRelease", T_NAME(l), T_ID(l),  "release waiter");
+		dlist_delete(&waiter->lwWaitLink);
+		pg_write_barrier();
+		waiter->lwWaiting = false;
+		PGSemaphoreUnlock(&waiter->sem);
+	}
+}
+
+/*
+ * Add ourselves to the end of the queue. Mode can be LW_WAIT_UNTIL_FREE here!
+ */
+static inline void
+LWLockQueueSelf(LWLock *lock, LWLockMode mode)
+{
+#ifdef LWLOCK_STATS
+	lwlock_stats   *lwstats;
+	lwstats = get_lwlock_stats_entry(l);
+#endif
+
+	/*
+	 * If we don't have a PGPROC structure, there's no way to wait. This
+	 * should never occur, since MyProc should only be null during shared
+	 * memory initialization.
+	 */
+	if (MyProc == NULL)
+		elog(PANIC, "cannot wait without a PGPROC structure");
+
+	pg_atomic_fetch_add_u32(&lock->nwaiters, 1);
+
+#ifdef LWLOCK_STATS
+	lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
+#else
+	SpinLockAcquire(&lock->mutex);
+#endif
+
+	if (MyProc->lwWaiting)
+		elog(PANIC, "queueing for lock while waiting on another one");
+
+	MyProc->lwWaiting = true;
+	MyProc->lwWaitMode = mode;
+
+	/* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */
+	if (mode == LW_WAIT_UNTIL_FREE)
+		dlist_push_head((dlist_head *) &lock->waiters, &MyProc->lwWaitLink);
+	else
+		dlist_push_tail((dlist_head *) &lock->waiters, &MyProc->lwWaitLink);
+
+	/* Can release the mutex now */
+	SpinLockRelease(&lock->mutex);
+}
+
+/*
+ * Remove ourselves from the waitlist.  This is used if we queued ourselves
+ * because we thought we needed to sleep but, after further checking, we
+ * discovered that we don't actually need to do so. Somebody else might have
+ * already woken us up though, in that case return false.
+ */
+static inline bool
+LWLockDequeueSelf(LWLock *lock)
+{
+	bool	found = false;
+	dlist_mutable_iter iter;
+
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+	lwstats = get_lwlock_stats_entry(l);
+#endif
+
+#ifdef LWLOCK_STATS
+	lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
+#else
+	SpinLockAcquire(&lock->mutex);
+#endif
+
+	/* need to iterate, somebody else could have unqueued us */
+	dlist_foreach_modify(iter, (dlist_head *) &lock->waiters)
+	{
+		PGPROC *proc = dlist_container(PGPROC, lwWaitLink, iter.cur);
+		if (proc == MyProc)
+		{
+			found = true;
+			dlist_delete(&proc->lwWaitLink);
+			break;
+		}
+	}
+
+	/* clear waiting state again, nice for debugging */
+	if (found)
+		MyProc->lwWaiting = false;
+
+	SpinLockRelease(&lock->mutex);
+
+	pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+	return found;
+}
 
 /*
  * LWLockAcquire - acquire a lightweight lock in the specified mode
@@ -510,14 +896,17 @@ static inline bool
 LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val)
 {
 	PGPROC	   *proc = MyProc;
-	bool		retry = false;
 	bool		result = true;
 	int			extraWaits = 0;
+	bool		potentially_spurious;
+
 #ifdef LWLOCK_STATS
 	lwlock_stats *lwstats;
 #endif
 
-	PRINT_LWDEBUG("LWLockAcquire", lock);
+	AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+	PRINT_LWDEBUG("LWLockAcquire", lock, mode);
 
 #ifdef LWLOCK_STATS
 	lwstats = get_lwlock_stats_entry(lock);
@@ -567,58 +956,77 @@ LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val)
 	{
 		bool		mustwait;
 
-		/* Acquire mutex.  Time spent holding mutex should be short! */
-#ifdef LWLOCK_STATS
-		lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
-#else
-		SpinLockAcquire(&lock->mutex);
-#endif
+		/*
+		 * try to grab the lock the first time, we're not in the waitqueue yet.
+		 */
+		mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious);
 
-		/* If retrying, allow LWLockRelease to release waiters again */
-		if (retry)
-			lock->releaseOK = true;
+		if (!mustwait)
+		{
+			LOG_LWDEBUG("LWLockAcquire", T_NAME(l), T_ID(l), "success");
+			break;				/* got the lock */
+		}
 
-		/* If I can get the lock, do so quickly. */
-		if (mode == LW_EXCLUSIVE)
+		/*
+		 * Ok, at this point we couldn't grab the lock on the first try. We
+		 * cannot simply queue ourselves to the end of the list and wait to be
+		 * woken up because by now the lock could long have been released.
+		 * Instead add us to the queue and try to grab the lock again. If we
+		 * succeed we need to revert the queuing and be happy, otherwise we
+		 * recheck the lock. If we still couldn't grab it, we know that the
+		 * other lock will see our queue entries when releasing since they
+		 * existed before we checked for the lock.
+		 * FIXME: add note referring to overall notes
+		 */
+
+		/* add to the queue */
+		LWLockQueueSelf(lock, mode);
+
+		/* we're now guaranteed to be woken up if necessary */
+		mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious);
+
+		/* ok, grabbed the lock the second time round, need to undo queueing */
+		if (!mustwait)
 		{
-			if (lock->exclusive == 0 && lock->shared == 0)
+#ifdef LWLOCK_STATS
+			lwstats->dequeue_self_count++;
+#endif
+			if (!LWLockDequeueSelf(lock))
 			{
-				lock->exclusive++;
-				mustwait = false;
+				/*
+				 * Somebody else dequeued us and has or will wake us up. Wait
+				 * for the correct wakeup, otherwise our ->lwWaiting would get
+				 * reset at some inconvenient point later, and releaseOk
+				 * wouldn't be managed correctly.
+				 */
+				for (;;)
+				{
+					PGSemaphoreLock(&proc->sem, false);
+					if (!proc->lwWaiting)
+						break;
+					extraWaits++;
+				}
+				/*
+				 * Reset releaseOk - if somebody woke us they'll have set it
+				 * to false.
+				 */
+				SpinLockAcquire(&lock->mutex);
+				lock->releaseOK = true;
+				SpinLockRelease(&lock->mutex);
 			}
-			else
-				mustwait = true;
+			PRINT_LWDEBUG("LWLockAcquire success: undo queue", lock, mode);
+			break;
 		}
 		else
 		{
-			if (lock->exclusive == 0)
-			{
-				lock->shared++;
-				mustwait = false;
-			}
-			else
-				mustwait = true;
+			PRINT_LWDEBUG("LWLockAcquire waiting 4", lock, mode);
 		}
 
-		if (!mustwait)
-			break;				/* got the lock */
-
 		/*
-		 * Add myself to wait queue.
-		 *
-		 * If we don't have a PGPROC structure, there's no way to wait. This
-		 * should never occur, since MyProc should only be null during shared
-		 * memory initialization.
+		 * NB: There's no need to deal with spurious lock attempts
+		 * here. Anyone we prevented from acquiring the lock will
+		 * enqueue themselves using the same protocol we used here.
 		 */
-		if (proc == NULL)
-			elog(PANIC, "cannot wait without a PGPROC structure");
-
-		proc->lwWaiting = true;
-		proc->lwWaitMode = mode;
-		dlist_push_head((dlist_head *) &lock->waiters, &proc->lwWaitLink);
-
-		/* Can release the mutex now */
-		SpinLockRelease(&lock->mutex);
 
 		/*
 		 * Wait until awakened.
@@ -653,8 +1061,14 @@ LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val)
 
 		LOG_LWDEBUG("LWLockAcquire", T_NAME(lock), T_ID(lock), "awakened");
 
+		/* not waiting anymore */
+		pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
 		/* Now loop back and try to acquire lock again. */
-		retry = true;
+		SpinLockAcquire(&lock->mutex);
+		lock->releaseOK = true;
+		SpinLockRelease(&lock->mutex);
+
 		result = false;
 	}
 
@@ -662,13 +1076,11 @@ LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val)
 	if (valptr)
 		*valptr = val;
 
-	/* We are done updating shared state of the lock itself. */
-	SpinLockRelease(&lock->mutex);
-
 	TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), T_ID(lock), mode);
 
 	/* Add lock to list of locks held by this backend */
-	held_lwlocks[num_held_lwlocks++] = lock;
+	held_lwlocks[num_held_lwlocks].lock = lock;
+	held_lwlocks[num_held_lwlocks++].mode = mode;
 
 	/*
 	 * Fix the process wait semaphore's count for any absorbed wakeups.
@@ -690,8 +1102,11 @@ bool
 LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
 {
 	bool		mustwait;
+	bool		potentially_spurious;
+
+	AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);
 
-	PRINT_LWDEBUG("LWLockConditionalAcquire", lock);
+	PRINT_LWDEBUG("LWLockConditionalAcquire", l, mode);
 
 	/* Ensure we will have room to remember the lock */
 	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
@@ -704,50 +1119,44 @@ LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
 	 */
 	HOLD_INTERRUPTS();
 
-	/* Acquire mutex.  Time spent holding mutex should be short! */
-	SpinLockAcquire(&lock->mutex);
-
-	/* If I can get the lock, do so quickly. */
-	if (mode == LW_EXCLUSIVE)
-	{
-		if (lock->exclusive == 0 && lock->shared == 0)
-		{
-			lock->exclusive++;
-			mustwait = false;
-		}
-		else
-			mustwait = true;
-	}
-	else
-	{
-		if (lock->exclusive == 0)
-		{
-			lock->shared++;
-			mustwait = false;
-		}
-		else
-			mustwait = true;
-	}
-
-	/* We are done updating shared state of the lock itself. */
-	SpinLockRelease(&lock->mutex);
+retry:
+	/*
+	 * passing 'true' to check more carefully to avoid potential
+	 * spurious acquisitions
+	 */
+	mustwait = LWLockAttemptLock(lock, mode, true, &potentially_spurious);
 
 	if (mustwait)
 	{
 		/* Failed to get lock, so release interrupt holdoff */
 		RESUME_INTERRUPTS();
+
 		LOG_LWDEBUG("LWLockConditionalAcquire",
 					T_NAME(lock), T_ID(lock), "failed");
-		TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock),
-												 T_ID(lock), mode);
+		TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), T_ID(lock), mode);
+
+		/*
+		 * We ran into an exclusive lock and might have blocked another
+		 * exclusive lock from taking a shot because it took a time to back
+		 * off. Retry till we are either sure we didn't block somebody (because
+		 * somebody else certainly has the lock) or till we got it.
+		 *
+		 * We cannot rely on the two-step lock-acquisition protocol as in
+		 * LWLockAcquire because we're not using it.
+		 */
+		if (potentially_spurious)
+		{
+			SPIN_DELAY();
+			goto retry;
+		}
 	}
 	else
 	{
 		/* Add lock to list of locks held by this backend */
-		held_lwlocks[num_held_lwlocks++] = lock;
+		held_lwlocks[num_held_lwlocks].lock = lock;
+		held_lwlocks[num_held_lwlocks++].mode = mode;
 		TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), T_ID(lock), mode);
 	}
-
 	return !mustwait;
 }
 
@@ -771,11 +1180,15 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
 	PGPROC	   *proc = MyProc;
 	bool		mustwait;
 	int			extraWaits = 0;
+	bool		potentially_spurious_first;
+	bool		potentially_spurious_second;
 #ifdef LWLOCK_STATS
 	lwlock_stats *lwstats;
 #endif
 
-	PRINT_LWDEBUG("LWLockAcquireOrWait", lock);
+	Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+	PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode);
 
 #ifdef LWLOCK_STATS
 	lwstats = get_lwlock_stats_entry(lock);
@@ -792,81 +1205,58 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
 	 */
 	HOLD_INTERRUPTS();
 
-	/* Acquire mutex.  Time spent holding mutex should be short! */
-	SpinLockAcquire(&lock->mutex);
-
-	/* If I can get the lock, do so quickly. */
-	if (mode == LW_EXCLUSIVE)
-	{
-		if (lock->exclusive == 0 && lock->shared == 0)
-		{
-			lock->exclusive++;
-			mustwait = false;
-		}
-		else
-			mustwait = true;
-	}
-	else
-	{
-		if (lock->exclusive == 0)
-		{
-			lock->shared++;
-			mustwait = false;
-		}
-		else
-			mustwait = true;
-	}
+	/*
+	 * NB: We're using nearly the same twice-in-a-row lock acquisition
+	 * protocol as LWLockAcquire(). Check its comments for details.
+	 */
+	mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious_first);
 
 	if (mustwait)
 	{
-		/*
-		 * Add myself to wait queue.
-		 *
-		 * If we don't have a PGPROC structure, there's no way to wait.  This
-		 * should never occur, since MyProc should only be null during shared
-		 * memory initialization.
-		 */
-		if (proc == NULL)
-			elog(PANIC, "cannot wait without a PGPROC structure");
-
-		proc->lwWaiting = true;
-		proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
-		dlist_push_head((dlist_head *) &lock->waiters, &proc->lwWaitLink);
+		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
 
-		/* Can release the mutex now */
-		SpinLockRelease(&lock->mutex);
+		mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious_second);
 
-		/*
-		 * Wait until awakened.  Like in LWLockAcquire, be prepared for bogus
-		 * wakups, because we share the semaphore with ProcWaitForSignal.
-		 */
-		LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock),
-					"waiting");
+		if (mustwait)
+		{
+			/*
+			 * Wait until awakened.  Like in LWLockAcquire, be prepared for bogus
+			 * wakups, because we share the semaphore with ProcWaitForSignal.
+			 */
+			LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock),
+						"waiting");
 
 #ifdef LWLOCK_STATS
-		lwstats->block_count++;
+			lwstats->block_count++;
 #endif
+			TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(l), T_ID(l), mode);
 
-		TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), T_ID(lock), mode);
+			for (;;)
+			{
+				/* "false" means cannot accept cancel/die interrupt here. */
+				PGSemaphoreLock(&proc->sem, false);
+				if (!proc->lwWaiting)
+					break;
+				extraWaits++;
+			}
+			pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
 
-		for (;;)
+			TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock), mode);
+		}
+		else
 		{
-			/* "false" means cannot accept cancel/die interrupt here. */
-			PGSemaphoreLock(&proc->sem, false);
-			if (!proc->lwWaiting)
-				break;
-			extraWaits++;
+			/* got lock in the second attempt, undo queueing */
+			if (!LWLockDequeueSelf(lock))
+			{
+				for (;;)
+				{
+					PGSemaphoreLock(&proc->sem, false);
+					if (!proc->lwWaiting)
+						break;
+					extraWaits++;
+				}
+			}
 		}
-
-		TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock), mode);
-
-		LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock),
-					"awakened");
-	}
-	else
-	{
-		/* We are done updating shared state of the lock itself. */
-		SpinLockRelease(&lock->mutex);
 	}
 
 	/*
@@ -885,10 +1275,11 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
 	}
 	else
 	{
+		LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock), "succeeded");
 		/* Add lock to list of locks held by this backend */
-		held_lwlocks[num_held_lwlocks++] = lock;
-		TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), T_ID(lock),
-												mode);
+		held_lwlocks[num_held_lwlocks].lock = lock;
+		held_lwlocks[num_held_lwlocks++].mode = mode;
+		TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), T_ID(lock), mode);
 	}
 
 	return !mustwait;
@@ -922,7 +1313,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 	lwlock_stats *lwstats;
 #endif
 
-	PRINT_LWDEBUG("LWLockWaitForVar", lock);
+	PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_EXCLUSIVE);
 
 #ifdef LWLOCK_STATS
 	lwstats = get_lwlock_stats_entry(lock);
@@ -935,7 +1326,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 	 * barrier here as far as the current usage is concerned.  But that might
 	 * not be safe in general.
 	 */
-	if (lock->exclusive == 0)
+	if (pg_atomic_read_u32(&lock->lockcount) == 0)
 		return true;
 
 	/*
@@ -953,21 +1344,16 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 		bool		mustwait;
 		uint64		value;
 
-		/* Acquire mutex.  Time spent holding mutex should be short! */
-#ifdef LWLOCK_STATS
-		lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
-#else
-		SpinLockAcquire(&lock->mutex);
-#endif
+		mustwait = pg_atomic_read_u32(&lock->lockcount) != 0;
 
-		/* Is the lock now free, and if not, does the value match? */
-		if (lock->exclusive == 0)
-		{
-			result = true;
-			mustwait = false;
-		}
-		else
+		if (mustwait)
 		{
+			/*
+			 * Perform comparison using spinlock as we can't rely on atomic 64
+			 * bit reads/stores.
+			 */
+			SpinLockAcquire(&lock->mutex);
+
 			value = *valptr;
 			if (value != oldval)
 			{
@@ -977,21 +1363,62 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 			}
 			else
 				mustwait = true;
+			SpinLockRelease(&lock->mutex);
 		}
+		else
+			mustwait = false;
 
 		if (!mustwait)
 			break;				/* the lock was free or value didn't match */
 
 		/*
-		 * Add myself to wait queue.
+		 * Add myself to wait queue. Note that this is racy, somebody else
+		 * could wakeup before we're finished queuing.
+		 * NB: We're using nearly the same twice-in-a-row lock acquisition
+		 * protocol as LWLockAcquire(). Check its comments for details.
 		 */
-		proc->lwWaiting = true;
-		proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
-		/* waiters are added to the front of the queue */
-		dlist_push_head((dlist_head *) &lock->waiters, &proc->lwWaitLink);
+		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
 
-		/* Can release the mutex now */
-		SpinLockRelease(&lock->mutex);
+		/*
+		 * We're now guaranteed to be woken up if necessary. Recheck the
+		 * lock's state.
+		 */
+		pg_read_barrier();
+		mustwait = pg_atomic_read_u32(&lock->lockcount) != 0;
+
+		/* ok, grabbed the lock the second time round, need to undo queueing */
+		if (!mustwait)
+		{
+#ifdef LWLOCK_STATS
+			lwstats->dequeue_self_count++;
+#endif
+			if (!LWLockDequeueSelf(lock))
+			{
+				/*
+				 * Somebody else dequeued us and has or will wake us up. Wait
+				 * for the correct wakeup, otherwise our ->lwWaiting would get
+				 * reset at some inconvenient point later.
+				 */
+				for (;;)
+				{
+					PGSemaphoreLock(&proc->sem, false);
+					if (!proc->lwWaiting)
+						break;
+					extraWaits++;
+				}
+			}
+			PRINT_LWDEBUG("LWLockWaitForVar undo queue", lock, LW_EXCLUSIVE);
+			break;
+		}
+		else
+		{
+			PRINT_LWDEBUG("LWLockWaitForVar waiting 4", lock, LW_EXCLUSIVE);
+		}
+
+		/*
+		 * NB: Just as in LWLockAcquireCommon() there's no need to deal with
+		 * spurious lock attempts here.
+		 */
 
 		/*
 		 * Wait until awakened.
@@ -1027,13 +1454,11 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 										  LW_EXCLUSIVE);
 
 		LOG_LWDEBUG("LWLockWaitForVar", T_NAME(lock), T_ID(lock), "awakened");
+		pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
 
 		/* Now loop back and check the status of the lock again. */
 	}
 
-	/* We are done updating shared state of the lock itself. */
-	SpinLockRelease(&lock->mutex);
-
 	TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), T_ID(lock), LW_EXCLUSIVE);
 
 	/*
@@ -1072,8 +1497,7 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
 	/* Acquire mutex.  Time spent holding mutex should be short! */
 	SpinLockAcquire(&lock->mutex);
 
-	/* we should hold the lock */
-	Assert(lock->exclusive == 1);
+	Assert(pg_atomic_read_u32(&lock->lockcount) >= EXCLUSIVE_LOCK);
 
 	/* Update the lock's value */
 	*valptr = val;
@@ -1099,7 +1523,7 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
 	/*
 	 * Awaken any waiters I removed from the queue.
 	 */
-	dlist_foreach_modify(iter, (dlist_head *) &wakeup)
+	dlist_foreach_modify(iter, &wakeup)
 	{
 		PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);
 		dlist_delete(&waiter->lwWaitLink);
@@ -1116,22 +1540,23 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
 void
 LWLockRelease(LWLock *lock)
 {
-	dlist_head	wakeup;
-	dlist_mutable_iter iter;
+	LWLockMode	mode;
+	uint32		lockcount;
+	bool		check_waiters;
+	bool		have_waiters = false;
 	int			i;
 
-	dlist_init(&wakeup);
-
-	PRINT_LWDEBUG("LWLockRelease", lock);
-
 	/*
 	 * Remove lock from list of locks held.  Usually, but not always, it will
 	 * be the latest-acquired lock; so search array backwards.
 	 */
 	for (i = num_held_lwlocks; --i >= 0;)
 	{
-		if (lock == held_lwlocks[i])
+		if (lock == held_lwlocks[i].lock)
+		{
+			mode = held_lwlocks[i].mode;
 			break;
+		}
 	}
 	if (i < 0)
 		elog(ERROR, "lock %s %d is not held", T_NAME(lock), T_ID(lock));
@@ -1139,77 +1564,44 @@ LWLockRelease(LWLock *lock)
 	for (; i < num_held_lwlocks; i++)
 		held_lwlocks[i] = held_lwlocks[i + 1];
 
-	/* Acquire mutex.  Time spent holding mutex should be short! */
-	SpinLockAcquire(&lock->mutex);
+	PRINT_LWDEBUG("LWLockRelease", lock, mode);
 
-	/* Release my hold on lock */
-	if (lock->exclusive > 0)
-		lock->exclusive--;
+	/* Release my hold on lock, both are a full barrier */
+	if (mode == LW_EXCLUSIVE)
+		lockcount = pg_atomic_sub_fetch_u32(&lock->lockcount, EXCLUSIVE_LOCK);
 	else
-	{
-		Assert(lock->shared > 0);
-		lock->shared--;
-	}
+		lockcount = pg_atomic_sub_fetch_u32(&lock->lockcount, 1);
+
+	/* nobody else can have that kind of lock */
+	Assert(lockcount < EXCLUSIVE_LOCK);
 
 	/*
-	 * See if I need to awaken any waiters.  If I released a non-last shared
-	 * hold, there cannot be anything to do.  Also, do not awaken any waiters
-	 * if someone has already awakened waiters that haven't yet acquired the
-	 * lock.
+	 * Anybody we need to wakeup needs to have started queueing before
+	 * we removed ourselves from the queue and the __sync_ operations
+	 * above are full barriers.
 	 */
-	if (lock->exclusive == 0 && lock->shared == 0 && lock->releaseOK)
-	{
-		/*
-		 * Remove the to-be-awakened PGPROCs from the queue.
-		 */
-		bool		releaseOK = true;
-		bool		wokeup_somebody = false;
 
-		dlist_foreach_modify(iter, (dlist_head *) &lock->waiters)
-		{
-			PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);
+	if (pg_atomic_read_u32(&lock->nwaiters) > 0)
+		have_waiters = true;
 
-			if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE)
-				continue;
-
-			dlist_delete(&waiter->lwWaitLink);
-			dlist_push_tail(&wakeup, &waiter->lwWaitLink);
-
-			/*
-			 * Prevent additional wakeups until retryer gets to
-			 * run. Backends that are just waiting for the lock to become
-			 * free don't retry automatically.
-			 */
-			if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
-			{
-				releaseOK = false;
-				wokeup_somebody = true;
-			}
+	/* grant permission to run, even if a spurious share lock increases lockcount */
+	if (mode == LW_EXCLUSIVE && have_waiters)
+		check_waiters = true;
+	/* nobody has this locked anymore, potential exclusive lockers get a chance */
+	else if (lockcount == 0 && have_waiters)
+		check_waiters = true;
+	/* nobody queued or not free */
+	else
+		check_waiters = false;
 
-			if(waiter->lwWaitMode == LW_EXCLUSIVE)
-				break;
-		}
-		lock->releaseOK = releaseOK;
+	if (check_waiters)
+	{
+		PRINT_LWDEBUG("LWLockRelease releasing", lock, mode);
+		LWLockWakeup(lock, mode);
 	}
 
-	/* We are done updating shared state of the lock itself. */
-	SpinLockRelease(&lock->mutex);
-
 	TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock), T_ID(lock));
 
-	/*
-	 * Awaken any waiters I removed from the queue.
-	 */
-	dlist_foreach_modify(iter, (dlist_head *) &wakeup)
-	{
-		PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);
-		LOG_LWDEBUG("LWLockRelease", T_NAME(l), T_ID(l), "release waiter");
-		dlist_delete(&waiter->lwWaitLink);
-		pg_write_barrier();
-		waiter->lwWaiting = false;
-		PGSemaphoreUnlock(&waiter->sem);
-	}
-
 	/*
 	 * Now okay to allow cancel/die interrupts.
 	 */
@@ -1233,7 +1625,7 @@ LWLockReleaseAll(void)
 	{
 		HOLD_INTERRUPTS();		/* match the upcoming RESUME_INTERRUPTS */
 
-		LWLockRelease(held_lwlocks[num_held_lwlocks - 1]);
+		LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock);
 	}
 }
 
@@ -1241,8 +1633,8 @@ LWLockReleaseAll(void)
 /*
  * LWLockHeldByMe - test whether my process currently holds a lock
  *
- * This is meant as debug support only.  We do not distinguish whether the
- * lock is held shared or exclusive.
+ * This is meant as debug support only.  We currently do not distinguish
+ * whether the lock is held shared or exclusive.
  */
 bool
 LWLockHeldByMe(LWLock *l)
@@ -1251,7 +1643,7 @@ LWLockHeldByMe(LWLock *l)
 
 	for (i = 0; i < num_held_lwlocks; i++)
 	{
-		if (held_lwlocks[i] == l)
+		if (held_lwlocks[i].lock == l)
 			return true;
 	}
 	return false;
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 8b20fd9f9f..3376b1435e 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -16,6 +16,7 @@
 
 #include "lib/ilist.h"
 #include "storage/s_lock.h"
+#include "port/atomics.h"
 
 struct PGPROC;
 
@@ -48,10 +49,14 @@ typedef struct LWLock
 {
 	slock_t		mutex;			/* Protects LWLock and queue of PGPROCs */
 	bool		releaseOK;		/* T if ok to release waiters */
-	char		exclusive;		/* # of exclusive holders (0 or 1) */
-	int			shared;			/* # of shared holders (0..MaxBackends) */
+
+	pg_atomic_uint32 lockcount;	/* state of exlusive/nonexclusive lockers */
+	pg_atomic_uint32 nwaiters;	/* number of waiters */
 	int			tranche;		/* tranche ID */
 	dlist_head	waiters;		/* list of waiting PGPROCs */
+#ifdef LOCK_DEBUG
+	struct PGPROC *owner;
+#endif
 } LWLock;
 
 /*
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index f49d17f0c0..f217bd6c46 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -906,6 +906,8 @@ extern int	tas_sema(volatile slock_t *lock);
 #define S_LOCK_FREE(lock)	(*(lock) == 0)
 #endif	 /* S_LOCK_FREE */
 
+#include "storage/barrier.h"
+
 #if !defined(S_UNLOCK)
 /*
  * Our default implementation of S_UNLOCK is essentially *(lock) = 0.  This