Wait free LW_SHARED lwlock acquiration
authorAndres Freund <[email protected]>
Thu, 18 Sep 2014 14:14:16 +0000 (16:14 +0200)
committerAndres Freund <[email protected]>
Wed, 1 Oct 2014 16:01:47 +0000 (18:01 +0200)
src/backend/storage/lmgr/lwlock.c
src/include/storage/lwlock.h
src/include/storage/s_lock.h

index 1fd26aaaf294ea86617ee38c58f9d99ddcbee3e9..82e85c971ce7077a7476ed5143f191d3396a8061 100644 (file)
  * IDENTIFICATION
  *   src/backend/storage/lmgr/lwlock.c
  *
+ * NOTES:
+ *
+ * This used to be a pretty straight forward reader-writer lock
+ * implementation, in which the internal state was protected by a
+ * spinlock. Unfortunately the overhead of taking the spinlock proved to be
+ * too high for workloads/locks that were locked in shared mode very
+ * frequently.
+ * Thus a new implementation was devised that provides wait-free shared lock
+ * acquiration for locks that aren't exclusively locked.
+ *
+ * The basic idea is to have a single atomic variable 'lockcount' instead of
+ * the formerly separate shared and exclusive counters and to use an atomic
+ * increment to acquire the lock. That's fairly easy to do for rw-spinlocks,
+ * but a lot harder for something like LWLocks that want to wait in the OS.
+ *
+ * For exlusive lock acquisition we use an atomic compare-and-exchange on the
+ * lockcount variable swapping in EXCLUSIVE_LOCK/1<<31-1/0x7FFFFFFF if and only
+ * if the current value of lockcount is 0. If the swap was not successfull, we
+ * have to wait.
+ *
+ * For shared lock acquisition we use an atomic add (lock xadd) to the
+ * lockcount variable to add 1. If the value is bigger than EXCLUSIVE_LOCK we
+ * know that somebody actually has an exclusive lock, and we back out by
+ * atomically decrementing by 1 again. If so, we have to wait for the exlusive
+ * locker to release the lock.
+ *
+ * To release the lock we use an atomic decrement to release the lock. If the
+ * new value is zero (we get that atomically), we know we have to release
+ * waiters.
+ *
+ * The attentive reader probably might have noticed that naively doing the
+ * above has two glaring race conditions:
+ *
+ * 1) too-quick-for-queueing: We try to lock using the atomic operations and
+ * notice that we have to wait. Unfortunately until we have finished queuing,
+ * the former locker very well might have already finished it's work. That's
+ * problematic because we're now stuck waiting inside the OS.
+ *
+ * 2) spurious failed locks: Due to the logic of backing out of shared
+ * locks after we unconditionally added a 1 to lockcount, we might have
+ * prevented another exclusive locker from getting the lock:
+ *   1) Session A: LWLockAcquire(LW_EXCLUSIVE) - success
+ *   2) Session B: LWLockAcquire(LW_SHARED) - lockcount += 1
+ *   3) Session B: LWLockAcquire(LW_SHARED) - oops, bigger than EXCLUSIVE_LOCK
+ *   4) Session A: LWLockRelease()
+ *   5) Session C: LWLockAcquire(LW_EXCLUSIVE) - check if lockcount = 0, no. wait.
+ *   6) Session B: LWLockAcquire(LW_SHARED) - lockcount -= 1
+ *   7) Session B: LWLockAcquire(LW_SHARED) - wait
+ *
+ * So we'd now have both B) and C) waiting on a lock that nobody is holding
+ * anymore. Not good.
+ *
+ * To mitigate those races we use a two phased attempt at locking:
+ *   Phase 1: * Try to do it atomically, if we succeed, nice
+ *   Phase 2: Add us too the waitqueue of the lock
+ *   Phase 3: Try to grab the lock again, if we succeed, remove ourselves from
+ *            the queue
+ *   Phase 4: Sleep till wakeup, goto Phase 1
+ *
+ * This protects us against both problems from above:
+ * 1) Nobody can release too quick, before we're queued, since after Phase 2
+ *    we're already queued.
+ * 2) If somebody spuriously got blocked from acquiring the lock, they will
+ *    get queued in Phase 2 and we can wake them up if neccessary or they will
+ *    have gotten the lock in Phase 3.
+ *
+ * There above algorithm only works for LWLockAcquire, not directly for
+ * LWLockAcquireConditional where we don't want to wait. In that case we just
+ * need to retry acquiring the lock until we're sure we didn't disturb anybody
+ * in doing so.
+ *
+ * TODO:
+ * - decide if we need a spinlock fallback
+ * - expand documentation
+ * - make LWLOCK_STATS do something sensible again
+ * - make LOCK_DEBUG output nicer
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "replication/slot.h"
-#include "storage/barrier.h"
 #include "storage/ipc.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
 /* We use the ShmemLock spinlock to protect LWLockAssign */
 extern slock_t *ShmemLock;
 
+#define EXCLUSIVE_LOCK (((uint32) 1) << (31 - 1))
+/* must be greater than MAX_BACKENDS */
+#define SHARED_LOCK_MASK (~EXCLUSIVE_LOCK)
+
 /*
  * This is indexed by tranche ID and stores metadata for all tranches known
  * to the current backend.
@@ -80,8 +159,14 @@ static LWLockTranche MainLWLockTranche;
  */
 #define MAX_SIMUL_LWLOCKS  100
 
+typedef struct LWLockHandle
+{
+   LWLock *lock;
+   LWLockMode  mode;
+} LWLockHandle;
+
 static int num_held_lwlocks = 0;
-static LWLock *held_lwlocks[MAX_SIMUL_LWLOCKS];
+static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS];
 
 static int lock_addin_request = 0;
 static bool lock_addin_request_allowed = true;
@@ -100,8 +185,11 @@ typedef struct lwlock_stats
 {
    lwlock_stats_key key;
    int         sh_acquire_count;
+   int         sh_attempt_backout;
    int         ex_acquire_count;
+   int         ex_race;
    int         block_count;
+   int         dequeue_self_count;
    int         spin_delay_count;
 }  lwlock_stats;
 
@@ -113,23 +201,30 @@ static lwlock_stats lwlock_stats_dummy;
 bool       Trace_lwlocks = false;
 
 inline static void
-PRINT_LWDEBUG(const char *where, const LWLock *lock)
+PRINT_LWDEBUG(const char *where, const LWLock *lock, LWLockMode mode)
 {
    if (Trace_lwlocks)
-       elog(LOG, "%s(%s %d): excl %d shared %d rOK %d",
+   {
+       uint32 lockcount = pg_atomic_read_u32(&lock->lockcount);
+
+       elog(LOG, "%d: %s(%s %d): excl %u shared %u waiters %u rOK %d\n",
+            MyProcPid,
             where, T_NAME(lock), T_ID(lock),
-            (int) lock->exclusive, lock->shared,
+            lockcount >= EXCLUSIVE_LOCK,
+            lockcount & SHARED_LOCK_MASK,
+            pg_atomic_read_u32(&lock->nwaiters),
             (int) lock->releaseOK);
+   }
 }
 
 inline static void
 LOG_LWDEBUG(const char *where, const char *name, int index, const char *msg)
 {
    if (Trace_lwlocks)
-       elog(LOG, "%s(%s %d): %s", where, name, index, msg);
+       elog(LOG, "%d: %s(%s %d): %s\n", MyProcPid, where, name, index, msg);
 }
 #else                          /* not LOCK_DEBUG */
-#define PRINT_LWDEBUG(a,b)
+#define PRINT_LWDEBUG(a,b,c)
 #define LOG_LWDEBUG(a,b,c,d)
 #endif   /* LOCK_DEBUG */
 
@@ -192,11 +287,12 @@ print_lwlock_stats(int code, Datum arg)
    while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL)
    {
        fprintf(stderr,
-             "PID %d lwlock %s %d: shacq %u exacq %u blk %u spindelay %u\n",
+               "PID %d lwlock %s %d: shacq %u exacq %u blk %u spindelay %u, backout %u, ex race %u, dequeue self %u\n",
                MyProcPid, LWLockTrancheArray[lwstats->key.tranche]->name,
                lwstats->key.instance, lwstats->sh_acquire_count,
                lwstats->ex_acquire_count, lwstats->block_count,
-               lwstats->spin_delay_count);
+               lwstats->spin_delay_count, lwstats->sh_attempt_backout,
+               lwstats->ex_race, lwstats->dequeue_self_count);
    }
 
    LWLockRelease(&MainLWLockArray[0].lock);
@@ -224,8 +320,11 @@ get_lwlock_stats_entry(LWLock *lock)
    if (!found)
    {
        lwstats->sh_acquire_count = 0;
+       lwstats->sh_attempt_backout = 0;
        lwstats->ex_acquire_count = 0;
+       lwstats->ex_race = 0;
        lwstats->block_count = 0;
+       lwstats->dequeue_self_count = 0;
        lwstats->spin_delay_count = 0;
    }
    return lwstats;
@@ -473,12 +572,299 @@ LWLockInitialize(LWLock *lock, int tranche_id)
 {
    SpinLockInit(&lock->mutex);
    lock->releaseOK = true;
-   lock->exclusive = 0;
-   lock->shared = 0;
+   pg_atomic_init_u32(&lock->lockcount, 0);
+   pg_atomic_init_u32(&lock->nwaiters, 0);
    lock->tranche = tranche_id;
    dlist_init(&lock->waiters);
 }
 
+/*
+ * Internal function handling the atomic manipulation of lock->lockcount.
+ *
+ * 'double_check' = true means that we try to check more carefully
+ * against causing somebody else to spuriously believe the lock is
+ * already taken, although we're just about to back out of it.
+ */
+static inline bool
+LWLockAttemptLock(LWLock* l, LWLockMode mode, bool double_check, bool *potentially_spurious)
+{
+   bool        mustwait;
+   uint32      oldstate;
+#ifdef LWLOCK_STATS
+   lwlock_stats *lwstats;
+   lwstats = get_lwlock_stats_entry(l);
+#endif
+
+   Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED);
+
+   *potentially_spurious = false;
+
+   if (mode == LW_EXCLUSIVE)
+   {
+       uint32 expected;
+
+       /* check without CAS first; it's way cheaper, frequently locked otherwise */
+       expected = pg_atomic_read_u32(&l->lockcount);
+
+       Assert(expected < EXCLUSIVE_LOCK + (1 << 16));
+
+       if (expected != 0)
+           mustwait = true;
+       else if (!pg_atomic_compare_exchange_u32(&l->lockcount,
+                                                &expected, EXCLUSIVE_LOCK))
+       {
+           /*
+            * ok, no can do. Between the pg_atomic_read() above and the
+            * CAS somebody else acquired the lock.
+            */
+           mustwait = true;
+           Assert(expected < EXCLUSIVE_LOCK + (1 << 16));
+       }
+       else
+       {
+           /* yipeyyahee */
+           mustwait = false;
+#ifdef LOCK_DEBUG
+           l->owner = MyProc;
+#endif
+           Assert(expected == 0);
+       }
+   }
+   else
+   {
+       /*
+        * If requested by caller, do an unlocked check first.  This is useful
+        * if potentially spurious results have a noticeable cost.
+        */
+       if (double_check)
+       {
+           if (pg_atomic_read_u32(&l->lockcount) >= EXCLUSIVE_LOCK)
+           {
+               mustwait = true;
+               goto out;
+           }
+       }
+
+       /*
+        * Acquire the share lock unconditionally using an atomic addition. We
+        * might have to back out again if it turns out somebody else has an
+        * exclusive lock.
+        */
+       oldstate = pg_atomic_fetch_add_u32(&l->lockcount, 1);
+
+       if (oldstate >= EXCLUSIVE_LOCK)
+       {
+           /*
+            * Ok, somebody else holds the lock exclusively. We need to back
+            * away from the shared lock, since we don't actually hold it right
+            * now.  Since there's a window between lockcount += 1 and lockcount
+            * -= 1, the previous exclusive locker could have released and
+            * another exclusive locker could have seen our +1. We need to
+            * signal that to the upper layers so they can deal with the race
+            * condition.
+            */
+
+           /*
+            * FIXME: check return value if (double_check), it's not
+            * spurious if still exclusively locked.
+            */
+           pg_atomic_fetch_sub_u32(&l->lockcount, 1);
+
+
+           mustwait = true;
+           *potentially_spurious = true;
+#ifdef LWLOCK_STATS
+           lwstats->sh_attempt_backout++;
+#endif
+       }
+       else
+       {
+           /* yipeyyahee */
+           mustwait = false;
+       }
+   }
+
+out:
+   return mustwait;
+}
+
+/*
+ * Wakeup all the lockers that currently have a chance to run.
+ */
+static void
+LWLockWakeup(LWLock *lock, LWLockMode mode)
+{
+   bool        releaseOK;
+   bool        wokeup_somebody = false;
+   dlist_head  wakeup;
+   dlist_mutable_iter iter;
+#ifdef LWLOCK_STATS
+   lwlock_stats   *lwstats;
+   lwstats = get_lwlock_stats_entry(l);
+#endif
+
+   dlist_init(&wakeup);
+
+   /* remove the to-be-awakened PGPROCs from the queue */
+   releaseOK = true;
+
+   /* Acquire mutex.  Time spent holding mutex should be short! */
+#ifdef LWLOCK_STATS
+   lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
+#else
+   SpinLockAcquire(&lock->mutex);
+#endif
+
+   /*
+    * We're still waiting for backends to get scheduled, don't wake them up
+    * again.
+    */
+   if (!lock->releaseOK)
+   {
+       SpinLockRelease(&lock->mutex);
+       PRINT_LWDEBUG("LWLockRelease skip releaseok", lock, mode);
+       return;
+   }
+
+   dlist_foreach_modify(iter, (dlist_head *) &lock->waiters)
+   {
+       PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);
+
+       if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE)
+           continue;
+
+       dlist_delete(&waiter->lwWaitLink);
+       dlist_push_tail(&wakeup, &waiter->lwWaitLink);
+
+       if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+       {
+           /*
+            * Prevent additional wakeups until retryer gets to run. Backends
+            * that are just waiting for the lock to become free don't retry
+            * automatically.
+            */
+           releaseOK = false;
+           /*
+            * Don't wakeup (further) exclusive locks.
+            */
+           wokeup_somebody = true;
+       }
+
+       /*
+        * Once we've woken up an exclusive lock, there's no point in waking
+        * up anybody else.
+        */
+       if(waiter->lwWaitMode == LW_EXCLUSIVE)
+           break;
+   }
+   lock->releaseOK = releaseOK;
+
+
+   /* We are done updating shared state of the lock queue. */
+   SpinLockRelease(&lock->mutex);
+
+   /*
+    * Awaken any waiters I removed from the queue.
+    */
+   dlist_foreach_modify(iter, &wakeup)
+   {
+       PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);
+
+       LOG_LWDEBUG("LWLockRelease", T_NAME(l), T_ID(l),  "release waiter");
+       dlist_delete(&waiter->lwWaitLink);
+       pg_write_barrier();
+       waiter->lwWaiting = false;
+       PGSemaphoreUnlock(&waiter->sem);
+   }
+}
+
+/*
+ * Add ourselves to the end of the queue. Mode can be LW_WAIT_UNTIL_FREE here!
+ */
+static inline void
+LWLockQueueSelf(LWLock *lock, LWLockMode mode)
+{
+#ifdef LWLOCK_STATS
+   lwlock_stats   *lwstats;
+   lwstats = get_lwlock_stats_entry(l);
+#endif
+
+   /*
+    * If we don't have a PGPROC structure, there's no way to wait. This
+    * should never occur, since MyProc should only be null during shared
+    * memory initialization.
+    */
+   if (MyProc == NULL)
+       elog(PANIC, "cannot wait without a PGPROC structure");
+
+   pg_atomic_fetch_add_u32(&lock->nwaiters, 1);
+
+#ifdef LWLOCK_STATS
+   lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
+#else
+   SpinLockAcquire(&lock->mutex);
+#endif
+
+   if (MyProc->lwWaiting)
+       elog(PANIC, "queueing for lock while waiting on another one");
+
+   MyProc->lwWaiting = true;
+   MyProc->lwWaitMode = mode;
+
+   /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */
+   if (mode == LW_WAIT_UNTIL_FREE)
+       dlist_push_head((dlist_head *) &lock->waiters, &MyProc->lwWaitLink);
+   else
+       dlist_push_tail((dlist_head *) &lock->waiters, &MyProc->lwWaitLink);
+
+   /* Can release the mutex now */
+   SpinLockRelease(&lock->mutex);
+}
+
+/*
+ * Remove ourselves from the waitlist.  This is used if we queued ourselves
+ * because we thought we needed to sleep but, after further checking, we
+ * discovered that we don't actually need to do so. Somebody else might have
+ * already woken us up though, in that case return false.
+ */
+static inline bool
+LWLockDequeueSelf(LWLock *lock)
+{
+   bool    found = false;
+   dlist_mutable_iter iter;
+
+#ifdef LWLOCK_STATS
+   lwlock_stats *lwstats;
+   lwstats = get_lwlock_stats_entry(l);
+#endif
+
+#ifdef LWLOCK_STATS
+   lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
+#else
+   SpinLockAcquire(&lock->mutex);
+#endif
+
+   /* need to iterate, somebody else could have unqueued us */
+   dlist_foreach_modify(iter, (dlist_head *) &lock->waiters)
+   {
+       PGPROC *proc = dlist_container(PGPROC, lwWaitLink, iter.cur);
+       if (proc == MyProc)
+       {
+           found = true;
+           dlist_delete(&proc->lwWaitLink);
+           break;
+       }
+   }
+
+   /* clear waiting state again, nice for debugging */
+   if (found)
+       MyProc->lwWaiting = false;
+
+   SpinLockRelease(&lock->mutex);
+
+   pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+   return found;
+}
 
 /*
  * LWLockAcquire - acquire a lightweight lock in the specified mode
@@ -510,14 +896,17 @@ static inline bool
 LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val)
 {
    PGPROC     *proc = MyProc;
-   bool        retry = false;
    bool        result = true;
    int         extraWaits = 0;
+   bool        potentially_spurious;
+
 #ifdef LWLOCK_STATS
    lwlock_stats *lwstats;
 #endif
 
-   PRINT_LWDEBUG("LWLockAcquire", lock);
+   AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+   PRINT_LWDEBUG("LWLockAcquire", lock, mode);
 
 #ifdef LWLOCK_STATS
    lwstats = get_lwlock_stats_entry(lock);
@@ -567,58 +956,77 @@ LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val)
    {
        bool        mustwait;
 
-       /* Acquire mutex.  Time spent holding mutex should be short! */
-#ifdef LWLOCK_STATS
-       lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
-#else
-       SpinLockAcquire(&lock->mutex);
-#endif
+       /*
+        * try to grab the lock the first time, we're not in the waitqueue yet.
+        */
+       mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious);
 
-       /* If retrying, allow LWLockRelease to release waiters again */
-       if (retry)
-           lock->releaseOK = true;
+       if (!mustwait)
+       {
+           LOG_LWDEBUG("LWLockAcquire", T_NAME(l), T_ID(l), "success");
+           break;              /* got the lock */
+       }
 
-       /* If I can get the lock, do so quickly. */
-       if (mode == LW_EXCLUSIVE)
+       /*
+        * Ok, at this point we couldn't grab the lock on the first try. We
+        * cannot simply queue ourselves to the end of the list and wait to be
+        * woken up because by now the lock could long have been released.
+        * Instead add us to the queue and try to grab the lock again. If we
+        * succeed we need to revert the queuing and be happy, otherwise we
+        * recheck the lock. If we still couldn't grab it, we know that the
+        * other lock will see our queue entries when releasing since they
+        * existed before we checked for the lock.
+        * FIXME: add note referring to overall notes
+        */
+
+       /* add to the queue */
+       LWLockQueueSelf(lock, mode);
+
+       /* we're now guaranteed to be woken up if necessary */
+       mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious);
+
+       /* ok, grabbed the lock the second time round, need to undo queueing */
+       if (!mustwait)
        {
-           if (lock->exclusive == 0 && lock->shared == 0)
+#ifdef LWLOCK_STATS
+           lwstats->dequeue_self_count++;
+#endif
+           if (!LWLockDequeueSelf(lock))
            {
-               lock->exclusive++;
-               mustwait = false;
+               /*
+                * Somebody else dequeued us and has or will wake us up. Wait
+                * for the correct wakeup, otherwise our ->lwWaiting would get
+                * reset at some inconvenient point later, and releaseOk
+                * wouldn't be managed correctly.
+                */
+               for (;;)
+               {
+                   PGSemaphoreLock(&proc->sem, false);
+                   if (!proc->lwWaiting)
+                       break;
+                   extraWaits++;
+               }
+               /*
+                * Reset releaseOk - if somebody woke us they'll have set it
+                * to false.
+                */
+               SpinLockAcquire(&lock->mutex);
+               lock->releaseOK = true;
+               SpinLockRelease(&lock->mutex);
            }
-           else
-               mustwait = true;
+           PRINT_LWDEBUG("LWLockAcquire success: undo queue", lock, mode);
+           break;
        }
        else
        {
-           if (lock->exclusive == 0)
-           {
-               lock->shared++;
-               mustwait = false;
-           }
-           else
-               mustwait = true;
+           PRINT_LWDEBUG("LWLockAcquire waiting 4", lock, mode);
        }
 
-       if (!mustwait)
-           break;              /* got the lock */
-
        /*
-        * Add myself to wait queue.
-        *
-        * If we don't have a PGPROC structure, there's no way to wait. This
-        * should never occur, since MyProc should only be null during shared
-        * memory initialization.
+        * NB: There's no need to deal with spurious lock attempts
+        * here. Anyone we prevented from acquiring the lock will
+        * enqueue themselves using the same protocol we used here.
         */
-       if (proc == NULL)
-           elog(PANIC, "cannot wait without a PGPROC structure");
-
-       proc->lwWaiting = true;
-       proc->lwWaitMode = mode;
-       dlist_push_head((dlist_head *) &lock->waiters, &proc->lwWaitLink);
-
-       /* Can release the mutex now */
-       SpinLockRelease(&lock->mutex);
 
        /*
         * Wait until awakened.
@@ -653,8 +1061,14 @@ LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val)
 
        LOG_LWDEBUG("LWLockAcquire", T_NAME(lock), T_ID(lock), "awakened");
 
+       /* not waiting anymore */
+       pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
        /* Now loop back and try to acquire lock again. */
-       retry = true;
+       SpinLockAcquire(&lock->mutex);
+       lock->releaseOK = true;
+       SpinLockRelease(&lock->mutex);
+
        result = false;
    }
 
@@ -662,13 +1076,11 @@ LWLockAcquireCommon(LWLock *lock, LWLockMode mode, uint64 *valptr, uint64 val)
    if (valptr)
        *valptr = val;
 
-   /* We are done updating shared state of the lock itself. */
-   SpinLockRelease(&lock->mutex);
-
    TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), T_ID(lock), mode);
 
    /* Add lock to list of locks held by this backend */
-   held_lwlocks[num_held_lwlocks++] = lock;
+   held_lwlocks[num_held_lwlocks].lock = lock;
+   held_lwlocks[num_held_lwlocks++].mode = mode;
 
    /*
     * Fix the process wait semaphore's count for any absorbed wakeups.
@@ -690,8 +1102,11 @@ bool
 LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
 {
    bool        mustwait;
+   bool        potentially_spurious;
+
+   AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);
 
-   PRINT_LWDEBUG("LWLockConditionalAcquire", lock);
+   PRINT_LWDEBUG("LWLockConditionalAcquire", l, mode);
 
    /* Ensure we will have room to remember the lock */
    if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
@@ -704,50 +1119,44 @@ LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
     */
    HOLD_INTERRUPTS();
 
-   /* Acquire mutex.  Time spent holding mutex should be short! */
-   SpinLockAcquire(&lock->mutex);
-
-   /* If I can get the lock, do so quickly. */
-   if (mode == LW_EXCLUSIVE)
-   {
-       if (lock->exclusive == 0 && lock->shared == 0)
-       {
-           lock->exclusive++;
-           mustwait = false;
-       }
-       else
-           mustwait = true;
-   }
-   else
-   {
-       if (lock->exclusive == 0)
-       {
-           lock->shared++;
-           mustwait = false;
-       }
-       else
-           mustwait = true;
-   }
-
-   /* We are done updating shared state of the lock itself. */
-   SpinLockRelease(&lock->mutex);
+retry:
+   /*
+    * passing 'true' to check more carefully to avoid potential
+    * spurious acquisitions
+    */
+   mustwait = LWLockAttemptLock(lock, mode, true, &potentially_spurious);
 
    if (mustwait)
    {
        /* Failed to get lock, so release interrupt holdoff */
        RESUME_INTERRUPTS();
+
        LOG_LWDEBUG("LWLockConditionalAcquire",
                    T_NAME(lock), T_ID(lock), "failed");
-       TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock),
-                                                T_ID(lock), mode);
+       TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), T_ID(lock), mode);
+
+       /*
+        * We ran into an exclusive lock and might have blocked another
+        * exclusive lock from taking a shot because it took a time to back
+        * off. Retry till we are either sure we didn't block somebody (because
+        * somebody else certainly has the lock) or till we got it.
+        *
+        * We cannot rely on the two-step lock-acquisition protocol as in
+        * LWLockAcquire because we're not using it.
+        */
+       if (potentially_spurious)
+       {
+           SPIN_DELAY();
+           goto retry;
+       }
    }
    else
    {
        /* Add lock to list of locks held by this backend */
-       held_lwlocks[num_held_lwlocks++] = lock;
+       held_lwlocks[num_held_lwlocks].lock = lock;
+       held_lwlocks[num_held_lwlocks++].mode = mode;
        TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), T_ID(lock), mode);
    }
-
    return !mustwait;
 }
 
@@ -771,11 +1180,15 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
    PGPROC     *proc = MyProc;
    bool        mustwait;
    int         extraWaits = 0;
+   bool        potentially_spurious_first;
+   bool        potentially_spurious_second;
 #ifdef LWLOCK_STATS
    lwlock_stats *lwstats;
 #endif
 
-   PRINT_LWDEBUG("LWLockAcquireOrWait", lock);
+   Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+   PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode);
 
 #ifdef LWLOCK_STATS
    lwstats = get_lwlock_stats_entry(lock);
@@ -792,81 +1205,58 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
     */
    HOLD_INTERRUPTS();
 
-   /* Acquire mutex.  Time spent holding mutex should be short! */
-   SpinLockAcquire(&lock->mutex);
-
-   /* If I can get the lock, do so quickly. */
-   if (mode == LW_EXCLUSIVE)
-   {
-       if (lock->exclusive == 0 && lock->shared == 0)
-       {
-           lock->exclusive++;
-           mustwait = false;
-       }
-       else
-           mustwait = true;
-   }
-   else
-   {
-       if (lock->exclusive == 0)
-       {
-           lock->shared++;
-           mustwait = false;
-       }
-       else
-           mustwait = true;
-   }
+   /*
+    * NB: We're using nearly the same twice-in-a-row lock acquisition
+    * protocol as LWLockAcquire(). Check its comments for details.
+    */
+   mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious_first);
 
    if (mustwait)
    {
-       /*
-        * Add myself to wait queue.
-        *
-        * If we don't have a PGPROC structure, there's no way to wait.  This
-        * should never occur, since MyProc should only be null during shared
-        * memory initialization.
-        */
-       if (proc == NULL)
-           elog(PANIC, "cannot wait without a PGPROC structure");
-
-       proc->lwWaiting = true;
-       proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
-       dlist_push_head((dlist_head *) &lock->waiters, &proc->lwWaitLink);
+       LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
 
-       /* Can release the mutex now */
-       SpinLockRelease(&lock->mutex);
+       mustwait = LWLockAttemptLock(lock, mode, false, &potentially_spurious_second);
 
-       /*
-        * Wait until awakened.  Like in LWLockAcquire, be prepared for bogus
-        * wakups, because we share the semaphore with ProcWaitForSignal.
-        */
-       LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock),
-                   "waiting");
+       if (mustwait)
+       {
+           /*
+            * Wait until awakened.  Like in LWLockAcquire, be prepared for bogus
+            * wakups, because we share the semaphore with ProcWaitForSignal.
+            */
+           LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock),
+                       "waiting");
 
 #ifdef LWLOCK_STATS
-       lwstats->block_count++;
+           lwstats->block_count++;
 #endif
+           TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(l), T_ID(l), mode);
 
-       TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), T_ID(lock), mode);
+           for (;;)
+           {
+               /* "false" means cannot accept cancel/die interrupt here. */
+               PGSemaphoreLock(&proc->sem, false);
+               if (!proc->lwWaiting)
+                   break;
+               extraWaits++;
+           }
+           pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
 
-       for (;;)
+           TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock), mode);
+       }
+       else
        {
-           /* "false" means cannot accept cancel/die interrupt here. */
-           PGSemaphoreLock(&proc->sem, false);
-           if (!proc->lwWaiting)
-               break;
-           extraWaits++;
+           /* got lock in the second attempt, undo queueing */
+           if (!LWLockDequeueSelf(lock))
+           {
+               for (;;)
+               {
+                   PGSemaphoreLock(&proc->sem, false);
+                   if (!proc->lwWaiting)
+                       break;
+                   extraWaits++;
+               }
+           }
        }
-
-       TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock), mode);
-
-       LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock),
-                   "awakened");
-   }
-   else
-   {
-       /* We are done updating shared state of the lock itself. */
-       SpinLockRelease(&lock->mutex);
    }
 
    /*
@@ -885,10 +1275,11 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
    }
    else
    {
+       LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock), "succeeded");
        /* Add lock to list of locks held by this backend */
-       held_lwlocks[num_held_lwlocks++] = lock;
-       TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), T_ID(lock),
-                                               mode);
+       held_lwlocks[num_held_lwlocks].lock = lock;
+       held_lwlocks[num_held_lwlocks++].mode = mode;
+       TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), T_ID(lock), mode);
    }
 
    return !mustwait;
@@ -922,7 +1313,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
    lwlock_stats *lwstats;
 #endif
 
-   PRINT_LWDEBUG("LWLockWaitForVar", lock);
+   PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_EXCLUSIVE);
 
 #ifdef LWLOCK_STATS
    lwstats = get_lwlock_stats_entry(lock);
@@ -935,7 +1326,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
     * barrier here as far as the current usage is concerned.  But that might
     * not be safe in general.
     */
-   if (lock->exclusive == 0)
+   if (pg_atomic_read_u32(&lock->lockcount) == 0)
        return true;
 
    /*
@@ -953,21 +1344,16 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
        bool        mustwait;
        uint64      value;
 
-       /* Acquire mutex.  Time spent holding mutex should be short! */
-#ifdef LWLOCK_STATS
-       lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
-#else
-       SpinLockAcquire(&lock->mutex);
-#endif
+       mustwait = pg_atomic_read_u32(&lock->lockcount) != 0;
 
-       /* Is the lock now free, and if not, does the value match? */
-       if (lock->exclusive == 0)
-       {
-           result = true;
-           mustwait = false;
-       }
-       else
+       if (mustwait)
        {
+           /*
+            * Perform comparison using spinlock as we can't rely on atomic 64
+            * bit reads/stores.
+            */
+           SpinLockAcquire(&lock->mutex);
+
            value = *valptr;
            if (value != oldval)
            {
@@ -977,21 +1363,62 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
            }
            else
                mustwait = true;
+           SpinLockRelease(&lock->mutex);
        }
+       else
+           mustwait = false;
 
        if (!mustwait)
            break;              /* the lock was free or value didn't match */
 
        /*
-        * Add myself to wait queue.
+        * Add myself to wait queue. Note that this is racy, somebody else
+        * could wakeup before we're finished queuing.
+        * NB: We're using nearly the same twice-in-a-row lock acquisition
+        * protocol as LWLockAcquire(). Check its comments for details.
         */
-       proc->lwWaiting = true;
-       proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
-       /* waiters are added to the front of the queue */
-       dlist_push_head((dlist_head *) &lock->waiters, &proc->lwWaitLink);
+       LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
 
-       /* Can release the mutex now */
-       SpinLockRelease(&lock->mutex);
+       /*
+        * We're now guaranteed to be woken up if necessary. Recheck the
+        * lock's state.
+        */
+       pg_read_barrier();
+       mustwait = pg_atomic_read_u32(&lock->lockcount) != 0;
+
+       /* ok, grabbed the lock the second time round, need to undo queueing */
+       if (!mustwait)
+       {
+#ifdef LWLOCK_STATS
+           lwstats->dequeue_self_count++;
+#endif
+           if (!LWLockDequeueSelf(lock))
+           {
+               /*
+                * Somebody else dequeued us and has or will wake us up. Wait
+                * for the correct wakeup, otherwise our ->lwWaiting would get
+                * reset at some inconvenient point later.
+                */
+               for (;;)
+               {
+                   PGSemaphoreLock(&proc->sem, false);
+                   if (!proc->lwWaiting)
+                       break;
+                   extraWaits++;
+               }
+           }
+           PRINT_LWDEBUG("LWLockWaitForVar undo queue", lock, LW_EXCLUSIVE);
+           break;
+       }
+       else
+       {
+           PRINT_LWDEBUG("LWLockWaitForVar waiting 4", lock, LW_EXCLUSIVE);
+       }
+
+       /*
+        * NB: Just as in LWLockAcquireCommon() there's no need to deal with
+        * spurious lock attempts here.
+        */
 
        /*
         * Wait until awakened.
@@ -1027,13 +1454,11 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
                                          LW_EXCLUSIVE);
 
        LOG_LWDEBUG("LWLockWaitForVar", T_NAME(lock), T_ID(lock), "awakened");
+       pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
 
        /* Now loop back and check the status of the lock again. */
    }
 
-   /* We are done updating shared state of the lock itself. */
-   SpinLockRelease(&lock->mutex);
-
    TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), T_ID(lock), LW_EXCLUSIVE);
 
    /*
@@ -1072,8 +1497,7 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
    /* Acquire mutex.  Time spent holding mutex should be short! */
    SpinLockAcquire(&lock->mutex);
 
-   /* we should hold the lock */
-   Assert(lock->exclusive == 1);
+   Assert(pg_atomic_read_u32(&lock->lockcount) >= EXCLUSIVE_LOCK);
 
    /* Update the lock's value */
    *valptr = val;
@@ -1099,7 +1523,7 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
    /*
     * Awaken any waiters I removed from the queue.
     */
-   dlist_foreach_modify(iter, (dlist_head *) &wakeup)
+   dlist_foreach_modify(iter, &wakeup)
    {
        PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);
        dlist_delete(&waiter->lwWaitLink);
@@ -1116,22 +1540,23 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
 void
 LWLockRelease(LWLock *lock)
 {
-   dlist_head  wakeup;
-   dlist_mutable_iter iter;
+   LWLockMode  mode;
+   uint32      lockcount;
+   bool        check_waiters;
+   bool        have_waiters = false;
    int         i;
 
-   dlist_init(&wakeup);
-
-   PRINT_LWDEBUG("LWLockRelease", lock);
-
    /*
     * Remove lock from list of locks held.  Usually, but not always, it will
     * be the latest-acquired lock; so search array backwards.
     */
    for (i = num_held_lwlocks; --i >= 0;)
    {
-       if (lock == held_lwlocks[i])
+       if (lock == held_lwlocks[i].lock)
+       {
+           mode = held_lwlocks[i].mode;
            break;
+       }
    }
    if (i < 0)
        elog(ERROR, "lock %s %d is not held", T_NAME(lock), T_ID(lock));
@@ -1139,77 +1564,44 @@ LWLockRelease(LWLock *lock)
    for (; i < num_held_lwlocks; i++)
        held_lwlocks[i] = held_lwlocks[i + 1];
 
-   /* Acquire mutex.  Time spent holding mutex should be short! */
-   SpinLockAcquire(&lock->mutex);
+   PRINT_LWDEBUG("LWLockRelease", lock, mode);
 
-   /* Release my hold on lock */
-   if (lock->exclusive > 0)
-       lock->exclusive--;
+   /* Release my hold on lock, both are a full barrier */
+   if (mode == LW_EXCLUSIVE)
+       lockcount = pg_atomic_sub_fetch_u32(&lock->lockcount, EXCLUSIVE_LOCK);
    else
-   {
-       Assert(lock->shared > 0);
-       lock->shared--;
-   }
+       lockcount = pg_atomic_sub_fetch_u32(&lock->lockcount, 1);
+
+   /* nobody else can have that kind of lock */
+   Assert(lockcount < EXCLUSIVE_LOCK);
 
    /*
-    * See if I need to awaken any waiters.  If I released a non-last shared
-    * hold, there cannot be anything to do.  Also, do not awaken any waiters
-    * if someone has already awakened waiters that haven't yet acquired the
-    * lock.
+    * Anybody we need to wakeup needs to have started queueing before
+    * we removed ourselves from the queue and the __sync_ operations
+    * above are full barriers.
     */
-   if (lock->exclusive == 0 && lock->shared == 0 && lock->releaseOK)
-   {
-       /*
-        * Remove the to-be-awakened PGPROCs from the queue.
-        */
-       bool        releaseOK = true;
-       bool        wokeup_somebody = false;
 
-       dlist_foreach_modify(iter, (dlist_head *) &lock->waiters)
-       {
-           PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);
+   if (pg_atomic_read_u32(&lock->nwaiters) > 0)
+       have_waiters = true;
 
-           if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE)
-               continue;
-
-           dlist_delete(&waiter->lwWaitLink);
-           dlist_push_tail(&wakeup, &waiter->lwWaitLink);
-
-           /*
-            * Prevent additional wakeups until retryer gets to
-            * run. Backends that are just waiting for the lock to become
-            * free don't retry automatically.
-            */
-           if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
-           {
-               releaseOK = false;
-               wokeup_somebody = true;
-           }
+   /* grant permission to run, even if a spurious share lock increases lockcount */
+   if (mode == LW_EXCLUSIVE && have_waiters)
+       check_waiters = true;
+   /* nobody has this locked anymore, potential exclusive lockers get a chance */
+   else if (lockcount == 0 && have_waiters)
+       check_waiters = true;
+   /* nobody queued or not free */
+   else
+       check_waiters = false;
 
-           if(waiter->lwWaitMode == LW_EXCLUSIVE)
-               break;
-       }
-       lock->releaseOK = releaseOK;
+   if (check_waiters)
+   {
+       PRINT_LWDEBUG("LWLockRelease releasing", lock, mode);
+       LWLockWakeup(lock, mode);
    }
 
-   /* We are done updating shared state of the lock itself. */
-   SpinLockRelease(&lock->mutex);
-
    TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock), T_ID(lock));
 
-   /*
-    * Awaken any waiters I removed from the queue.
-    */
-   dlist_foreach_modify(iter, (dlist_head *) &wakeup)
-   {
-       PGPROC *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);
-       LOG_LWDEBUG("LWLockRelease", T_NAME(l), T_ID(l), "release waiter");
-       dlist_delete(&waiter->lwWaitLink);
-       pg_write_barrier();
-       waiter->lwWaiting = false;
-       PGSemaphoreUnlock(&waiter->sem);
-   }
-
    /*
     * Now okay to allow cancel/die interrupts.
     */
@@ -1233,7 +1625,7 @@ LWLockReleaseAll(void)
    {
        HOLD_INTERRUPTS();      /* match the upcoming RESUME_INTERRUPTS */
 
-       LWLockRelease(held_lwlocks[num_held_lwlocks - 1]);
+       LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock);
    }
 }
 
@@ -1241,8 +1633,8 @@ LWLockReleaseAll(void)
 /*
  * LWLockHeldByMe - test whether my process currently holds a lock
  *
- * This is meant as debug support only.  We do not distinguish whether the
- * lock is held shared or exclusive.
+ * This is meant as debug support only.  We currently do not distinguish
+ * whether the lock is held shared or exclusive.
  */
 bool
 LWLockHeldByMe(LWLock *l)
@@ -1251,7 +1643,7 @@ LWLockHeldByMe(LWLock *l)
 
    for (i = 0; i < num_held_lwlocks; i++)
    {
-       if (held_lwlocks[i] == l)
+       if (held_lwlocks[i].lock == l)
            return true;
    }
    return false;
index 8b20fd9f9f074efe2b18ff710adb65fce8072ca5..3376b1435e9828e7491bdee5b8c2555847ba8f88 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "lib/ilist.h"
 #include "storage/s_lock.h"
+#include "port/atomics.h"
 
 struct PGPROC;
 
@@ -48,10 +49,14 @@ typedef struct LWLock
 {
    slock_t     mutex;          /* Protects LWLock and queue of PGPROCs */
    bool        releaseOK;      /* T if ok to release waiters */
-   char        exclusive;      /* # of exclusive holders (0 or 1) */
-   int         shared;         /* # of shared holders (0..MaxBackends) */
+
+   pg_atomic_uint32 lockcount; /* state of exlusive/nonexclusive lockers */
+   pg_atomic_uint32 nwaiters;  /* number of waiters */
    int         tranche;        /* tranche ID */
    dlist_head  waiters;        /* list of waiting PGPROCs */
+#ifdef LOCK_DEBUG
+   struct PGPROC *owner;
+#endif
 } LWLock;
 
 /*
index f49d17f0c0245389b6b197b74232eae7593dea88..f217bd6c464e6a373b87cc19faae2588123d92a5 100644 (file)
@@ -906,6 +906,8 @@ extern int  tas_sema(volatile slock_t *lock);
 #define S_LOCK_FREE(lock)  (*(lock) == 0)
 #endif  /* S_LOCK_FREE */
 
+#include "storage/barrier.h"
+
 #if !defined(S_UNLOCK)
 /*
  * Our default implementation of S_UNLOCK is essentially *(lock) = 0.  This