Avoid uninterruptable waits in SharedQ
authorPavan Deolasee <[email protected]>
Wed, 9 Jan 2019 12:36:55 +0000 (18:06 +0530)
committerPavan Deolasee <[email protected]>
Wed, 9 Jan 2019 12:36:55 +0000 (18:06 +0530)
Instead we wait with a small timeout (10s) and retry things (which we were
doing anyways). This also allows us to process any pending interrupts.

src/backend/pgxc/squeue/squeue.c

index abe74c8893ade6498deb0eefe00a2be1af437314..21bcebe52513bd22af0a64a145f3ea7e744ae152 100644 (file)
@@ -1061,11 +1061,21 @@ SharedQueueRead(SharedQueue squeue, int consumerIdx,
                        SetLatch(&sqsync->sqs_producer_latch);
                        LWLockRelease(sqsync->sqs_producer_lwlock);
 
-                       /* Wait for notification about available info */
+                       /*
+                        * Wait for notification about available info.
+                        *
+                        * We use a timed-wait to avoid non-interruptable waits, as seen
+                        * from some complaints about a backend remaining stuck even after
+                        * the query is cancelled.
+                        */
                        WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch,
-                                       WL_LATCH_SET | WL_POSTMASTER_DEATH, -1,
+                                       WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT,
+                                       10000L,
                                        WAIT_EVENT_MQ_INTERNAL);
 
+                       /* Interrupt may have come while waiting */
+                       CHECK_FOR_INTERRUPTS();
+
                        /* got the notification, restore lock and try again */
                        LWLockAcquire(sqsync->sqs_producer_lwlock, LW_SHARED);
                        LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE);
@@ -1955,9 +1965,24 @@ sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow,
                        LWLockRelease(sync->cs_lwlock);
                        LWLockRelease(sqsync->sqs_producer_lwlock);
 
-                       /* Wait for notification about available info */
-                       WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1,
+                       /*
+                        * Wait for notification about available info. We use a timed-wait
+                        * to avoid infinite hang in case the producer does not produce
+                        * more data, for whatever reason. For example, we have seen issues
+                        * with query cancellation where a backend may remain stuck in
+                        * these waits.
+                        *
+                        * 10s wait is quite arbitrary, but matches with what we use
+                        * elsewhere in SharedQ code.
+                        */
+                       WaitLatch(&sync->cs_latch,
+                                       WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT,
+                                       10000L,
                                        WAIT_EVENT_MQ_INTERNAL);
+
+                       /* Interrupt may have come while waiting */
+                       CHECK_FOR_INTERRUPTS();
+
                        /* got the notification, restore lock and try again */
                        LWLockAcquire(sqsync->sqs_producer_lwlock, LW_SHARED);
                        LWLockAcquire(sync->cs_lwlock, LW_EXCLUSIVE);