From: Pavan Deolasee Date: Wed, 9 Jan 2019 12:36:55 +0000 (+0530) Subject: Avoid uninterruptable waits in SharedQ X-Git-Tag: XL_10_R1_1~12 X-Git-Url: http://git.postgresql.org/gitweb/static/gitweb.js?a=commitdiff_plain;h=d1f5935d6e21feeed7c099025fce336bf4930823;p=postgres-xl.git Avoid uninterruptable waits in SharedQ Instead we wait with a small timeout (10s) and retry things (which we were doing anyways). This also allows us to process any pending interrupts. --- diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index abe74c8893..21bcebe525 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -1061,11 +1061,21 @@ SharedQueueRead(SharedQueue squeue, int consumerIdx, SetLatch(&sqsync->sqs_producer_latch); LWLockRelease(sqsync->sqs_producer_lwlock); - /* Wait for notification about available info */ + /* + * Wait for notification about available info. + * + * We use a timed-wait to avoid non-interruptable waits, as seen + * from some complaints about a backend remaining stuck even after + * the query is cancelled. + */ WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch, - WL_LATCH_SET | WL_POSTMASTER_DEATH, -1, + WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT, + 10000L, WAIT_EVENT_MQ_INTERNAL); + /* Interrupt may have come while waiting */ + CHECK_FOR_INTERRUPTS(); + /* got the notification, restore lock and try again */ LWLockAcquire(sqsync->sqs_producer_lwlock, LW_SHARED); LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE); @@ -1955,9 +1965,24 @@ sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow, LWLockRelease(sync->cs_lwlock); LWLockRelease(sqsync->sqs_producer_lwlock); - /* Wait for notification about available info */ - WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1, + /* + * Wait for notification about available info. We use a timed-wait + * to avoid infinite hang in case the producer does not produce + * more data, for whatever reason. For example, we have seen issues + * with query cancellation where a backend may remain stuck in + * these waits. + * + * 10s wait is quite arbitrary, but matches with what we use + * elsewhere in SharedQ code. + */ + WaitLatch(&sync->cs_latch, + WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT, + 10000L, WAIT_EVENT_MQ_INTERNAL); + + /* Interrupt may have come while waiting */ + CHECK_FOR_INTERRUPTS(); + /* got the notification, restore lock and try again */ LWLockAcquire(sqsync->sqs_producer_lwlock, LW_SHARED); LWLockAcquire(sync->cs_lwlock, LW_EXCLUSIVE);