Use timeouts in communications between GTM proxy and GTM.
authorPavan Deolasee <[email protected]>
Sun, 10 Feb 2019 06:04:29 +0000 (11:34 +0530)
committerPavan Deolasee <[email protected]>
Sun, 10 Feb 2019 06:04:29 +0000 (11:34 +0530)
We now have a new comm_timeout connection parameter while connecting to the
GTM/GTM Proxy. The default value for the parameter is 60s. If the client
doesn't get a response within the timeout, it errors out. Most GTM clients at
this point use compile time default, but we can later turn this into
configurable parameter.

This may help resolve certain conditions where a proxy infinitely waits for a
response from the GTM, because of some protocol error, thus freezing the
entire node.

contrib/pgxc_monitor/pgxc_monitor.c
src/backend/access/transam/gtm.c
src/gtm/client/fe-connect.c
src/gtm/client/fe-protocol.c
src/gtm/proxy/proxy_main.c
src/include/gtm/libpq-int.h

index 243ebcd0af000e3a800d3d83e9695d683291a098..ef73cbcdb8a8b1943498f2954919c01820a87e14 100644 (file)
@@ -181,7 +181,7 @@ do_gtm_ping(char *host, char* port, nodetype_t nodetype, char *nodename, bool ve
                exit(3);
        }
        /* Use 60s as connection timeout */
-       sprintf(connect_str, "host=%s port=%s node_name=%s remote_type=%d postmaster=0 connect_timeout=60",
+       sprintf(connect_str, "host=%s port=%s node_name=%s remote_type=%d postmaster=0 connect_timeout=60 comm_timeout=60",
                        host, port, nodename ? nodename : "pgxc_monitor", GTM_NODE_COORDINATOR);
        if ((conn = PQconnectGTM(connect_str)) == NULL)
        {
index e6c489f3b86e1c7b3fc7a7d0787005e7cd335308..c30403620305496454612eed47deb8ecf21ceb7a 100644 (file)
@@ -36,6 +36,7 @@
 char *GtmHost = "localhost";
 int GtmPort = 6666;
 static int GtmConnectTimeout = 60;
+static int GtmCommTimeout = 60;
 bool IsXidFromGTM = false;
 bool gtm_backup_barrier = false;
 extern bool FirstSnapshotSet;
@@ -82,9 +83,9 @@ InitGTM(void)
                        remote_type = GTM_NODE_DATANODE;
 
                /* Use 60s as connection timeout */
-               sprintf(conn_str, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=%d",
+               sprintf(conn_str, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=%d comm_timeout=%d",
                                                                GtmHost, GtmPort, PGXCNodeName, remote_type,
-                                                               GtmConnectTimeout);
+                                                               GtmConnectTimeout, GtmCommTimeout);
 
                /* Log activity of GTM connections */
                elog(DEBUG1, "Postmaster: connection established to GTM with string %s", conn_str);
@@ -92,8 +93,9 @@ InitGTM(void)
        else
        {
                /* Use 60s as connection timeout */
-               sprintf(conn_str, "host=%s port=%d node_name=%s connect_timeout=%d",
-                               GtmHost, GtmPort, PGXCNodeName, GtmConnectTimeout);
+               sprintf(conn_str, "host=%s port=%d node_name=%s connect_timeout=%d comm_timeout=%d",
+                               GtmHost, GtmPort, PGXCNodeName, GtmConnectTimeout,
+                               GtmCommTimeout);
 
                /* Log activity of GTM connections */
                if (IsAutoVacuumWorkerProcess())
index 58b39007fa8413846e83bbe7fc09dd2a102832a1..6faeabe616101bdd4d82241a1a5266549db5bdfc 100644 (file)
@@ -59,6 +59,7 @@ static const GTMPQconninfoOption GTMPQconninfoOptions[] = {
        {"remote_type", NULL},
        {"postmaster", NULL},
        {"client_id", NULL},
+       {"comm_timeout", NULL},
        /* Terminating entry --- MUST BE LAST */
        {NULL, NULL}
 };
@@ -176,6 +177,8 @@ connectOptions1(GTM_Conn *conn, const char *conninfo)
        conn->pgport = tmp ? strdup(tmp) : NULL;
        tmp = conninfo_getval(connOptions, "connect_timeout");
        conn->connect_timeout = tmp ? strdup(tmp) : NULL;
+       tmp = conninfo_getval(connOptions, "comm_timeout");
+       conn->comm_timeout = tmp ? atoi(tmp) : 0;
        tmp = conninfo_getval(connOptions, "node_name");
        conn->gc_node_name = tmp ? strdup(tmp) : NULL;
        tmp = conninfo_getval(connOptions, "postmaster");
index d4be13803c1170cbc06f046199dc2463bd43cee7..2ab9c5d20bc66fad1953ffcb42c74a90158be2eb 100644 (file)
@@ -263,6 +263,7 @@ GTMPQgetResult(GTM_Conn *conn)
        while ((res = pqParseInput(conn)) == NULL)
        {
                int                     flushResult;
+               time_t          finish_time;
 
                /*
                 * If data remains unsent, send it.  Else we might be waiting for the
@@ -270,16 +271,35 @@ GTMPQgetResult(GTM_Conn *conn)
                 */
                while ((flushResult = gtmpqFlush(conn)) > 0)
                {
-                       if (gtmpqWait(false, true, conn))
+                       if (conn->comm_timeout)
+                               finish_time = time(NULL) + conn->comm_timeout;
+                       else
+                               finish_time = -1;
+
+                       if (gtmpqWaitTimed(false, true, conn, finish_time))
                        {
-                               flushResult = -1;
-                               break;
+                               /*
+                                * conn->errorMessage has been set by gtmpqWait or
+                                * gtmpqReadData.
+                                */
+                               return NULL;
                        }
                }
 
+               /*
+                * By now we should have sent all pending data. If gtmpqFlush returned
+                * failure (< 0), then this is an error condition.
+                */
+               if (flushResult)
+                       return NULL;
+
+               if (conn->comm_timeout)
+                       finish_time = time(NULL) + conn->comm_timeout;
+               else
+                       finish_time = -1;
+
                /* Wait for some more data, and load it. */
-               if (flushResult ||
-                       gtmpqWait(true, false, conn) ||
+               if (gtmpqWaitTimed(true, false, conn, finish_time) ||
                        gtmpqReadData(conn) < 0)
                {
                        /*
index 871c6a5ee3a4ccbee4c57c0997d31aa2baed96ee..b679369f879b356e5dc17d7a48452a5fa55f0bb3 100644 (file)
@@ -60,7 +60,7 @@ extern char *optarg;
 #ifdef  GTM_DEBUG
 #define PROXY_CLIENT_TIMEOUT   3600
 #else
-#define PROXY_CLIENT_TIMEOUT   20
+#define PROXY_CLIENT_TIMEOUT   60
 #endif
 #endif
 
@@ -1109,8 +1109,9 @@ GTMProxy_ThreadMain(void *argp)
        /*
         * Set up connection with the GTM server
         */
-       sprintf(gtm_connect_string, "host=%s port=%d node_name=%s remote_type=%d",
-                       GTMServerHost, GTMServerPortNumber, GTMProxyNodeName, GTM_NODE_GTM_PROXY);
+       sprintf(gtm_connect_string, "host=%s port=%d node_name=%s remote_type=%d comm_timeout=%d",
+                       GTMServerHost, GTMServerPortNumber, GTMProxyNodeName,
+                       GTM_NODE_GTM_PROXY, PROXY_CLIENT_TIMEOUT);
 
        thrinfo->thr_gtm_conn = PQconnectGTM(gtm_connect_string);
 
@@ -1697,8 +1698,10 @@ HandleGTMError(GTM_Conn *gtm_conn)
                /* Close and free previous connection object if still active */
                GTMPQfinish(gtm_conn);
                /* Reconnect */
-               sprintf(gtm_connect_string, "host=%s port=%d node_name=%s remote_type=%d",
-                               GTMServerHost, GTMServerPortNumber, GTMProxyNodeName, GTM_NODE_GTM_PROXY);
+               sprintf(gtm_connect_string, "host=%s port=%d node_name=%s "
+                               "remote_type=%d comm_timeout=%d",
+                               GTMServerHost, GTMServerPortNumber, GTMProxyNodeName,
+                               GTM_NODE_GTM_PROXY, PROXY_CLIENT_TIMEOUT);
                gtm_conn = PQconnectGTM(gtm_connect_string);
                /*
                 * If reconnect succeeded the connection will be ready to use out of
@@ -2992,7 +2995,11 @@ UnregisterProxy(void)
        if (gtmpqFlush(master_conn))
                goto failed;
 
-       finish_time = time(NULL) + PROXY_CLIENT_TIMEOUT;
+       if (master_conn->comm_timeout)
+               finish_time = time(NULL) + master_conn->comm_timeout;
+       else
+               finish_time = -1;
+
        if (gtmpqWaitTimed(true, false, master_conn, finish_time) ||
                gtmpqReadData(master_conn) < 0)
                goto failed;
@@ -3092,7 +3099,11 @@ RegisterProxy(bool is_reconnect)
        if (gtmpqFlush(master_conn))
                goto failed;
 
-       finish_time = time(NULL) + PROXY_CLIENT_TIMEOUT;
+       if (master_conn->comm_timeout)
+               finish_time = time(NULL) + master_conn->comm_timeout;
+       else
+               finish_time = -1;
+
        if (gtmpqWaitTimed(true, false, master_conn, finish_time) ||
                gtmpqReadData(master_conn) < 0)
        {
@@ -3127,8 +3138,9 @@ ConnectGTM(void)
        char conn_str[256];
        GTM_Conn *conn;
 
-       sprintf(conn_str, "host=%s port=%d node_name=%s remote_type=%d postmaster=1",
-                       GTMServerHost, GTMServerPortNumber, GTMProxyNodeName, GTM_NODE_GTM_PROXY_POSTMASTER);
+       sprintf(conn_str, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 comm_timeout=%d",
+                       GTMServerHost, GTMServerPortNumber, GTMProxyNodeName,
+                       GTM_NODE_GTM_PROXY_POSTMASTER, PROXY_CLIENT_TIMEOUT);
 
        conn = PQconnectGTM(conn_str);
        if (GTMPQstatus(conn) != CONNECTION_OK)
@@ -3189,9 +3201,10 @@ workerThreadReconnectToGTM(void)
                GTMPQfinish(GetMyThreadInfo->thr_gtm_conn);
        }
 
-       sprintf(gtm_connect_string, "host=%s port=%d node_name=%s remote_type=%d client_id=%u",
+       sprintf(gtm_connect_string, "host=%s port=%d node_name=%s remote_type=%d "
+                       "client_id=%u comm_timeout=%d",
                        GTMServerHost, GTMServerPortNumber, GTMProxyNodeName,
-                       GTM_NODE_GTM_PROXY, saveMyClientId);
+                       GTM_NODE_GTM_PROXY, saveMyClientId, PROXY_CLIENT_TIMEOUT);
        elog(DEBUG1, "Worker thread connecting to %s", gtm_connect_string);
        GetMyThreadInfo->thr_gtm_conn = PQconnectGTM(gtm_connect_string);
 
index b11426ad994f77a488a34c14b0d9bd87e28e4b3c..d62ef3bb01536782b6a6465016c5a924621a8330 100644 (file)
@@ -43,6 +43,7 @@ struct gtm_conn
                                                                         * over above. */
        char            *pgport;                        /* the server's communication port */
        char            *connect_timeout;       /* connection timeout (numeric string) */
+       int                      comm_timeout;          /* communication timeout, 0 means infinite */
        char            *gc_node_name;          /* PGXC Node Name */
        int                     remote_type;            /* is this a connection to/from a proxy ? */
        int                     is_postmaster;          /* is this connection to/from a postmaster instance */