Merge from PG master upto d5cb3bab564e0927ffac7c8729eacf181a12dd40

author Pavan Deolasee <[email protected]>

Wed, 14 Jun 2017 05:42:18 +0000 (11:12 +0530)

committer Pavan Deolasee <[email protected]>

Wed, 14 Jun 2017 05:42:18 +0000 (11:12 +0530)
author Pavan Deolasee <[email protected]>
Wed, 14 Jun 2017 05:42:18 +0000 (11:12 +0530)
committer Pavan Deolasee <[email protected]>
Wed, 14 Jun 2017 05:42:18 +0000 (11:12 +0530)
diff --cc .gitignore
Simple merge
diff --cc COPYRIGHT

index aa6567a3068b90d621f6ee33fcc1990403ab4494,c320eccac08f7bec087f47efd48182eeca639d26..fa6acc3d93dac65b402d9ce47925bcadf235403e
--- 1/COPYRIGHT
--- 2/COPYRIGHT
+++ b/COPYRIGHT
@@@ -1,9 -1,7 +1,9 @@@
- -PostgreSQL Database Management System
- -(formerly known as Postgres, then as Postgres95)
+ +Postgres-XL Cluster Database Management System
   
- Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ +Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ +Portions Copyright (c) 2010-2013, Postgres-XC Development Group
+ Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ +Portions Copyright (c) 2015-1016, 2ndQuadrant Limited
   
   Portions Copyright (c) 1994, The Regents of the University of California
   
diff --cc GNUmakefile.in
Simple merge
diff --cc README

index 7dd5bcbb71d4e8f02e08280d29215af32c29bbe5,12de3f1d73cc9e48f804845ccab24b630c2dbd80..4cafaa535b27628823faa58a89d56ee8801ccc75
--- 1/README
--- 2/README
+++ b/README
@@@ -1,23 -1,22 +1,23 @@@
- -PostgreSQL Database Management System
- -=====================================
+ +Postgres-XL Database Management System
+ +======================================
   
- -This directory contains the source code distribution of the PostgreSQL
+ +This directory contains the source code distribution of the Postgres-XL
   database management system.
   
- -PostgreSQL is an advanced object-relational database management system
- -that supports an extended subset of the SQL standard, including
- -transactions, foreign keys, subqueries, triggers, user-defined types
- -and functions.  This distribution also contains C language bindings.
+ +Postgres-XL is an advanced object-relational cluster database management
+ + system that supports an extended subset of the SQL standard, including
+ +transactions, foreign keys, user-defined types and functions.  This
+ +distribution also contains C language bindings.
   
- -PostgreSQL has many language interfaces, many of which are listed here:
+ +Postgres-XL has many language interfaces similar to PostgreSQL, many of
+ +which are listed here:
   
-       http://www.postgresql.org/download
+       https://www.postgresql.org/download
   
   See the file INSTALL for instructions on how to build and install
- -PostgreSQL.  That file also lists supported operating systems and
+ +Postgres-XL.  That file also lists supported operating systems and
   hardware platforms and contains information regarding any other
- -software packages that are required to build or run the PostgreSQL
+ +software packages that are required to build or run the PostgreSQL-XL
   system.  Copyright and license information can be found in the
   file COPYRIGHT.  A comprehensive documentation set is included in this
   distribution; it can be read as described in the installation
diff --cc configure

index cbfcb1815723526b8a0aa991f8a58a77b03f9f80,8208ecdb4ff5fbaa77d746a7b4c887719fd48a07..b458a5aa9a84cc08c4b5908560a4441bba3a7602
--- 1/configure
--- 2/configure
+++ b/configure
@@@ -1,8 -1,8 +1,8 @@@
   #! /bin/sh
   # Guess values for system-dependent variables and create Makefiles.
- # Generated by GNU Autoconf 2.69 for PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1).
- -# Generated by GNU Autoconf 2.69 for PostgreSQL 10beta1.
++# Generated by GNU Autoconf 2.69 for PostgreSQL 10beta1 (Postgres-XL 10alpha1).
   #
- -# Report bugs to <pgsql-bugs@postgresql.org>.
+ +# Report bugs to <bugs@postgres-xl.org>.
   #
   #
   # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@@ -582,10 -582,9 +582,10 @@@ MAKEFLAGS
   # Identity of this package.
   PACKAGE_NAME='PostgreSQL'
   PACKAGE_TARNAME='postgresql'
- PACKAGE_VERSION='9.6beta4 (Postgres-XL 9.6alpha1)'
- PACKAGE_XC_VERSION='9.6alpha1'
- PACKAGE_STRING='PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1)'
- -PACKAGE_VERSION='10beta1'
- -PACKAGE_STRING='PostgreSQL 10beta1'
- -PACKAGE_BUGREPORT='[email protected]'
++PACKAGE_VERSION='10beta1 (Postgres-XL 10alpha1)'
++PACKAGE_XC_VERSION='10alpha1'
++PACKAGE_STRING='PostgreSQL 10beta1 (Postgres-XL 10alpha1)'
+ +PACKAGE_BUGREPORT='[email protected]'
   PACKAGE_URL=''
   
   ac_unique_file="src/backend/access/common/heaptuple.c"
@@@ -821,9 -823,9 +826,10 @@@ with_wal_blocksiz
   with_wal_segsize
   with_CC
   enable_depend
+ +enable_genmsgids
   enable_cassert
   enable_thread_safety
+ with_icu
   with_tcl
   with_tclconfig
   with_perl
@@@ -1402,7 -1408,7 +1412,7 @@@ if test "$ac_init_help" = "long"; the
     # Omit some internal or obsolete options to make the list less imposing.
     # This message is too long to be a string in the A/UX 3.1 sh.
     cat <<_ACEOF
- \`configure' configures PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1) to adapt to many kinds of systems.
- -\`configure' configures PostgreSQL 10beta1 to adapt to many kinds of systems.
++\`configure' configures PostgreSQL 10beta1 (Postgres-XL 10alpha1) to adapt to many kinds of systems.
   
   Usage: $0 [OPTION]... [VAR=VALUE]...
   
@@@ -1467,7 -1473,7 +1477,7 @@@ f
   
   if test -n "$ac_init_help"; then
     case $ac_init_help in
-      short | recursive ) echo "Configuration of PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1):";;
- -     short | recursive ) echo "Configuration of PostgreSQL 10beta1:";;
++     short | recursive ) echo "Configuration of PostgreSQL 10beta1 (Postgres-XL 10alpha1):";;
      esac
     cat <<\_ACEOF
   
@@@ -1619,7 -1632,7 +1636,7 @@@ f
   test -n "$ac_init_help" && exit $ac_status
   if $ac_init_version; then
     cat <<\_ACEOF
- PostgreSQL configure 9.6beta4 (Postgres-XL 9.6alpha1)
- -PostgreSQL configure 10beta1
++PostgreSQL configure 10beta1 (Postgres-XL 10alpha1)
   generated by GNU Autoconf 2.69
   
   Copyright (C) 2012 Free Software Foundation, Inc.
@@@ -2330,7 -2343,7 +2347,7 @@@ cat >config.log <<_ACEO
   This file contains any messages produced by compilers while
   running configure, to aid debugging if configure makes a mistake.
   
- It was created by PostgreSQL $as_me 9.6beta4 (Postgres-XL 9.6alpha1), which was
- -It was created by PostgreSQL $as_me 10beta1, which was
++It was created by PostgreSQL $as_me 10beta1 (Postgres-XL 10alpha1), which was
   generated by GNU Autoconf 2.69.  Invocation command line was
   
     $ $0 $@
@@@ -16483,7 -17019,7 +17069,7 @@@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_wr
   # report actual input values of CONFIG_FILES etc. instead of their
   # values after options handling.
   ac_log="
- This file was extended by PostgreSQL $as_me 9.6beta4 (Postgres-XL 9.6alpha1), which was
- -This file was extended by PostgreSQL $as_me 10beta1, which was
++This file was extended by PostgreSQL $as_me 10beta1 (Postgres-XL 10alpha1), which was
   generated by GNU Autoconf 2.69.  Invocation command line was
   
     CONFIG_FILES    = $CONFIG_FILES
@@@ -16553,7 -17089,7 +17139,7 @@@ _ACEO
   cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
   ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
   ac_cs_version="\\
- PostgreSQL config.status 9.6beta4 (Postgres-XL 9.6alpha1)
- -PostgreSQL config.status 10beta1
++PostgreSQL config.status 10beta1 (Postgres-XL 10alpha1)
   configured by $0, generated by GNU Autoconf 2.69,
     with options \\"\$ac_cs_config\\"
   
diff --cc configure.in

index 3c77bebcdce6385571f759e85f3fc6fe1e1cc4a5,e9f85b805dae870dd6a77f9bac6a266c03e14e73..e1b1780a5fd6230f70f8148a621da44319264d67
--- 1/configure.in
--- 2/configure.in
+++ b/configure.in
@@@ -17,7 -17,7 +17,7 @@@ dnl Read the Autoconf manual for detail
   dnl
   m4_pattern_forbid(^PGAC_)dnl to catch undefined macros
   
- AC_INIT([PostgreSQL], [9.6beta4 (Postgres-XL 9.6alpha1)], [[email protected]])
- -AC_INIT([PostgreSQL], [10beta1], [[email protected]])
++AC_INIT([PostgreSQL], [10beta1 (Postgres-XL 10alpha1)], [[email protected]])
   
   m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required.
   Untested combinations of 'autoconf' and PostgreSQL versions are not
diff --cc contrib/Makefile

index fedc61b243eb9fe4279df2d55e9e3f8cc1f3f13f,e84eb67008032e4c53df45a92876ca32903a89d9..d2503412701d67ba1b7f44e67a744d0747f8cd30
--- 1/contrib/Makefile
--- 2/contrib/Makefile
+++ b/contrib/Makefile
@@@ -50,10 -48,8 +51,9 @@@ SUBDIRS = 
                 test_decoding   \
                 tsm_system_rows \
                 tsm_system_time \
-               tsearch2        \
                 unaccent        \
- -              vacuumlo
+ +              vacuumlo        \
+ +              stormstats
   
   ifeq ($(with_openssl),yes)
   SUBDIRS += sslinfo
diff --cc contrib/citext/Makefile

index 3623f9d91ce695d685d70d015ecf9c676b8e1d83,563cd22dcccbb53222ea42aea499e6498e904b8d..a6e026dafb4b2ebf83ac29e67b6ff192bc651306

mode 100755,100644..100755
--- 1/contrib/citext/Makefile
--- 2/contrib/citext/Makefile
+++ b/contrib/citext/Makefile
diff --cc contrib/citext/expected/citext_1.out
Simple merge
diff --cc contrib/citext/sql/citext.sql
Simple merge
diff --cc contrib/hstore/expected/hstore.out
Simple merge
diff --cc contrib/hstore/hstore_io.c
Simple merge
diff --cc contrib/hstore/sql/hstore.sql
Simple merge
diff --cc contrib/ltree/expected/ltree.out
Simple merge
diff --cc contrib/pg_stat_statements/pg_stat_statements.c

index cc56d80bb56097e7a144efac00f62b73ca333efc,bf03e67513f262133fa6882892d8975c9ad42227..3c35604b5d665d0e623463408a6bb84b2ca7d801
--- 1/contrib/pg_stat_statements/pg_stat_statements.c
--- 2/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@@ -289,20 -294,18 +294,22 @@@ static void pgss_post_parse_analyze(Par
   static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
   static void pgss_ExecutorRun(QueryDesc *queryDesc,
                                  ScanDirection direction,
-                                uint64 count);
+                                uint64 count, bool execute_once);
   static void pgss_ExecutorFinish(QueryDesc *queryDesc);
   static void pgss_ExecutorEnd(QueryDesc *queryDesc);
- static void pgss_ProcessUtility(Node *parsetree, const char *queryString,
+ static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
                                         ProcessUtilityContext context, ParamListInfo params,
- -                                      DestReceiver *dest, char *completionTag);
+                                       QueryEnvironment *queryEnv,
+ +                                      DestReceiver *dest,
+ +#ifdef PGXC
+ +                                      bool sentToRemote,
+ +#endif /* PGXC */
+ +                                      char *completionTag);
   static uint32 pgss_hash_fn(const void *key, Size keysize);
   static int    pgss_match_fn(const void *key1, const void *key2, Size keysize);
- static uint32 pgss_hash_string(const char *str);
+ static uint32 pgss_hash_string(const char *str, int len);
   static void pgss_store(const char *query, uint32 queryId,
+                  int query_location, int query_len,
                    double total_time, uint64 rows,
                    const BufferUsage *bufusage,
                    pgssJumbleState *jstate);
@@@ -946,14 -956,13 +960,17 @@@ pgss_ExecutorEnd(QueryDesc *queryDesc
    * ProcessUtility hook
    */
   static void
- pgss_ProcessUtility(Node *parsetree, const char *queryString,
+ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
- -                                      ProcessUtilityContext context,
- -                                      ParamListInfo params, QueryEnvironment *queryEnv,
- -                                      DestReceiver *dest, char *completionTag)
+ +                                      ProcessUtilityContext context, ParamListInfo params,
++                                      QueryEnvironment *queryEnv,
+ +                                      DestReceiver *dest,
+ +#ifdef PGXC
+ +                                      bool sentToRemote,
+ +#endif /* PGXC */
+ +                                      char *completionTag)
   {
+       Node       *parsetree = pstmt->utilityStmt;
+ 
         /*
          * If it's an EXECUTE statement, we don't track it and don't increment the
          * nesting level.  This allows the cycles to be charged to the underlying
@@@ -987,21 -995,13 +1003,17 @@@
                 PG_TRY();
                 {
                         if (prev_ProcessUtility)
-                               prev_ProcessUtility(parsetree, queryString,
-                                                                       context, params,
+                               prev_ProcessUtility(pstmt, queryString,
+                                                                       context, params, queryEnv,
- -                                                                      dest, completionTag);
+ +                                                                      dest,
- #ifdef PGXC
+ +                                                                      sentToRemote,
- #endif /* PGXC */
+ +                                                                      completionTag);
                         else
-                               standard_ProcessUtility(parsetree, queryString,
-                                                                               context, params,
+                               standard_ProcessUtility(pstmt, queryString,
+                                                                               context, params, queryEnv,
- -                                                                              dest, completionTag);
+ +                                                                              dest,
- #ifdef PGXC
+ +                                                                              sentToRemote,
- #endif /* PGXC */
+ +                                                                              completionTag);
                         nested_level--;
                 }
                 PG_CATCH();
@@@ -1060,21 -1059,13 +1071,17 @@@
         else
         {
                 if (prev_ProcessUtility)
-                       prev_ProcessUtility(parsetree, queryString,
-                                                               context, params,
+                       prev_ProcessUtility(pstmt, queryString,
+                                                               context, params, queryEnv,
- -                                                              dest, completionTag);
+ +                                                              dest,
- #ifdef PGXC
+ +                                                              sentToRemote,
- #endif /* PGXC */
+ +                                                              completionTag);
                 else
-                       standard_ProcessUtility(parsetree, queryString,
-                                                                       context, params,
+                       standard_ProcessUtility(pstmt, queryString,
+                                                                       context, params, queryEnv,
- -                                                                      dest, completionTag);
+ +                                                                      dest,
- #ifdef PGXC
+ +                                                                      sentToRemote,
- #endif /* PGXC */
+ +                                                                      completionTag);
         }
   }
   
diff --cc contrib/pg_trgm/expected/pg_trgm.out
Simple merge
diff --cc contrib/sepgsql/hooks.c

index 41193222385a99e30a13a92993192dbf97da51bb,c4b978b48f2c830e3b1177b3e5dc5fb7d16edbd5..dadf99e74b60c05f7a44fa902509bf22d182ca34
--- 1/contrib/sepgsql/hooks.c
--- 2/contrib/sepgsql/hooks.c
+++ b/contrib/sepgsql/hooks.c
@@@ -301,12 -302,11 +302,14 @@@ sepgsql_utility_command(PlannedStmt *ps
                                                 const char *queryString,
                                                 ProcessUtilityContext context,
                                                 ParamListInfo params,
+                                               QueryEnvironment *queryEnv,
                                                 DestReceiver *dest,
+ +#ifdef PGXC
+ +                                              bool sentToRemote,
+ +#endif /* PGXC */
                                                 char *completionTag)
   {
+       Node       *parsetree = pstmt->utilityStmt;
         sepgsql_context_info_t saved_context_info = sepgsql_context_info;
         ListCell   *cell;
   
@@@ -365,21 -365,13 +368,17 @@@
                 }
   
                 if (next_ProcessUtility_hook)
-                       (*next_ProcessUtility_hook) (parsetree, queryString,
-                                                                                context, params,
+                       (*next_ProcessUtility_hook) (pstmt, queryString,
+                                                                                context, params, queryEnv,
- -                                                                               dest, completionTag);
+ +                                                                               dest,
- #ifdef PGXC
+ +                                                                               sentToRemote,
- #endif
+ +                                                                               completionTag);
                 else
-                       standard_ProcessUtility(parsetree, queryString,
-                                                                       context, params,
+                       standard_ProcessUtility(pstmt, queryString,
+                                                                       context, params, queryEnv,
- -                                                                      dest, completionTag);
+ +                                                                      dest,
- #ifdef PGXC
+ +                                                                      sentToRemote,
- #endif
+ +                                                                      completionTag);
         }
         PG_CATCH();
         {
diff --cc doc/bug.template

index b0ec8a3f02c4a7c68470e9af3073d5ebfbd9f0f3,70ce03163aff1cb2caa6248fe4a3a1f893998387..d4d5f4d4f2006d5e1b69430bdf3d3474ce24b4f5
--- 1/doc/bug.template
--- 2/doc/bug.template
+++ b/doc/bug.template
@@@ -27,7 -27,7 +27,7 @@@ System Configuration
   
     Operating System (example: Linux 2.4.18)    :
   
-   PostgreSQL version (example: PostgreSQL 9.6beta4):  Postgres-XL 9.6alpha1
- -  PostgreSQL version (example: PostgreSQL 10beta1):  PostgreSQL 10beta1
++  PostgreSQL version (example: PostgreSQL 10beta1):  Postgres-XL 10alpha1
   
     Compiler used (example: gcc 3.3.5)          :
   
diff --cc doc/src/sgml/Makefile
Simple merge
diff --cc doc/src/sgml/acronyms.sgml
Simple merge
diff --cc doc/src/sgml/advanced.sgml
Simple merge
diff --cc doc/src/sgml/backup.sgml
Simple merge
diff --cc doc/src/sgml/biblio.sgml
Simple merge
diff --cc doc/src/sgml/btree-gist.sgml
Simple merge
diff --cc doc/src/sgml/catalogs.sgml
Simple merge
diff --cc doc/src/sgml/citext.sgml
Simple merge
diff --cc doc/src/sgml/config.sgml
Simple merge
diff --cc doc/src/sgml/contrib.sgml
Simple merge
diff --cc doc/src/sgml/datatype.sgml
Simple merge
diff --cc doc/src/sgml/ddl.sgml

index f7841719831b4232f8ae90c0e2a833914fa88ee7,84c4f209909072a755c993eaf2ba53e16f6eec6f..71339bf81de82c543b54a853487f6681c1b14909

mode 100755,100644..100755
--- 1/doc/src/sgml/ddl.sgml
--- 2/doc/src/sgml/ddl.sgml
+++ b/doc/src/sgml/ddl.sgml
diff --cc doc/src/sgml/dml.sgml
Simple merge
diff --cc doc/src/sgml/filelist.sgml

index eba66890885cf249492788fb580fcf77bc93a1de,b914086009f619c66d732209a2048b36869cef7d..37bddc7f4e88fe28dfb437ebe056b5941b6ff93a
--- 1/doc/src/sgml/filelist.sgml
--- 2/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@@ -49,8 -47,7 +47,9 @@@
   <!ENTITY config        SYSTEM "config.sgml">
   <!ENTITY user-manag    SYSTEM "user-manag.sgml">
   <!ENTITY wal           SYSTEM "wal.sgml">
+ +<!ENTITY add-node      SYSTEM "add-node.sgml">
+ +<!ENTITY remove-node   SYSTEM "remove-node.sgml">
+ <!ENTITY logical-replication    SYSTEM "logical-replication.sgml">
   
   <!-- programmer's guide -->
   <!ENTITY bgworker   SYSTEM "bgworker.sgml">
@@@ -173,8 -166,8 +172,9 @@@
   <!ENTITY sourcerepo SYSTEM "sourcerepo.sgml">
   
   <!ENTITY release    SYSTEM "release.sgml">
+ <!ENTITY release-10     SYSTEM "release-10.sgml">
   <!ENTITY release-9.6    SYSTEM "release-9.6.sgml">
+ +<!ENTITY release-xl-9.5r1    SYSTEM "release-xl-9.5r1.sgml">
   <!ENTITY release-9.5    SYSTEM "release-9.5.sgml">
   <!ENTITY release-9.4    SYSTEM "release-9.4.sgml">
   <!ENTITY release-9.3    SYSTEM "release-9.3.sgml">
diff --cc doc/src/sgml/func.sgml

index 22f52ce256bab8041eca8faa07ed1e04dd07792a,14aae736c3427bc9581f3d64f4599e6462030729..7c5cbab2a2c06adace01074114c4585970ec4741
--- 1/doc/src/sgml/func.sgml
--- 2/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@@ -18103,15 -18666,9 +18708,15 @@@ postgres=# SELECT * FROM pg_walfile_nam
       needs to be archived.
      </para>
   
+ +   <para>
+ +    Please note that these functions works just locally.  To issue
+ +    these functions to another Coordinators or Datanodes, you should
+ +    issue these functions through <type>EXECUTE DIRECT</> statement.
+ +   </para>
+ +
      <para>
-     <function>pg_xlog_location_diff</> calculates the difference in bytes
-     between two transaction log locations. It can be used with
+     <function>pg_wal_lsn_diff</> calculates the difference in bytes
+     between two write-ahead log locations. It can be used with
       <structname>pg_stat_replication</structname> or some functions shown in
       <xref linkend="functions-admin-backup-table"> to get the replication lag.
      </para>
diff --cc doc/src/sgml/high-availability.sgml
Simple merge
diff --cc doc/src/sgml/indices.sgml
Simple merge
diff --cc doc/src/sgml/info.sgml

index b1738a75a7a5b47b9591c6ad8f129abb2ff6bd98,233ba0e6687408c409dd5d2d7449ad4e0ae0079c..4e5503c3c6065964cd4286b9b358be14c10eed87
--- 1/doc/src/sgml/info.sgml
--- 2/doc/src/sgml/info.sgml
+++ b/doc/src/sgml/info.sgml
@@@ -13,11 -27,11 +27,11 @@@
       <term>Web Site</term>
       <listitem>
        <para>
-       The <productname>Postgres-XL</productname> 
-       <ulink url="http://www.postgres-xl.org/">web site</ulink>
- -      The <productname>PostgreSQL</productname>
- -      <ulink url="https://www.postgresql.org">web site</ulink>
++      The <productname>Postgres-XL</productname>
++      <ulink url="http://www.postgres-xl.org">web site</ulink>
         carries details on the latest release and other
         information to make your work or play with
- -      <productname>PostgreSQL</productname> more productive.
+ +      <productname>Postgres-XL</productname> more productive.
        </para>
       </listitem>
      </varlistentry>
diff --cc doc/src/sgml/installation.sgml
Simple merge
diff --cc doc/src/sgml/keywords.sgml
Simple merge
diff --cc doc/src/sgml/legal.sgml

index ae7f5a7d317f198d9409c40f3824a30e452f94aa,67ef88b2ff9c79cc184a2fd99463680feac93cd0..601d063a3a2ff2f5b3434b73ebafdba7be7a8317
--- 1/doc/src/sgml/legal.sgml
--- 2/doc/src/sgml/legal.sgml
+++ b/doc/src/sgml/legal.sgml
@@@ -1,27 -1,11 +1,27 @@@
   <!-- doc/src/sgml/legal.sgml -->
   
- <date>2016</date>
+ <date>2017</date>
   
   <copyright>
-  <year>1996-2016</year>
+  <year>1996-2017</year>
    <holder>The PostgreSQL Global Development Group</holder>
   </copyright>
+ +<copyright>
+ + <year>2014-2016</year>
+ + <holder>Postgres-XL Development Group</holder>
+ +</copyright>
+ +<copyright>
+ + <year>2009-2012</year>
+ + <holder>Postgres-XC Development Group</holder>
+ +</copyright>
+ +<copyright>
+ + <year>2012-2014</year>
+ + <holder>TransLattice, Inc.</holder>
+ +</copyright>
+ +<copyright>
+ + <year>2015-2016</year>
+ + <holder>2ndQuadrant Ltd</holder>
+ +</copyright>
   
   <legalnotice id="legalnotice">
    <title>Legal Notice</title>
diff --cc doc/src/sgml/libpq.sgml
Simple merge
diff --cc doc/src/sgml/lobj.sgml
Simple merge
diff --cc doc/src/sgml/maintenance.sgml

index fbab22e715d94268593e21075941beb6ffb3c3b9,65a64c85ec012452c999c7fdba6868b9a9e4c0de..cfaa0da4b836d0aaa1a853f7f526b6fc2e8e42c4
--- 1/doc/src/sgml/maintenance.sgml
--- 2/doc/src/sgml/maintenance.sgml
+++ b/doc/src/sgml/maintenance.sgml
@@@ -413,13 -388,9 +413,14 @@@
        <secondary>of transaction IDs</secondary>
       </indexterm>
   
+ +   <para>
+ +    Please note that this section describes the tasks of individual
+ +    Coordinators and Datanodes.  It should be done for each of them.
+ +   </para>
+ +
      <para>
-     <productname>PostgreSQL</productname>'s MVCC transaction semantics
+     <productname>PostgreSQL</productname>'s
+     <link linkend="mvcc-intro">MVCC</link> transaction semantics
       depend on being able to compare transaction ID (<acronym>XID</>)
       numbers: a row version with an insertion XID greater than the current
       transaction's XID is <quote>in the future</> and should not be visible
diff --cc doc/src/sgml/mvcc.sgml
Simple merge
diff --cc doc/src/sgml/pageinspect.sgml

index 96ac82afef1040d53ee9870cef9c9e921c668516,ccdaf3e0aca3b699f0af0e452134a90e52e47f52..3809c4e7f9256c531137bd86ae1f4c6b9a480221
--- 1/doc/src/sgml/pageinspect.sgml
--- 2/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@@ -13,14 -13,8 +13,14 @@@
     debugging purposes.  All of these functions may be used only by superusers.
    </para>
   
+ + <para>
+ +  Functions of this module returns information about connecting Coordinators
+ +  locally.  To get information from a specific a Datanode, you can use EXECUTE
+ +  DIRECT from a Coordinator. 
+ + </para>
+ +
    <sect2>
-   <title>Functions</title>
+   <title>General Functions</title>
   
     <variablelist>
      <varlistentry>
diff --cc doc/src/sgml/perform.sgml
Simple merge
diff --cc doc/src/sgml/pgbuffercache.sgml

index 6c5be94a4c360a1b4b530d684692c60aab5fac10,4e53009ae073b69b1545b48141e5a511554ec3e4..2e26d9e202429e0795ee6f364d3fd747ea75f198
--- 1/doc/src/sgml/pgbuffercache.sgml
--- 2/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@@ -24,16 -24,11 +24,17 @@@
    </para>
   
    <para>
-   By default public access is revoked from both of these, just in case there
-   are security issues lurking.
+   By default use is restricted to superusers and members of the
+   <literal>pg_read_all_stats</literal> role. Access may be granted to others
+   using <command>GRANT</command>.
    </para>
   
+ + <para>
+ +  <filename>pg_buffercache</filename> returns information local to the
+ +  connecting Coordinator.  To inquire information local to other node,
+ +  use <command>EXECUTE DIRECT</command>.
+ + </para>
+ +
    <sect2>
     <title>The <structname>pg_buffercache</structname> View</title>
   
diff --cc doc/src/sgml/pgfreespacemap.sgml

index a88cd52678757263fa0e05e42115449dfe4f53bd,43e154a2f3aa7fcc9f70957294054143879e0ecc..924d512f6a4b41eb9fafb356bdf9ee0f55a4555a
--- 1/doc/src/sgml/pgfreespacemap.sgml
--- 2/doc/src/sgml/pgfreespacemap.sgml
+++ b/doc/src/sgml/pgfreespacemap.sgml
@@@ -16,16 -16,11 +16,17 @@@
    </para>
   
    <para>
-   By default public access is revoked from the functions, just in case
-   there are security issues lurking.
+   By default use is restricted to superusers and members of the
+   <literal>pg_stat_scan_tables</literal> role. Access may be granted to others
+   using <command>GRANT</command>.
    </para>
   
+ + <para>
+ +  Functions of this module return information from the Coordinator that the
+ +  session is currently connected to.  To get information about a Datanode, you
+ +  can use <command>EXECUTE DIRECT</command>.
+ + </para>
+ +
    <sect2>
     <title>Functions</title>
   
diff --cc doc/src/sgml/pgrowlocks.sgml

index 4fae2fad98dde9b091627c4e590321ad32a5d01a,65d532e081041dac7162af172face346584997ef..b0cde6d01a55b1fd4ed37c27107ffedacaf07e51
--- 1/doc/src/sgml/pgrowlocks.sgml
--- 2/doc/src/sgml/pgrowlocks.sgml
+++ b/doc/src/sgml/pgrowlocks.sgml
@@@ -12,12 -12,12 +12,18 @@@
     locking information for a specified table.
    </para>
   
+  <para>
+   By default use is restricted to superusers, members of the
+   <literal>pg_stat_scan_tables</literal> role, and users with
+   <literal>SELECT</literal> permissions on the table.
+  </para>
+ 
+ + <para>
+ +  Functions of this module return information from the 
+ +  Coordinator that the session is currently connect to.  
+ +  To get information about a Datanode, you can
+ +  use <command>EXECUTE DIRECT</command>.
+ + </para>
   
    <sect2>
     <title>Overview</title>
diff --cc doc/src/sgml/pgstatstatements.sgml
Simple merge
diff --cc doc/src/sgml/pgstattuple.sgml

index b5e2ea7187ff0f5d5df92b372e6d7336342c98a5,b6a5f19e706ba8d692a0f070add2292172b57c95..63412b03ef6e92062d3a27282a5e96f749744994
--- 1/doc/src/sgml/pgstattuple.sgml
--- 2/doc/src/sgml/pgstattuple.sgml
+++ b/doc/src/sgml/pgstattuple.sgml
@@@ -12,11 -12,14 +12,19 @@@
     obtain tuple-level statistics.
    </para>
   
+  <para>
+   As these functions return detailed page-level information, only the superuser
+   has EXECUTE privileges on them upon installation.  After the functions have
+   been installed, users may issue <command>GRANT</command> commands to change
+   the privileges on the functions to allow non-superusers to execute them. Members
+   of the <literal>pg_stat_scan_tables</literal> role are granted access by default. See
+   the description of the <xref linkend="sql-grant"> command for specifics.
+  </para>
+ + <para>
+ +  Functions of this module return information from the Coordinator that the
+ +  session is currently connected to.  To get information about a Datanode, you
+ +  can use <command>EXECUTE DIRECT</command>.
+ + </para>
   
    <sect2>
     <title>Functions</title>
diff --cc doc/src/sgml/plperl.sgml
Simple merge
diff --cc doc/src/sgml/plpgsql.sgml
Simple merge
diff --cc doc/src/sgml/pltcl.sgml
Simple merge
diff --cc doc/src/sgml/postgres.sgml
Simple merge
diff --cc doc/src/sgml/recovery-config.sgml

index e04a157cc5957451b8e5523045f8b764667b187b,0a5d086248c37309401e2b8dfcee97bf94a9a66d..4797156eddfb429b5f4169e03ffe0cc6da7e5df9
--- 1/doc/src/sgml/recovery-config.sgml
--- 2/doc/src/sgml/recovery-config.sgml
+++ b/doc/src/sgml/recovery-config.sgml
@@@ -157,9 -157,10 +157,9 @@@ restore_command = 'copy "C:\\server\\ar
         By default, recovery will recover to the end of the WAL log. The
         following parameters can be used to specify an earlier stopping point.
         At most one of <varname>recovery_target</>,
-       <varname>recovery_target_name</>, <varname>recovery_target_time</>,
- -      <varname>recovery_target_lsn</>, <varname>recovery_target_name</>,
- -      <varname>recovery_target_time</>, or <varname>recovery_target_xid</>
- -      can be used; if more than one of these is specified in the configuration
- -      file, the last entry will be used.
++      <varname>recovery_target_lsn</>,  <varname>recovery_target_name</>, <varname>recovery_target_time</>,
+ +      <varname>recovery_target_xid</> or <varname>recovery_target_barrier</> can be used; if more than one of these
+ +      is specified in the configuration file, the last entry will be used.
        </para>
   
        <variablelist>
@@@ -232,20 -233,20 +232,35 @@@
          </para>
         </listitem>
        </varlistentry>
+ +     <varlistentry id="recovery-target-barrier" xreflabel="recovery_target_barrier">
+ +      <term><varname>recovery_target_barrier</varname> (<type>string</type>)
+ +       <indexterm>
+ +         <primary><varname>recovery_target_barrier</> recovery parameter</primary>
+ +       </indexterm>
+ +      </term>
+ +      <listitem>
+ +       <para>
+ +        This parameter specifies the barrier ID up to which recovery
+ +        will proceed. A global consistency is guaranteed when recovery is
+ +        stopped at a previously successfully completed barrier. At most
+ +        one of <varname>recovery_target_xid</>,
+ +        <xref linkend="recovery-target-time"> and 
+ +        <varname>recovery_target_barrier</> can be specified.
++     </varlistentry>
+ 
+      <varlistentry id="recovery-target-lsn" xreflabel="recovery_target_lsn">
+       <term><varname>recovery_target_lsn</varname> (<type>pg_lsn</type>)
+       <indexterm>
+         <primary><varname>recovery_target_lsn</> recovery parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         This parameter specifies the LSN of the write-ahead log location up
+         to which recovery will proceed. The precise stopping point is also
+         influenced by <xref linkend="recovery-target-inclusive">. This
+         parameter is parsed using the system data type
+         <link linkend="datatype-pg-lsn"><type>pg_lsn</></link>.
          </para>
         </listitem>
        </varlistentry>
diff --cc doc/src/sgml/ref/allfiles.sgml

index 0c5e3b350e0f23ad0267e2fda4deadd41b931b16,01acc2ef9dad1986af16c7b4e46afbeb22f4c6c8..3ef0b5af791ad50c841f65edd45adb19427f9224
--- 1/doc/src/sgml/ref/allfiles.sgml
--- 2/doc/src/sgml/ref/allfiles.sgml
+++ b/doc/src/sgml/ref/allfiles.sgml
@@@ -186,17 -185,11 +195,15 @@@ Complete list of usable sgml source fil
   <!-- applications and utilities -->
   <!ENTITY clusterdb          SYSTEM "clusterdb.sgml">
   <!ENTITY createdb           SYSTEM "createdb.sgml">
- <!ENTITY createlang         SYSTEM "createlang.sgml">
   <!ENTITY createuser         SYSTEM "createuser.sgml">
   <!ENTITY dropdb             SYSTEM "dropdb.sgml">
- <!ENTITY droplang           SYSTEM "droplang.sgml">
   <!ENTITY dropuser           SYSTEM "dropuser.sgml">
   <!ENTITY ecpgRef            SYSTEM "ecpg-ref.sgml">
+ +<!ENTITY gtm                system "gtm.sgml">
+ +<!ENTITY gtmPxy             system "gtm_proxy.sgml">
+ +<!ENTITY gtmCtl             system "gtm_ctl.sgml">
   <!ENTITY initdb             SYSTEM "initdb.sgml">
+ +<!ENTITY initgtm            SYSTEM "initgtm.sgml">
   <!ENTITY pgarchivecleanup   SYSTEM "pgarchivecleanup.sgml">
   <!ENTITY pgBasebackup       SYSTEM "pg_basebackup.sgml">
   <!ENTITY pgbench            SYSTEM "pgbench.sgml">
diff --cc doc/src/sgml/ref/alter_table.sgml

index 8deb80ab63c052eb7eac0c4aad3866437736b070,56ea830d413f83c76b7398113a3d13bfbf4b8455..1dfbf6d3c802c12992a2892d92cf3c120dfab71d

mode 100755,100644..100755
--- 1/doc/src/sgml/ref/alter_table.sgml
--- 2/doc/src/sgml/ref/alter_table.sgml
+++ b/doc/src/sgml/ref/alter_table.sgml
@@@ -705,103 -765,61 +765,154 @@@ ALTER TABLE [ IF EXISTS ] <replaceable 
      </varlistentry>
   
      <varlistentry>
+ +    <term><literal>DISTRIBUTE BY</literal></term>
+ +    <listitem>
+ +     <para>
+ +      This clause specifies how the table is distributed or replicated among Datanodes.
+ +     </para>
+ +
+ +     <variablelist>
+ +
+ +      <varlistentry>
+ +       <term><literal>REPLICATION</literal></term>
+ +       <listitem>
+ +        <para>
+ +         Each row of the table will be replicated into all the
+ +         Datanodes of the <productname>Postgres-XL</> database
+ +         cluster.
+ +        </para>
+ +       </listitem>
+ +      </varlistentry>
+ +
+ +      <varlistentry>
+ +       <term><literal>ROUNDROBIN</literal></term>
+ +       <listitem>
+ +        <para>
+ +         Each row of the table will be placed in one of the Datanodes in a
+ +         round-robin manner.  The value of the row will not be needed to
+ +         determine what Datanode to go.
+ +        </para>
+ +       </listitem>
+ +      </varlistentry>
+ +
+ +      <varlistentry>
+ +       <term><literal>HASH ( <replaceable class="PARAMETER">column_name</> )</literal></term>
+ +       <listitem>
+ +        <para>
+ +         Each row of the table will be placed based on the hash value
+ +         of the specified column.  Following type is allowed as
+ +         distribution column: INT8, INT2, OID, INT4, BOOL, INT2VECTOR,
+ +         OIDVECTOR, CHAR, NAME, TEXT, BPCHAR, BYTEA, VARCHAR, NUMERIC, MONEY,
+ +         ABSTIME, RELTIME, DATE, TIME,TIMESTAMP, TIMESTAMPTZ, INTERVAL, and TIMETZ.
+ +        </para>
+ +        <para>
+ +         Please note that floating point is not allowed as a basis of
+ +         the distribution column.
+ +        </para>
+ +       </listitem>
+ +      </varlistentry>
+ +
+ +      <varlistentry>
+ +       <term><literal>MODULO ( <replaceable class="PARAMETER">column_name</> )</literal></term>
+ +       <listitem>
+ +        <para>
+ +         Each row of the table will be placed based on the modulo
+ +         of the specified column.  Following type is allowed as
+ +         distribution column: INT8, INT2, INT4, BOOL, ABSTIME, RELTIME, 
+ +         DATE.
+ +        </para>
+ +        <para>
+ +         Please note that floating point is not allowed as a basis of
+ +         the distribution column.
+ +        </para>
+ +       </listitem>
+ +      </varlistentry>
+ +     </variablelist>
+     <term><literal>ATTACH PARTITION <replaceable class="PARAMETER">partition_name</replaceable> FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable></literal></term>
+     <listitem>
+      <para>
+       This form attaches an existing table (which might itself be partitioned)
+       as a partition of the target table using the same syntax for
+       <replaceable class="PARAMETER">partition_bound_spec</replaceable> as
+       <xref linkend="sql-createtable">.  The partition bound specification
+       must correspond to the partitioning strategy and partition key of the
+       target table.  The table to be attached must have all the same columns
+       as the target table and no more; moreover, the column types must also
+       match.  Also, it must have all the <literal>NOT NULL</literal> and
+       <literal>CHECK</literal> constraints of the target table.  Currently
+       <literal>UNIQUE</literal>, <literal>PRIMARY KEY</literal>, and
+       <literal>FOREIGN KEY</literal> constraints are not considered.
+       If any of the <literal>CHECK</literal> constraints of the table being
+       attached is marked <literal>NO INHERIT</literal>, the command will fail;
+       such a constraint must be recreated without the <literal>NO INHERIT</literal>
+       clause.
+      </para>
+ 
+      <para>
+       If the new partition is a regular table, a full table scan is performed
+       to check that no existing row in the table violates the partition
+       constraint.  It is possible to avoid this scan by adding a valid
+       <literal>CHECK</literal> constraint to the table that would allow only
+       the rows satisfying the desired partition constraint before running this
+       command.  It will be determined using such a constraint that the table
+       need not be scanned to validate the partition constraint.  This does not
+       work, however, if any of the partition keys is an expression and the
+       partition does not accept <literal>NULL</literal> values.  If attaching
+       a list partition that will not accept <literal>NULL</literal> values,
+       also add <literal>NOT NULL</literal> constraint to the partition key
+       column, unless it's an expression.
+      </para>
+ 
+      <para>
+       If the new partition is a foreign table, nothing is done to verify
+       that all the rows in the foreign table obey the partition constraint.
+       (See the discussion in <xref linkend="SQL-CREATEFOREIGNTABLE"> about
+       constraints on the foreign table.)
+      </para>
       </listitem>
      </varlistentry>
   
- -
+ +   <varlistentry>
+ +    <term><literal>TO GROUP</literal></term>
+ +    <term><literal>TO NODE</literal></term>
+ +      <listitem>
+ +       <para>
+ +        This defines the list of nodes on which table data exists.
+ +       </para>
+ +      </listitem>
+ +   </varlistentry>
+ +
+ +   <varlistentry>
+ +     <term><literal>ADD NODE</literal></term>
+ +       <listitem>
+ +        <para>
+ +         This adds a list of nodes where data of table is distributed
+ +         to the existing list. If the list of nodes added contains nodes
+ +         already used by table, an error is returned.
+ +        </para>
+ +       </listitem>
+ +   </varlistentry>
+ +
+ +   <varlistentry>
+ +     <term><literal>DELETE NODE</literal></term>
+ +       <listitem>
+ +        <para>
+ +         This deletes a list of nodes where the data of a table is distributed
+ +         to the existing list. If the list of nodes deleted contains nodes not
+ +         used by table, an error is returned.
+ +        </para>
+ +       </listitem>
+ +   </varlistentry>
+    <varlistentry>
+     <term><literal>DETACH PARTITION</literal> <replaceable class="PARAMETER">partition_name</replaceable></term>
+     <listitem>
+      <para>
+       This form detaches specified partition of the target table.  The detached
+       partition continues to exist as a standalone table, but no longer has any
+       ties to the table from which it was detached.
+      </para>
+     </listitem>
+    </varlistentry>
     </variablelist>
     </para>
   
@@@ -1036,24 -1056,25 +1149,43 @@@
         </listitem>
        </varlistentry>
   
+ +     <varlistentry>
+ +      <term><replaceable class="PARAMETER">nodename</replaceable></term>
+ +        <listitem>
+ +         <para>
+ +          It defines a <productname>Postgres-XL</productname> node of catalog pgxc_node.
+ +         </para>
+ +        </listitem>
+ +     </varlistentry>
+ +
+ +     <varlistentry>
+ +      <term><replaceable class="PARAMETER">groupname</replaceable></term>
+ +        <listitem>
+ +         <para>
+ +          It defines a <productname>Postgres-XL</productname> node group in catalog pgxc_group.
+ +         </para>
+ +        </listitem>
+ +     </varlistentry>
+ +
+      <varlistentry>
+       <term><replaceable class="PARAMETER">partition_name</replaceable></term>
+       <listitem>
+        <para>
+         The name of the table to attach as a new partition or to detach from this table.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
+      <varlistentry>
+       <term><replaceable class="PARAMETER">partition_bound_spec</replaceable></term>
+       <listitem>
+        <para>
+         The partition bound specification for a new partition.  Refer to
+         <xref linkend="sql-createtable"> for more details on the syntax of the same.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
       </variablelist>
    </refsect1>
   
@@@ -1419,28 -1381,27 +1566,49 @@@ ALTER TABLE distributors DROP CONSTRAIN
       ADD CONSTRAINT distributors_pkey PRIMARY KEY USING INDEX dist_id_temp_idx;
   </programlisting></para>
   
+ +  <para>
+ +    To change the distribution type and the list of nodes where table data
+ +    is located:
+ +<programlisting>
+ +ALTER TABLE distributors TO NODE (dn1, dn7), DISTRIBUTE BY HASH(dist_id);
+ +</programlisting>
+ +  </para>
+ +
+ +  <para>
+ +    To add a node where data of table is distributed:
+ +<programlisting>
+ +ALTER TABLE distributors ADD NODE (dn9, dn14);
+ +</programlisting>
+ +  </para>
+ +
+ +  <para>
+ +    To remove a node where data of table is distributed:
+ +<programlisting>
+ +ALTER TABLE distributors DELETE NODE (dn4, dn0);
+ +</programlisting>
+ +  </para>
+ +
+   <para>
+    Attach a partition to range partitioned table:
+ <programlisting>
+ ALTER TABLE measurement
+     ATTACH PARTITION measurement_y2016m07 FOR VALUES FROM ('2016-07-01') TO ('2016-08-01');
+ </programlisting></para>
+ 
+   <para>
+    Attach a partition to list partitioned table:
+ <programlisting>
+ ALTER TABLE cities
+     ATTACH PARTITION cities_ab FOR VALUES IN ('a', 'b');
+ </programlisting></para>
+ 
+   <para>
+    Detach a partition from partitioned table:
+ <programlisting>
+ ALTER TABLE cities
+     DETACH PARTITION measurement_y2015m12;
+ </programlisting></para>
+ 
    </refsect1>
   
    <refsect1>
diff --cc doc/src/sgml/ref/alter_user_mapping.sgml
Simple merge
diff --cc doc/src/sgml/ref/analyze.sgml
Simple merge
diff --cc doc/src/sgml/ref/checkpoint.sgml
Simple merge
diff --cc doc/src/sgml/ref/copy.sgml
Simple merge
diff --cc doc/src/sgml/ref/create_database.sgml
Simple merge
diff --cc doc/src/sgml/ref/create_function.sgml
Simple merge
diff --cc doc/src/sgml/ref/create_index.sgml
Simple merge
diff --cc doc/src/sgml/ref/create_server.sgml
Simple merge
diff --cc doc/src/sgml/ref/create_table.sgml

index 2b842d5fb980b7d71e787d3e7952c15f74605a02,0478e40447df1b014c1cd27be058db826ac0b0f1..8d1e4ee487f49c47fad5b83ae080f1e0eb02a887

mode 100755,100644..100755
--- 1/doc/src/sgml/ref/create_table.sgml
--- 2/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@@ -1698,30 -1831,25 +1979,49 @@@ CREATE TABLE cities_ab_10000_to_10000
       effect can be had using the OID feature.
      </para>
     </refsect2>
+ 
+   <refsect2>
+    <title><literal>PARTITION BY</> Clause</title>
+ 
+    <para>
+     The <literal>PARTITION BY</> clause is a
+     <productname>PostgreSQL</productname> extension.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title><literal>PARTITION OF</> Clause</title>
+ 
+    <para>
+     The <literal>PARTITION OF</> clause is a
+     <productname>PostgreSQL</productname> extension.
+    </para>
+   </refsect2>
+ 
+ +  <refsect2>
+ +   <title><productname>Postgres-XL</> Specifics</title>
+ +
+ +   <para>
+ +    Currently, immutable, stable, volatile functions and nextval are allowed in DEFAULT clause. 
+ +    as <literal>DEFAULT</> values.
+ +   </para>
+ +   <para>
+ +    <literal>PRIMARY KEY</> and foreign key must include the
+ +    distribution column.
+ +   </para>
+ +   <para>
+ +    <literal>TEMP</> tables and exclusion constraint are not supported
+ +    yet.
+ +   </para>
+ +   <para>
+ +   </para>
+ +   <para>
+ +    In <productname>Postgres-XL</>, OID is maintained locally in each
+ +    Datanode and Coordinator.  The OID value may be inconsistent for rows
+ +    stored in different Datanodes.
+ +   </para>
+ +
+ +  </refsect2>
    </refsect1>
   
   
diff --cc doc/src/sgml/ref/create_trigger.sgml
Simple merge
diff --cc doc/src/sgml/ref/create_user_mapping.sgml
Simple merge
diff --cc doc/src/sgml/ref/create_view.sgml
Simple merge
diff --cc doc/src/sgml/ref/delete.sgml
Simple merge
diff --cc doc/src/sgml/ref/drop_foreign_data_wrapper.sgml
Simple merge
diff --cc doc/src/sgml/ref/drop_server.sgml
Simple merge
diff --cc doc/src/sgml/ref/explain.sgml
Simple merge
diff --cc doc/src/sgml/ref/initdb.sgml
Simple merge
diff --cc doc/src/sgml/ref/pg_ctl-ref.sgml

index fd73840d51a0232a8e3da69d6f3774e4eb007f93,71e52c4c35583c089f0132cdb8d622c35cd937ef..a6f27ec54d4043a91a0415ef39d078ccc554c6cb
--- 1/doc/src/sgml/ref/pg_ctl-ref.sgml
--- 2/doc/src/sgml/ref/pg_ctl-ref.sgml
+++ b/doc/src/sgml/ref/pg_ctl-ref.sgml
@@@ -73,8 -68,11 +69,12 @@@ PostgreSQL documentatio
          <arg choice="plain"><option>i[mmediate]</option></arg>
        </group>
      </arg>
+    <arg choice="opt"><option>-W</option></arg>
+    <arg choice="opt"><option>-t</option> <replaceable>seconds</replaceable></arg>
+    <arg choice="opt"><option>-s</option></arg>
      <arg choice="opt"><option>-o</option> <replaceable>options</replaceable></arg>
+ +   <arg>-Z <replaceable>nodeopt</replaceable></arg>
+    <arg choice="opt"><option>-c</option></arg>
     </cmdsynopsis>
   
     <cmdsynopsis>
@@@ -534,12 -561,12 +579,16 @@@
      utilities,
      also uses the environment variables supported by <application>libpq</>
      (see <xref linkend="libpq-envars">).
-    For additional server variables, see <xref linkend="app-postgres">.
+   </para>
+ 
+   <para>
+    For additional variables that affect the server,
+    see <xref linkend="app-postgres">.
     </para>
+ +
+ +  <para>
+ +   In <productname>Postgres-XL</>, this command controls individual Coordinator or Datanode.
+ +  </para>
    </refsect1>
   
   
diff --cc doc/src/sgml/ref/pg_resetwal.sgml

index e72021764a349e73a3cc534360b5241f043f1d9b,defaf170dc67a9df1ada930dd41e5e4896bebe6c..a23fe18b725fcebf6691f12318bd2543a81f6f57
--- 1/doc/src/sgml/ref/pg_resetxlog.sgml
--- 2/doc/src/sgml/ref/pg_resetwal.sgml
+++ b/doc/src/sgml/ref/pg_resetwal.sgml
@@@ -283,10 -283,9 +283,15 @@@ PostgreSQL documentatio
     </para>
   
     <para>
-    In <productname>Postgres-XL</>, <command>pg_resetxlog</command>
+    <command>pg_resetwal</command> works only with servers of the same
+    major version.
+   </para>
++
++  <para>
++   In <productname>Postgres-XL</>, <command>pg_resetwal</command>
+ +   will only run locally for Coordinators and Datanodes.  You should run it
+ +   for each Coordinator or Datanode manually.
+ +  </para>
    </refsect1>
   
    <refsect1>
diff --cc doc/src/sgml/ref/pgbench.sgml
Simple merge
diff --cc doc/src/sgml/ref/pgupgrade.sgml
Simple merge
diff --cc doc/src/sgml/ref/update.sgml
Simple merge
diff --cc doc/src/sgml/ref/vacuum.sgml
Simple merge
diff --cc doc/src/sgml/ref/vacuumdb.sgml
Simple merge
diff --cc doc/src/sgml/reference.sgml
Simple merge
diff --cc doc/src/sgml/regress.sgml
Simple merge
diff --cc doc/src/sgml/release.sgml

index eace76cace728b9b586c1e4ad7c789283149b94f,f1f4e91252ef852d6bbc81be69c25f30466488df..7e3e2c5c4430009a47a5a9df7b02565a1762d778
--- 1/doc/src/sgml/release.sgml
--- 2/doc/src/sgml/release.sgml
+++ b/doc/src/sgml/release.sgml
@@@ -73,8 -74,8 +74,9 @@@ For new features, add links to the docu
     The reason for splitting the release notes this way is so that appropriate
     subsets can easily be copied into back branches.
   -->
+ &release-10;
   &release-9.6;
+ +&release-xl-9.5r1;
   &release-9.5;
   &release-9.4;
   &release-9.3;
diff --cc doc/src/sgml/rules.sgml
Simple merge
diff --cc doc/src/sgml/runtime.sgml

index 714d29c4637bfa4d114dcfbc83ca9e6cbd1cdfb7,6d57525515e5acf3726bf41902c1adc8b2b8f62a..bafb099ccd4dd644122eb537892103e5eefc170a
--- 1/doc/src/sgml/runtime.sgml
--- 2/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@@ -2203,25 -1652,27 +2255,34 @@@ $ <userinput>kill -INT `head -1 /usr/lo
      <productname>PostgreSQL</> release to a newer one.
     </para>
   
+ +  <para>
+ +   Because <productname>Postgres-XL</>'s Coordinators and Datanodes
+ +   are essentially <productname>PostgreSQL</> servers, you can follw
+ +   the steps described below to upgrade each of them.  Please note
+ +   that you should do this manually.
+ +  </para>
+ +
     <para>
-    <productname>PostgreSQL</> major versions are represented by the
-    first two digit groups of the version number, e.g., 8.4.
-    <productname>PostgreSQL</> minor versions are represented by the
-    third group of version digits, e.g., 8.4.2 is the second minor
-    release of 8.4.  Minor releases never change the internal storage
-    format and are always compatible with earlier and later minor
-    releases of the same major version number, e.g., 8.4.2 is compatible
-    with 8.4, 8.4.1 and 8.4.6.  To update between compatible versions,
-    you simply replace the executables while the server is down and
-    restart the server.  The data directory remains unchanged &mdash;
-    minor upgrades are that simple.
+    Current <productname>PostgreSQL</productname> version numbers consist of a
+    major and a minor version number.  For example, in the version number 10.1,
+    the 10 is the major version number and the 1 is the minor version number,
+    meaning this would be the first minor release of the major release 10.  For
+    releases before <productname>PostgreSQL</productname> version 10.0, version
+    numbers consist of three numbers, for example, 9.5.3.  In those cases, the
+    major version consists of the first two digit groups of the version number,
+    e.g., 9.5, and the minor version is the third number, e.g., 3, meaning this
+    would be the third minor release of the major release 9.5.
+   </para>
+ 
+   <para>
+    Minor releases never change the internal storage format and are always
+    compatible with earlier and later minor releases of the same major version
+    number.  For example, version 10.1 is compatible with version 10.0 and
+    version 10.6.  Similarly, for example, 9.5.3 is compatible with 9.5.0,
+    9.5.1, and 9.5.6.  To update between compatible versions, you simply
+    replace the executables while the server is down and restart the server.
+    The data directory remains unchanged &mdash; minor upgrades are that
+    simple.
     </para>
   
     <para>
diff --cc doc/src/sgml/sourcerepo.sgml

index 08cafe129b81b7e726faf5634805d27f38365bc9,f8f6bf2de1dabf1a9a64330aacee304cb3c5b6d6..59b996edc90a0fb211db789755a14dbc62ccc428
--- 1/doc/src/sgml/sourcerepo.sgml
--- 2/doc/src/sgml/sourcerepo.sgml
+++ b/doc/src/sgml/sourcerepo.sgml
@@@ -64,10 -64,10 +64,10 @@@ git clone git://git.postgresql.org/git/
       <para>
        The Git mirror can also be reached via the HTTP protocol, if for example
        a firewall is blocking access to the Git protocol. Just change the URL
-      prefix to <literal>http</>, as in:
+      prefix to <literal>https</>, as in:
   
   <programlisting>
- git clone http://git.postgresql.org/git/postgres-xl.git
- -git clone https://git.postgresql.org/git/postgresql.git
++git clone https://git.postgresql.org/git/postgres-xl.git
   </programlisting>
   
        The HTTP protocol is less efficient than the Git protocol, so it will be
diff --cc doc/src/sgml/trigger.sgml
Simple merge
diff --cc doc/src/sgml/wal.sgml
Simple merge
diff --cc doc/src/sgml/xaggr.sgml
Simple merge
diff --cc doc/src/sgml/xfunc.sgml
Simple merge
diff --cc src/Makefile

index 5706bb13352b6348e984dc4a51441eead78318a6,380da92c75ef4aa6c4d08ebd15b438d3af845e4d..79cfeeb7102c6efd1d48ba35afdc158ecb96a9bd
--- 1/src/Makefile
--- 2/src/Makefile
+++ b/src/Makefile
@@@ -22,7 -20,9 +22,8 @@@ SUBDIRS = 
         backend/utils/mb/conversion_procs \
         backend/snowball \
         include \
- -      interfaces \
         backend/replication/libpqwalreceiver \
+       backend/replication/pgoutput \
         fe_utils \
         bin \
         pl \
diff --cc src/Makefile.global.in
Simple merge
diff --cc src/Makefile.shlib
Simple merge
diff --cc src/backend/Makefile

index faec8d852310dc59837a9b0085d1f3ea45fadcb6,bce9d2c3ebb09a06f2e7e1a39a097ef37ce749d0..d9aec0e0a41f71caffcd6b7c83158094279acbd1
--- 1/src/backend/Makefile
--- 2/src/backend/Makefile
+++ b/src/backend/Makefile
@@@ -17,13 -17,9 +17,13 @@@ subdir = src/backen
   top_builddir = ../..
   include $(top_builddir)/src/Makefile.global
   
+ +ifneq ($(PORTNAME), win32)
+ +override CFLAGS += $(PTHREAD_CFLAGS)
+ +endif
+ +
   SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \
- -      main nodes optimizer port postmaster regex replication rewrite \
- -      statistics storage tcop tsearch utils $(top_builddir)/src/timezone
+ +      pgxc main nodes optimizer port postmaster regex replication rewrite \
-       storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
++      statistics storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
   
   include $(srcdir)/common.mk
   
@@@ -39,24 -35,8 +39,25 @@@ LOCALOBJS += utils/probes.
   endif
   endif
   
- -OBJS = $(SUBDIROBJS) $(LOCALOBJS) $(top_builddir)/src/port/libpgport_srv.a \
- -       $(top_builddir)/src/common/libpgcommon_srv.a
+ +OBJS = $(SUBDIROBJS) $(LOCALOBJS) \
+ +      $(top_builddir)/src/port/libpgport_srv.a \
+ +       $(top_builddir)/src/common/libpgcommon_srv.a \
+ +      $(top_builddir)/src/interfaces/libpq/fe-connect.o \
+ +      $(top_builddir)/src/interfaces/libpq/fe-secure.o \
+ +      $(top_builddir)/src/interfaces/libpq/fe-misc.o \
+ +      $(top_builddir)/src/interfaces/libpq/fe-protocol3.o \
+ +      $(top_builddir)/src/interfaces/libpq/fe-protocol2.o \
+ +      $(top_builddir)/src/interfaces/libpq/fe-exec.o \
+ +      $(top_builddir)/src/interfaces/libpq/fe-auth.o \
+ +      $(top_builddir)/src/interfaces/libpq/pqexpbuffer.o \
++      $(top_builddir)/src/interfaces/libpq/fe-auth-scram.o \
+ +      $(top_builddir)/src/gtm/client/libgtmclient.a \
+ +      $(top_builddir)/src/gtm/common/libgtm.a \
+ +      $(top_builddir)/src/gtm/libpq/libpqcomm.a
+ +
+ +ifeq ($(with_openssl), yes)
+ +OBJS += $(top_builddir)/src/interfaces/libpq/fe-secure-openssl.o
+ +endif
   
   # We put libpgport and libpgcommon into OBJS, so remove it from LIBS; also add
   # libldap
@@@ -78,7 -58,7 +79,7 @@@ ifneq ($(PORTNAME), win32
   ifneq ($(PORTNAME), aix)
   
   postgres: $(OBJS)
-       $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@
- -      $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(call expand_subsys,$^) $(LIBS) $(ICU_LIBS) -o $@
++      $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) $(ICU_LIBS) -o $@
   
   endif
   endif
diff --cc src/backend/access/common/heaptuple.c

index 15a18a51cc4924f4750109890084230e1089fc0e,c0086ded62d95208ea943681ed5380dbadf1fec6..970e3aa6c95798e7396e1bf470fb5c00aace60c4
--- 1/src/backend/access/common/heaptuple.c
--- 2/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@@ -45,8 -45,7 +45,8 @@@
    * and we'd like to still refer to them via C struct offsets.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/access/common/printtup.c

index a44be6f96f1616f7837043aa2b04c39275edf604,a2ca2d74aefb36f5f71b80cadec2ca989f30d072..78704dafd910eee09fd715ea811cec5b0a9262a8
--- 1/src/backend/access/common/printtup.c
--- 2/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@@ -5,8 -5,7 +5,8 @@@
    *      clients and standalone backends are supported here).
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
diff --cc src/backend/access/hash/hashfunc.c

index bb9adad82ef55b60dc03bf61060d7d687ebb0b8e,289d766419aa080a5bfdb5c059bf3bac206f75ea..4089fd6d8ad10a2a227689a32e82d78256a94b61
--- 1/src/backend/access/hash/hashfunc.c
--- 2/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@@ -27,15 -27,18 +27,26 @@@
   #include "postgres.h"
   
   #include "access/hash.h"
+ #include "utils/builtins.h"
   
+ +#ifdef PGXC
+ +#include "catalog/pg_type.h"
+ +#include "utils/builtins.h"
+ +#include "utils/timestamp.h"
+ +#include "utils/date.h"
+ +#include "utils/nabstime.h"
+ +#endif
+ +
+ /*
+  * Datatype-specific hash functions.
+  *
+  * These support both hash indexes and hash joins.
+  *
+  * NOTE: some of these are also used by catcache operations, without
+  * any direct connection to hash indexes.  Also, the common hash_any
+  * routine is also used by dynahash tables.
+  */
+ 
   /* Note: this is used for both "char" and boolean datatypes */
   Datum
   hashchar(PG_FUNCTION_ARGS)
@@@ -531,190 -523,3 +531,186 @@@ hash_uint32(uint32 k
         /* report the result */
         return UInt32GetDatum(c);
   }
-               case INT2VECTOROID:
-                       return DirectFunctionCall1(hashint2vector, value);
+ +
+ +#ifdef PGXC
+ +/*
+ + * compute_hash()
+ + * Generic hash function for all datatypes
+ + */
+ +Datum
+ +compute_hash(Oid type, Datum value, char locator)
+ +{
+ +      int16   tmp16;
+ +      int32   tmp32;
+ +      int64   tmp64;
+ +      Oid             tmpoid;
+ +      char    tmpch;
+ +
+ +      switch (type)
+ +      {
+ +              case INT8OID:
+ +                      /* This gives added advantage that
+ +                       *      a = 8446744073709551359
+ +                       * and  a = 8446744073709551359::int8 both work*/
+ +                      tmp64 = DatumGetInt64(value);
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return DirectFunctionCall1(hashint8, value);
+ +                      return tmp64;
+ +              case INT2OID:
+ +                      tmp16 = DatumGetInt16(value);
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return DirectFunctionCall1(hashint2, tmp16);
+ +                      return tmp16;
+ +              case OIDOID:
+ +                      tmpoid = DatumGetObjectId(value);
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return DirectFunctionCall1(hashoid, tmpoid);
+ +                      return tmpoid;
+ +              case INT4OID:
+ +                      tmp32 = DatumGetInt32(value);
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return DirectFunctionCall1(hashint4, tmp32);
+ +                      return tmp32;
+ +              case BOOLOID:
+ +                      tmpch = DatumGetBool(value);
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return DirectFunctionCall1(hashchar, tmpch);
+ +                      return tmpch;
+ +
+ +              case CHAROID:
+ +                      return DirectFunctionCall1(hashchar, value);
+ +              case NAMEOID:
+ +                      return DirectFunctionCall1(hashname, value);
-               case INT2VECTOROID:
-                       return "hashint2vector";
+ +
+ +              case VARCHAROID:
+ +              case TEXTOID:
+ +                      return DirectFunctionCall1(hashtext, value);
+ +
+ +              case OIDVECTOROID:
+ +                      return DirectFunctionCall1(hashoidvector, value);
+ +              case FLOAT4OID:
+ +                      return DirectFunctionCall1(hashfloat4, value);
+ +              case FLOAT8OID:
+ +                      return DirectFunctionCall1(hashfloat8, value);
+ +
+ +              case ABSTIMEOID:
+ +                      tmp32 = DatumGetAbsoluteTime(value);
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return DirectFunctionCall1(hashint4, tmp32);
+ +                      return tmp32;
+ +              case RELTIMEOID:
+ +                      tmp32 = DatumGetRelativeTime(value);
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return DirectFunctionCall1(hashint4, tmp32);
+ +                      return tmp32;
+ +              case CASHOID:
+ +                      return DirectFunctionCall1(hashint8, value);
+ +
+ +              case BPCHAROID:
+ +                      return DirectFunctionCall1(hashbpchar, value);
+ +              case BYTEAOID:
+ +                      return DirectFunctionCall1(hashvarlena, value);
+ +
+ +              case DATEOID:
+ +                      tmp32 = DatumGetDateADT(value);
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return DirectFunctionCall1(hashint4, tmp32);
+ +                      return tmp32;
+ +              case TIMEOID:
+ +                      return DirectFunctionCall1(time_hash, value);
+ +              case TIMESTAMPOID:
+ +                      return DirectFunctionCall1(timestamp_hash, value);
+ +              case TIMESTAMPTZOID:
+ +                      return DirectFunctionCall1(timestamp_hash, value);
+ +              case INTERVALOID:
+ +                      return DirectFunctionCall1(interval_hash, value);
+ +              case TIMETZOID:
+ +                      return DirectFunctionCall1(timetz_hash, value);
+ +
+ +              case NUMERICOID:
+ +                      return DirectFunctionCall1(hash_numeric, value);
+ +              default:
+ +                      ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n")));
+ +      }
+ +      /* Control should not come here. */
+ +      ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n")));
+ +      /* Keep compiler silent */
+ +      return (Datum)0;
+ +}
+ +
+ +
+ +/*
+ + * get_compute_hash_function
+ + * Get hash function name depending on the hash type.
+ + * For some cases of hash or modulo distribution, a function might
+ + * be required or not.
+ + */
+ +char *
+ +get_compute_hash_function(Oid type, char locator)
+ +{
+ +      switch (type)
+ +      {
+ +              case INT8OID:
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return "hashint8";
+ +                      return NULL;
+ +              case INT2OID:
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return "hashint2";
+ +                      return NULL;
+ +              case OIDOID:
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return "hashoid";
+ +                      return NULL;
+ +              case DATEOID:
+ +              case INT4OID:
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return "hashint4";
+ +                      return NULL;
+ +              case BOOLOID:
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return "hashchar";
+ +                      return NULL;
+ +              case CHAROID:
+ +                      return "hashchar";
+ +              case NAMEOID:
+ +                      return "hashname";
+ +              case VARCHAROID:
+ +              case TEXTOID:
+ +                      return "hashtext";
+ +              case OIDVECTOROID:
+ +                      return "hashoidvector";
+ +              case FLOAT4OID:
+ +                      return "hashfloat4";
+ +              case FLOAT8OID:
+ +                      return "hashfloat8";
+ +              case RELTIMEOID:
+ +              case ABSTIMEOID:
+ +                      if (locator == LOCATOR_TYPE_HASH)
+ +                              return "hashint4";
+ +                      return NULL;
+ +              case CASHOID:
+ +                              return "hashint8";
+ +              case BPCHAROID:
+ +                      return "hashbpchar";
+ +              case BYTEAOID:
+ +                      return "hashvarlena";
+ +              case TIMEOID:
+ +                      return "time_hash";
+ +              case TIMESTAMPOID:
+ +              case TIMESTAMPTZOID:
+ +                      return "timestamp_hash";
+ +              case INTERVALOID:
+ +                      return "interval_hash";
+ +              case TIMETZOID:
+ +                      return "timetz_hash";
+ +              case NUMERICOID:
+ +                      return "hash_numeric";
+ +              default:
+ +                      ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n")));
+ +      }
+ +
+ +      /* Keep compiler quiet */
+ +      return NULL;
+ +}
+ +#endif
diff --cc src/backend/access/heap/heapam.c
Simple merge
diff --cc src/backend/access/heap/pruneheap.c
Simple merge
diff --cc src/backend/access/heap/tuptoaster.c
Simple merge
diff --cc src/backend/access/rmgrdesc/smgrdesc.c
Simple merge
diff --cc src/backend/access/rmgrdesc/xactdesc.c
Simple merge
diff --cc src/backend/access/transam/clog.c

index 0fcccfc3a700229c650584a0d3d1757e089a64d4,bece57589e80ebceaacca0db0f8775a9ed3ba8f6..9c6964d79c9aa3e8b29dd8fd8a1b3f51ed899097
--- 1/src/backend/access/transam/clog.c
--- 2/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@@ -23,10 -23,8 +23,10 @@@
    * for aborts (whether sync or async), since the post-crash assumption would
    * be that such transactions failed anyway.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/backend/access/transam/clog.c
    *
diff --cc src/backend/access/transam/commit_ts.c
Simple merge
diff --cc src/backend/access/transam/parallel.c

index c36a92cdb93effdb3f24856c6af375b6b4a7c5ce,2dad3e8a655e1e15cb1d6826d419e369a7d7029e..d3585c8449d1d464f8fa5751b65a929d5d0e7fa3
--- 1/src/backend/access/transam/parallel.c
--- 2/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@@ -24,7 -25,7 +25,8 @@@
   #include "libpq/pqmq.h"
   #include "miscadmin.h"
   #include "optimizer/planmain.h"
+ #include "pgstat.h"
+ +#include "pgxc/pgxcnode.h"
   #include "storage/ipc.h"
   #include "storage/sinval.h"
   #include "storage/spin.h"
diff --cc src/backend/access/transam/recovery.conf.sample

index 5624eeca84b50c7091aa7cd59771dba7dcec6a05,de4e38f9fe316fdedcf60050349382c8951f5b94..acb81afd66cc891b633d8789be173e1060cc3a84
--- 1/src/backend/access/transam/recovery.conf.sample
--- 2/src/backend/access/transam/recovery.conf.sample
+++ b/src/backend/access/transam/recovery.conf.sample
@@@ -67,10 -67,9 +67,10 @@@
   # must set a recovery target.
   #
   # You may set a recovery target either by transactionId, by name,
- # or by timestamp or by barrier. Recovery may either include or exclude the
- # transaction(s) with the recovery target value (ie, stop either
- # just after or just before the given target, respectively). In case of
- # barrier, the recovery stops exactly at that point.
- -# by timestamp or by WAL location (LSN). Recovery may either include or
- -# exclude the transaction(s) with the recovery target value (ie, stop either
- -# just after or just before the given target, respectively).
++# or by timestamp or by WAL location (LSN) or by barrier. Recovery may either
++# include or exclude the transaction(s) with the recovery target value (ie,
++# stop either just after or just before the given target, respectively). In
++# case of barrier, the recovery stops exactly at that point.
   #
   #
   #recovery_target_name = ''    # e.g. 'daily backup 2011-01-26'
@@@ -79,8 -78,8 +79,10 @@@
   #
   #recovery_target_xid = ''
   #
+ +#recovery_target_barrier = ''
+ +#
+ #recovery_target_lsn = ''     # e.g. '0/70006B8'
+ #
   #recovery_target_inclusive = true
   #
   #
diff --cc src/backend/access/transam/rmgr.c
Simple merge
diff --cc src/backend/access/transam/slru.c
Simple merge
diff --cc src/backend/access/transam/subtrans.c

index 76069546cbccfc50bd3e0686858c58a575cd60e0,cef03f83e03f3be28d02945c152971b1527ab4b1..a0390bf25b827cf98a6646678ec21b4cddb00b47
--- 1/src/backend/access/transam/subtrans.c
--- 2/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@@ -19,9 -18,8 +18,9 @@@
    * data across crashes.  During database startup, we simply force the
    * currently-active page of SUBTRANS to zeroes.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/backend/access/transam/subtrans.c
    *
diff --cc src/backend/access/transam/transam.c
Simple merge
diff --cc src/backend/access/transam/twophase.c

index b65227922bdbce2c7d15f64f0fe08a0a0fe0cd38,c50f9c4bf6537d882cfe8ea877770eaa0200cde2..f6986d37db950a1a6ae8df236d9c942aa8ce4ff9
--- 1/src/backend/access/transam/twophase.c
--- 2/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@@ -3,9 -3,8 +3,9 @@@
    * twophase.c
    *            Two-phase commit support functions.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * IDENTIFICATION
    *            src/backend/access/transam/twophase.c
diff --cc src/backend/access/transam/varsup.c

index a4e67d9fc393afe8fbd1c0bc9d4c2471111daafd,b02dd6fbd25a8dfdd30b2c6578ec24617ef7958b..d94a1deeb15ea7d9ff28454043422d449c17d0ef
--- 1/src/backend/access/transam/varsup.c
--- 2/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@@ -3,9 -3,7 +3,9 @@@
    * varsup.c
    *      postgres OID & XID variables support routines
    *
-  * Copyright (c) 2000-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+  * Copyright (c) 2000-2017, PostgreSQL Global Development Group
    *
    * IDENTIFICATION
    *      src/backend/access/transam/varsup.c
diff --cc src/backend/access/transam/xact.c

index 049aabc20996583bd4e6b7fc57ea6b99146ab3dd,7e8c598f2adc191a34f2bb5424a3a480cc342888..77666c4b80113a2fc1401e3be700416988d9d258
--- 1/src/backend/access/transam/xact.c
--- 2/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@@ -5,10 -5,8 +5,10 @@@
    *
    * See src/backend/access/transam/README for more information.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -421,20 -329,10 +424,20 @@@ static void AtSubCommit_Memory(void)
   static void AtSubStart_Memory(void);
   static void AtSubStart_ResourceOwner(void);
   
+ +#ifdef XCP
+ +static void AtSubCommit_WaitedXids(void);
+ +static void AtSubAbort_WaitedXids(void);
+ +static void AtEOXact_WaitedXids(void);
+ +static void TransactionRecordXidWait_Internal(TransactionState s,
+ +              TransactionId xid);
+ +#endif
+ +
   static void ShowTransactionState(const char *str);
- static void ShowTransactionStateRec(TransactionState state);
+ static void ShowTransactionStateRec(const char *str, TransactionState state);
   static const char *BlockStateAsString(TBlockState blockState);
   static const char *TransStateAsString(TransState state);
+ +static void PrepareTransaction(void);
+ +static void AtEOXact_GlobalTxn(bool commit);
   
   
   /* ----------------------------------------------------------------
@@@ -2142,24 -1843,11 +2142,24 @@@ StartTransaction(void
         {
                 s->startedInRecovery = false;
                 XactReadOnly = DefaultXactReadOnly;
+ +#ifdef PGXC
+ +              /* Save Postgres-XC session as read-only if necessary */
+ +              XactReadOnly |= IsPGXCNodeXactReadOnly();
+ +#endif
         }
         XactDeferrable = DefaultXactDeferrable;
+ +#ifdef PGXC
+ +      /* PGXCTODO - PGXC doesn't support 9.1 serializable transactions. They are
+ +       * silently turned into repeatable-reads which is same as pre 9.1
+ +       * serializable isolation level
+ +       */
+ +      if (DefaultXactIsoLevel == XACT_SERIALIZABLE)
+ +              DefaultXactIsoLevel = XACT_REPEATABLE_READ;
+ +#endif
         XactIsoLevel = DefaultXactIsoLevel;
         forceSyncCommit = false;
-       MyXactAccessedTempRel = false;
+ +      XactLocalNodePrepared = false;
+       MyXactFlags = 0;
   
         /*
          * reinitialize within-transaction counters
@@@ -2498,14 -2037,10 +2498,14 @@@ CommitTransaction(void
         if (!is_parallel_worker)
         {
                 /*
-                * We need to mark our XIDs as committed in pg_clog.  This is where we
+                * We need to mark our XIDs as committed in pg_xact.  This is where we
                  * durably commit.
                  */
- -              latestXid = RecordTransactionCommit();
+ +#ifdef XCP
+ +              latestXid = InvalidTransactionId;
+ +              if (!IsConnFromDatanode())
+ +#endif
+ +                      latestXid = RecordTransactionCommit();
         }
         else
         {
diff --cc src/backend/access/transam/xlog.c

index 19b4921075b91e063658c656c473a5d35aca77e8,399822d3fead60e0302169ac007ff8bc042a8fd6..b29f283e6a318babd2e364123e9c7cc87ecd7db9
--- 1/src/backend/access/transam/xlog.c
--- 2/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@@ -40,10 -41,8 +41,11 @@@
   #include "catalog/pg_database.h"
   #include "commands/tablespace.h"
   #include "miscadmin.h"
+ +#ifdef PGXC
+ +#include "pgxc/barrier.h"
+ +#endif
   #include "pgstat.h"
+ #include "port/atomics.h"
   #include "postmaster/bgwriter.h"
   #include "postmaster/walwriter.h"
   #include "postmaster/startup.h"
@@@ -256,8 -262,8 +265,9 @@@ static bool recoveryTargetInclusive = t
   static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
   static TransactionId recoveryTargetXid;
   static TimestampTz recoveryTargetTime;
+ +static char *recoveryTargetBarrierId;
   static char *recoveryTargetName;
+ static XLogRecPtr recoveryTargetLSN;
   static int    recovery_min_apply_delay = 0;
   static TimestampTz recoveryDelayUntilTime;
   
@@@ -5404,17 -5626,31 +5642,35 @@@ recoveryStopsBefore(XLogReaderState *re
   
                 recoveryStopAfter = false;
                 recoveryStopXid = InvalidTransactionId;
+               recoveryStopLSN = InvalidXLogRecPtr;
+               recoveryStopTime = 0;
+               recoveryStopName[0] = '\0';
+               return true;
+       }
+ 
+       /* Check if target LSN has been reached */
+       if (recoveryTarget == RECOVERY_TARGET_LSN &&
+               !recoveryTargetInclusive &&
+               record->ReadRecPtr >= recoveryTargetLSN)
+       {
+               recoveryStopAfter = false;
+               recoveryStopXid = InvalidTransactionId;
+               recoveryStopLSN = record->ReadRecPtr;
                 recoveryStopTime = 0;
                 recoveryStopName[0] = '\0';
+               ereport(LOG,
+                        (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
+                                        (uint32) (recoveryStopLSN >> 32),
+                                        (uint32) recoveryStopLSN)));
                 return true;
         }
- -
+ +#ifdef PGXC
+ +      /* Otherwise we only consider stopping before COMMIT, ABORT or BARRIER records. */
+ +      if ((XLogRecGetRmid(record) != RM_XACT_ID) && (XLogRecGetRmid(record) != RM_BARRIER_ID))
+ +#else         
         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
         if (XLogRecGetRmid(record) != RM_XACT_ID)
+ +#endif                
                 return false;
   
         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
diff --cc src/backend/access/transam/xlogutils.c

index 1bdbea655bb646e452aa1fe9b66cc8b58ef4a75d,7430a1f77b456f58f8319e459d8580764aa901a2..4f67dc62fb73a29b3ee97e96d09aa6982b618f30
--- 1/src/backend/access/transam/xlogutils.c
--- 2/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@@ -19,7 -19,7 +19,8 @@@
   
   #include <unistd.h>
   
+ +#include "miscadmin.h"
+ #include "access/timeline.h"
   #include "access/xlog.h"
   #include "access/xlog_internal.h"
   #include "access/xlogutils.h"
diff --cc src/backend/bootstrap/bootstrap.c

index 86732f73d87138b7cf450662fa302454eab4fb36,4c28b2b821a767ef7a262312b6102bc844244f30..c2274ae2ff4dea62c371a57dcd7bcf69f87d3544
--- 1/src/backend/bootstrap/bootstrap.c
--- 2/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@@ -4,10 -4,8 +4,10 @@@
    *      routines to support running postgres in 'bootstrap' mode
    *    bootstrap mode is used to create the initial template database
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * IDENTIFICATION
    *      src/backend/bootstrap/bootstrap.c
diff --cc src/backend/catalog/Makefile

index 240c44d0f0d8969728a8d644e6e131adb1ccbbc6,fd33426bad15164500bb8189f65f808758107f27..52bc63c78869cadda780a9884dcfedffa8764571
--- 1/src/backend/catalog/Makefile
--- 2/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@@ -11,11 -11,12 +11,12 @@@ top_builddir = ../../.
   include $(top_builddir)/src/Makefile.global
   
   OBJS = catalog.o dependency.o heap.o index.o indexing.o namespace.o aclchk.o \
-        objectaccess.o objectaddress.o pg_aggregate.o pg_collation.o \
+        objectaccess.o objectaddress.o partition.o pg_aggregate.o pg_collation.o \
          pg_constraint.o pg_conversion.o \
          pg_depend.o pg_enum.o pg_inherits.o pg_largeobject.o pg_namespace.o \
-        pg_operator.o pg_proc.o pg_range.o pg_db_role_setting.o pg_shdepend.o \
-        pg_type.o pgxc_class.o storage.o toasting.o
+        pg_operator.o pg_proc.o pg_publication.o pg_range.o \
+          pg_db_role_setting.o pg_shdepend.o pg_subscription.o pg_type.o \
- -         storage.o toasting.o
++         pgxc_class.o storage.o toasting.o
   
   BKIFILES = postgres.bki postgres.description postgres.shdescription
   
@@@ -39,10 -41,11 +41,12 @@@ POSTGRES_BKI_SRCS = $(addprefix $(top_s
         pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \
         pg_ts_parser.h pg_ts_template.h pg_extension.h \
         pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \
+ +      pgxc_class.h pgxc_node.h pgxc_group.h \
         pg_foreign_table.h pg_policy.h pg_replication_origin.h \
         pg_default_acl.h pg_init_privs.h pg_seclabel.h pg_shseclabel.h \
-       pg_collation.h pg_range.h pg_transform.h \
+       pg_collation.h pg_partitioned_table.h pg_range.h pg_transform.h \
+       pg_sequence.h pg_publication.h pg_publication_rel.h pg_subscription.h \
+       pg_subscription_rel.h toasting.h indexing.h \
         toasting.h indexing.h \
       )
   
diff --cc src/backend/catalog/catalog.c

index 9bb937aa4c2b09faa39ebc5c982da1eede70412d,11ee536726ce08bf22b87bc8a087e2f73869e821..2e8cd10ebb24dbe7cbb617ab07683cdc12745d88
--- 1/src/backend/catalog/catalog.c
--- 2/src/backend/catalog/catalog.c
+++ b/src/backend/catalog/catalog.c
@@@ -5,8 -5,7 +5,8 @@@
    *            bits of hard-wired knowledge
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -37,10 -36,9 +37,11 @@@
   #include "catalog/pg_shdepend.h"
   #include "catalog/pg_shdescription.h"
   #include "catalog/pg_shseclabel.h"
+ #include "catalog/pg_subscription.h"
   #include "catalog/pg_tablespace.h"
   #include "catalog/toasting.h"
+ +#include "catalog/pgxc_node.h"
+ +#include "catalog/pgxc_group.h"
   #include "miscadmin.h"
   #include "storage/fd.h"
   #include "utils/fmgroids.h"
@@@ -232,12 -227,9 +233,13 @@@ IsSharedRelation(Oid relationId
                 relationId == SharedDependRelationId ||
                 relationId == SharedSecLabelRelationId ||
                 relationId == TableSpaceRelationId ||
+ +#ifdef PGXC
+ +              relationId == PgxcGroupRelationId ||
+ +              relationId == PgxcNodeRelationId ||
+ +#endif
                 relationId == DbRoleSettingRelationId ||
-               relationId == ReplicationOriginRelationId)
+               relationId == ReplicationOriginRelationId ||
+               relationId == SubscriptionRelationId)
                 return true;
         /* These are their indexes (see indexing.h) */
         if (relationId == AuthIdRolnameIndexId ||
@@@ -253,16 -245,11 +255,18 @@@
                 relationId == SharedSecLabelObjectIndexId ||
                 relationId == TablespaceOidIndexId ||
                 relationId == TablespaceNameIndexId ||
+ +#ifdef PGXC
+ +              relationId == PgxcNodeNodeNameIndexId ||
+ +              relationId == PgxcNodeNodeIdIndexId ||
+ +              relationId == PgxcNodeOidIndexId ||
+ +              relationId == PgxcGroupGroupNameIndexId ||
+ +              relationId == PgxcGroupOidIndexId ||
+ +#endif
                 relationId == DbRoleSettingDatidRolidIndexId ||
                 relationId == ReplicationOriginIdentIndex ||
-               relationId == ReplicationOriginNameIndex)
+               relationId == ReplicationOriginNameIndex ||
+               relationId == SubscriptionObjectIndexId ||
+               relationId == SubscriptionNameIndexId)
                 return true;
         /* These are their toast tables and toast indexes (see toasting.h) */
         if (relationId == PgShdescriptionToastTable ||
diff --cc src/backend/catalog/dependency.c

index 467d9ead0efed79b914d34ead0982738facbec7d,cd82cb9f29a8e99e8d57620e154e686d6134579a..f8e560a8d4449be5771d22b06f18616825f89d58
--- 1/src/backend/catalog/dependency.c
--- 2/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@@ -4,10 -4,8 +4,10 @@@
    *      Routines to support inter-object dependencies.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * IDENTIFICATION
    *      src/backend/catalog/dependency.c
@@@ -174,17 -168,17 +181,21 @@@ static const Oid object_classes[] = 
         UserMappingRelationId,          /* OCLASS_USER_MAPPING */
         DefaultAclRelationId,           /* OCLASS_DEFACL */
         ExtensionRelationId,            /* OCLASS_EXTENSION */
+ +#ifdef PGXC
+ +      PgxcClassRelationId,            /* OCLASS_PGXCCLASS */
+ +      PgxcNodeRelationId,                     /* OCLASS_PGXC_NODE */
+ +      PgxcGroupRelationId,            /* OCLASS_PGXC_GROUP */
+ +#endif
         EventTriggerRelationId,         /* OCLASS_EVENT_TRIGGER */
         PolicyRelationId,                       /* OCLASS_POLICY */
+       PublicationRelationId,          /* OCLASS_PUBLICATION */
+       PublicationRelRelationId,       /* OCLASS_PUBLICATION_REL */
+       SubscriptionRelationId,         /* OCLASS_SUBSCRIPTION */
         TransformRelationId                     /* OCLASS_TRANSFORM */
   };
   
- -
   static void findDependentObjects(const ObjectAddress *object,
+                                        int objflags,
                                          int flags,
                                          ObjectAddressStack *stack,
                                          ObjectAddresses *targetObjects,
@@@ -407,174 -425,6 +442,94 @@@ performMultipleDeletions(const ObjectAd
         heap_close(depRel, RowExclusiveLock);
   }
   
- /*
-  * deleteWhatDependsOn: attempt to drop everything that depends on the
-  * specified object, though not the object itself.  Behavior is always
-  * CASCADE.
-  *
-  * This is currently used only to clean out the contents of a schema
-  * (namespace): the passed object is a namespace.  We normally want this
-  * to be done silently, so there's an option to suppress NOTICE messages.
-  *
-  * Note we don't fire object drop event triggers here; it would be wrong to do
-  * so for the current only use of this function, but if more callers are added
-  * this might need to be reconsidered.
-  */
- void
- deleteWhatDependsOn(const ObjectAddress *object,
-                                       bool showNotices)
- {
-       Relation        depRel;
-       ObjectAddresses *targetObjects;
-       int                     i;
- 
-       /*
-        * We save some cycles by opening pg_depend just once and passing the
-        * Relation pointer down to all the recursive deletion steps.
-        */
-       depRel = heap_open(DependRelationId, RowExclusiveLock);
- 
-       /*
-        * Acquire deletion lock on the target object.  (Ideally the caller has
-        * done this already, but many places are sloppy about it.)
-        */
-       AcquireDeletionLock(object, 0);
- 
-       /*
-        * Construct a list of objects to delete (ie, the given object plus
-        * everything directly or indirectly dependent on it).
-        */
-       targetObjects = new_object_addresses();
- 
-       findDependentObjects(object,
-                                                DEPFLAG_ORIGINAL,
-                                                NULL,  /* empty stack */
-                                                targetObjects,
-                                                NULL,  /* no pendingObjects */
-                                                &depRel);
- 
-       /*
-        * Check if deletion is allowed, and report about cascaded deletes.
-        */
-       reportDependentObjects(targetObjects,
-                                                  DROP_CASCADE,
-                                                  showNotices ? NOTICE : DEBUG2,
-                                                  object);
- 
-       /*
-        * Delete all the objects in the proper order, except we skip the original
-        * object.
-        */
-       for (i = 0; i < targetObjects->numrefs; i++)
-       {
-               ObjectAddress *thisobj = targetObjects->refs + i;
-               ObjectAddressExtra *thisextra = targetObjects->extras + i;
- 
-               if (thisextra->flags & DEPFLAG_ORIGINAL)
-                       continue;
- 
-               /*
-                * Since this function is currently only used to clean out temporary
-                * schemas, we pass PERFORM_DELETION_INTERNAL here, indicating that
-                * the operation is an automatic system operation rather than a user
-                * action.  If, in the future, this function is used for other
-                * purposes, we might need to revisit this.
-                */
-               deleteOneObject(thisobj, &depRel, PERFORM_DELETION_INTERNAL);
-       }
- 
-       /* And clean up */
-       free_object_addresses(targetObjects);
- 
-       heap_close(depRel, RowExclusiveLock);
- }
- 
+ +#ifdef PGXC
+ +/*
+ + * Check type and class of the given object and rename it properly on GTM
+ + */
+ +static void
+ +doRename(const ObjectAddress *object, const char *oldname, const char *newname)
+ +{
+ +      switch (getObjectClass(object))
+ +      {
+ +              case OCLASS_CLASS:
+ +              {
+ +                      char        relKind = get_rel_relkind(object->objectId);
+ +
+ +                      /*
+ +                       * If we are here, a schema is being renamed, a sequence depends on it.
+ +                       * as sequences' global name use the schema name, this sequence
+ +                       * has also to be renamed on GTM.
+ +                       * An operation with GTM can just be done from a remote Coordinator.
+ +                       */
+ +                      if (relKind == RELKIND_SEQUENCE &&
+ +                              IS_PGXC_LOCAL_COORDINATOR)
+ +                      {
+ +                              Relation relseq = relation_open(object->objectId, AccessShareLock);
+ +                              char *seqname = GetGlobalSeqName(relseq, NULL, oldname);
+ +                              char *newseqname = GetGlobalSeqName(relseq, NULL, newname);
+ +
+ +                              /* We also need to rename this sequence on GTM, it has a global name ! */
+ +                              if (RenameSequenceGTM(seqname, newseqname) < 0)
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_CONNECTION_FAILURE),
+ +                                                       errmsg("GTM error, could not rename sequence")));
+ +
+ +
+ +                              pfree(seqname);
+ +                              pfree(newseqname);
+ +
+ +                              relation_close(relseq, AccessShareLock);
+ +                      }
+ +              }
+ +              default:
+ +                      /* Nothing to do, this object has not to be renamed, end of the story... */
+ +                      break;
+ +      }
+ +}
+ +
+ +/*
+ + * performRename: used to rename objects
+ + * on GTM depending on another object(s)
+ + */
+ +void
+ +performRename(const ObjectAddress *object, const char *oldname, const char *newname)
+ +{
+ +      Relation    depRel;
+ +      ObjectAddresses *targetObjects;
+ +      int i;
+ +
+ +      /*
+ +       * Check the dependencies on this object
+ +       * And rename object dependent if necessary
+ +       */
+ +
+ +      depRel = heap_open(DependRelationId, RowExclusiveLock);
+ +
+ +      targetObjects = new_object_addresses();
+ +
+ +      findDependentObjects(object,
+ +                                               DEPFLAG_ORIGINAL,
++                                               0, /* XXX seems like flags are only used while
++                                                         dropping objects */
+ +                                               NULL,      /* empty stack */
+ +                                               targetObjects,
+ +                                               NULL,
+ +                                               &depRel);
+ +
+ +      /* Check Objects one by one to see if some of them have to be renamed on GTM */
+ +      for (i = 0; i < targetObjects->numrefs; i++)
+ +      {
+ +              ObjectAddress *thisobj = targetObjects->refs + i;
+ +              doRename(thisobj, oldname, newname);
+ +      }
+ +
+ +      /* And clean up */
+ +      free_object_addresses(targetObjects);
+ +
+ +      heap_close(depRel, RowExclusiveLock);
+ +}
+ +#endif
+ +
   /*
    * findDependentObjects - find all objects that depend on 'object'
    *
@@@ -1259,54 -1125,12 +1230,60 @@@ doDeletion(const ObjectAddress *object
                                                 heap_drop_with_catalog(object->objectId);
                                 }
   
+                               /*
+                                * for a sequence, in addition to dropping the heap, also
+                                * delete pg_sequence tuple
+                                */
+                               if (relKind == RELKIND_SEQUENCE)
+                                       DeleteSequenceTuple(object->objectId);
+ +#ifdef PGXC
+ +                              /*
+ +                               * Do not do extra process if this session is connected to a remote
+ +                               * Coordinator.
+ +                               */
+ +                              if (IsConnFromCoord())
+ +                                      break;
+ +
+ +                              /*
+ +                               * This session is connected directly to application, so extra
+ +                               * process related to remote nodes and GTM is needed.
+ +                               */
+ +                              switch (relKind)
+ +                              {
+ +                                      case RELKIND_SEQUENCE:
+ +                                              /*
+ +                                               * Drop the sequence on GTM.
+ +                                               * Sequence is dropped on GTM by a remote Coordinator only
+ +                                               * for a non temporary sequence.
+ +                                               */
+ +                                              {
+ +                                                      /*
+ +                                                       * The sequence has already been removed from Coordinator,
+ +                                                       * finish the stuff on GTM too
+ +                                                       */
+ +
+ +                                                      Relation relseq;
+ +                                                      char *seqname;
+ +                                                      /*
+ +                                                       * A relation is opened to get the schema and database name as
+ +                                                       * such data is not available before when dropping a function.
+ +                                                       */
+ +                                                      relseq = relation_open(object->objectId, AccessShareLock);
+ +                                                      seqname = GetGlobalSeqName(relseq, NULL, NULL);
+ +                                                      DropSequenceGTM(seqname, GTM_SEQ_FULL_NAME);
+ +                                                      pfree(seqname);
+ +
+ +                                                      /* Then close the relation opened previously */
+ +                                                      relation_close(relseq, AccessShareLock);
+ +                                              }
+ +                                              break;
+ +                                      case RELKIND_RELATION:
+ +                                      case RELKIND_VIEW:
+ +                                              break;
+ +                                      default:
+ +                                              break;
+ +                              }
+ +#endif /* PGXC */
                                 break;
                         }
   
@@@ -1440,9 -1271,20 +1429,23 @@@
                         DropTransformById(object->objectId);
                         break;
   
-               default:
-                       elog(ERROR, "unrecognized object class: %u",
-                                object->classId);
+                       /*
+                        * These global object types are not supported here.
+                        */
+               case OCLASS_ROLE:
+               case OCLASS_DATABASE:
+               case OCLASS_TBLSPACE:
+               case OCLASS_SUBSCRIPTION:
++              case OCLASS_PGXC_NODE:
++              case OCLASS_PGXC_GROUP:
+                       elog(ERROR, "global objects cannot be deleted by doDeletion");
+                       break;
+ 
+                       /*
+                        * There's intentionally no default: case here; we want the
+                        * compiler to warn if a new OCLASS hasn't been handled above.
+                        */
++
         }
   }
   
diff --cc src/backend/catalog/genbki.pl
Simple merge
diff --cc src/backend/catalog/heap.c

index a1df27d43ff465dc57d935b9f3536884f192b5d9,0ce94f346f56d856937ba629ba70d215d7b24680..ea3d2ade219d2ab63dccd661408500c94e1ba0eb
--- 1/src/backend/catalog/heap.c
--- 2/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@@ -3,9 -3,8 +3,9 @@@
    * heap.c
    *      code to create and destroy POSTGRES heap relations
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -190,27 -186,10 +196,27 @@@ static FormData_pg_attribute a6 = 
   static FormData_pg_attribute a7 = {
         0, {"tableoid"}, OIDOID, 0, sizeof(Oid),
         TableOidAttributeNumber, 0, -1, -1,
-       true, 'p', 'i', true, false, false, true, 0
+       true, 'p', 'i', true, false, '\0', false, true, 0
   };
   
+ +#ifdef PGXC
+ +/*
+ + * In XC we need some sort of node identification for each tuple
+ + * We are adding another system column that would serve as node identifier.
+ + * This is not only required by WHERE CURRENT OF but it can be used any
+ + * where we want to know the originating Datanode of a tuple received
+ + * at the Coordinator
+ + */
+ +static FormData_pg_attribute a8 = {
+ +      0, {"xc_node_id"}, INT4OID, 0, sizeof(int32),
+ +      XC_NodeIdAttributeNumber, 0, -1, -1,
+ +      true, 'p', 'i', true, false, false, true, 0
+ +};
+ +
+ +static const Form_pg_attribute SysAtt[] = {&a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8};
+ +#else
   static const Form_pg_attribute SysAtt[] = {&a1, &a2, &a3, &a4, &a5, &a6, &a7};
+ +#endif
   
   /*
    * This function returns a Form_pg_attribute pointer for a system attribute.
diff --cc src/backend/catalog/index.c
Simple merge
diff --cc src/backend/catalog/namespace.c

index 5caaef144f388f92ff94653569979c6336cbc6d7,2a33eb73fa9b276f67bf21d74183f85b930dadf0..d7f6075b13cf2cdb0de1c9c2702257a9b5099645
--- 1/src/backend/catalog/namespace.c
--- 2/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@@ -9,8 -9,7 +9,8 @@@
    * and implementing search-path-controlled searches.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
@@@ -205,26 -199,7 +207,10 @@@ static void RemoveTempRelationsCallback
   static void NamespaceCallback(Datum arg, int cacheid, uint32 hashvalue);
   static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames,
                            int **argnumbers);
+ +#ifdef XCP
+ +static void FindTemporaryNamespace(void);
+ +#endif
   
- /* These don't really need to appear in any header file */
- Datum         pg_table_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_type_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_function_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_operator_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_opclass_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_opfamily_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_collation_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_conversion_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_ts_parser_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_ts_dict_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_ts_template_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_ts_config_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_my_temp_schema(PG_FUNCTION_ARGS);
- Datum         pg_is_other_temp_schema(PG_FUNCTION_ARGS);
- 
   
   /*
    * RangeVarGetRelid
diff --cc src/backend/catalog/objectaddress.c

index 8068b82eab892078bd3d42db778cb8d8c9d0753b,6bc05cab3a28a990f4f46b93cd06478539f23bc1..6a365dceec1122adb11f394b6dc348e789b227f0
--- 1/src/backend/catalog/objectaddress.c
--- 2/src/backend/catalog/objectaddress.c
+++ b/src/backend/catalog/objectaddress.c
@@@ -3195,12 -3387,38 +3387,52 @@@ getObjectDescription(const ObjectAddres
                                 break;
                         }
   
-               default:
-                       appendStringInfo(&buffer, "unrecognized object %u %u %d",
-                                                        object->classId,
-                                                        object->objectId,
-                                                        object->objectSubId);
-                       break;
+               case OCLASS_SUBSCRIPTION:
+                       {
+                               appendStringInfo(&buffer, _("subscription %s"),
+                                                                get_subscription_name(object->objectId));
+                               break;
+                       }
+ 
+               case OCLASS_TRANSFORM:
+                       {
+                               HeapTuple       trfTup;
+                               Form_pg_transform trfForm;
+ 
+                               trfTup = SearchSysCache1(TRFOID,
+                                                                                ObjectIdGetDatum(object->objectId));
+                               if (!HeapTupleIsValid(trfTup))
+                                       elog(ERROR, "could not find tuple for transform %u",
+                                                object->objectId);
+ 
+                               trfForm = (Form_pg_transform) GETSTRUCT(trfTup);
+ 
+                               appendStringInfo(&buffer, _("transform for %s language %s"),
+                                                                format_type_be(trfForm->trftype),
+                                                                get_language_name(trfForm->trflang, false));
+ 
+                               ReleaseSysCache(trfTup);
+                               break;
+                       }
+ 
++              case OCLASS_PGXC_NODE:
++                      {
++                              appendStringInfo(&buffer, _("node %s"),
++                                                               get_pgxc_nodename(object->objectId));
++                              break;
++                      }
++
++              case OCLASS_PGXC_GROUP:
++                      {
++                              appendStringInfo(&buffer, _("node group %s"),
++                                                               get_pgxc_groupname(object->objectId));
++                              break;
++                      }
++
+                       /*
+                        * There's intentionally no default: case here; we want the
+                        * compiler to warn if a new OCLASS hasn't been handled above.
+                        */
         }
   
         return buffer.data;
@@@ -3676,13 -3915,10 +3929,22 @@@ getObjectTypeDescription(const ObjectAd
                         appendStringInfoString(&buffer, "transform");
                         break;
   
-               case OCLASS_AM:
-                       appendStringInfoString(&buffer, "access method");
++              case OCLASS_PGXC_CLASS:
++                      appendStringInfoString(&buffer, "pgxc_class");
+ +                      break;
+ +
-               default:
-                       appendStringInfo(&buffer, "unrecognized %u", object->classId);
++              case OCLASS_PGXC_NODE:
++                      appendStringInfoString(&buffer, "node");
++                      break;
++
++              case OCLASS_PGXC_GROUP:
++                      appendStringInfoString(&buffer, "node group");
+ +                      break;
++
+                       /*
+                        * There's intentionally no default: case here; we want the
+                        * compiler to warn if a new OCLASS hasn't been handled above.
+                        */
         }
   
         return buffer.data;
@@@ -4635,27 -4965,11 +4991,44 @@@ getObjectIdentityParts(const ObjectAddr
                                 heap_close(transformDesc, AccessShareLock);
                         }
                         break;
-               case OCLASS_AM:
++              
++              case OCLASS_PGXC_CLASS:
++                      /* 
++                       * XXX PG10MERGE: ISTM that we don't record dependencies on
++                       * pgxc_class, pgxc_node and pgxc_group. So it's not clear if we
++                       * really need corresponding OCLASS_* either. We should check this
++                       * in more detail.
++                       */
++                      break;
+ +
-                               char       *amname;
++              case OCLASS_PGXC_NODE:
+ +                      {
-                               amname = get_am_name(object->objectId);
-                               if (!amname)
-                                       elog(ERROR, "cache lookup failed for access method %u",
-                                                object->objectId);
-                               appendStringInfoString(&buffer, quote_identifier(amname));
++                              char       *nodename;
+ +
-                                       *objname = list_make1(amname);
++                              nodename = get_pgxc_nodename(object->objectId);
+ +                              if (objname)
-                       break;
++                                      *objname = list_make1(nodename);
++                              appendStringInfoString(&buffer,
++                                                                         quote_identifier(nodename));
++                              break;
+ +                      }
-               default:
-                       appendStringInfo(&buffer, "unrecognized object %u %u %d",
-                                                        object->classId,
-                                                        object->objectId,
-                                                        object->objectSubId);
-                       break;
+ +
++              case OCLASS_PGXC_GROUP:
++                      {
++                              char       *groupname;
++
++                              groupname = get_pgxc_groupname(object->objectId);
++                              if (objname)
++                                      *objname = list_make1(groupname);
++                              appendStringInfoString(&buffer,
++                                                                         quote_identifier(groupname));
++                              break;
++                      }
+ 
+                       /*
+                        * There's intentionally no default: case here; we want the
+                        * compiler to warn if a new OCLASS hasn't been handled above.
+                        */
         }
   
         /*
diff --cc src/backend/catalog/pg_proc.c

index 75621bd6e39d8e7f2ec6ec650f20b5f7c7787387,eaeabf13d68b23cc145f97156d1b10294d427ed4..0f7ab80f65a1a2e2ea2a2fbd4c10c98cdc2a2261
--- 1/src/backend/catalog/pg_proc.c
--- 2/src/backend/catalog/pg_proc.c
+++ b/src/backend/catalog/pg_proc.c
@@@ -3,8 -3,7 +3,8 @@@
    * pg_proc.c
    *      routines to support manipulation of the pg_proc relation
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -38,19 -37,11 +38,16 @@@
   #include "utils/acl.h"
   #include "utils/builtins.h"
   #include "utils/lsyscache.h"
+ #include "utils/regproc.h"
   #include "utils/rel.h"
   #include "utils/syscache.h"
+ +#ifdef PGXC
+ +#include "pgxc/execRemote.h"
+ +#include "pgxc/pgxc.h"
+ +#include "pgxc/planner.h"
+ +#endif
   
   
- Datum         fmgr_internal_validator(PG_FUNCTION_ARGS);
- Datum         fmgr_c_validator(PG_FUNCTION_ARGS);
- Datum         fmgr_sql_validator(PG_FUNCTION_ARGS);
- 
   typedef struct
   {
         char       *proname;
@@@ -940,17 -928,9 +934,17 @@@ fmgr_sql_validator(PG_FUNCTION_ARGS
                         querytree_list = NIL;
                         foreach(lc, raw_parsetree_list)
                         {
-                               Node       *parsetree = (Node *) lfirst(lc);
+                               RawStmt    *parsetree = lfirst_node(RawStmt, lc);
                                 List       *querytree_sublist;
   
+ +#ifdef PGXC
+ +                              /* Block CTAS in SQL functions */
+ +                              if (IsA(parsetree, CreateTableAsStmt))
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                      errmsg("In XC, SQL functions cannot contain utility statements")));
+ +#endif
+ +
                                 querytree_sublist = pg_analyze_and_rewrite_params(parsetree,
                                                                                                                                   prosrc,
                                                                            (ParserSetupHook) sql_fn_parser_setup,
diff --cc src/backend/catalog/pgxc_class.c

index 297010be9f26b2216acaf2c222626a442e7e1018,0000000000000000000000000000000000000000..a35cf7866dd6b8bdd254a4bc5cec65a8a4c34dab

mode 100644,000000..100644
--- 1/src/backend/catalog/pgxc_class.c
--- /dev/null
+++ b/src/backend/catalog/pgxc_class.c
@@@ -1,211 -1,0 +1,208 @@@
-       (void) simple_heap_insert(pgxcclassrel, htup);
- 
-       CatalogUpdateIndexes(pgxcclassrel, htup);
+ +/*-------------------------------------------------------------------------
+ + *
+ + * pgxc_class.c
+ + *    routines to support manipulation of the pgxc_class relation
+ + *
+ + * Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ + *
+ + *
+ + *-------------------------------------------------------------------------
+ + */
+ +#include "postgres.h"
+ +
+ +#include "access/heapam.h"
+ +#include "access/htup_details.h"
+ +#include "catalog/dependency.h"
+ +#include "catalog/indexing.h"
+ +#include "catalog/namespace.h"
+ +#include "catalog/pg_type.h"
+ +#include "catalog/pgxc_class.h"
+ +#include "utils/builtins.h"
+ +#include "utils/rel.h"
+ +#include "utils/syscache.h"
+ +#include "pgxc/locator.h"
+ +#include "utils/array.h"
+ +
+ +/*
+ + * PgxcClassCreate
+ + *            Create a pgxc_class entry
+ + */
+ +void
+ +PgxcClassCreate(Oid pcrelid,
+ +                              char pclocatortype,
+ +                              int pcattnum,
+ +                              int pchashalgorithm,
+ +                              int pchashbuckets,
+ +                              int numnodes,
+ +                              Oid *nodes)
+ +{
+ +      Relation        pgxcclassrel;
+ +      HeapTuple       htup;
+ +      bool            nulls[Natts_pgxc_class];
+ +      Datum           values[Natts_pgxc_class];
+ +      int             i;
+ +      oidvector       *nodes_array;
+ +
+ +      /* Build array of Oids to be inserted */
+ +      nodes_array = buildoidvector(nodes, numnodes);
+ +
+ +      /* Iterate through attributes initializing nulls and values */
+ +      for (i = 0; i < Natts_pgxc_class; i++)
+ +      {
+ +              nulls[i]  = false;
+ +              values[i] = (Datum) 0;
+ +      }
+ +
+ +      /* should not happen */
+ +      if (pcrelid == InvalidOid)
+ +      {
+ +              elog(ERROR,"pgxc class relid invalid.");
+ +              return;
+ +      }
+ +
+ +      values[Anum_pgxc_class_pcrelid - 1]   = ObjectIdGetDatum(pcrelid);
+ +      values[Anum_pgxc_class_pclocatortype - 1] = CharGetDatum(pclocatortype);
+ +
+ +      if (pclocatortype == LOCATOR_TYPE_HASH || pclocatortype == LOCATOR_TYPE_MODULO)
+ +      {
+ +              values[Anum_pgxc_class_pcattnum - 1] = UInt16GetDatum(pcattnum);
+ +              values[Anum_pgxc_class_pchashalgorithm - 1] = UInt16GetDatum(pchashalgorithm);
+ +              values[Anum_pgxc_class_pchashbuckets - 1] = UInt16GetDatum(pchashbuckets);
+ +      }
+ +
+ +      /* Node information */
+ +      values[Anum_pgxc_class_nodes - 1] = PointerGetDatum(nodes_array);
+ +
+ +      /* Open the relation for insertion */
+ +      pgxcclassrel = heap_open(PgxcClassRelationId, RowExclusiveLock);
+ +
+ +      htup = heap_form_tuple(pgxcclassrel->rd_att, values, nulls);
+ +
-       simple_heap_update(rel, &oldtup->t_self, newtup);
-       CatalogUpdateIndexes(rel, newtup);
++      CatalogTupleInsert(pgxcclassrel, htup);
+ +
+ +      heap_close(pgxcclassrel, RowExclusiveLock);
+ +}
+ +
+ +
+ +/*
+ + * PgxcClassAlter
+ + *            Modify a pgxc_class entry with given data
+ + */
+ +void
+ +PgxcClassAlter(Oid pcrelid,
+ +                         char pclocatortype,
+ +                         int pcattnum,
+ +                         int pchashalgorithm,
+ +                         int pchashbuckets,
+ +                         int numnodes,
+ +                         Oid *nodes,
+ +                         PgxcClassAlterType type)
+ +{
+ +      Relation        rel;
+ +      HeapTuple       oldtup, newtup;
+ +      oidvector  *nodes_array;
+ +      Datum           new_record[Natts_pgxc_class];
+ +      bool            new_record_nulls[Natts_pgxc_class];
+ +      bool            new_record_repl[Natts_pgxc_class];
+ +
+ +      Assert(OidIsValid(pcrelid));
+ +
+ +      rel = heap_open(PgxcClassRelationId, RowExclusiveLock);
+ +      oldtup = SearchSysCacheCopy1(PGXCCLASSRELID,
+ +                                                               ObjectIdGetDatum(pcrelid));
+ +
+ +      if (!HeapTupleIsValid(oldtup)) /* should not happen */
+ +              elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid);
+ +
+ +      /* Build array of Oids to be inserted */
+ +      nodes_array = buildoidvector(nodes, numnodes);
+ +
+ +      /* Initialize fields */
+ +      MemSet(new_record, 0, sizeof(new_record));
+ +      MemSet(new_record_nulls, false, sizeof(new_record_nulls));
+ +      MemSet(new_record_repl, false, sizeof(new_record_repl));
+ +
+ +      /* Fields are updated depending on operation type */
+ +      switch (type)
+ +      {
+ +              case PGXC_CLASS_ALTER_DISTRIBUTION:
+ +                      new_record_repl[Anum_pgxc_class_pclocatortype - 1] = true;
+ +                      new_record_repl[Anum_pgxc_class_pcattnum - 1] = true;
+ +                      new_record_repl[Anum_pgxc_class_pchashalgorithm - 1] = true;
+ +                      new_record_repl[Anum_pgxc_class_pchashbuckets - 1] = true;
+ +                      break;
+ +              case PGXC_CLASS_ALTER_NODES:
+ +                      new_record_repl[Anum_pgxc_class_nodes - 1] = true;
+ +                      break;
+ +              case PGXC_CLASS_ALTER_ALL:
+ +              default:
+ +                      new_record_repl[Anum_pgxc_class_pcrelid - 1] = true;
+ +                      new_record_repl[Anum_pgxc_class_pclocatortype - 1] = true;
+ +                      new_record_repl[Anum_pgxc_class_pcattnum - 1] = true;
+ +                      new_record_repl[Anum_pgxc_class_pchashalgorithm - 1] = true;
+ +                      new_record_repl[Anum_pgxc_class_pchashbuckets - 1] = true;
+ +                      new_record_repl[Anum_pgxc_class_nodes - 1] = true;
+ +      }
+ +
+ +      /* Set up new fields */
+ +      /* Relation Oid */
+ +      if (new_record_repl[Anum_pgxc_class_pcrelid - 1])
+ +              new_record[Anum_pgxc_class_pcrelid - 1] = ObjectIdGetDatum(pcrelid);
+ +
+ +      /* Locator type */
+ +      if (new_record_repl[Anum_pgxc_class_pclocatortype - 1])
+ +              new_record[Anum_pgxc_class_pclocatortype - 1] = CharGetDatum(pclocatortype);
+ +
+ +      /* Attribute number of distribution column */
+ +      if (new_record_repl[Anum_pgxc_class_pcattnum - 1])
+ +              new_record[Anum_pgxc_class_pcattnum - 1] = UInt16GetDatum(pcattnum);
+ +
+ +      /* Hash algorithm type */
+ +      if (new_record_repl[Anum_pgxc_class_pchashalgorithm - 1])
+ +              new_record[Anum_pgxc_class_pchashalgorithm - 1] = UInt16GetDatum(pchashalgorithm);
+ +
+ +      /* Hash buckets */
+ +      if (new_record_repl[Anum_pgxc_class_pchashbuckets - 1])
+ +              new_record[Anum_pgxc_class_pchashbuckets - 1] = UInt16GetDatum(pchashbuckets);
+ +
+ +      /* Node information */
+ +      if (new_record_repl[Anum_pgxc_class_nodes - 1])
+ +              new_record[Anum_pgxc_class_nodes - 1] = PointerGetDatum(nodes_array);
+ +
+ +      /* Update relation */
+ +      newtup = heap_modify_tuple(oldtup, RelationGetDescr(rel),
+ +                                                         new_record,
+ +                                                         new_record_nulls, new_record_repl);
++      CatalogTupleUpdate(rel, &oldtup->t_self, newtup);
+ +
+ +      heap_close(rel, RowExclusiveLock);
+ +}
+ +
+ +/*
+ + * RemovePGXCClass():
+ + *            Remove extended PGXC information
+ + */
+ +void
+ +RemovePgxcClass(Oid pcrelid)
+ +{
+ +      Relation  relation;
+ +      HeapTuple tup;
+ +
+ +      /*
+ +       * Delete the pgxc_class tuple.
+ +       */
+ +      relation = heap_open(PgxcClassRelationId, RowExclusiveLock);
+ +      tup = SearchSysCache(PGXCCLASSRELID,
+ +                                               ObjectIdGetDatum(pcrelid),
+ +                                               0, 0, 0);
+ +
+ +      if (!HeapTupleIsValid(tup)) /* should not happen */
+ +              elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid);
+ +
+ +      simple_heap_delete(relation, &tup->t_self);
+ +
+ +      ReleaseSysCache(tup);
+ +
+ +      heap_close(relation, RowExclusiveLock);
+ +}
diff --cc src/backend/catalog/storage.c

index a759e16c72df6ec6f217c12f68e0d8b9e76f1c09,f677916d0396f2f73ef77a63785e6f80e4172958..d5c4754d0168631f00ad28ce51162739a6d2d539
--- 1/src/backend/catalog/storage.c
--- 2/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@@ -3,8 -3,7 +3,8 @@@
    * storage.c
    *      code to create and destroy physical storage for relations
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/commands/analyze.c

index 14aad4fd7c08274cf573e7a797adb77342dec0ef,ecdd8950ee02a547bf02acc2a74b85e8c685f707..67e4146c6cb7310ca93d828275ddcdda1941bd75
--- 1/src/backend/commands/analyze.c
--- 2/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@@ -3,8 -3,7 +3,8 @@@
    * analyze.c
    *      the Postgres statistics generator
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/commands/cluster.c
Simple merge
diff --cc src/backend/commands/comment.c

index f45da2d9144bf375930e936dd117cce1f77ddb0c,1c17927c499d7d4a706f4578d9eb1b605b85066d..236b582f7c6dc3f8ef9e5b71632e3de68f45ec6e
--- 1/src/backend/commands/comment.c
--- 2/src/backend/commands/comment.c
+++ b/src/backend/commands/comment.c
@@@ -4,8 -4,7 +4,8 @@@
    *
    * PostgreSQL object comments utility code.
    *
-  * Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+  * Copyright (c) 1996-2017, PostgreSQL Global Development Group
    *
    * IDENTIFICATION
    *      src/backend/commands/comment.c
diff --cc src/backend/commands/copy.c

index 461e94ed0bdd606a534b1ccf0797cb2636dbc627,84b1a54cb9b4ed81015ef96a30f7c01179750d99..5d5e409c7dcd789b0018d9e0fb2c5c081cdaad7e
--- 1/src/backend/commands/copy.c
--- 2/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@@ -3,8 -3,7 +3,8 @@@
    * copy.c
    *            Implements the COPY utility command
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -45,17 -36,8 +45,18 @@@
   #include "miscadmin.h"
   #include "optimizer/clauses.h"
   #include "optimizer/planner.h"
+ +#ifdef PGXC
+ +#include "pgxc/pgxc.h"
+ +#include "pgxc/execRemote.h"
+ +#include "pgxc/locator.h"
+ +#include "pgxc/remotecopy.h"
+ +#include "nodes/nodes.h"
+ +#include "pgxc/poolmgr.h"
+ +#include "catalog/pgxc_node.h"
+ +#endif
   #include "nodes/makefuncs.h"
+ +#include "optimizer/pgxcship.h"
+ #include "parser/parse_relation.h"
   #include "rewrite/rewriteHandler.h"
   #include "storage/fd.h"
   #include "tcop/tcopprot.h"
@@@ -79,10 -61,8 +80,11 @@@ typedef enum CopyDes
   {
         COPY_FILE,                                      /* to/from file (or a piped program) */
         COPY_OLD_FE,                            /* to/from frontend (2.0 protocol) */
-       COPY_NEW_FE                                     /* to/from frontend (3.0 protocol) */
- -      COPY_NEW_FE,                            /* to/from frontend (3.0 protocol) */
++      COPY_NEW_FE,                                    /* to/from frontend (3.0 protocol) */
+ +#ifdef PGXC
-       ,COPY_BUFFER                            /* Do not send, just prepare */
++      COPY_BUFFER,                            /* Do not send, just prepare */
+ +#endif
+       COPY_CALLBACK                           /* to/from callback function */
   } CopyDest;
   
   /*
@@@ -577,11 -531,9 +562,14 @@@ CopySendEndOfRow(CopyState cstate
                         /* Dump the accumulated row as one CopyData message */
                         (void) pq_putmessage('d', fe_msgbuf->data, fe_msgbuf->len);
                         break;
+ +#ifdef PGXC
+ +              case COPY_BUFFER:
+ +                      /* Do not send yet anywhere, just return */
+ +                      return;
+ +#endif
+               case COPY_CALLBACK:
+                       Assert(false);          /* Not yet supported. */
+                       break;
         }
   
         resetStringInfo(fe_msgbuf);
@@@ -696,11 -648,9 +684,14 @@@ CopyGetData(CopyState cstate, void *dat
                                 bytesread += avail;
                         }
                         break;
+ +#ifdef PGXC
+ +              case COPY_BUFFER:
+ +                      elog(ERROR, "COPY_BUFFER not allowed in this context");
+ +                      break;
+ +#endif
+               case COPY_CALLBACK:
+                       bytesread = cstate->data_source_cb(databuf, minread, maxread);
+                       break;
         }
   
         return bytesread;
@@@ -976,19 -970,9 +1010,18 @@@ DoCopy(ParseState *pstate, const CopySt
                         PreventCommandIfReadOnly("COPY FROM");
                 PreventCommandIfParallelMode("COPY FROM");
   
-               cstate = BeginCopyFrom(rel, stmt->filename, stmt->is_program,
-                                                          stmt->attlist, stmt->options);
-               cstate->range_table = range_table;
+               cstate = BeginCopyFrom(pstate, rel, stmt->filename, stmt->is_program,
+                                                          NULL, stmt->attlist, stmt->options);
                 *processed = CopyFrom(cstate);  /* copy from file to database */
+ +#ifdef XCP
+ +              /*
+ +               * We should record insert to distributed table.
+ +               * Bulk inserts into local tables are recorded when heap tuples are
+ +               * written.
+ +               */
+ +              if (IS_PGXC_COORDINATOR && rel->rd_locator_info)
+ +                      pgstat_count_remote_insert(rel, (int) *processed);
+ +#endif
                 EndCopyFrom(cstate);
         }
         else
@@@ -1413,31 -1413,30 +1462,55 @@@ BeginCopy(ParseState *pstate
                                         (errcode(ERRCODE_UNDEFINED_COLUMN),
                                          errmsg("table \"%s\" does not have OIDs",
                                                         RelationGetRelationName(cstate->rel))));
+ 
+               /* Initialize state for CopyFrom tuple routing. */
+               if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+               {
+                       PartitionDispatch *partition_dispatch_info;
+                       ResultRelInfo *partitions;
+                       TupleConversionMap **partition_tupconv_maps;
+                       TupleTableSlot *partition_tuple_slot;
+                       int                     num_parted,
+                                               num_partitions;
+ 
+                       ExecSetupPartitionTupleRouting(rel,
+                                                                                  &partition_dispatch_info,
+                                                                                  &partitions,
+                                                                                  &partition_tupconv_maps,
+                                                                                  &partition_tuple_slot,
+                                                                                  &num_parted, &num_partitions);
+                       cstate->partition_dispatch_info = partition_dispatch_info;
+                       cstate->num_dispatch = num_parted;
+                       cstate->partitions = partitions;
+                       cstate->num_partitions = num_partitions;
+                       cstate->partition_tupconv_maps = partition_tupconv_maps;
+                       cstate->partition_tuple_slot = partition_tuple_slot;
+               }
+ +#ifdef PGXC
+ +              /* Get copy statement and execution node information */
+ +              if (IS_PGXC_COORDINATOR)
+ +              {
+ +                      RemoteCopyData *remoteCopyState = (RemoteCopyData *) palloc0(sizeof(RemoteCopyData));
+ +                      List *attnums = CopyGetAttnums(tupDesc, cstate->rel, attnamelist);
+ +
+ +                      /* Setup correct COPY FROM/TO flag */
+ +                      remoteCopyState->is_from = is_from;
+ +
+ +                      /* Get execution node list */
+ +                      RemoteCopy_GetRelationLoc(remoteCopyState,
+ +                                                                        cstate->rel,
+ +                                                                        attnums);
+ +                      /* Build remote query */
+ +                      RemoteCopy_BuildStatement(remoteCopyState,
+ +                                                                        cstate->rel,
+ +                                                                        GetRemoteCopyOptions(cstate),
+ +                                                                        attnamelist,
+ +                                                                        attnums);
+ +
+ +                      /* Then assign built structure */
+ +                      cstate->remoteCopyState = remoteCopyState;
+ +              }
+ +#endif
         }
         else
         {
@@@ -1527,8 -1499,15 +1601,8 @@@
                                          errmsg("multi-statement DO INSTEAD rules are not supported for COPY")));
                 }
   
-               query = (Query *) linitial(rewritten);
+               query = linitial_node(Query, rewritten);
   
- -              /* The grammar allows SELECT INTO, but we don't support that */
- -              if (query->utilityStmt != NULL &&
- -                      IsA(query->utilityStmt, CreateTableAsStmt))
- -                      ereport(ERROR,
- -                                      (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- -                                       errmsg("COPY (SELECT INTO) is not supported")));
- -
                 Assert(query->utilityStmt == NULL);
   
                 /*
@@@ -2698,10 -2700,13 +2887,16 @@@ CopyFrom(CopyState cstate
                          * tuples inserted by an INSERT command.
                          */
                         processed++;
+ 
+                       if (saved_resultRelInfo)
+                       {
+                               resultRelInfo = saved_resultRelInfo;
+                               estate->es_result_relation_info = resultRelInfo;
+                       }
                 }
+ +#ifdef PGXC
+ +              }
+ +#endif
         }
   
         /* Flush any remaining buffered tuples */
@@@ -3499,17 -3419,9 +3746,17 @@@ NextCopyFrom(CopyState cstate, ExprCont
                 Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
   
                 values[defmap[i]] = ExecEvalExpr(defexprs[i], econtext,
-                                                                                &nulls[defmap[i]], NULL);
+                                                                                &nulls[defmap[i]]);
         }
   
+ +#ifdef PGXC
+ +      if (IS_PGXC_COORDINATOR)
+ +      {
+ +              /* Append default values to the data-row in output format. */
+ +              append_defvals(values, cstate);
+ +      }
+ +#endif
+ +
         return true;
   }
   
diff --cc src/backend/commands/dbcommands.c

index d87945e4d3802409fed7aa4c06f65f64fec030be,11038f6764c02656d193f6e14fc6a5232dd0c60e..baeb8b591e457c331af9220f0430a34d0b469069
--- 1/src/backend/commands/dbcommands.c
--- 2/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@@ -1571,14 -1473,10 +1606,15 @@@ AlterDatabase(ParseState *pstate, Alter
                         ereport(ERROR,
                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                            errmsg("option \"%s\" cannot be specified with other options",
-                                         dtablespace->defname)));
+                                         dtablespace->defname),
+                                        parser_errposition(pstate, dtablespace->location)));
                 /* this case isn't allowed within a transaction block */
- -              PreventTransactionChain(isTopLevel, "ALTER DATABASE SET TABLESPACE");
+ +#ifdef PGXC
+ +              /* ... but we allow it on remote nodes */
+ +              if (IS_PGXC_LOCAL_COORDINATOR)
+ +#endif
+ +                      PreventTransactionChain(isTopLevel, "ALTER DATABASE SET TABLESPACE");
+ +
                 movedb(stmt->dbname, defGetString(dtablespace));
                 return InvalidOid;
         }
diff --cc src/backend/commands/event_trigger.c

index 0b58639229e9fde8501e9a729a2103b55abc3409,4cfab418a6f8cf91f55c2a9cfc6c92d5f02da285..51d8783fb6405aaf17fba9fbfc5093548713c9f1
--- 1/src/backend/commands/event_trigger.c
--- 2/src/backend/commands/event_trigger.c
+++ b/src/backend/commands/event_trigger.c
@@@ -1168,17 -1178,21 +1178,26 @@@ EventTriggerSupportsObjectClass(ObjectC
                 case OCLASS_USER_MAPPING:
                 case OCLASS_DEFACL:
                 case OCLASS_EXTENSION:
+ +#ifdef PGXC
+ +              case OCLASS_PGXC_CLASS:
+ +              case OCLASS_PGXC_NODE:
+ +              case OCLASS_PGXC_GROUP:
+ +#endif
                 case OCLASS_POLICY:
-               case OCLASS_AM:
+               case OCLASS_PUBLICATION:
+               case OCLASS_PUBLICATION_REL:
+               case OCLASS_SUBSCRIPTION:
+               case OCLASS_TRANSFORM:
                         return true;
+ 
+                       /*
+                        * There's intentionally no default: case here; we want the
+                        * compiler to warn if a new OCLASS hasn't been handled above.
+                        */
         }
   
-       return true;
+       /* Shouldn't get here, but if we do, say "no support" */
+       return false;
   }
   
   bool
diff --cc src/backend/commands/explain.c

index cdc0fe8f0c73323f1f9c305920dbc37fcfe4f1f2,9359d0a83ad7e19dd53674a186029307a25389c1..1bb5d7582ffbca144625b5bdf872a8ee732c5d04
--- 1/src/backend/commands/explain.c
--- 2/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@@ -3,8 -3,7 +3,8 @@@
    * explain.c
    *      Explain query execution plans
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994-5, Regents of the University of California
    *
    * IDENTIFICATION
@@@ -356,22 -346,8 +369,22 @@@ ExplainOneQuery(Query *query, int curso
         /* planner will not cope with utility statements */
         if (query->commandType == CMD_UTILITY)
         {
- -              ExplainOneUtility(query->utilityStmt, into, es, queryString, params,
- -                                                queryEnv);
+ +              /*
+ +               * If we are running EXPLAIN ANALYZE, transform the CTAS such that the
+ +               * target table is created first and select result is inserted into the
+ +               * table. The EXPLAIN ANALYZE would really just show the plan for the
+ +               * INSERT INTO generated by QueryRewriteCTAS, but that's OK.
+ +               */
+ +              if (es->analyze && IsA(query->utilityStmt, CreateTableAsStmt))
+ +              {
+ +                      List *rewritten = QueryRewriteCTAS(query);
+ +                      Assert(list_length(rewritten) == 1);
-                       ExplainOneQuery((Query *) linitial(rewritten), into, es,
-                                       queryString, params);
++                      ExplainOneQuery((Query *) linitial(rewritten), cursorOptions,
++                                      into, es, queryString, params, queryEnv);
+ +              }
+ +              else
+ +                      ExplainOneUtility(query->utilityStmt, into, es,
-                                       queryString, params);
++                                      queryString, params, queryEnv);
                 return;
         }
   
diff --cc src/backend/commands/extension.c

index be8641b5731358a4766a9c2929efb0bc1ba10dcf,c3718b08c1711f7f275b0c44cf6aee01ffe78f46..fa79e719553b91f5a53ee18f2fd123d5450597c3
--- 1/src/backend/commands/extension.c
--- 2/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@@ -700,17 -712,20 +713,21 @@@ execute_sql_string(const char *sql, con
          * parsetree.  We must fully execute each query before beginning parse
          * analysis on the next one, since there may be interdependencies.
          */
- -      foreach(lc1, raw_parsetree_list)
+ +      forboth(lc1, raw_parsetree_list, lc3, querysource_list)
         {
-               Node       *parsetree = (Node *) lfirst(lc1);
+               RawStmt    *parsetree = lfirst_node(RawStmt, lc1);
+ +              char       *querysource = (char *) lfirst(lc3);
                 List       *stmt_list;
                 ListCell   *lc2;
   
+               /* Be sure parser can see any DDL done so far */
+               CommandCounterIncrement();
+ 
                 stmt_list = pg_analyze_and_rewrite(parsetree,
- -                                                                                 sql,
+ +                                                                                 querysource,
                                                                                    NULL,
-                                                                                  0);
+                                                                                  0,
+                                                                                  NULL);
                 stmt_list = pg_plan_queries(stmt_list, CURSOR_OPT_PARALLEL_OK, NULL);
   
                 foreach(lc2, stmt_list)
@@@ -731,13 -740,13 +742,13 @@@
                         {
                                 QueryDesc  *qdesc;
   
-                               qdesc = CreateQueryDesc((PlannedStmt *) stmt,
+                               qdesc = CreateQueryDesc(stmt,
- -                                                                              sql,
+ +                                                                              querysource,
                                                                                 GetActiveSnapshot(), NULL,
-                                                                               dest, NULL, 0);
+                                                                               dest, NULL, NULL, 0);
   
                                 ExecutorStart(qdesc, 0);
-                               ExecutorRun(qdesc, ForwardScanDirection, 0);
+                               ExecutorRun(qdesc, ForwardScanDirection, 0, true);
                                 ExecutorFinish(qdesc);
                                 ExecutorEnd(qdesc);
   
@@@ -745,14 -754,17 +756,20 @@@
                         }
                         else
                         {
+                               if (IsA(stmt->utilityStmt, TransactionStmt))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                                        errmsg("transaction control statements are not allowed within an extension script")));
+ 
                                 ProcessUtility(stmt,
- -                                                         sql,
+ +                                                         querysource,
                                                            PROCESS_UTILITY_QUERY,
                                                            NULL,
+                                                          NULL,
                                                            dest,
+ +#ifdef PGXC
+ +                                                         true,        /* this is created at remote node level */
+ +#endif /* PGXC */
                                                            NULL);
                         }
   
@@@ -1363,11 -1440,8 +1445,8 @@@ CreateExtensionInternal(char *extension
                         csstmt->authrole = NULL;        /* will be created by current user */
                         csstmt->schemaElts = NIL;
                         csstmt->if_not_exists = false;
- #ifdef PGXC
-                       CreateSchemaCommand(csstmt, NULL, true);
- #else
-                       CreateSchemaCommand(csstmt, NULL);
- #endif
+                       CreateSchemaCommand(csstmt, "(generated CREATE SCHEMA command)",
- -                                                              -1, -1);
++                                                              true, -1, -1);
   
                         /*
                          * CreateSchemaCommand includes CommandCounterIncrement, so new
diff --cc src/backend/commands/foreigncmds.c

index 6963855373c7942a059f73e577e4503ef9eb0af6,ba85952baaef52b07463038c1d462337fb9ea983..554656b6ec9c976796f43df270634c60dcf46db3
--- 1/src/backend/commands/foreigncmds.c
--- 2/src/backend/commands/foreigncmds.c
+++ b/src/backend/commands/foreigncmds.c
@@@ -1593,15 -1612,19 +1612,22 @@@ ImportForeignSchema(ImportForeignSchema
                         /* Ensure creation schema is the one given in IMPORT statement */
                         cstmt->base.relation->schemaname = pstrdup(stmt->local_schema);
   
+                       /* No planning needed, just make a wrapper PlannedStmt */
+                       pstmt = makeNode(PlannedStmt);
+                       pstmt->commandType = CMD_UTILITY;
+                       pstmt->canSetTag = false;
+                       pstmt->utilityStmt = (Node *) cstmt;
+                       pstmt->stmt_location = rs->stmt_location;
+                       pstmt->stmt_len = rs->stmt_len;
+ 
                         /* Execute statement */
-                       ProcessUtility((Node *) cstmt,
+                       ProcessUtility(pstmt,
                                                    cmd,
- -                                                 PROCESS_UTILITY_SUBCOMMAND, NULL, NULL,
- -                                                 None_Receiver, NULL);
+ +                                                 PROCESS_UTILITY_SUBCOMMAND, NULL,
++                                                 NULL,
+ +                                                 None_Receiver,
- #ifdef XCP
+ +                                                 false,
- #endif
+ +                                                 NULL);
   
                         /* Be sure to advance the command counter between subcommands */
                         CommandCounterIncrement();
diff --cc src/backend/commands/indexcmds.c

index 1587fb6e80522e6d0016551de4ee75d4ccd218ed,486179938c3e8537dc2cf631f4693a488168a53d..87ff7faf48b7c6ebaad7f772e718423f34a6c887
--- 1/src/backend/commands/indexcmds.c
--- 2/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@@ -3,10 -3,8 +3,10 @@@
    * indexcmds.c
    *      POSTGRES define and remove index code.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
diff --cc src/backend/commands/matview.c

index 442ee236a1b2014692d334ef588d36797582ec37,9ffd91ea0e3284df0e9e55468b78cde189eb7996..2061568d7f850a94d9df7cc5799d0e5e54040249
--- 1/src/backend/commands/matview.c
--- 2/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@@ -32,10 -29,8 +32,11 @@@
   #include "executor/executor.h"
   #include "executor/spi.h"
   #include "miscadmin.h"
+ +#ifdef PGXC
+ +#include "nodes/makefuncs.h"
+ +#endif
   #include "parser/parse_relation.h"
+ #include "pgstat.h"
   #include "rewrite/rewriteHandler.h"
   #include "storage/lmgr.h"
   #include "storage/smgr.h"
diff --cc src/backend/commands/portalcmds.c

index bbd5ca54dcff6a2da92a5d95274176f3e126c6c9,167910fcb5b140d4227c0a8a3d06b3d5b11ff60b..619145998200b671b5a957a00b69adcf894e54b1
--- 1/src/backend/commands/portalcmds.c
--- 2/src/backend/commands/portalcmds.c
+++ b/src/backend/commands/portalcmds.c
@@@ -9,8 -9,7 +9,8 @@@
    * storage management for portals (but doesn't run any queries in them).
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -77,18 -96,9 +100,17 @@@ PerformCursorOpen(DeclareCursorStmt *cs
          */
         portal = CreatePortal(cstmt->portalname, false, false);
   
+ +#ifdef PGXC
+ +      /*
+ +       * Consume the command id of the command creating the cursor
+ +       */
+ +      if (IS_PGXC_LOCAL_COORDINATOR)
+ +              GetCurrentCommandId(true);
+ +#endif
+ +
         oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(portal));
   
-       stmt = copyObject(stmt);
-       stmt->utilityStmt = NULL;       /* make it look like plain SELECT */
+       plan = copyObject(plan);
   
         queryString = pstrdup(queryString);
   
diff --cc src/backend/commands/prepare.c

index 316fc49109810936bdad9272e8ff197112001f1b,d265c77826f2b53b4c247901ad20d69dcead6302..287affa515011a5cbf43a3b0e53d3061c8101c3c
--- 1/src/backend/commands/prepare.c
--- 2/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@@ -89,10 -90,7 +105,10 @@@ PrepareQuery(PrepareStmt *stmt, const c
          * Create the CachedPlanSource before we do parse analysis, since it needs
          * to see the unmodified raw parse tree.
          */
-       plansource = CreateCachedPlan(stmt->query, queryString,
+       plansource = CreateCachedPlan(rawstmt, queryString,
+ +#ifdef PGXC
+ +                                                                stmt->name,
+ +#endif
                                                                   CreateCommandTag(stmt->query));
   
         /* Transform list of TypeNames to array of type OIDs */
diff --cc src/backend/commands/schemacmds.c

index 255ca89199d671c3cba92388de30f5f140a803ec,93425babbedb14b7a3c54e8c2d757210f9870a04..546b54bd9fb325d44f61d676c09e8f5891b000f7
--- 1/src/backend/commands/schemacmds.c
--- 2/src/backend/commands/schemacmds.c
+++ b/src/backend/commands/schemacmds.c
@@@ -3,8 -3,7 +3,8 @@@
    * schemacmds.c
    *      schema creation/manipulation commands
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -45,13 -40,16 +45,17 @@@ static void AlterSchemaOwner_internal(H
   
   /*
    * CREATE SCHEMA
+  *
+  * Note: caller should pass in location information for the whole
+  * CREATE SCHEMA statement, which in turn we pass down as the location
+  * of the component commands.  This comports with our general plan of
+  * reporting location/len for the whole command even when executing
+  * a subquery.
    */
   Oid
- #ifdef PGXC
- CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString, bool sentToRemote)
- #else
- CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString)
- #endif
+ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString,
++                                      bool sentToRemote,
+                                       int stmt_location, int stmt_len)
   {
         const char *schemaName = stmt->schemaname;
         Oid                     namespaceId;
@@@ -197,11 -194,10 +210,13 @@@
                                            queryString,
                                            PROCESS_UTILITY_SUBCOMMAND,
                                            NULL,
+                                          NULL,
                                            None_Receiver,
+ +#ifdef PGXC
+ +                                         true,
+ +#endif /* PGXC */
                                            NULL);
+ 
                 /* make sure later steps can see the object created here */
                 CommandCounterIncrement();
         }
diff --cc src/backend/commands/sequence.c

index e4a20294771d2d655c26758792264367ab0c5065,568b3022f2dfc7dcd5fcd2fa3691e8070705494f..0ccbe37a1b8e3d63134a61fcb6709a3aeefcf2e0
--- 1/src/backend/commands/sequence.c
--- 2/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@@ -3,10 -3,8 +3,10 @@@
    * sequence.c
    *      PostgreSQL sequences support code.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -15,7 -13,8 +15,9 @@@
    *-------------------------------------------------------------------------
    */
   #include "postgres.h"
++#include <math.h>
   
+ #include "access/bufmask.h"
   #include "access/htup_details.h"
   #include "access/multixact.h"
   #include "access/transam.h"
@@@ -41,18 -43,9 +46,19 @@@
   #include "utils/lsyscache.h"
   #include "utils/resowner.h"
   #include "utils/syscache.h"
+ +#include "commands/dbcommands.h"
+ +
+ +#ifdef PGXC
+ +#include "pgxc/pgxc.h"
+ +/* PGXC_COORD */
+ +#include "access/gtm.h"
+ +#include "utils/memutils.h"
+ +#ifdef XCP
+ +#include "utils/timestamp.h"
+ +#endif
+ +#endif
+ #include "utils/varlena.h"
   
- -
   /*
    * We don't want to log each fetching of a value from a sequence,
    * so we pre-log a few fetches in advance. In the event of
@@@ -92,11 -79,7 +98,11 @@@ typedef struct SeqTableDat
         int64           cached;                 /* last value already cached for nextval */
         /* if last != cached, we have not used up all the cached values */
         int64           increment;              /* copy of sequence's increment field */
-       /* note that increment is zero until we first do read_seq_tuple() */
+       /* note that increment is zero until we first do nextval_internal() */
+ +#ifdef XCP
+ +      TimestampTz last_call_time; /* the time when the last call as made */
+ +      int64           range_multiplier; /* multiply this value with 2 next time */
+ +#endif
   } SeqTableData;
   
   typedef SeqTableData *SeqTable;
@@@ -131,21 -93,19 +137,20 @@@ typedef struct rename_sequence_callback
   static SeqTableData *last_used_seq = NULL;
   
   static void fill_seq_with_data(Relation rel, HeapTuple tuple);
- static int64 nextval_internal(Oid relid);
- static Relation open_share_lock(SeqTable seq);
+ static Relation lock_and_open_sequence(SeqTable seq);
   static void create_seq_hashtable(void);
   static void init_sequence(Oid relid, SeqTable *p_elm, Relation *p_rel);
- static Form_pg_sequence read_seq_tuple(SeqTable elm, Relation rel,
-                          Buffer *buf, HeapTuple seqtuple);
- #ifdef PGXC
- static void init_params(List *options, bool isInit,
-                                               Form_pg_sequence new, List **owned_by, bool *is_restart);
- #else
- static void init_params(List *options, bool isInit,
-                       Form_pg_sequence new, List **owned_by);
- #endif
+ static Form_pg_sequence_data read_seq_tuple(Relation rel,
+                          Buffer *buf, HeapTuple seqdatatuple);
+ static LOCKMODE alter_sequence_get_lock_level(List *options);
+ static void init_params(ParseState *pstate, List *options, bool for_identity,
+                       bool isInit,
+                       Form_pg_sequence seqform,
+                       bool *changed_seqform,
- -                      Form_pg_sequence_data seqdataform, List **owned_by);
++                      Form_pg_sequence_data seqdataform, List **owned_by,
++                      bool *is_restart);
   static void do_setval(Oid relid, int64 next, bool iscalled);
- static void process_owned_by(Relation seqrel, List *owned_by);
+ static void process_owned_by(Relation seqrel, List *owned_by, bool for_identity);
   
   
   /*
@@@ -165,16 -127,9 +172,17 @@@ DefineSequence(ParseState *pstate, Crea
         TupleDesc       tupDesc;
         Datum           value[SEQ_COL_LASTCOL];
         bool            null[SEQ_COL_LASTCOL];
+       Datum           pgs_values[Natts_pg_sequence];
+       bool            pgs_nulls[Natts_pg_sequence];
         int                     i;
-       NameData        name;
+ +#ifdef PGXC /* PGXC_COORD */
+ +      GTM_Sequence    start_value = 1;
+ +      GTM_Sequence    min_value = 1;
+ +      GTM_Sequence    max_value = InvalidSequenceValue;
+ +      GTM_Sequence    increment = 1;
+ +      bool            cycle = false;
+ +      bool            is_restart;
+ +#endif
   
         /* Unlogged sequences are not implemented -- not clear if useful. */
         if (seq->sequence->relpersistence == RELPERSISTENCE_UNLOGGED)
@@@ -201,11 -156,7 +209,8 @@@
         }
   
         /* Check and set all option values */
- #ifdef PGXC
-       init_params(seq->options, true, &new, &owned_by, &is_restart);
- #else
-       init_params(seq->options, true, &new, &owned_by);
- #endif
- -      init_params(pstate, seq->options, seq->for_identity, true, &seqform, &changed_seqform, &seqdataform, &owned_by);
++      init_params(pstate, seq->options, seq->for_identity, true, &seqform,
++                      &changed_seqform, &seqdataform, &owned_by, &is_restart);
   
         /*
          * Create relation (and fill value[] and null[] for the tuple)
@@@ -326,31 -227,27 +281,52 @@@
   
         heap_close(rel, NoLock);
   
+       /* fill in pg_sequence */
+       rel = heap_open(SequenceRelationId, RowExclusiveLock);
+       tupDesc = RelationGetDescr(rel);
+ 
+       memset(pgs_nulls, 0, sizeof(pgs_nulls));
+ 
+       pgs_values[Anum_pg_sequence_seqrelid - 1] = ObjectIdGetDatum(seqoid);
+       pgs_values[Anum_pg_sequence_seqtypid - 1] = ObjectIdGetDatum(seqform.seqtypid);
+       pgs_values[Anum_pg_sequence_seqstart - 1] = Int64GetDatumFast(seqform.seqstart);
+       pgs_values[Anum_pg_sequence_seqincrement - 1] = Int64GetDatumFast(seqform.seqincrement);
+       pgs_values[Anum_pg_sequence_seqmax - 1] = Int64GetDatumFast(seqform.seqmax);
+       pgs_values[Anum_pg_sequence_seqmin - 1] = Int64GetDatumFast(seqform.seqmin);
+       pgs_values[Anum_pg_sequence_seqcache - 1] = Int64GetDatumFast(seqform.seqcache);
+       pgs_values[Anum_pg_sequence_seqcycle - 1] = BoolGetDatum(seqform.seqcycle);
+ 
+       tuple = heap_form_tuple(tupDesc, pgs_values, pgs_nulls);
+       CatalogTupleInsert(rel, tuple);
+ 
+       heap_freetuple(tuple);
+       heap_close(rel, RowExclusiveLock);
+ 
+ +#ifdef PGXC  /* PGXC_COORD */
+ +      /*
+ +       * Remote Coordinator is in charge of creating sequence in GTM.
+ +       * If sequence is temporary, it is not necessary to create it on GTM.
+ +       */
+ +      if (IS_PGXC_LOCAL_COORDINATOR)
+ +      {
+ +              char *seqname = GetGlobalSeqName(rel, NULL, NULL);
+ +
+ +              /* We also need to create it on the GTM */
+ +              if (CreateSequenceGTM(seqname,
+ +                                                        increment,
+ +                                                        min_value,
+ +                                                        max_value,
+ +                              start_value, cycle) < 0)
+ +              {
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_CONNECTION_FAILURE),
+ +                                       errmsg("GTM error, could not create sequence")));
+ +              }
+ +
+ +
+ +              pfree(seqname);
+ +      }
+ +#endif
         return address;
   }
   
@@@ -510,23 -417,20 +496,29 @@@ AlterSequence(ParseState *pstate, Alter
         SeqTable        elm;
         Relation        seqrel;
         Buffer          buf;
-       HeapTupleData seqtuple;
-       Form_pg_sequence seq;
-       FormData_pg_sequence new;
+       HeapTupleData seqdatatuple;
+       Form_pg_sequence seqform;
+       Form_pg_sequence_data seqdata;
+       FormData_pg_sequence_data newseqdata;
+       bool            changed_seqform = false;
         List       *owned_by;
+ +#ifdef PGXC
+ +      GTM_Sequence    start_value;
+ +      GTM_Sequence    last_value;
+ +      GTM_Sequence    min_value;
+ +      GTM_Sequence    max_value;
+ +      GTM_Sequence    increment;
+ +      bool                    cycle;
+ +      bool                    is_restart;
+ +#endif
         ObjectAddress address;
+       Relation        rel;
+       HeapTuple       tuple;
   
         /* Open and lock sequence. */
-       relid = RangeVarGetRelid(stmt->sequence, AccessShareLock, stmt->missing_ok);
+       relid = RangeVarGetRelid(stmt->sequence,
+                                                        alter_sequence_get_lock_level(stmt->options),
+                                                        stmt->missing_ok);
         if (relid == InvalidOid)
         {
                 ereport(NOTICE,
@@@ -542,18 -446,23 +534,24 @@@
                 aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
                                            stmt->sequence->relname);
   
-       /* lock page' buffer and read tuple into new sequence structure */
-       seq = read_seq_tuple(elm, seqrel, &buf, &seqtuple);
+       rel = heap_open(SequenceRelationId, RowExclusiveLock);
+       tuple = SearchSysCacheCopy1(SEQRELID,
+                                                               ObjectIdGetDatum(relid));
+       if (!HeapTupleIsValid(tuple))
+               elog(ERROR, "cache lookup failed for sequence %u",
+                        relid);
+ 
+       seqform = (Form_pg_sequence) GETSTRUCT(tuple);
   
-       /* Copy old values of options into workspace */
-       memcpy(&new, seq, sizeof(FormData_pg_sequence));
+       /* lock page's buffer and read tuple into new sequence structure */
+       seqdata = read_seq_tuple(seqrel, &buf, &seqdatatuple);
+ 
+       /* Copy old sequence data into workspace */
+       memcpy(&newseqdata, seqdata, sizeof(FormData_pg_sequence_data));
   
         /* Check and set new values */
- #ifdef PGXC
-       init_params(stmt->options, false, &new, &owned_by, &is_restart);
- #else
-       init_params(stmt->options, false, &new, &owned_by);
- #endif
- -      init_params(pstate, stmt->options, stmt->for_identity, false, seqform, &changed_seqform, &newseqdata, &owned_by);
++      init_params(pstate, stmt->options, stmt->for_identity, false, seqform,
++                      &changed_seqform, &newseqdata, &owned_by, &is_restart);
   
         /* Clear local cache so that we don't think we have cached numbers */
         /* Note that we do not change the currval() state */
@@@ -564,18 -473,9 +562,18 @@@
                 GetTopTransactionId();
   
         /* Now okay to update the on-disk tuple */
-       increment = new.increment_by;
-       min_value = new.min_value;
-       max_value = new.max_value;
-       start_value = new.start_value;
-       last_value = new.last_value;
-       cycle = new.is_cycled;
+ +#ifdef PGXC
++      increment = seqform->seqincrement;
++      min_value = seqform->seqmin;
++      max_value = seqform->seqmax;
++      start_value = seqform->seqstart;
++      last_value = elm->last;
++      cycle = seqform->seqcycle;
+ +#endif
+ +
         START_CRIT_SECTION();
   
-       memcpy(seq, &new, sizeof(FormData_pg_sequence));
+       memcpy(seqdata, &newseqdata, sizeof(FormData_pg_sequence_data));
   
         MarkBufferDirty(buf);
   
@@@ -611,32 -511,12 +609,36 @@@
   
         ObjectAddressSet(address, RelationRelationId, relid);
   
+       if (changed_seqform)
+               CatalogTupleUpdate(rel, &tuple->t_self, tuple);
+       heap_close(rel, RowExclusiveLock);
+ 
         relation_close(seqrel, NoLock);
   
+ +#ifdef PGXC
+ +      /*
+ +       * Remote Coordinator is in charge of create sequence in GTM
+ +       * If sequence is temporary, no need to go through GTM.
+ +       */
+ +      if (IS_PGXC_LOCAL_COORDINATOR && seqrel->rd_backend != MyBackendId)
+ +      {
+ +              char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
+ +
+ +              /* We also need to create it on the GTM */
+ +              if (AlterSequenceGTM(seqname,
+ +                                                       increment,
+ +                                                       min_value,
+ +                                                       max_value,
+ +                                                       start_value,
+ +                                                       last_value,
+ +                                                       cycle,
+ +                                                       is_restart) < 0)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_CONNECTION_FAILURE),
+ +                                       errmsg("GTM error, could not alter sequence")));
+ +              pfree(seqname);
+ +      }
+ +#endif
         return address;
   }
   
@@@ -728,91 -629,24 +751,104 @@@ nextval_internal(Oid relid, bool check_
                 return elm->last;
         }
   
+       pgstuple = SearchSysCache1(SEQRELID, ObjectIdGetDatum(relid));
+       if (!HeapTupleIsValid(pgstuple))
+               elog(ERROR, "cache lookup failed for sequence %u", relid);
+       pgsform = (Form_pg_sequence) GETSTRUCT(pgstuple);
+       incby = pgsform->seqincrement;
+       maxv = pgsform->seqmax;
+       minv = pgsform->seqmin;
+       cache = pgsform->seqcache;
+       cycle = pgsform->seqcycle;
+       ReleaseSysCache(pgstuple);
+ 
         /* lock page' buffer and read tuple */
-       seq = read_seq_tuple(elm, seqrel, &buf, &seqtuple);
+       seq = read_seq_tuple(seqrel, &buf, &seqdatatuple);
         page = BufferGetPage(buf);
   
-               int64 range = seq->cache_value; /* how many values to ask from GTM? */
+ +      {
-       fetch = cache = seq->cache_value;
++              int64 range = cache; /* how many values to ask from GTM? */
+ +              int64 rangemax; /* the max value returned from the GTM for our request */
+ +              char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
+ +
+ +              /*
+ +               * Above, we still use the page as a locking mechanism to handle
+ +               * concurrency
+ +               *
+ +               * If the user has set a CACHE parameter, we use that. Else we pass in
+ +               * the SequenceRangeVal value
+ +               */
+ +              if (range == DEFAULT_CACHEVAL && SequenceRangeVal > range)
+ +              {
+ +                      TimestampTz curtime = GetCurrentTimestamp();
+ +
+ +                      if (!TimestampDifferenceExceeds(elm->last_call_time,
+ +                                                                                                      curtime, 1000))
+ +                      {
+ +                              /*
+ +                               * The previous GetNextValGTM call was made just a while back.
+ +                               * Request double the range of what was requested in the
+ +                               * earlier call. Honor the SequenceRangeVal boundary
+ +                               * value to limit very large range requests!
+ +                               */
+ +                              elm->range_multiplier *= 2;
+ +                              if (elm->range_multiplier < SequenceRangeVal)
+ +                                      range = elm->range_multiplier;
+ +                              else
+ +                                      elm->range_multiplier = range = SequenceRangeVal;
+ +
+ +                              elog(DEBUG1, "increase sequence range %ld", range);
+ +                      }
+ +                      else if (TimestampDifferenceExceeds(elm->last_call_time,
+ +                                                                                              curtime, 5000))
+ +                      {
+ +                              /* The previous GetNextValGTM call was pretty old */
+ +                              range = elm->range_multiplier = DEFAULT_CACHEVAL;
+ +                              elog(DEBUG1, "reset sequence range %ld", range);
+ +                      }
+ +                      else if (TimestampDifferenceExceeds(elm->last_call_time,
+ +                                                                                              curtime, 3000))
+ +                      {
+ +                              /*
+ +                               * The previous GetNextValGTM call was made quite some time
+ +                               * ago. Try to reduce the range request to reduce the gap
+ +                               */
+ +                              if (elm->range_multiplier != DEFAULT_CACHEVAL)
+ +                              {
+ +                                      range = elm->range_multiplier =
+ +                                                              rint(elm->range_multiplier/2);
+ +                                      elog(DEBUG1, "decrease sequence range %ld", range);
+ +                              }
+ +                      }
+ +                      else
+ +                      {
+ +                              /*
+ +                               * Current range_multiplier alllows to cache sequence values
+ +                               * for 1-3 seconds of work. Keep that rate.
+ +                               */
+ +                              range = elm->range_multiplier;
+ +                      }
+ +                      elm->last_call_time = curtime;
+ +              }
+ +
+ +              result = (int64) GetNextValGTM(seqname, range, &rangemax);
+ +              pfree(seqname);
+ +
+ +              /* Update the on-disk data */
+ +              seq->last_value = result; /* last fetched number */
+ +              seq->is_called = true;
+ +
+ +              /* save info in local cache */
+ +              elm->last = result;                     /* last returned number */
+ +              elm->cached = rangemax;         /* last fetched range max limit */
+ +              elm->last_valid = true;
+ +
+ +              last_used_seq = elm;
+ +      }
+ +
+       elm->increment = incby;
+       last = next = result = seq->last_value;
+       fetch = cache;
         log = seq->log_cnt;
   
         if (!seq->is_called)
@@@ -1010,28 -963,50 +1172,59 @@@ do_setval(Oid relid, int64 next, bool i
                                                 bufm, bufx)));
         }
   
- -      /* Set the currval() state only if iscalled = true */
- -      if (iscalled)
         {
- -              elm->last = next;               /* last returned number */
- -              elm->last_valid = true;
- -      }
+ +              char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
   
- -              Page            page = BufferGetPage(buf);
+ +              if (SetValGTM(seqname, next, iscalled) < 0)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_CONNECTION_FAILURE),
+ +                                       errmsg("GTM error, could not obtain sequence value")));
+ +              pfree(seqname);
+ +              /* Update the on-disk data */
+ +              seq->last_value = next; /* last fetched number */
+ +              seq->is_called = iscalled;
+ +              seq->log_cnt = (iscalled) ? 0 : 1;
+ +
+ +              if (iscalled)
+ +              {
+ +                      elm->last = next;               /* last returned number */
+ +                      elm->last_valid = true;
+ +              }
++      }
+       /* In any case, forget any future cached numbers */
+       elm->cached = elm->last;
+ 
+       /* check the comment above nextval_internal()'s equivalent call. */
+       if (RelationNeedsWAL(seqrel))
+               GetTopTransactionId();
+ 
+       /* ready to change the on-disk (or really, in-buffer) tuple */
+       START_CRIT_SECTION();
+ 
+       seq->last_value = next;         /* last fetched number */
+       seq->is_called = iscalled;
+       seq->log_cnt = 0;
+ 
+       MarkBufferDirty(buf);
+ 
+       /* XLOG stuff */
+       if (RelationNeedsWAL(seqrel))
+       {
+               xl_seq_rec      xlrec;
+               XLogRecPtr      recptr;
+ 
+               XLogBeginInsert();
+               XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
+ 
+               xlrec.node = seqrel->rd_node;
+               XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec));
+               XLogRegisterData((char *) seqdatatuple.t_data, seqdatatuple.t_len);
+ 
+               recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG);
   
- -              PageSetLSN(page, recptr);
+ +              elm->cached = elm->last;
         }
   
- -      END_CRIT_SECTION();
- -
         UnlockReleaseBuffer(buf);
   
         relation_close(seqrel, NoLock);
@@@ -1157,11 -1132,7 +1350,11 @@@ init_sequence(Oid relid, SeqTable *p_el
                 elm->filenode = InvalidOid;
                 elm->lxid = InvalidLocalTransactionId;
                 elm->last_valid = false;
-               elm->last = elm->cached = elm->increment = 0;
+ +#ifdef XCP
+ +              elm->last_call_time = 0;
+ +              elm->range_multiplier = DEFAULT_CACHEVAL;
+ +#endif
+               elm->last = elm->cached = 0;
         }
   
         /*
@@@ -1260,14 -1255,14 +1477,15 @@@ alter_sequence_get_lock_level(List *opt
    * otherwise, do not change existing options that aren't explicitly overridden.
    */
   static void
- #ifdef PGXC
- init_params(List *options, bool isInit,
-                       Form_pg_sequence new, List **owned_by, bool *is_restart)
- #else
- init_params(List *options, bool isInit,
-                       Form_pg_sequence new, List **owned_by)
- #endif
+ init_params(ParseState *pstate, List *options, bool for_identity,
+                       bool isInit,
+                       Form_pg_sequence seqform,
+                       bool *changed_seqform,
+                       Form_pg_sequence_data seqdataform,
- -                      List **owned_by)
++                      List **owned_by,
++                      bool *is_restart)
   {
+       DefElem    *as_type = NULL;
         DefElem    *start_value = NULL;
         DefElem    *restart_value = NULL;
         DefElem    *increment_by = NULL;
@@@ -1276,11 -1271,9 +1494,13 @@@
         DefElem    *cache_value = NULL;
         DefElem    *is_cycled = NULL;
         ListCell   *option;
+       bool            reset_max_value = false;
+       bool            reset_min_value = false;
   
+ +#ifdef PGXC
+ +      *is_restart = false;
+ +#endif
+ +
         *owned_by = NIL;
   
         foreach(option, options)
@@@ -1471,14 -1596,11 +1823,15 @@@
         if (restart_value != NULL)
         {
                 if (restart_value->arg != NULL)
-                       new->last_value = defGetInt64(restart_value);
+                       seqdataform->last_value = defGetInt64(restart_value);
                 else
-                       new->last_value = new->start_value;
+                       seqdataform->last_value = seqform->seqstart;
+ +#ifdef PGXC
+ +              *is_restart = true;
+ +#endif
-               new->is_called = false;
-               new->log_cnt = 0;
++              seqdataform->last_value = seqform->seqstart;
+               seqdataform->is_called = false;
+               seqdataform->log_cnt = 0;
         }
         else if (isInit)
         {
@@@ -1526,100 -1649,15 +1880,103 @@@
                                          errmsg("CACHE (%s) must be greater than zero",
                                                         buf)));
                 }
-               new->log_cnt = 0;
+               seqdataform->log_cnt = 0;
         }
         else if (isInit)
-               new->cache_value = 1;
+       {
+               seqform->seqcache = 1;
+               *changed_seqform = true;
+       }
   }
   
+ +#ifdef PGXC
+ +/*
+ + * GetGlobalSeqName
+ + *
+ + * Returns a global sequence name adapted to GTM
+ + * Name format is dbname.schemaname.seqname
+ + * so as to identify in a unique way in the whole cluster each sequence
+ + */
+ +char *
+ +GetGlobalSeqName(Relation seqrel, const char *new_seqname, const char *new_schemaname)
+ +{
+ +      char *seqname, *dbname, *relname;
+ +      char namespace[NAMEDATALEN * 2];
+ +      int charlen;
+ +      bool is_temp = seqrel->rd_backend == MyBackendId;
+ +      /* Get all the necessary relation names */
+ +      dbname = get_database_name(seqrel->rd_node.dbNode);
+ +
+ +      if (new_seqname)
+ +              relname = (char *) new_seqname;
+ +      else
+ +              relname = RelationGetRelationName(seqrel);
+ +
+ +      if (!is_temp)
+ +      {
+ +              /*
+ +               * For a permanent sequence, use schema qualified name. That can
+ +               * uniquely identify the sequences.
+ +               */
+ +              char *schema = get_namespace_name(RelationGetNamespace(seqrel));
+ +              sprintf(namespace, "%s", new_schemaname ? new_schemaname : schema);
+ +              pfree(schema);
+ +      }
+ +      else
+ +      {
+ +              /*
+ +               * For temporary sequences, we use originating coordinator name and
+ +               * originating coordinator PID to qualify the sequence name. If we are
+ +               * running on the local coordinator, we can readily fetch that
+ +               * information from PGXCNodeName and MyProcPid, but when running on
+ +               * remote datanode, we must consult MyCoordName and MyProcPid to get
+ +               * the correct information.
+ +               */
+ +              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                      sprintf(namespace, "%s.%d", PGXCNodeName, MyProcPid);
+ +              else
+ +                      sprintf(namespace, "%s.%d", MyCoordName, MyCoordPid);
+ +      }
+ +
+ +      /* Calculate the global name size including the dots and \0 */
+ +      charlen = strlen(dbname) + strlen(namespace) + strlen(relname) + 3;
+ +      seqname = (char *) palloc(charlen);
+ +
+ +      /* Form a unique sequence name with schema and database name for GTM */
+ +      snprintf(seqname,
+ +                       charlen,
+ +                       "%s.%s.%s",
+ +                       dbname,
+ +                       namespace,
+ +                       relname);
+ +
+ +      if (dbname)
+ +              pfree(dbname);
+ +
+ +      return seqname;
+ +}
+ +
+ +/*
+ + * IsTempSequence
+ + *
+ + * Determine if given sequence is temporary or not.
+ + */
+ +bool
+ +IsTempSequence(Oid relid)
+ +{
+ +      Relation seqrel;
+ +      bool res;
+ +      SeqTable        elm;
+ +
+ +      /* open and AccessShareLock sequence */
+ +      init_sequence(relid, &elm, &seqrel);
+ +
+ +      res = seqrel->rd_backend == MyBackendId;
+ +      relation_close(seqrel, NoLock);
+ +      return res;
+ +}
+ +#endif
+ +
   /*
    * Process an OWNED BY option for CREATE/ALTER SEQUENCE
    *
diff --cc src/backend/commands/tablecmds.c

index b48f6e529dcb433b1a800b8a41383f420583c9fc,7959120f53eb3a17210a55a124f2f301982f3eee..5be449648ce1107d4371398a30922746e537298c
--- 1/src/backend/commands/tablecmds.c
--- 2/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@@ -3,10 -3,8 +3,10 @@@
    * tablecmds.c
    *      Commands for creating and altering table structures and settings
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -3553,16 -3709,12 +3888,22 @@@ ATPrepCmd(List **wqueue, Relation rel, 
                         /* No command-specific prep needed */
                         pass = AT_PASS_MISC;
                         break;
+ +#ifdef PGXC
+ +              case AT_DistributeBy:
+ +              case AT_SubCluster:
+ +              case AT_AddNodeList:
+ +              case AT_DeleteNodeList:
+ +                      ATSimplePermissions(rel, ATT_TABLE);
+ +                      /* No command-specific prep needed */
+ +                      pass = AT_PASS_DISTRIB;
+ +                      break;
+ +#endif
+               case AT_AttachPartition:
+               case AT_DetachPartition:
+                       ATSimplePermissions(rel, ATT_TABLE);
+                       /* No command-specific prep needed */
+                       pass = AT_PASS_MISC;
+                       break;
                 default:                                /* oops */
                         elog(ERROR, "unrecognized alter table type: %d",
                                  (int) cmd->subtype);
@@@ -3882,20 -4050,12 +4239,24 @@@ ATExecCmd(List **wqueue, AlteredTableIn
                 case AT_GenericOptions:
                         ATExecGenericOptions(rel, (List *) cmd->def);
                         break;
- #ifdef PGXC
+ +              case AT_DistributeBy:
+ +                      AtExecDistributeBy(rel, (DistributeBy *) cmd->def);
+ +                      break;
+ +              case AT_SubCluster:
+ +                      AtExecSubCluster(rel, (PGXCSubCluster *) cmd->def);
+ +                      break;
+ +              case AT_AddNodeList:
+ +                      AtExecAddNode(rel, (List *) cmd->def);
+ +                      break;
+ +              case AT_DeleteNodeList:
+ +                      AtExecDeleteNode(rel, (List *) cmd->def);
+ +                      break;
- #endif
+               case AT_AttachPartition:
+                       ATExecAttachPartition(wqueue, rel, (PartitionCmd *) cmd->def);
+                       break;
+               case AT_DetachPartition:
+                       ATExecDetachPartition(rel, ((PartitionCmd *) cmd->def)->name);
+                       break;
                 default:                                /* oops */
                         elog(ERROR, "unrecognized alter table type: %d",
                                  (int) cmd->subtype);
@@@ -3927,18 -4087,9 +4288,19 @@@ ATRewriteTables(AlterTableStmt *parsetr
         {
                 AlteredTableInfo *tab = (AlteredTableInfo *) lfirst(ltab);
   
-               /* Foreign tables have no storage. */
-               if (tab->relkind == RELKIND_FOREIGN_TABLE)
+ +#ifdef PGXC
+ +              /* Forbid table rewrite operations with online data redistribution */
+ +              if (tab->rewrite &&
+ +                      list_length(tab->subcmds[AT_PASS_DISTRIB]) > 0 &&
+ +                      IS_PGXC_LOCAL_COORDINATOR)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ +                                       errmsg("Incompatible operation with data redistribution")));
+ +#endif
+ +
+               /* Foreign tables have no storage, nor do partitioned tables. */
+               if (tab->relkind == RELKIND_FOREIGN_TABLE ||
+                       tab->relkind == RELKIND_PARTITIONED_TABLE)
                         continue;
   
                 /*
@@@ -12448,13 -12800,7 +13502,13 @@@ PreCommit_on_commit_actions(void
                                  * relations, we can skip truncating ON COMMIT DELETE ROWS
                                  * tables, as they must still be empty.
                                  */
-                               if (MyXactAccessedTempRel)
+ +#ifndef XCP
+ +                              /*
+ +                               * This optimization does not work in XL since temporary tables
+ +                               * are handled differently in XL.
+ +                               */
+                               if ((MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL))
+ +#endif
                                         oids_to_truncate = lappend_oid(oids_to_truncate, oc->relid);
                                 break;
                         case ONCOMMIT_DROP:
@@@ -12770,119 -13117,727 +13825,844 @@@ RangeVarCallbackForAlterRelation(const 
         ReleaseSysCache(tuple);
   }
   
+ +#ifdef PGXC
+ +/*
+ + * IsTempTable
+ + *
+ + * Check if given table Oid is temporary.
+ + */
+ +bool
+ +IsTempTable(Oid relid)
+ +{
+ +      Relation        rel;
+ +      bool            res;
+ +      /*
+ +       * PGXCTODO: Is it correct to open without locks?
+ +       * we just check if this table is temporary though...
+ +       */
+ +      rel = relation_open(relid, NoLock);
+ +      res = rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP;
+ +      relation_close(rel, NoLock);
+ +      return res;
+ +}
+ +
+ +bool
+ +IsLocalTempTable(Oid relid)
+ +{
+ +      Relation        rel;
+ +      bool            res;
+ +      rel = relation_open(relid, NoLock);
+ +      res = (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+ +                      rel->rd_locator_info == NULL);
+ +      relation_close(rel, NoLock);
+ +      return res;
+ +}
+ +
+ +/*
+ + * IsIndexUsingTemp
+ + *
+ + * Check if given index relation uses temporary tables.
+ + */
+ +bool
+ +IsIndexUsingTempTable(Oid relid)
+ +{
+ +      bool res = false;
+ +      HeapTuple   tuple;
+ +      Oid parent_id = InvalidOid;
+ +
+ +      tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(relid));
+ +      if (HeapTupleIsValid(tuple))
+ +      {
+ +              Form_pg_index index = (Form_pg_index) GETSTRUCT(tuple);
+ +              parent_id = index->indrelid;
+ +
+ +              /* Release system cache BEFORE looking at the parent table */
+ +              ReleaseSysCache(tuple);
+ +
+ +              res = IsTempTable(parent_id);
+ +      }
+ +      else
+ +              res = false; /* Default case */
+ +
+ +      return res;
+ +}
+ +
+ +/*
+ + * IsOnCommitActions
+ + *
+ + * Check if there are any on-commit actions activated.
+ + */
+ +bool
+ +IsOnCommitActions(void)
+ +{
+ +      return list_length(on_commits) > 0;
+ +}
+ +
+ +/*
+ + * DropTableThrowErrorExternal
+ + *
+ + * Error interface for DROP when looking for execution node type.
+ + */
+ +void
+ +DropTableThrowErrorExternal(RangeVar *relation, ObjectType removeType, bool missing_ok)
+ +{
+ +      char relkind;
+ +
+ +      /* Determine required relkind */
+ +      switch (removeType)
+ +      {
+ +              case OBJECT_TABLE:
+ +                      relkind = RELKIND_RELATION;
+ +                      break;
+ +
+ +              case OBJECT_INDEX:
+ +                      relkind = RELKIND_INDEX;
+ +                      break;
+ +
+ +              case OBJECT_SEQUENCE:
+ +                      relkind = RELKIND_SEQUENCE;
+ +                      break;
+ +
+ +              case OBJECT_VIEW:
+ +                      relkind = RELKIND_VIEW;
+ +                      break;
+ +
+ +              case OBJECT_FOREIGN_TABLE:
+ +                      relkind = RELKIND_FOREIGN_TABLE;
+ +                      break;
+ +
+ +              default:
+ +                      elog(ERROR, "unrecognized drop object type: %d",
+ +                               (int) removeType);
+ +                      relkind = 0;            /* keep compiler quiet */
+ +                      break;
+ +      }
+ +
+ +      DropErrorMsgNonExistent(relation, relkind, missing_ok);
+ +}
+ +#endif
++
+ /*
+  * Transform any expressions present in the partition key
+  *
+  * Returns a transformed PartitionSpec, as well as the strategy code
+  */
+ static PartitionSpec *
+ transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy)
+ {
+       PartitionSpec *newspec;
+       ParseState *pstate;
+       RangeTblEntry *rte;
+       ListCell   *l;
+ 
+       newspec = makeNode(PartitionSpec);
+ 
+       newspec->strategy = partspec->strategy;
+       newspec->partParams = NIL;
+       newspec->location = partspec->location;
+ 
+       /* Parse partitioning strategy name */
+       if (pg_strcasecmp(partspec->strategy, "list") == 0)
+               *strategy = PARTITION_STRATEGY_LIST;
+       else if (pg_strcasecmp(partspec->strategy, "range") == 0)
+               *strategy = PARTITION_STRATEGY_RANGE;
+       else
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                errmsg("unrecognized partitioning strategy \"%s\"",
+                                               partspec->strategy)));
+ 
+       /* Check valid number of columns for strategy */
+       if (*strategy == PARTITION_STRATEGY_LIST &&
+               list_length(partspec->partParams) != 1)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                errmsg("cannot use \"list\" partition strategy with more than one column")));
+ 
+       /*
+        * Create a dummy ParseState and insert the target relation as its sole
+        * rangetable entry.  We need a ParseState for transformExpr.
+        */
+       pstate = make_parsestate(NULL);
+       rte = addRangeTableEntryForRelation(pstate, rel, NULL, false, true);
+       addRTEtoQuery(pstate, rte, true, true, true);
+ 
+       /* take care of any partition expressions */
+       foreach(l, partspec->partParams)
+       {
+               PartitionElem *pelem = castNode(PartitionElem, lfirst(l));
+               ListCell   *lc;
+ 
+               /* Check for PARTITION BY ... (foo, foo) */
+               foreach(lc, newspec->partParams)
+               {
+                       PartitionElem *pparam = castNode(PartitionElem, lfirst(lc));
+ 
+                       if (pelem->name && pparam->name &&
+                               strcmp(pelem->name, pparam->name) == 0)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_DUPLICATE_COLUMN),
+                                                errmsg("column \"%s\" appears more than once in partition key",
+                                                               pelem->name),
+                                                parser_errposition(pstate, pelem->location)));
+               }
+ 
+               if (pelem->expr)
+               {
+                       /* Copy, to avoid scribbling on the input */
+                       pelem = copyObject(pelem);
+ 
+                       /* Now do parse transformation of the expression */
+                       pelem->expr = transformExpr(pstate, pelem->expr,
+                                                                               EXPR_KIND_PARTITION_EXPRESSION);
+ 
+                       /* we have to fix its collations too */
+                       assign_expr_collations(pstate, pelem->expr);
+               }
+ 
+               newspec->partParams = lappend(newspec->partParams, pelem);
+       }
+ 
+       return newspec;
+ }
+ 
+ /*
+  * Compute per-partition-column information from a list of PartitionElems.
+  * Expressions in the PartitionElems must be parse-analyzed already.
+  */
+ static void
+ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
+                                         List **partexprs, Oid *partopclass, Oid *partcollation)
+ {
+       int                     attn;
+       ListCell   *lc;
+ 
+       attn = 0;
+       foreach(lc, partParams)
+       {
+               PartitionElem *pelem = castNode(PartitionElem, lfirst(lc));
+               Oid                     atttype;
+               Oid                     attcollation;
+ 
+               if (pelem->name != NULL)
+               {
+                       /* Simple attribute reference */
+                       HeapTuple       atttuple;
+                       Form_pg_attribute attform;
+ 
+                       atttuple = SearchSysCacheAttName(RelationGetRelid(rel),
+                                                                                        pelem->name);
+                       if (!HeapTupleIsValid(atttuple))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_UNDEFINED_COLUMN),
+                               errmsg("column \"%s\" named in partition key does not exist",
+                                          pelem->name)));
+                       attform = (Form_pg_attribute) GETSTRUCT(atttuple);
+ 
+                       if (attform->attnum <= 0)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                  errmsg("cannot use system column \"%s\" in partition key",
+                                                 pelem->name)));
+ 
+                       partattrs[attn] = attform->attnum;
+                       atttype = attform->atttypid;
+                       attcollation = attform->attcollation;
+                       ReleaseSysCache(atttuple);
+               }
+               else
+               {
+                       /* Expression */
+                       Node       *expr = pelem->expr;
+ 
+                       Assert(expr != NULL);
+                       atttype = exprType(expr);
+                       attcollation = exprCollation(expr);
+ 
+                       /*
+                        * Strip any top-level COLLATE clause.  This ensures that we treat
+                        * "x COLLATE y" and "(x COLLATE y)" alike.
+                        */
+                       while (IsA(expr, CollateExpr))
+                               expr = (Node *) ((CollateExpr *) expr)->arg;
+ 
+                       if (IsA(expr, Var) &&
+                               ((Var *) expr)->varattno > 0)
+                       {
+                               /*
+                                * User wrote "(column)" or "(column COLLATE something)".
+                                * Treat it like simple attribute anyway.
+                                */
+                               partattrs[attn] = ((Var *) expr)->varattno;
+                       }
+                       else
+                       {
+                               Bitmapset  *expr_attrs = NULL;
+                               int                     i;
+ 
+                               partattrs[attn] = 0;    /* marks the column as expression */
+                               *partexprs = lappend(*partexprs, expr);
+ 
+                               /*
+                                * Try to simplify the expression before checking for
+                                * mutability.  The main practical value of doing it in this
+                                * order is that an inline-able SQL-language function will be
+                                * accepted if its expansion is immutable, whether or not the
+                                * function itself is marked immutable.
+                                *
+                                * Note that expression_planner does not change the passed in
+                                * expression destructively and we have already saved the
+                                * expression to be stored into the catalog above.
+                                */
+                               expr = (Node *) expression_planner((Expr *) expr);
+ 
+                               /*
+                                * Partition expression cannot contain mutable functions,
+                                * because a given row must always map to the same partition
+                                * as long as there is no change in the partition boundary
+                                * structure.
+                                */
+                               if (contain_mutable_functions(expr))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                        errmsg("functions in partition key expression must be marked IMMUTABLE")));
+ 
+                               /*
+                                * transformPartitionSpec() should have already rejected
+                                * subqueries, aggregates, window functions, and SRFs, based
+                                * on the EXPR_KIND_ for partition expressions.
+                                */
+ 
+                               /*
+                                * Cannot have expressions containing whole-row references or
+                                * system column references.
+                                */
+                               pull_varattnos(expr, 1, &expr_attrs);
+                               if (bms_is_member(0 - FirstLowInvalidHeapAttributeNumber,
+                                                                 expr_attrs))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                        errmsg("partition key expressions cannot contain whole-row references")));
+                               for (i = FirstLowInvalidHeapAttributeNumber; i < 0; i++)
+                               {
+                                       if (bms_is_member(i - FirstLowInvalidHeapAttributeNumber,
+                                                                         expr_attrs))
+                                               ereport(ERROR,
+                                                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                                errmsg("partition key expressions cannot contain system column references")));
+                               }
+ 
+                               /*
+                                * While it is not exactly *wrong* for a partition expression
+                                * to be a constant, it seems better to reject such keys.
+                                */
+                               if (IsA(expr, Const))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                        errmsg("cannot use constant expression as partition key")));
+                       }
+               }
+ 
+               /*
+                * Apply collation override if any
+                */
+               if (pelem->collation)
+                       attcollation = get_collation_oid(pelem->collation, false);
+ 
+               /*
+                * Check we have a collation iff it's a collatable type.  The only
+                * expected failures here are (1) COLLATE applied to a noncollatable
+                * type, or (2) partition expression had an unresolved collation. But
+                * we might as well code this to be a complete consistency check.
+                */
+               if (type_is_collatable(atttype))
+               {
+                       if (!OidIsValid(attcollation))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INDETERMINATE_COLLATION),
+                                                errmsg("could not determine which collation to use for partition expression"),
+                                                errhint("Use the COLLATE clause to set the collation explicitly.")));
+               }
+               else
+               {
+                       if (OidIsValid(attcollation))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_DATATYPE_MISMATCH),
+                                                errmsg("collations are not supported by type %s",
+                                                               format_type_be(atttype))));
+               }
+ 
+               partcollation[attn] = attcollation;
+ 
+               /*
+                * Identify a btree opclass to use. Currently, we use only btree
+                * operators, which seems enough for list and range partitioning.
+                */
+               if (!pelem->opclass)
+               {
+                       partopclass[attn] = GetDefaultOpClass(atttype, BTREE_AM_OID);
+ 
+                       if (!OidIsValid(partopclass[attn]))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_UNDEFINED_OBJECT),
+                                  errmsg("data type %s has no default btree operator class",
+                                                 format_type_be(atttype)),
+                                                errhint("You must specify a btree operator class or define a default btree operator class for the data type.")));
+               }
+               else
+                       partopclass[attn] = ResolveOpClass(pelem->opclass,
+                                                                                          atttype,
+                                                                                          "btree",
+                                                                                          BTREE_AM_OID);
+ 
+               attn++;
+       }
+ }
+ 
+ /*
+  * ALTER TABLE <name> ATTACH PARTITION <partition-name> FOR VALUES
+  *
+  * Return the address of the newly attached partition.
+  */
+ static ObjectAddress
+ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
+ {
+       PartitionKey key = RelationGetPartitionKey(rel);
+       Relation        attachRel,
+                               catalog;
+       List       *childrels;
+       TupleConstr *attachRel_constr;
+       List       *partConstraint,
+                          *existConstraint;
+       SysScanDesc scan;
+       ScanKeyData skey;
+       AttrNumber      attno;
+       int                     natts;
+       TupleDesc       tupleDesc;
+       bool            skip_validate = false;
+       ObjectAddress address;
+ 
+       attachRel = heap_openrv(cmd->name, AccessExclusiveLock);
+ 
+       /*
+        * Must be owner of both parent and source table -- parent was checked by
+        * ATSimplePermissions call in ATPrepCmd
+        */
+       ATSimplePermissions(attachRel, ATT_TABLE | ATT_FOREIGN_TABLE);
+ 
+       /* A partition can only have one parent */
+       if (attachRel->rd_rel->relispartition)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("\"%s\" is already a partition",
+                                               RelationGetRelationName(attachRel))));
+ 
+       if (OidIsValid(attachRel->rd_rel->reloftype))
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach a typed table as partition")));
+ 
+       /*
+        * Table being attached should not already be part of inheritance; either
+        * as a child table...
+        */
+       catalog = heap_open(InheritsRelationId, AccessShareLock);
+       ScanKeyInit(&skey,
+                               Anum_pg_inherits_inhrelid,
+                               BTEqualStrategyNumber, F_OIDEQ,
+                               ObjectIdGetDatum(RelationGetRelid(attachRel)));
+       scan = systable_beginscan(catalog, InheritsRelidSeqnoIndexId, true,
+                                                         NULL, 1, &skey);
+       if (HeapTupleIsValid(systable_getnext(scan)))
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach inheritance child as partition")));
+       systable_endscan(scan);
+ 
+       /* ...or as a parent table (except the case when it is partitioned) */
+       ScanKeyInit(&skey,
+                               Anum_pg_inherits_inhparent,
+                               BTEqualStrategyNumber, F_OIDEQ,
+                               ObjectIdGetDatum(RelationGetRelid(attachRel)));
+       scan = systable_beginscan(catalog, InheritsParentIndexId, true, NULL,
+                                                         1, &skey);
+       if (HeapTupleIsValid(systable_getnext(scan)) &&
+               attachRel->rd_rel->relkind == RELKIND_RELATION)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach inheritance parent as partition")));
+       systable_endscan(scan);
+       heap_close(catalog, AccessShareLock);
+ 
+       /*
+        * Prevent circularity by seeing if rel is a partition of attachRel. (In
+        * particular, this disallows making a rel a partition of itself.)
+        */
+       childrels = find_all_inheritors(RelationGetRelid(attachRel),
+                                                                       AccessShareLock, NULL);
+       if (list_member_oid(childrels, RelationGetRelid(rel)))
+               ereport(ERROR,
+                               (errcode(ERRCODE_DUPLICATE_TABLE),
+                                errmsg("circular inheritance not allowed"),
+                                errdetail("\"%s\" is already a child of \"%s\".",
+                                                  RelationGetRelationName(rel),
+                                                  RelationGetRelationName(attachRel))));
+ 
+       /* Temp parent cannot have a partition that is itself not a temp */
+       if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+               attachRel->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach a permanent relation as partition of temporary relation \"%s\"",
+                                               RelationGetRelationName(rel))));
+ 
+       /* If the parent is temp, it must belong to this session */
+       if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+               !rel->rd_islocaltemp)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach as partition of temporary relation of another session")));
+ 
+       /* Ditto for the partition */
+       if (attachRel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+               !attachRel->rd_islocaltemp)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach temporary relation of another session as partition")));
+ 
+       /* If parent has OIDs then child must have OIDs */
+       if (rel->rd_rel->relhasoids && !attachRel->rd_rel->relhasoids)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                        errmsg("cannot attach table \"%s\" without OIDs as partition of"
+                          " table \"%s\" with OIDs", RelationGetRelationName(attachRel),
+                                       RelationGetRelationName(rel))));
+ 
+       /* OTOH, if parent doesn't have them, do not allow in attachRel either */
+       if (attachRel->rd_rel->relhasoids && !rel->rd_rel->relhasoids)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                 errmsg("cannot attach table \"%s\" with OIDs as partition of table"
+                                " \"%s\" without OIDs", RelationGetRelationName(attachRel),
+                                RelationGetRelationName(rel))));
+ 
+       /* Check if there are any columns in attachRel that aren't in the parent */
+       tupleDesc = RelationGetDescr(attachRel);
+       natts = tupleDesc->natts;
+       for (attno = 1; attno <= natts; attno++)
+       {
+               Form_pg_attribute attribute = tupleDesc->attrs[attno - 1];
+               char       *attributeName = NameStr(attribute->attname);
+ 
+               /* Ignore dropped */
+               if (attribute->attisdropped)
+                       continue;
+ 
+               /* Try to find the column in parent (matching on column name) */
+               if (!SearchSysCacheExists2(ATTNAME,
+                                                                  ObjectIdGetDatum(RelationGetRelid(rel)),
+                                                                  CStringGetDatum(attributeName)))
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_DATATYPE_MISMATCH),
+                                        errmsg("table \"%s\" contains column \"%s\" not found in parent \"%s\"",
+                                                       RelationGetRelationName(attachRel), attributeName,
+                                                       RelationGetRelationName(rel)),
+                                        errdetail("New partition should contain only the columns present in parent.")));
+       }
+ 
+       /* OK to create inheritance.  Rest of the checks performed there */
+       CreateInheritance(attachRel, rel);
+ 
+       /*
+        * Check that the new partition's bound is valid and does not overlap any
+        * of existing partitions of the parent - note that it does not return on
+        * error.
+        */
+       check_new_partition_bound(RelationGetRelationName(attachRel), rel,
+                                                         cmd->bound);
+ 
+       /* Update the pg_class entry. */
+       StorePartitionBound(attachRel, rel, cmd->bound);
+ 
+       /*
+        * Generate partition constraint from the partition bound specification.
+        * If the parent itself is a partition, make sure to include its
+        * constraint as well.
+        */
+       partConstraint = list_concat(get_qual_from_partbound(attachRel, rel,
+                                                                                                                cmd->bound),
+                                                                RelationGetPartitionQual(rel));
+       partConstraint = (List *) eval_const_expressions(NULL,
+                                                                                                        (Node *) partConstraint);
+       partConstraint = (List *) canonicalize_qual((Expr *) partConstraint);
+       partConstraint = list_make1(make_ands_explicit(partConstraint));
+ 
+       /*
+        * Check if we can do away with having to scan the table being attached to
+        * validate the partition constraint, by *proving* that the existing
+        * constraints of the table *imply* the partition predicate.  We include
+        * the table's check constraints and NOT NULL constraints in the list of
+        * clauses passed to predicate_implied_by().
+        *
+        * There is a case in which we cannot rely on just the result of the
+        * proof.
+        */
+       attachRel_constr = tupleDesc->constr;
+       existConstraint = NIL;
+       if (attachRel_constr != NULL)
+       {
+               int                     num_check = attachRel_constr->num_check;
+               int                     i;
+               Bitmapset  *not_null_attrs = NULL;
+               List       *part_constr;
+               ListCell   *lc;
+               bool            partition_accepts_null = true;
+               int                     partnatts;
+ 
+               if (attachRel_constr->has_not_null)
+               {
+                       int                     natts = attachRel->rd_att->natts;
+ 
+                       for (i = 1; i <= natts; i++)
+                       {
+                               Form_pg_attribute att = attachRel->rd_att->attrs[i - 1];
+ 
+                               if (att->attnotnull && !att->attisdropped)
+                               {
+                                       NullTest   *ntest = makeNode(NullTest);
+ 
+                                       ntest->arg = (Expr *) makeVar(1,
+                                                                                                 i,
+                                                                                                 att->atttypid,
+                                                                                                 att->atttypmod,
+                                                                                                 att->attcollation,
+                                                                                                 0);
+                                       ntest->nulltesttype = IS_NOT_NULL;
+ 
+                                       /*
+                                        * argisrow=false is correct even for a composite column,
+                                        * because attnotnull does not represent a SQL-spec IS NOT
+                                        * NULL test in such a case, just IS DISTINCT FROM NULL.
+                                        */
+                                       ntest->argisrow = false;
+                                       ntest->location = -1;
+                                       existConstraint = lappend(existConstraint, ntest);
+                                       not_null_attrs = bms_add_member(not_null_attrs, i);
+                               }
+                       }
+               }
+ 
+               for (i = 0; i < num_check; i++)
+               {
+                       Node       *cexpr;
+ 
+                       /*
+                        * If this constraint hasn't been fully validated yet, we must
+                        * ignore it here.
+                        */
+                       if (!attachRel_constr->check[i].ccvalid)
+                               continue;
+ 
+                       cexpr = stringToNode(attachRel_constr->check[i].ccbin);
+ 
+                       /*
+                        * Run each expression through const-simplification and
+                        * canonicalization.  It is necessary, because we will be
+                        * comparing it to similarly-processed qual clauses, and may fail
+                        * to detect valid matches without this.
+                        */
+                       cexpr = eval_const_expressions(NULL, cexpr);
+                       cexpr = (Node *) canonicalize_qual((Expr *) cexpr);
+ 
+                       existConstraint = list_concat(existConstraint,
+                                                                                 make_ands_implicit((Expr *) cexpr));
+               }
+ 
+               existConstraint = list_make1(make_ands_explicit(existConstraint));
+ 
+               /* And away we go ... */
+               if (predicate_implied_by(partConstraint, existConstraint))
+                       skip_validate = true;
+ 
+               /*
+                * We choose to err on the safer side, i.e., give up on skipping the
+                * validation scan, if the partition key column doesn't have the NOT
+                * NULL constraint and the table is to become a list partition that
+                * does not accept nulls.  In this case, the partition predicate
+                * (partConstraint) does include an 'key IS NOT NULL' expression,
+                * however, because of the way predicate_implied_by_simple_clause() is
+                * designed to handle IS NOT NULL predicates in the absence of a IS
+                * NOT NULL clause, we cannot rely on just the above proof.
+                *
+                * That is not an issue in case of a range partition, because if there
+                * were no NOT NULL constraint defined on the key columns, an error
+                * would be thrown before we get here anyway.  That is not true,
+                * however, if any of the partition keys is an expression, which is
+                * handled below.
+                */
+               part_constr = linitial(partConstraint);
+               part_constr = make_ands_implicit((Expr *) part_constr);
+ 
+               /*
+                * part_constr contains an IS NOT NULL expression, if this is a list
+                * partition that does not accept nulls (in fact, also if this is a
+                * range partition and some partition key is an expression, but we
+                * never skip validation in that case anyway; see below)
+                */
+               foreach(lc, part_constr)
+               {
+                       Node       *expr = lfirst(lc);
+ 
+                       if (IsA(expr, NullTest) &&
+                               ((NullTest *) expr)->nulltesttype == IS_NOT_NULL)
+                       {
+                               partition_accepts_null = false;
+                               break;
+                       }
+               }
+ 
+               partnatts = get_partition_natts(key);
+               for (i = 0; i < partnatts; i++)
+               {
+                       AttrNumber      partattno;
+ 
+                       partattno = get_partition_col_attnum(key, i);
+ 
+                       /* If partition key is an expression, must not skip validation */
+                       if (!partition_accepts_null &&
+                               (partattno == 0 ||
+                                !bms_is_member(partattno, not_null_attrs)))
+                               skip_validate = false;
+               }
+       }
+ 
+       /* It's safe to skip the validation scan after all */
+       if (skip_validate)
+               ereport(INFO,
+                               (errmsg("partition constraint for table \"%s\" is implied by existing constraints",
+                                               RelationGetRelationName(attachRel))));
+ 
+       /*
+        * Set up to have the table be scanned to validate the partition
+        * constraint (see partConstraint above).  If it's a partitioned table, we
+        * instead schedule its leaf partitions to be scanned.
+        */
+       if (!skip_validate)
+       {
+               List       *all_parts;
+               ListCell   *lc;
+ 
+               /* Take an exclusive lock on the partitions to be checked */
+               if (attachRel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+                       all_parts = find_all_inheritors(RelationGetRelid(attachRel),
+                                                                                       AccessExclusiveLock, NULL);
+               else
+                       all_parts = list_make1_oid(RelationGetRelid(attachRel));
+ 
+               foreach(lc, all_parts)
+               {
+                       AlteredTableInfo *tab;
+                       Oid                     part_relid = lfirst_oid(lc);
+                       Relation        part_rel;
+                       Expr       *constr;
+ 
+                       /* Lock already taken */
+                       if (part_relid != RelationGetRelid(attachRel))
+                               part_rel = heap_open(part_relid, NoLock);
+                       else
+                               part_rel = attachRel;
+ 
+                       /*
+                        * Skip if it's a partitioned table.  Only RELKIND_RELATION
+                        * relations (ie, leaf partitions) need to be scanned.
+                        */
+                       if (part_rel != attachRel &&
+                               part_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+                       {
+                               heap_close(part_rel, NoLock);
+                               continue;
+                       }
+ 
+                       /* Grab a work queue entry */
+                       tab = ATGetQueueEntry(wqueue, part_rel);
+ 
+                       /* Adjust constraint to match this partition */
+                       constr = linitial(partConstraint);
+                       tab->partition_constraint = (Expr *)
+                               map_partition_varattnos((List *) constr, 1,
+                                                                               part_rel, rel);
+                       /* keep our lock until commit */
+                       if (part_rel != attachRel)
+                               heap_close(part_rel, NoLock);
+               }
+       }
+ 
+       ObjectAddressSet(address, RelationRelationId, RelationGetRelid(attachRel));
+ 
+       /* keep our lock until commit */
+       heap_close(attachRel, NoLock);
+ 
+       return address;
+ }
+ 
+ /*
+  * ALTER TABLE DETACH PARTITION
+  *
+  * Return the address of the relation that is no longer a partition of rel.
+  */
+ static ObjectAddress
+ ATExecDetachPartition(Relation rel, RangeVar *name)
+ {
+       Relation        partRel,
+                               classRel;
+       HeapTuple       tuple,
+                               newtuple;
+       Datum           new_val[Natts_pg_class];
+       bool            isnull,
+                               new_null[Natts_pg_class],
+                               new_repl[Natts_pg_class];
+       ObjectAddress address;
+ 
+       partRel = heap_openrv(name, AccessShareLock);
+ 
+       /* All inheritance related checks are performed within the function */
+       RemoveInheritance(partRel, rel);
+ 
+       /* Update pg_class tuple */
+       classRel = heap_open(RelationRelationId, RowExclusiveLock);
+       tuple = SearchSysCacheCopy1(RELOID,
+                                                               ObjectIdGetDatum(RelationGetRelid(partRel)));
+       Assert(((Form_pg_class) GETSTRUCT(tuple))->relispartition);
+ 
+       (void) SysCacheGetAttr(RELOID, tuple, Anum_pg_class_relpartbound,
+                                                  &isnull);
+       Assert(!isnull);
+ 
+       /* Clear relpartbound and reset relispartition */
+       memset(new_val, 0, sizeof(new_val));
+       memset(new_null, false, sizeof(new_null));
+       memset(new_repl, false, sizeof(new_repl));
+       new_val[Anum_pg_class_relpartbound - 1] = (Datum) 0;
+       new_null[Anum_pg_class_relpartbound - 1] = true;
+       new_repl[Anum_pg_class_relpartbound - 1] = true;
+       newtuple = heap_modify_tuple(tuple, RelationGetDescr(classRel),
+                                                                new_val, new_null, new_repl);
+ 
+       ((Form_pg_class) GETSTRUCT(newtuple))->relispartition = false;
+       CatalogTupleUpdate(classRel, &newtuple->t_self, newtuple);
+       heap_freetuple(newtuple);
+       heap_close(classRel, RowExclusiveLock);
+ 
+       /*
+        * Invalidate the parent's relcache so that the partition is no longer
+        * included in its partition descriptor.
+        */
+       CacheInvalidateRelcache(rel);
+ 
+       ObjectAddressSet(address, RelationRelationId, RelationGetRelid(partRel));
+ 
+       /* keep our lock until commit */
+       heap_close(partRel, NoLock);
+ 
+       return address;
+ }
diff --cc src/backend/commands/tablespace.c

index b43d61075a304b2da919b126cca58a5ca0f4a972,f9c26201d982d758b548f8ff7e15930e230d93bc..dc4d3ab02de69bbd7dd1097f6158304fdf19c2f1
--- 1/src/backend/commands/tablespace.c
--- 2/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@@ -82,11 -81,7 +81,12 @@@
   #include "utils/memutils.h"
   #include "utils/rel.h"
   #include "utils/tqual.h"
+ +#ifdef PGXC
+ +#include "pgxc/execRemote.h"
+ +#include "pgxc/nodemgr.h"
+ +#include "pgxc/pgxc.h"
+ +#endif
+ #include "utils/varlena.h"
   
   
   /* GUC variables */
diff --cc src/backend/commands/trigger.c

index 3dec3365567ed4d46543e17ee0dab56f9e2fbbf9,0271788bf9908c66041daf97a14a0e3baf5660a5..619d422e62797838ca704cd41f0f5d5602ae0e8f
--- 1/src/backend/commands/trigger.c
--- 2/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@@ -3,8 -3,7 +3,8 @@@
    * trigger.c
    *      PostgreSQL TRIGGERs support code.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
@@@ -1068,15 -1189,18 +1196,20 @@@ ConvertTriggerToFK(CreateTrigStmt *stmt
                 fkcon->skip_validation = false;
                 fkcon->initially_valid = true;
   
+               /* finally, wrap it in a dummy PlannedStmt */
+               wrapper->commandType = CMD_UTILITY;
+               wrapper->canSetTag = false;
+               wrapper->utilityStmt = (Node *) atstmt;
+               wrapper->stmt_location = -1;
+               wrapper->stmt_len = -1;
+ 
                 /* ... and execute it */
-               ProcessUtility((Node *) atstmt,
+               ProcessUtility(wrapper,
                                            "(generated ALTER TABLE ADD FOREIGN KEY command)",
-                                          PROCESS_UTILITY_SUBCOMMAND, NULL,
+                                          PROCESS_UTILITY_SUBCOMMAND, NULL, NULL,
- -                                         None_Receiver, NULL);
+ +                                         None_Receiver,
- #ifdef PGXC
+ +                                         false,
- #endif /* PGXC */
+ +                                         NULL);
   
                 /* Remove the matched item from the list */
                 info_list = list_delete_ptr(info_list, info);
diff --cc src/backend/commands/vacuum.c

index 4181dfd167cb2fed45a88eb48a221ef461f68471,9fbb0eb4eb8e070943190d2bc8e003be47f8da50..24edf48b68ac61e852b93eb884dfc560aefe04de
--- 1/src/backend/commands/vacuum.c
--- 2/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@@ -9,10 -9,8 +9,10 @@@
    * in cluster.c.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
diff --cc src/backend/commands/variable.c

index aafa7485957f788cf4555061a8799067e75fedfb,d75bddd87b26cad79ece963350cc2738e190d964..ed3b2484ae53c55ec153df7ab498eebc633ca60a
--- 1/src/backend/commands/variable.c
--- 2/src/backend/commands/variable.c
+++ b/src/backend/commands/variable.c
@@@ -4,8 -4,7 +4,8 @@@
    *            Routines for handling specialized SET variables.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/commands/view.c

index a809a203e9d0540aba18f8c4d55830d07187230b,a5d6574eaf37a5c370209821e95998c662c5a868..2ca5b5cfd2ef9c16fbe728670a456e92bbf6e1f2
--- 1/src/backend/commands/view.c
--- 2/src/backend/commands/view.c
+++ b/src/backend/commands/view.c
@@@ -3,8 -3,7 +3,8 @@@
    * view.c
    *      use rewrite rules to construct views
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/executor/Makefile

index 6625d56b9715b5de4445edd0792bcc2b14f27a04,083b20f3fee0cb957f59d754e5128e153952a58b..fef60fb4c968ecc387e444de46f8db13025b5d54
--- 1/src/backend/executor/Makefile
--- 2/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@@ -12,19 -12,23 +12,23 @@@ subdir = src/backend/executo
   top_builddir = ../../..
   include $(top_builddir)/src/Makefile.global
   
- OBJS = execAmi.o execCurrent.o execGrouping.o execIndexing.o execJunk.o \
-        execMain.o execParallel.o execProcnode.o execQual.o \
-        execScan.o execTuples.o \
+ OBJS = execAmi.o execCurrent.o execExpr.o execExprInterp.o \
+        execGrouping.o execIndexing.o execJunk.o \
+        execMain.o execParallel.o execProcnode.o \
+        execReplication.o execScan.o execSRF.o execTuples.o \
          execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \
          nodeBitmapAnd.o nodeBitmapOr.o \
-        nodeBitmapHeapscan.o nodeBitmapIndexscan.o nodeCustom.o nodeGather.o \
+        nodeBitmapHeapscan.o nodeBitmapIndexscan.o \
+        nodeCustom.o nodeFunctionscan.o nodeGather.o \
          nodeHash.o nodeHashjoin.o nodeIndexscan.o nodeIndexonlyscan.o \
-        nodeLimit.o nodeLockRows.o \
+        nodeLimit.o nodeLockRows.o nodeGatherMerge.o \
          nodeMaterial.o nodeMergeAppend.o nodeMergejoin.o nodeModifyTable.o \
-        nodeNestloop.o nodeFunctionscan.o nodeRecursiveunion.o nodeResult.o \
+        nodeNestloop.o nodeProjectSet.o nodeRecursiveunion.o nodeResult.o \
          nodeSamplescan.o nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \
-        nodeValuesscan.o nodeCtescan.o nodeWorktablescan.o \
+        nodeValuesscan.o \
+        nodeCtescan.o nodeNamedtuplestorescan.o nodeWorktablescan.o \
          nodeGroup.o nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o \
-        nodeForeignscan.o nodeWindowAgg.o producerReceiver.o tstoreReceiver.o tqueue.o spi.o
- -       nodeForeignscan.o nodeWindowAgg.o tstoreReceiver.o tqueue.o spi.o \
++       nodeForeignscan.o nodeWindowAgg.o producerReceiver.o tstoreReceiver.o tqueue.o spi.o \
+        nodeTableFuncscan.o
   
   include $(top_srcdir)/src/backend/common.mk
diff --cc src/backend/executor/execAmi.c

index 2cb83d75a7e71095dd9c4e8b4c43395ed9f8ed1c,7337d21d7d2f5b5980d269e69d9c3123356c87f4..b802ad6956d0dc3d74797872a1e96643315b2898
--- 1/src/backend/executor/execAmi.c
--- 2/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@@ -3,8 -3,7 +3,8 @@@
    * execAmi.c
    *      miscellaneous executor access method routines
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *    src/backend/executor/execAmi.c
@@@ -57,12 -59,8 +60,11 @@@
   #include "nodes/relation.h"
   #include "utils/rel.h"
   #include "utils/syscache.h"
+ +#ifdef PGXC
+ +#include "pgxc/execRemote.h"
+ +#endif
   
   
- static bool TargetListSupportsBackwardScan(List *targetlist);
   static bool IndexSupportsBackwardScan(Oid indexid);
   
   
diff --cc src/backend/executor/execCurrent.c

index 757ea8dddc9183eb7c27ce613466720ccb7a53d1,3af4a90b515e262193bf4cc0da93ddef33e4fb54..0224b9e4af11c434f5969d3edcd6551e1082c2b5
--- 1/src/backend/executor/execCurrent.c
--- 2/src/backend/executor/execCurrent.c
+++ b/src/backend/executor/execCurrent.c
@@@ -3,8 -3,7 +3,8 @@@
    * execCurrent.c
    *      executor support for WHERE CURRENT OF cursor
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *    src/backend/executor/execCurrent.c
diff --cc src/backend/executor/execMain.c

index 2eaa33455c9c5952e54f499ac14d8d62b45d5347,4a899f1eb567c74d2e8b73b2912b9b31b3c154d7..7232b0911f037a7d4dbbdc3f730f7b399b6734a1
--- 1/src/backend/executor/execMain.c
--- 2/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@@ -26,8 -26,7 +26,8 @@@
    *    before ExecutorEnd.  This can be omitted only in case of EXPLAIN,
    *    which should also omit ExecutorRun.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -58,17 -60,10 +61,18 @@@
   #include "utils/lsyscache.h"
   #include "utils/memutils.h"
   #include "utils/rls.h"
+ #include "utils/ruleutils.h"
   #include "utils/snapmgr.h"
   #include "utils/tqual.h"
- -
+ +#ifdef PGXC
+ +#include "pgxc/pgxc.h"
+ +#include "commands/copy.h"
+ +#endif
+ +#ifdef XCP
+ +#include "access/gtm.h"
+ +#include "pgxc/execRemote.h"
+ +#include "pgxc/poolmgr.h"
+ +#endif
   
   /* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */
   ExecutorStart_hook_type ExecutorStart_hook = NULL;
@@@ -199,41 -193,16 +208,48 @@@ standard_ExecutorStart(QueryDesc *query
         estate->es_param_list_info = queryDesc->params;
   
         if (queryDesc->plannedstmt->nParamExec > 0)
+ +#ifdef XCP
+ +      {
+ +              estate->es_param_exec_vals = (ParamExecData *)
+ +                      palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData));
+ +              if (queryDesc->plannedstmt->nParamRemote > 0)
+ +              {
+ +                      ParamListInfo extparams = estate->es_param_list_info;
+ +                      int i = queryDesc->plannedstmt->nParamRemote;
+ +                      while (--i >= 0 &&
+ +                              queryDesc->plannedstmt->remoteparams[i].paramkind == PARAM_EXEC)
+ +                      {
+ +                              int paramno = queryDesc->plannedstmt->remoteparams[i].paramid;
+ +                              ParamExecData *prmdata;
+ +
+ +                              Assert(paramno >= 0 &&
+ +                                         paramno < queryDesc->plannedstmt->nParamExec);
+ +                              prmdata = &(estate->es_param_exec_vals[paramno]);
+ +                              prmdata->value = extparams->params[i].value;
+ +                              prmdata->isnull = extparams->params[i].isnull;
+ +                              prmdata->ptype = extparams->params[i].ptype;
+ +                              prmdata->done = true;
+ +                      }
+ +                      /*
+ +                       * Truncate exec parameters from the list of received parameters
+ +                       * to avoid sending down duplicates if there are multiple levels
+ +                       * of RemoteSubplan statements
+ +                       */
+ +                      extparams->numParams = i + 1;
+ +              }
+ +      }
+ +#else
                 estate->es_param_exec_vals = (ParamExecData *)
                         palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData));
+ +#endif
   
+       estate->es_sourceText = queryDesc->sourceText;
+ 
+       /*
+        * Fill in the query environment, if any, from queryDesc.
+        */
+       estate->es_queryEnv = queryDesc->queryEnv;
+ 
         /*
          * If non-read-only query, set the command ID to mark output tuples with
          */
diff --cc src/backend/executor/execProcnode.c

index fa7bdfc92363e6ea4f644ec7ef9a0909e4b71c85,5469cde1e00c25f7e90c1322d602009bda6b3780..f2d9ccb1309763cd3ba6a3fc6273a0e390b6b77d
--- 1/src/backend/executor/execProcnode.c
--- 2/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@@ -7,8 -7,7 +7,8 @@@
    *     ExecProcNode, or ExecEndNode on its subnodes and do the appropriate
    *     processing.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/executor/execTuples.c

index 63375dc82583ed199de698dbd91bfa0e50908d65,c4a955332f7c3d2b267287847b4b36fac7fa0add..489ca5edb97a1b4b39665b2fc27695ba3baee1e8
--- 1/src/backend/executor/execTuples.c
--- 2/src/backend/executor/execTuples.c
+++ b/src/backend/executor/execTuples.c
@@@ -12,8 -12,7 +12,8 @@@
    *      This information is needed by routines manipulating tuples
    *      (getattribute, formtuple, etc.).
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/executor/execUtils.c

index 4bdf76cf5620b6630ef2e0123979ef7d87fa721c,cb2596cb317e293cb525c7a4f0d5fb7c0f87b2e1..b1178552e526846b69b77ab7979563ea7f32f5d6
--- 1/src/backend/executor/execUtils.c
--- 2/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@@ -3,8 -3,7 +3,8 @@@
    * execUtils.c
    *      miscellaneous executor utility routines
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/executor/functions.c

index 06b4a57656d4b1a1a27e0cfbde8a721d03d6d039,a35ba32e6dd973fcaa61f79563f2b19b9fb6711e..3f40fa65ef50241cd1b08c5e81e904645185e598
--- 1/src/backend/executor/functions.c
--- 2/src/backend/executor/functions.c
+++ b/src/backend/executor/functions.c
@@@ -70,9 -66,8 +70,9 @@@ typedef struct execution_stat
         ExecStatus      status;
         bool            setsResult;             /* true if this query produces func's result */
         bool            lazyEval;               /* true if should fetch one row at a time */
-       Node       *stmt;                       /* PlannedStmt or utility statement */
+       PlannedStmt *stmt;                      /* plan for this query */
         QueryDesc  *qd;                         /* null unless status == RUN */
+ +      char            *src;                   /* source query resulting in this state */
   } execution_state;
   
   
@@@ -482,15 -475,14 +482,15 @@@ init_execution_state(List *queryTree_li
   {
         List       *eslist = NIL;
         execution_state *lasttages = NULL;
- -      ListCell   *lc1;
+ +      ListCell   *lc1, *lc3;
   
- -      foreach(lc1, queryTree_list)
+ +      forboth(lc1, queryTree_list, lc3, querySource_list)
         {
-               List       *qtlist = (List *) lfirst(lc1);
+               List       *qtlist = lfirst_node(List, lc1);
+ +              char       *querysource = (char *) lfirst(lc3);
                 execution_state *firstes = NULL;
                 execution_state *preves = NULL;
- -              ListCell   *lc2;
+ +              ListCell   *lc2, *lc4;
   
                 foreach(lc2, qtlist)
                 {
@@@ -521,28 -531,10 +539,28 @@@
                                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                 /* translator: %s is a SQL statement name */
                                            errmsg("%s is not allowed in a non-volatile function",
-                                                         CreateCommandTag(stmt))));
+                                                         CreateCommandTag((Node *) stmt))));
   
+ +#ifdef PGXC
+ +                      if (IS_PGXC_LOCAL_COORDINATOR)
+ +                      {
+ +                              if (queryTree->commandType != CMD_UTILITY)
+ +                              {
+ +                                      /*
+ +                                      * The parameterised queries in RemoteQuery nodes will be prepared
+ +                                      * on the Datanode, and need parameter types for the same. Set the
+ +                                      * parameter types and their number in all RemoteQuery nodes in the
+ +                                      * plan
+ +                                      */
+ +                                      SetRemoteStatementName(((PlannedStmt *)stmt)->planTree, NULL,
+ +                                                                                      fcache->pinfo->nargs,
+ +                                                                                      fcache->pinfo->argtypes, 0);
+ +                              }
+ +                      }
+ +#endif /* PGXC */
+ +
                         if (IsInParallelMode() && !CommandIsReadOnly(stmt))
-                               PreventCommandIfParallelMode(CreateCommandTag(stmt));
+                               PreventCommandIfParallelMode(CreateCommandTag((Node *) stmt));
   
                         /* OK, build the execution_state for this query */
                         newes = (execution_state *) palloc(sizeof(execution_state));
@@@ -723,16 -705,16 +733,17 @@@ init_sql_fcache(FmgrInfo *finfo, Oid co
   
         queryTree_list = NIL;
         flat_query_list = NIL;
- -      foreach(lc, raw_parsetree_list)
+ +      forboth(lc, raw_parsetree_list, lc2, querysource_list)
         {
-               Node       *parsetree = (Node *) lfirst(lc);
+               RawStmt    *parsetree = lfirst_node(RawStmt, lc);
+ +              char       *querysource = (char *) lfirst(lc2);
                 List       *queryTree_sublist;
   
                 queryTree_sublist = pg_analyze_and_rewrite_params(parsetree,
- -                                                                                                                fcache->src,
+ +                                                                                                                querysource,
                                                                            (ParserSetupHook) sql_fn_parser_setup,
-                                                                                                                 fcache->pinfo);
+                                                                                                                 fcache->pinfo,
+                                                                                                                 NULL);
                 queryTree_list = lappend(queryTree_list, queryTree_sublist);
                 flat_query_list = list_concat(flat_query_list,
                                                                           list_copy(queryTree_sublist));
@@@ -824,22 -805,17 +835,17 @@@ postquel_start(execution_state *es, SQL
         else
                 dest = None_Receiver;
   
-       if (IsA(es->stmt, PlannedStmt))
-               es->qd = CreateQueryDesc((PlannedStmt *) es->stmt,
-                                                                es->src,
-                                                                GetActiveSnapshot(),
-                                                                InvalidSnapshot,
-                                                                dest,
-                                                                fcache->paramLI, 0);
-       else
-               es->qd = CreateUtilityQueryDesc(es->stmt,
-                                                                               es->src,
-                                                                               GetActiveSnapshot(),
-                                                                               dest,
-                                                                               fcache->paramLI);
+       es->qd = CreateQueryDesc(es->stmt,
- -                                                       fcache->src,
++                                                       es->src,
+                                                        GetActiveSnapshot(),
+                                                        InvalidSnapshot,
+                                                        dest,
+                                                        fcache->paramLI,
+                                                        es->qd ? es->qd->queryEnv : NULL,
+                                                        0);
   
         /* Utility commands don't need Executor. */
-       if (es->qd->utilitystmt == NULL)
+       if (es->qd->operation != CMD_UTILITY)
         {
                 /*
                  * In lazyEval mode, do not let the executor set up an AfterTrigger
@@@ -867,19 -843,14 +873,17 @@@ postquel_getnext(execution_state *es, S
   {
         bool            result;
   
-       if (es->qd->utilitystmt)
+       if (es->qd->operation == CMD_UTILITY)
         {
-               /* ProcessUtility needs the PlannedStmt for DECLARE CURSOR */
-               ProcessUtility((es->qd->plannedstmt ?
-                                               (Node *) es->qd->plannedstmt :
-                                               es->qd->utilitystmt),
+               ProcessUtility(es->qd->plannedstmt,
- -                                         fcache->src,
+ +                                         es->src,
                                            PROCESS_UTILITY_QUERY,
                                            es->qd->params,
+                                          es->qd->queryEnv,
                                            es->qd->dest,
+ +#ifdef PGXC
+ +                                         false,
+ +#endif /* PGXC */
                                            NULL);
                 result = true;                  /* never stops early */
         }
diff --cc src/backend/executor/nodeAgg.c

index 8154522de410225979a69754d5bfed462d413ac2,7eeda95af752b992de48733b80faff09590cece4..1b94a664846a5f785478f024a853da94e0050337
--- 1/src/backend/executor/nodeAgg.c
--- 2/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@@ -136,10 -146,56 +146,57 @@@
    *      sensitive to the grouping set for which the aggregate function is
    *      currently being called.
    *
-  *      TODO: AGG_HASHED doesn't support multiple grouping sets yet.
+  *      Plan structure:
+  *
+  *      What we get from the planner is actually one "real" Agg node which is
+  *      part of the plan tree proper, but which optionally has an additional list
+  *      of Agg nodes hung off the side via the "chain" field.  This is because an
+  *      Agg node happens to be a convenient representation of all the data we
+  *      need for grouping sets.
+  *
+  *      For many purposes, we treat the "real" node as if it were just the first
+  *      node in the chain.  The chain must be ordered such that hashed entries
+  *      come before sorted/plain entries; the real node is marked AGG_MIXED if
+  *      there are both types present (in which case the real node describes one
+  *      of the hashed groupings, other AGG_HASHED nodes may optionally follow in
+  *      the chain, followed in turn by AGG_SORTED or (one) AGG_PLAIN node).  If
+  *      the real node is marked AGG_HASHED or AGG_SORTED, then all the chained
+  *      nodes must be of the same type; if it is AGG_PLAIN, there can be no
+  *      chained nodes.
+  *
+  *      We collect all hashed nodes into a single "phase", numbered 0, and create
+  *      a sorted phase (numbered 1..n) for each AGG_SORTED or AGG_PLAIN node.
+  *      Phase 0 is allocated even if there are no hashes, but remains unused in
+  *      that case.
+  *
+  *      AGG_HASHED nodes actually refer to only a single grouping set each,
+  *      because for each hashed grouping we need a separate grpColIdx and
+  *      numGroups estimate.  AGG_SORTED nodes represent a "rollup", a list of
+  *      grouping sets that share a sort order.  Each AGG_SORTED node other than
+  *      the first one has an associated Sort node which describes the sort order
+  *      to be used; the first sorted node takes its input from the outer subtree,
+  *      which the planner has already arranged to provide ordered data.
+  *
+  *      Memory and ExprContext usage:
+  *
+  *      Because we're accumulating aggregate values across input rows, we need to
+  *      use more memory contexts than just simple input/output tuple contexts.
+  *      In fact, for a rollup, we need a separate context for each grouping set
+  *      so that we can reset the inner (finer-grained) aggregates on their group
+  *      boundaries while continuing to accumulate values for outer
+  *      (coarser-grained) groupings.  On top of this, we might be simultaneously
+  *      populating hashtables; however, we only need one context for all the
+  *      hashtables.
+  *
+  *      So we create an array, aggcontexts, with an ExprContext for each grouping
+  *      set in the largest rollup that we're going to process, and use the
+  *      per-tuple memory context of those ExprContexts to store the aggregate
+  *      transition values.  hashcontext is the single context created to support
+  *      all hash tables.
+  *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
diff --cc src/backend/executor/nodeForeignscan.c
Simple merge
diff --cc src/backend/executor/nodeModifyTable.c

index 439e36ee3adda81e271f66bbba79c6d7468f6d19,cf555fe78d91b38f39f8f38594074bf55c4b7d00..0ee82e3add5c0e2d7afc9d5825cd1e2ee030ed05
--- 1/src/backend/executor/nodeModifyTable.c
--- 2/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@@ -3,8 -3,7 +3,8 @@@
    * nodeModifyTable.c
    *      routines to handle ModifyTable nodes.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -258,7 -275,69 +276,68 @@@ ExecInsert(ModifyTableState *mtstate
          * get information on the (current) result relation
          */
         resultRelInfo = estate->es_result_relation_info;
+ 
+       /* Determine the partition to heap_insert the tuple into */
+       if (mtstate->mt_partition_dispatch_info)
+       {
+               int                     leaf_part_index;
+               TupleConversionMap *map;
+ 
+               /*
+                * Away we go ... If we end up not finding a partition after all,
+                * ExecFindPartition() does not return and errors out instead.
+                * Otherwise, the returned value is to be used as an index into arrays
+                * mt_partitions[] and mt_partition_tupconv_maps[] that will get us
+                * the ResultRelInfo and TupleConversionMap for the partition,
+                * respectively.
+                */
+               leaf_part_index = ExecFindPartition(resultRelInfo,
+                                                                                mtstate->mt_partition_dispatch_info,
+                                                                                       slot,
+                                                                                       estate);
+               Assert(leaf_part_index >= 0 &&
+                          leaf_part_index < mtstate->mt_num_partitions);
+ 
+               /*
+                * Save the old ResultRelInfo and switch to the one corresponding to
+                * the selected partition.
+                */
+               saved_resultRelInfo = resultRelInfo;
+               resultRelInfo = mtstate->mt_partitions + leaf_part_index;
+ 
+               /* We do not yet have a way to insert into a foreign partition */
+               if (resultRelInfo->ri_FdwRoutine)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("cannot route inserted tuples to a foreign table")));
+ 
+               /* For ExecInsertIndexTuples() to work on the partition's indexes */
+               estate->es_result_relation_info = resultRelInfo;
+ 
+               /*
+                * We might need to convert from the parent rowtype to the partition
+                * rowtype.
+                */
+               map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
+               if (map)
+               {
+                       Relation        partrel = resultRelInfo->ri_RelationDesc;
+ 
+                       tuple = do_convert_tuple(tuple, map);
+ 
+                       /*
+                        * We must use the partition's tuple descriptor from this point
+                        * on, until we're finished dealing with the partition. Use the
+                        * dedicated slot for that.
+                        */
+                       slot = mtstate->mt_partition_tuple_slot;
+                       Assert(slot != NULL);
+                       ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
+                       ExecStoreTuple(tuple, slot, InvalidBuffer, true);
+               }
+       }
+ 
         resultRelationDesc = resultRelInfo->ri_RelationDesc;
- -
         /*
          * If the result relation has OIDs, force the tuple's OID to zero so that
          * heap_insert will assign a fresh OID.  Usually the OID already will be
diff --cc src/backend/executor/nodeNestloop.c
Simple merge
diff --cc src/backend/executor/nodeSubplan.c

index fb7461c96c0c5a1ad0a899856be837772571f255,e8fa4c8547ce36d2a4f602ff7fa18581362f8ad5..85e5acf75a14aa1090f8595911033130a54a177b
--- 1/src/backend/executor/nodeSubplan.c
--- 2/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@@ -288,9 -274,7 +277,8 @@@ ExecScanSubPlan(SubPlanState *node
   
                 prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar),
                                                                                            econtext,
-                                                                                          &(prm->isnull),
-                                                                                          NULL);
+                                                                                          &(prm->isnull));
+ +              prm->done = true;
                 planstate->chgParam = bms_add_member(planstate->chgParam, paramid);
         }
   
@@@ -712,14 -697,9 +701,14 @@@ ExecInitSubPlan(SubPlan *subplan, PlanS
         /* ... and to its parent's state */
         sstate->parent = parent;
   
+ +#ifdef XCP
+ +      /* subplan is referenced on local node, finish initialization */
+ +      ExecFinishInitProcNode(sstate->planstate);
+ +#endif
+ +
         /* Initialize subexpressions */
         sstate->testexpr = ExecInitExpr((Expr *) subplan->testexpr, parent);
-       sstate->args = (List *) ExecInitExpr((Expr *) subplan->args, parent);
+       sstate->args = ExecInitExprList(subplan->args, parent);
   
         /*
          * initialize my state
@@@ -996,9 -952,7 +961,8 @@@ ExecSetParamPlan(SubPlanState *node, Ex
   
                 prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar),
                                                                                            econtext,
-                                                                                          &(prm->isnull),
-                                                                                          NULL);
+                                                                                          &(prm->isnull));
+ +              prm->done = true;
                 planstate->chgParam = bms_add_member(planstate->chgParam, paramid);
         }
   
diff --cc src/backend/executor/nodeWindowAgg.c

index 0d512543d973df79a3893c8e6741d3690b1485eb,628bc9f00b70197c8c2f522970e6000a7bf512ab..124db68b1f5e76e4f64886eab91ae6afae88eb9d
--- 1/src/backend/executor/nodeWindowAgg.c
--- 2/src/backend/executor/nodeWindowAgg.c
+++ b/src/backend/executor/nodeWindowAgg.c
@@@ -23,8 -23,7 +23,8 @@@
    * aggregate function over all rows in the current row's window frame.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
diff --cc src/backend/executor/spi.c

index 0a3c65e6f7955093c3267c97762932b0032fc3d0,97c39258741f65cea88891f7b239070bcde6b854..ac0870c22d62f14a5456beaf69cd8b5cac25dfb2
--- 1/src/backend/executor/spi.c
--- 2/src/backend/executor/spi.c
+++ b/src/backend/executor/spi.c
@@@ -322,101 -299,6 +304,54 @@@ AtEOSubXact_SPI(bool isCommit, SubTrans
   }
   
   
- /* Pushes SPI stack to allow recursive SPI calls */
- void
- SPI_push(void)
- {
-       _SPI_curid++;
- }
- 
- /* Pops SPI stack to allow recursive SPI calls */
- void
- SPI_pop(void)
- {
-       _SPI_curid--;
- }
- 
- /* Conditional push: push only if we're inside a SPI procedure */
- bool
- SPI_push_conditional(void)
- {
-       bool            pushed = (_SPI_curid != _SPI_connected);
- 
-       if (pushed)
-       {
-               _SPI_curid++;
-               /* We should now be in a state where SPI_connect would succeed */
-               Assert(_SPI_curid == _SPI_connected);
-       }
-       return pushed;
- }
- 
- /* Conditional pop: pop only if SPI_push_conditional pushed */
- void
- SPI_pop_conditional(bool pushed)
- {
-       /* We should be in a state where SPI_connect would succeed */
-       Assert(_SPI_curid == _SPI_connected);
-       if (pushed)
-               _SPI_curid--;
- }
- 
- /* Restore state of SPI stack after aborting a subtransaction */
- void
- SPI_restore_connection(void)
- {
-       Assert(_SPI_connected >= 0);
-       _SPI_curid = _SPI_connected - 1;
- }
- 
+ +#ifdef PGXC
+ +/* SPI_execute_direct:
+ + * Runs the 'remote_sql' query string on the node 'nodename'
+ + * Create the ExecDirectStmt parse tree node using remote_sql, and then prepare
+ + * and execute it using SPI interface.
+ + * This function is essentially used for making internal exec-direct operations;
+ + * and this should not require super-user privileges. We cannot run EXEC-DIRECT
+ + * query because it is meant only for superusers. So this function needs to
+ + * bypass the parse stage. This is achieved here by calling
+ + * _SPI_pgxc_prepare_plan which accepts a parse tree.
+ + */
+ +int
+ +SPI_execute_direct(const char *remote_sql, char *nodename)
+ +{
+ +      _SPI_plan       plan;
+ +      int                     res;
+ +      ExecDirectStmt *stmt = makeNode(ExecDirectStmt);
+ +      StringInfoData execdirect;
+ +
+ +      initStringInfo(&execdirect);
+ +
+ +      /* This string is never used. It is just passed to fill up spierrcontext.arg */
+ +      appendStringInfo(&execdirect, "EXECUTE DIRECT ON (%s) '%s'",
+ +                                     nodename, remote_sql);
+ +
+ +      stmt->node_names = list_make1(makeString(nodename));
+ +      stmt->query = strdup(remote_sql);
+ +
+ +      res = _SPI_begin_call(true);
+ +      if (res < 0)
+ +              return res;
+ +
+ +      memset(&plan, 0, sizeof(_SPI_plan));
+ +      plan.magic = _SPI_PLAN_MAGIC;
+ +      plan.cursor_options = 0;
+ +
+ +      /* Now pass the ExecDirectStmt parsetree node */
+ +      _SPI_pgxc_prepare_plan(execdirect.data, list_make1(stmt),
+ +                      list_make1(execdirect.data), &plan);
+ +
+ +      res = _SPI_execute_plan(&plan, NULL,
+ +                                                      InvalidSnapshot, InvalidSnapshot, false, true, 0);
+ +
+ +      _SPI_end_call(true);
+ +      return res;
+ +}
+ +#endif
+ +
   /* Parse, plan, and execute a query string */
   int
   SPI_execute(const char *src, bool read_only, long tcount)
@@@ -1937,10 -1768,9 +1849,10 @@@ _SPI_pgxc_prepare_plan(const char *src
          */
         plancache_list = NIL;
   
- -      foreach(list_item, raw_parsetree_list)
+ +      forboth(list_item, raw_parsetree_list, list_item2, querysource_list)
         {
-               Node       *parsetree = (Node *) lfirst(list_item);
+               RawStmt    *parsetree = lfirst_node(RawStmt, list_item);
+ +              char       *querysource = (char *) lfirst (list_item2);
                 List       *stmt_list;
                 CachedPlanSource *plansource;
   
@@@ -1949,11 -1779,8 +1861,11 @@@
                  * needs to see the unmodified raw parse tree.
                  */
                 plansource = CreateCachedPlan(parsetree,
- -                                                                        src,
+ +                                                                        querysource,
+ +#ifdef PGXC
+ +                                                                        NULL,
+ +#endif
-                                                                         CreateCommandTag(parsetree));
+                                                                         CreateCommandTag(parsetree->stmt));
   
                 /*
                  * Parameter datatypes are driven by parserSetup hook if provided,
@@@ -1963,16 -1790,18 +1875,18 @@@
                 {
                         Assert(plan->nargs == 0);
                         stmt_list = pg_analyze_and_rewrite_params(parsetree,
- -                                                                                                        src,
+ +                                                                                                        querysource,
                                                                                                           plan->parserSetup,
-                                                                                                         plan->parserSetupArg);
+                                                                                                         plan->parserSetupArg,
+                                                                                                         _SPI_current->queryEnv);
                 }
                 else
                 {
                         stmt_list = pg_analyze_and_rewrite(parsetree,
- -                                                                                         src,
+ +                                                                                         querysource,
                                                                                            plan->argtypes,
-                                                                                          plan->nargs);
+                                                                                          plan->nargs,
+                                                                                          _SPI_current->queryEnv);
                 }
   
                 /* Finish filling in the CachedPlanSource */
@@@ -2044,15 -1872,14 +1958,16 @@@ _SPI_prepare_oneshot_plan(const char *s
          */
         plancache_list = NIL;
   
- -      foreach(list_item, raw_parsetree_list)
+ +      forboth(list_item, raw_parsetree_list, list_item2, querysource_list)
         {
-               Node       *parsetree = (Node *) lfirst(list_item);
+               RawStmt    *parsetree = lfirst_node(RawStmt, list_item);
+ +              char       *querysource = (char *) lfirst (list_item2);
                 CachedPlanSource *plansource;
   
++
                 plansource = CreateOneShotCachedPlan(parsetree,
- -                                                                                       src,
+ +                                                                                       querysource,
-                                                                                        CreateCommandTag(parsetree));
+                                                                                 CreateCommandTag(parsetree->stmt));
   
                 plancache_list = lappend(plancache_list, plansource);
         }
@@@ -2290,10 -2112,8 +2200,11 @@@ _SPI_execute_plan(SPIPlanPtr plan, Para
                                                            plansource->query_string,
                                                            PROCESS_UTILITY_QUERY,
                                                            paramLI,
+                                                          _SPI_current->queryEnv,
                                                            dest,
+ +#ifdef PGXC
+ +                                                         false,
+ +#endif /* PGXC */
                                                            completionTag);
   
                                 /* Update "processed" if stmt returned tuples */
diff --cc src/backend/libpq/be-fsstubs.c

index 75fc3b4845b464d15fd9ac746fb9ae52d599d2af,2cb60393852b93cda0a614bdb4dc69dc0c1aa47f..4773b18df966f68ef0a0f9b716ea5af62eda592e
--- 1/src/backend/libpq/be-fsstubs.c
--- 2/src/backend/libpq/be-fsstubs.c
+++ b/src/backend/libpq/be-fsstubs.c
@@@ -475,15 -403,8 +473,14 @@@ be_lowrite(PG_FUNCTION_ARGS
         int                     bytestowrite;
         int                     totalwritten;
   
- 
-       bytestowrite = VARSIZE(wbuf) - VARHDRSZ;
-       totalwritten = lo_write(fd, VARDATA(wbuf), bytestowrite);
+ +#ifdef PGXC
+ +      ereport(ERROR,
+ +                      (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ +                       errmsg("Postgres-XL does not yet support large objects"),
+ +                       errdetail("The feature is not currently supported")));
+ +#endif
+       bytestowrite = VARSIZE_ANY_EXHDR(wbuf);
+       totalwritten = lo_write(fd, VARDATA_ANY(wbuf), bytestowrite);
         PG_RETURN_INT32(totalwritten);
   }
   
diff --cc src/backend/main/main.c
Simple merge
diff --cc src/backend/nodes/bitmapset.c
Simple merge
diff --cc src/backend/nodes/copyfuncs.c

index 0cdd6559d0f8f695eebea57908ab74395e55b6d8,36bf1dc92bbe3c90de58741c1a1d9c6c69d3c884..fc21909ea39dc34a688611ac9467e4d55529af8e
--- 1/src/backend/nodes/copyfuncs.c
--- 2/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@@ -11,10 -11,8 +11,10 @@@
    * be handled easily in a simple depth-first traversal.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * IDENTIFICATION
    *      src/backend/nodes/copyfuncs.c
@@@ -106,16 -98,9 +107,19 @@@ _copyPlannedStmt(const PlannedStmt *fro
         COPY_NODE_FIELD(relationOids);
         COPY_NODE_FIELD(invalItems);
         COPY_SCALAR_FIELD(nParamExec);
+ +#ifdef XCP
+ +      COPY_SCALAR_FIELD(nParamRemote);
+ +      COPY_POINTER_FIELD(remoteparams,
+ +                                         newnode->nParamRemote * sizeof(RemoteParam));
+ +      COPY_STRING_FIELD(pname);
+ +      COPY_SCALAR_FIELD(distributionType);
+ +      COPY_SCALAR_FIELD(distributionKey);
+ +      COPY_NODE_FIELD(distributionNodes);
+ +      COPY_NODE_FIELD(distributionRestrict);
+ +#endif
+       COPY_NODE_FIELD(utilityStmt);
+       COPY_LOCATION_FIELD(stmt_location);
+       COPY_LOCATION_FIELD(stmt_len);
   
         return newnode;
   }
@@@ -4561,27 -4654,9 +4948,27 @@@ _copyForeignKeyCacheInfo(const ForeignK
         return newnode;
   }
   
+ +/* ****************************************************************
+ + *                                    poolutils.h copy functions
+ + * ****************************************************************
+ + */
+ +static CleanConnStmt *
+ +_copyCleanConnStmt(const CleanConnStmt *from)
+ +{
+ +      CleanConnStmt *newnode = makeNode(CleanConnStmt);
+ +
+ +      COPY_NODE_FIELD(nodes);
+ +      COPY_STRING_FIELD(dbname);
+ +      COPY_STRING_FIELD(username);
+ +      COPY_SCALAR_FIELD(is_coord);
+ +      COPY_SCALAR_FIELD(is_force);
+ +
+ +      return newnode;
+ +}
+ +#endif
   
   /*
-  * copyObject
+  * copyObjectImpl -- implementation of copyObject(); see nodes/nodes.h
    *
    * Create a copy of a Node tree or list.  This is a "deep" copy: all
    * substructure is copied too, recursively.
diff --cc src/backend/nodes/equalfuncs.c

index e6f44f1cf833ec93074ccf11ec30392d11e31899,5bcf0317dc8fb01000634d2dbf9e56f669df3b9d..c644aba4c106efc3ec5add74730a9967c0024708
--- 1/src/backend/nodes/equalfuncs.c
--- 2/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@@ -18,10 -18,8 +18,10 @@@
    * "x" to be considered equal() to another reference to "x" in the query.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * IDENTIFICATION
    *      src/backend/nodes/equalfuncs.c
diff --cc src/backend/nodes/makefuncs.c
Simple merge
diff --cc src/backend/nodes/nodeFuncs.c

index d05332c10abd3e250069da35e45d924cc69de7dc,41f3408cfcf9bf2de4baaf51558fe70bd9fc89e4..eb3e1ce1c141d5c10dc533b075cabab2647c3198
--- 1/src/backend/nodes/nodeFuncs.c
--- 2/src/backend/nodes/nodeFuncs.c
+++ b/src/backend/nodes/nodeFuncs.c
@@@ -3083,9 -3154,7 +3159,10 @@@ range_table_mutator(List *rtable
                                 /* we don't bother to copy eref, aliases, etc; OK? */
                                 break;
                         case RTE_CTE:
+ +#ifdef PGXC
+ +                      case RTE_REMOTE_DUMMY:
+ +#endif /* PGXC */
+                       case RTE_NAMEDTUPLESTORE:
                                 /* nothing to do */
                                 break;
                         case RTE_SUBQUERY:
diff --cc src/backend/nodes/outfuncs.c

index 522387a3f8cf673831db10aefd26785b77b28e8c,9189c8d43f8716eac6398b79f1976cf34e3195c5..d5165ddd4eefd5a5483feebd652d2a1c87176b62
--- 1/src/backend/nodes/outfuncs.c
--- 2/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@@ -3,8 -3,7 +3,8 @@@
    * outfuncs.c
    *      Output functions for Postgres tree nodes.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -134,125 -95,8 +134,125 @@@ set_portable_output(bool value
   /* Write a bitmapset field */
   #define WRITE_BITMAPSET_FIELD(fldname) \
         (appendStringInfo(str, " :" CppAsString(fldname) " "), \
-        _outBitmapset(str, node->fldname))
+        outBitmapset(str, node->fldname))
   
-       (_outToken(str, OidIsValid((relid)) ? NSP_NAME(get_rel_namespace((relid))) : NULL), \
+ +#ifdef XCP
+ +#define NSP_NAME(oid) \
+ +      isTempNamespace(oid) ? "pg_temp" : get_namespace_name(oid)
+ +/*
+ + * Macros to encode OIDs to send to other nodes. Objects on other nodes may have
+ + * different OIDs, so send instead an unique identifier allowing to lookup
+ + * the OID on target node. The identifier depends on object type.
+ + */
+ +
+ +#define WRITE_RELID_INTERNAL(relid) \
-        _outToken(str, OidIsValid((relid)) ? get_rel_name((relid)) : NULL))
++      (outToken(str, OidIsValid((relid)) ? NSP_NAME(get_rel_namespace((relid))) : NULL), \
+ +       appendStringInfoChar(str, ' '), \
-        _outToken(str, OidIsValid(node->fldname) ? NSP_NAME(get_typ_namespace(node->fldname)) : NULL), \
++       outToken(str, OidIsValid((relid)) ? get_rel_name((relid)) : NULL))
+ +
+ +/* write an OID which is a relation OID */
+ +#define WRITE_RELID_FIELD(fldname) \
+ +      (appendStringInfo(str, " :" CppAsString(fldname) " "), \
+ +       WRITE_RELID_INTERNAL(node->fldname))
+ +
+ +#define WRITE_RELID_LIST_FIELD(fldname) \
+ +      do { \
+ +              ListCell *lc; \
+ +              char *sep = ""; \
+ +              appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ +              if (node->fldname == NIL || list_length(node->fldname) == 0) \
+ +                      appendStringInfoString(str, "<>"); \
+ +              else \
+ +              { \
+ +                      appendStringInfoChar(str, '('); \
+ +                      foreach (lc, node->fldname) \
+ +                      { \
+ +                              Oid relid = lfirst_oid(lc); \
+ +                              appendStringInfoString(str, sep); \
+ +                              WRITE_RELID_INTERNAL(relid); \
+ +                              sep = ","; \
+ +                      } \
+ +                      appendStringInfoChar(str, ')'); \
+ +              } \
+ +      }  while (0)
+ +
+ +/* write an OID which is a data type OID */
+ +#define WRITE_TYPID_FIELD(fldname) \
+ +      (appendStringInfo(str, " :" CppAsString(fldname) " "), \
-        _outToken(str, OidIsValid(node->fldname) ? get_typ_name(node->fldname) : NULL))
++       outToken(str, OidIsValid(node->fldname) ? NSP_NAME(get_typ_namespace(node->fldname)) : NULL), \
+ +       appendStringInfoChar(str, ' '), \
-                       _outToken(str, NSP_NAME(get_func_namespace(node->fldname))); \
++       outToken(str, OidIsValid(node->fldname) ? get_typ_name(node->fldname) : NULL))
+ +
+ +/* write an OID which is a function OID */
+ +#define WRITE_FUNCID_FIELD(fldname) \
+ +      do { \
+ +              appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ +              if (OidIsValid(node->fldname)) \
+ +              { \
+ +                      Oid *argtypes; \
+ +                      int i, nargs; \
-                       _outToken(str, get_func_name(node->fldname)); \
++                      outToken(str, NSP_NAME(get_func_namespace(node->fldname))); \
+ +                      appendStringInfoChar(str, ' '); \
-                               _outToken(str, NSP_NAME(get_typ_namespace(argtypes[i]))); \
++                      outToken(str, get_func_name(node->fldname)); \
+ +                      appendStringInfoChar(str, ' '); \
+ +                      get_func_signature(node->fldname, &argtypes, &nargs); \
+ +                      appendStringInfo(str, "%d", nargs); \
+ +                      for (i = 0; i < nargs; i++) \
+ +                      { \
+ +                              appendStringInfoChar(str, ' '); \
-                               _outToken(str, get_typ_name(argtypes[i])); \
++                              outToken(str, NSP_NAME(get_typ_namespace(argtypes[i]))); \
+ +                              appendStringInfoChar(str, ' '); \
-                       _outToken(str, NSP_NAME(get_opnamespace(node->fldname))); \
++                              outToken(str, get_typ_name(argtypes[i])); \
+ +                      } \
+ +              } \
+ +              else \
+ +                      appendStringInfo(str, "<> <> 0"); \
+ +      } while (0)
+ +
+ +/* write an OID which is an operator OID */
+ +#define WRITE_OPERID_FIELD(fldname) \
+ +      do { \
+ +              appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ +              if (OidIsValid(node->fldname)) \
+ +              { \
+ +                      Oid oprleft, oprright; \
-                       _outToken(str, get_opname(node->fldname)); \
++                      outToken(str, NSP_NAME(get_opnamespace(node->fldname))); \
+ +                      appendStringInfoChar(str, ' '); \
-                       _outToken(str, OidIsValid(oprleft) ? \
++                      outToken(str, get_opname(node->fldname)); \
+ +                      appendStringInfoChar(str, ' '); \
+ +                      op_input_types(node->fldname, &oprleft, &oprright); \
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); \
++                      outToken(str, OidIsValid(oprleft) ? \
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL); \
+ +                      appendStringInfoChar(str, ' '); \
-                       _outToken(str, OidIsValid(oprright) ? \
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); \
+ +                      appendStringInfoChar(str, ' '); \
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); \
++                      outToken(str, OidIsValid(oprright) ? \
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL); \
+ +                      appendStringInfoChar(str, ' '); \
-                       _outToken(str, NSP_NAME(get_collation_namespace(node->fldname))); \
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); \
+ +                      appendStringInfoChar(str, ' '); \
+ +              } \
+ +              else \
+ +                      appendStringInfo(str, "<> <> <> <> <> <>"); \
+ +      } while (0)
+ +
+ +/* write an OID which is a collation OID */
+ +#define WRITE_COLLID_FIELD(fldname) \
+ +      do { \
+ +              appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ +              if (OidIsValid(node->fldname)) \
+ +              { \
-                       _outToken(str, get_collation_name(node->fldname)); \
++                      outToken(str, NSP_NAME(get_collation_namespace(node->fldname))); \
+ +                      appendStringInfoChar(str, ' '); \
++                      outToken(str, get_collation_name(node->fldname)); \
+ +                      appendStringInfo(str, " %d", get_collation_encoding(node->fldname)); \
+ +              } \
+ +              else \
+ +                      appendStringInfo(str, "<> <> -1"); \
+ +      } while (0)
+ +
+ +
+ +#endif
   
   #define booltostr(x)  ((x) ? "true" : "false")
   
@@@ -402,48 -232,6 +388,48 @@@ outDatum(StringInfo str, Datum value, i
   }
   
   
-       _outToken(str, textvalue);
+ +#ifdef XCP
+ +/*
+ + * Output value in text format
+ + */
+ +static void
+ +_printDatum(StringInfo str, Datum value, Oid typid)
+ +{
+ +      Oid             typOutput;
+ +      bool            typIsVarlena;
+ +      FmgrInfo    finfo;
+ +      Datum           tmpval;
+ +      char       *textvalue;
+ +      int                     saveDateStyle;
+ +
+ +      /* Get output function for the type */
+ +      getTypeOutputInfo(typid, &typOutput, &typIsVarlena);
+ +      fmgr_info(typOutput, &finfo);
+ +
+ +      /* Detoast value if needed */
+ +      if (typIsVarlena)
+ +              tmpval = PointerGetDatum(PG_DETOAST_DATUM(value));
+ +      else
+ +              tmpval = value;
+ +
+ +      /*
+ +       * It was found that if configuration setting for date style is
+ +       * "postgres,ymd" the output dates have format DD-MM-YYYY and they can not
+ +       * be parsed correctly by receiving party. So force ISO format YYYY-MM-DD
+ +       * in internal cluster communications, these values are always parsed
+ +       * correctly.
+ +       */
+ +      saveDateStyle = DateStyle;
+ +      DateStyle = USE_ISO_DATES;
+ +
+ +      textvalue = DatumGetCString(FunctionCall1(&finfo, tmpval));
++      outToken(str, textvalue);
+ +
+ +      DateStyle = saveDateStyle;
+ +}
+ +#endif
+ +
+ +
   /*
    *    Stuff from plannodes.h
    */
@@@ -601,52 -398,10 +605,52 @@@ _outMergeAppend(StringInfo str, const M
   
         appendStringInfoString(str, " :sortOperators");
         for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid oper = node->sortOperators[i];
+ +                      Oid oprleft, oprright;
+ +                      /* Sort operator is always valid */
+ +                      Assert(OidIsValid(oper));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
+ +                      appendStringInfoChar(str, ' ');
+ +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->sortOperators[i]);
   
         appendStringInfoString(str, " :collations");
         for (i = 0; i < node->numCols; i++)
-                               _outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid coll = node->collations[i];
+ +                      if (OidIsValid(coll))
+ +                      {
+ +                              appendStringInfoChar(str, ' ');
-                               _outToken(str, get_collation_name(coll));
++                              outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ +                              appendStringInfoChar(str, ' ');
++                              outToken(str, get_collation_name(coll));
+ +                              appendStringInfo(str, " %d", get_collation_encoding(coll));
+ +                      }
+ +                      else
+ +                              appendStringInfo(str, " <> <> -1");
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->collations[i]);
   
         appendStringInfoString(str, " :nullsFirst");
@@@ -672,32 -427,6 +676,32 @@@ _outRecursiveUnion(StringInfo str, cons
   
         appendStringInfoString(str, " :dupOperators");
         for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid oper = node->dupOperators[i];
+ +                      Oid oprleft, oprright;
+ +                      /* Unique operator is always valid */
+ +                      Assert(OidIsValid(oper));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
+ +                      appendStringInfoChar(str, ' ');
+ +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ +                      appendStringInfoChar(str, ' ');
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->dupOperators[i]);
   
         WRITE_LONG_FIELD(numGroups);
@@@ -851,12 -557,8 +885,13 @@@ _outBitmapIndexScan(StringInfo str, con
   
         _outScanInfo(str, (const Scan *) node);
   
+ +#ifdef XCP
+ +      if (portable_output)
+ +              WRITE_RELID_FIELD(indexid);
+ +      else
+ +#endif
         WRITE_OID_FIELD(indexid);
+       WRITE_BOOL_FIELD(isshared);
         WRITE_NODE_FIELD(indexqual);
         WRITE_NODE_FIELD(indexqualorig);
   }
@@@ -1006,23 -729,6 +1062,23 @@@ _outMergeJoin(StringInfo str, const Mer
   
         appendStringInfoString(str, " :mergeCollations");
         for (i = 0; i < numCols; i++)
-                               _outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid coll = node->mergeCollations[i];
+ +                      if (OidIsValid(coll))
+ +                      {
+ +                              appendStringInfoChar(str, ' ');
-                               _outToken(str, get_collation_name(coll));
++                              outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ +                              appendStringInfoChar(str, ' ');
++                              outToken(str, get_collation_name(coll));
+ +                              appendStringInfo(str, " %d", get_collation_encoding(coll));
+ +                      }
+ +                      else
+ +                              appendStringInfo(str, " <> <> -1");
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->mergeCollations[i]);
   
         appendStringInfoString(str, " :mergeStrategies");
@@@ -1063,32 -769,6 +1119,32 @@@ _outAgg(StringInfo str, const Agg *node
   
         appendStringInfoString(str, " :grpOperators");
         for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid oper = node->grpOperators[i];
+ +                      Oid oprleft, oprright;
+ +                      /* Group operator is always valid */
+ +                      Assert(OidIsValid(oper));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
+ +                      appendStringInfoChar(str, ' ');
+ +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ +                      appendStringInfoChar(str, ' ');
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->grpOperators[i]);
   
         WRITE_LONG_FIELD(numGroups);
@@@ -1112,34 -793,8 +1169,34 @@@ _outWindowAgg(StringInfo str, const Win
         for (i = 0; i < node->partNumCols; i++)
                 appendStringInfo(str, " %d", node->partColIdx[i]);
   
- -      appendStringInfoString(str, " :partOperations");
+ +      appendStringInfoString(str, " :partOperators");
         for (i = 0; i < node->partNumCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid oper = node->partOperators[i];
+ +                      Oid oprleft, oprright;
+ +                      /* The operator is always valid */
+ +                      Assert(OidIsValid(oper));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
+ +                      appendStringInfoChar(str, ' ');
+ +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ +                      appendStringInfoChar(str, ' ');
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->partOperators[i]);
   
         WRITE_INT_FIELD(ordNumCols);
@@@ -1148,34 -803,8 +1205,34 @@@
         for (i = 0; i < node->ordNumCols; i++)
                 appendStringInfo(str, " %d", node->ordColIdx[i]);
   
- -      appendStringInfoString(str, " :ordOperations");
+ +      appendStringInfoString(str, " :ordOperators");
         for (i = 0; i < node->ordNumCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid oper = node->ordOperators[i];
+ +                      Oid oprleft, oprright;
+ +                      /* Group operator is always valid */
+ +                      Assert(OidIsValid(oper));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
+ +                      appendStringInfoChar(str, ' ');
+ +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ +                      appendStringInfoChar(str, ' ');
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->ordOperators[i]);
   
         WRITE_INT_FIELD(frameOptions);
@@@ -1200,32 -829,6 +1257,32 @@@ _outGroup(StringInfo str, const Group *
   
         appendStringInfoString(str, " :grpOperators");
         for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid oper = node->grpOperators[i];
+ +                      Oid oprleft, oprright;
+ +                      /* Group operator is always valid */
+ +                      Assert(OidIsValid(oper));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
+ +                      appendStringInfoChar(str, ' ');
+ +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ +                      appendStringInfoChar(str, ' ');
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->grpOperators[i]);
   }
   
@@@ -1254,52 -857,10 +1311,52 @@@ _outSort(StringInfo str, const Sort *no
   
         appendStringInfoString(str, " :sortOperators");
         for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid oper = node->sortOperators[i];
+ +                      Oid oprleft, oprright;
+ +                      /* Sort operator is always valid */
+ +                      Assert(OidIsValid(oper));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
+ +                      appendStringInfoChar(str, ' ');
+ +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->sortOperators[i]);
   
         appendStringInfoString(str, " :collations");
         for (i = 0; i < node->numCols; i++)
-                               _outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid coll = node->collations[i];
+ +                      if (OidIsValid(coll))
+ +                      {
+ +                              appendStringInfoChar(str, ' ');
-                               _outToken(str, get_collation_name(coll));
++                              outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ +                              appendStringInfoChar(str, ' ');
++                              outToken(str, get_collation_name(coll));
+ +                              appendStringInfo(str, " %d", get_collation_encoding(coll));
+ +                      }
+ +                      else
+ +                              appendStringInfo(str, " <> <> -1");
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->collations[i]);
   
         appendStringInfoString(str, " :nullsFirst");
@@@ -1324,32 -885,6 +1381,32 @@@ _outUnique(StringInfo str, const Uniqu
   
         appendStringInfoString(str, " :uniqOperators");
         for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid oper = node->uniqOperators[i];
+ +                      Oid oprleft, oprright;
+ +                      /* Unique operator is always valid */
+ +                      Assert(OidIsValid(oper));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
+ +                      appendStringInfoChar(str, ' ');
+ +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ +                      appendStringInfoChar(str, ' ');
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->uniqOperators[i]);
   }
   
@@@ -1396,31 -919,6 +1446,31 @@@ _outSetOp(StringInfo str, const SetOp *
   
         appendStringInfoString(str, " :dupOperators");
         for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +#ifdef XCP
+ +              if (portable_output)
+ +              {
+ +                      Oid oper = node->dupOperators[i];
+ +                      Oid oprleft, oprright;
+ +                      /* Unique operator is always valid */
+ +                      Assert(OidIsValid(oper));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
+ +                      appendStringInfoChar(str, ' ');
+ +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ +              }
+ +              else
+ +#endif
                 appendStringInfo(str, " %u", node->dupOperators[i]);
   
         WRITE_INT_FIELD(flagColIdx);
@@@ -1450,138 -948,6 +1500,138 @@@ _outLimit(StringInfo str, const Limit *
         WRITE_NODE_FIELD(limitCount);
   }
   
-                       _outToken(str, NSP_NAME(get_typ_namespace(ptype)));
+ +#ifdef XCP
+ +static void
+ +_outRemoteSubplan(StringInfo str, const RemoteSubplan *node)
+ +{
+ +      WRITE_NODE_TYPE("REMOTESUBPLAN");
+ +
+ +      _outScanInfo(str, (Scan *) node);
+ +
+ +      WRITE_CHAR_FIELD(distributionType);
+ +      WRITE_INT_FIELD(distributionKey);
+ +      WRITE_NODE_FIELD(distributionNodes);
+ +      WRITE_NODE_FIELD(distributionRestrict);
+ +      WRITE_NODE_FIELD(nodeList);
+ +      WRITE_BOOL_FIELD(execOnAll);
+ +      WRITE_NODE_FIELD(sort);
+ +      WRITE_STRING_FIELD(cursor);
+ +      WRITE_INT_FIELD(unique);
+ +}
+ +
+ +static void
+ +_outRemoteStmt(StringInfo str, const RemoteStmt *node)
+ +{
+ +      int i;
+ +
+ +      WRITE_NODE_TYPE("REMOTESTMT");
+ +
+ +      WRITE_ENUM_FIELD(commandType, CmdType);
+ +      WRITE_BOOL_FIELD(hasReturning);
+ +      WRITE_NODE_FIELD(planTree);
+ +      WRITE_NODE_FIELD(rtable);
+ +      WRITE_NODE_FIELD(resultRelations);
+ +      WRITE_NODE_FIELD(subplans);
+ +      WRITE_INT_FIELD(nParamExec);
+ +      WRITE_INT_FIELD(nParamRemote);
+ +
+ +      for (i = 0; i < node->nParamRemote; i++)
+ +      {
+ +              RemoteParam *rparam = &(node->remoteparams[i]);
+ +              appendStringInfo(str, " :paramkind");
+ +              appendStringInfo(str, " %d", (int) rparam->paramkind);
+ +
+ +              appendStringInfo(str, " :paramid");
+ +              appendStringInfo(str, " %d", rparam->paramid);
+ +
+ +              appendStringInfo(str, " :paramused");
+ +              appendStringInfo(str, " %d", rparam->paramused);
+ +
+ +              appendStringInfo(str, " :paramtype");
+ +              if (portable_output)
+ +              {
+ +                      Oid ptype = rparam->paramtype;
+ +                      Assert(OidIsValid(ptype));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_typ_name(ptype));
++                      outToken(str, NSP_NAME(get_typ_namespace(ptype)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
++                      outToken(str, get_typ_name(ptype));
+ +              }
+ +              else
+ +                      appendStringInfo(str, " %u", rparam->paramtype);
+ +      }
+ +      WRITE_NODE_FIELD(rowMarks);
+ +      WRITE_CHAR_FIELD(distributionType);
+ +      WRITE_INT_FIELD(distributionKey);
+ +      WRITE_NODE_FIELD(distributionNodes);
+ +      WRITE_NODE_FIELD(distributionRestrict);
+ +}
+ +
+ +static void
+ +_outSimpleSort(StringInfo str, const SimpleSort *node)
+ +{
+ +      int                     i;
+ +
+ +      WRITE_NODE_TYPE("SIMPLESORT");
+ +
+ +      WRITE_INT_FIELD(numCols);
+ +
+ +      appendStringInfo(str, " :sortColIdx");
+ +      for (i = 0; i < node->numCols; i++)
+ +              appendStringInfo(str, " %d", node->sortColIdx[i]);
+ +
+ +      appendStringInfo(str, " :sortOperators");
+ +      for (i = 0; i < node->numCols; i++)
+ +              if (portable_output)
+ +              {
+ +                      Oid oper = node->sortOperators[i];
+ +                      Oid oprleft, oprright;
+ +                      /* Sort operator is always valid */
+ +                      Assert(OidIsValid(oper));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
+ +                      appendStringInfoChar(str, ' ');
+ +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
+ +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
+ +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ +                      appendStringInfoChar(str, ' ');
-                               _outToken(str, NSP_NAME(get_collation_namespace(coll)));
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ +              }
+ +              else
+ +                      appendStringInfo(str, " %u", node->sortOperators[i]);
+ +
+ +      appendStringInfo(str, " :sortCollations");
+ +      for (i = 0; i < node->numCols; i++)
+ +              if (portable_output)
+ +              {
+ +                      Oid coll = node->sortCollations[i];
+ +                      if (OidIsValid(coll))
+ +                      {
+ +                              appendStringInfoChar(str, ' ');
-                               _outToken(str, get_collation_name(coll));
++                              outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ +                              appendStringInfoChar(str, ' ');
++                              outToken(str, get_collation_name(coll));
+ +                              appendStringInfo(str, " %d", get_collation_encoding(coll));
+ +                      }
+ +                      else
+ +                              appendStringInfo(str, " <> <> -1");
+ +              }
+ +              else
+ +                      appendStringInfo(str, " %u", node->sortCollations[i]);
+ +
+ +      appendStringInfo(str, " :nullsFirst");
+ +      for (i = 0; i < node->numCols; i++)
+ +              appendStringInfo(str, " %s", booltostr(node->nullsFirst[i]));
+ +}
+ +#endif
+ +
   static void
   _outNestLoopParam(StringInfo str, const NestLoopParam *node)
   {
@@@ -3924,15 -3041,17 +4126,22 @@@ _outRangeTblEntry(StringInfo str, cons
                         WRITE_STRING_FIELD(ctename);
                         WRITE_UINT_FIELD(ctelevelsup);
                         WRITE_BOOL_FIELD(self_reference);
-                       WRITE_NODE_FIELD(ctecoltypes);
-                       WRITE_NODE_FIELD(ctecoltypmods);
-                       WRITE_NODE_FIELD(ctecolcollations);
+                       WRITE_NODE_FIELD(coltypes);
+                       WRITE_NODE_FIELD(coltypmods);
+                       WRITE_NODE_FIELD(colcollations);
+                       break;
+               case RTE_NAMEDTUPLESTORE:
+                       WRITE_STRING_FIELD(enrname);
+                       WRITE_OID_FIELD(relid);
+                       WRITE_NODE_FIELD(coltypes);
+                       WRITE_NODE_FIELD(coltypmods);
+                       WRITE_NODE_FIELD(colcollations);
                         break;
+ +#ifdef PGXC
+ +              case RTE_REMOTE_DUMMY:
+ +                      /* Everything relevant already copied */
+ +                      break;
+ +#endif /* PGXC */
                 default:
                         elog(ERROR, "unrecognized RTE kind: %d", (int) node->rtekind);
                         break;
@@@ -4976,11 -4183,21 +5306,26 @@@ outNode(StringInfo str, const void *obj
                         case T_ForeignKeyCacheInfo:
                                 _outForeignKeyCacheInfo(str, obj);
                                 break;
+ +#ifdef PGXC
+ +                      case T_ExecNodes:
+ +                              _outExecNodes(str, obj);
+ +                              break;
+ +#endif
+                       case T_TriggerTransition:
+                               _outTriggerTransition(str, obj);
+                               break;
+                       case T_PartitionElem:
+                               _outPartitionElem(str, obj);
+                               break;
+                       case T_PartitionSpec:
+                               _outPartitionSpec(str, obj);
+                               break;
+                       case T_PartitionBoundSpec:
+                               _outPartitionBoundSpec(str, obj);
+                               break;
+                       case T_PartitionRangeDatum:
+                               _outPartitionRangeDatum(str, obj);
+                               break;
   
                         default:
   
diff --cc src/backend/nodes/print.c
Simple merge
diff --cc src/backend/nodes/readfuncs.c

index 933825cd74cc6980bc2b4f046e4469da8a9012fc,b59ebd63ecb379bd16bb0b10bbb5b8ffd214c6a1..23091c2bcca7539a3689807b097f6b45580fbb00
--- 1/src/backend/nodes/readfuncs.c
--- 2/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@@ -3,10 -3,8 +3,10 @@@
    * readfuncs.c
    *      Reader functions for Postgres tree nodes.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -1991,15 -1352,17 +2040,22 @@@ _readRangeTblEntry(void
                         READ_STRING_FIELD(ctename);
                         READ_UINT_FIELD(ctelevelsup);
                         READ_BOOL_FIELD(self_reference);
-                       READ_NODE_FIELD(ctecoltypes);
-                       READ_NODE_FIELD(ctecoltypmods);
-                       READ_NODE_FIELD(ctecolcollations);
+                       READ_NODE_FIELD(coltypes);
+                       READ_NODE_FIELD(coltypmods);
+                       READ_NODE_FIELD(colcollations);
+                       break;
+               case RTE_NAMEDTUPLESTORE:
+                       READ_STRING_FIELD(enrname);
+                       READ_OID_FIELD(relid);
+                       READ_NODE_FIELD(coltypes);
+                       READ_NODE_FIELD(coltypmods);
+                       READ_NODE_FIELD(colcollations);
                         break;
+ +#ifdef PGXC
+ +              case RTE_REMOTE_DUMMY:
+ +                      /* Nothing to do */
+ +                      break;
+ +#endif /* PGXC */
                 default:
                         elog(ERROR, "unrecognized RTE kind: %d",
                                  (int) local_node->rtekind);
@@@ -2470,10 -1759,8 +2549,11 @@@ _readBitmapIndexScan(void
   
         ReadCommonScan(&local_node->scan);
   
- -      READ_OID_FIELD(indexid);
+ +      if (portable_input)
+ +              READ_RELID_FIELD(indexid);
+ +      else
+ +              READ_OID_FIELD(indexid);
+       READ_BOOL_FIELD(isshared);
         READ_NODE_FIELD(indexqual);
         READ_NODE_FIELD(indexqualorig);
   
@@@ -2949,58 -2087,9 +3046,59 @@@ _readAgg(void
         READ_ENUM_FIELD(aggsplit, AggSplit);
         READ_INT_FIELD(numCols);
         READ_ATTRNUMBER_ARRAY(grpColIdx, local_node->numCols);
+ +
+ +#ifdef PGXC
+ +      token = pg_strtok(&length);             /* skip :grpOperators */
+ +      local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ +      for (i = 0; i < local_node->numCols; i++)
+ +      {
+ +              token = pg_strtok(&length);
+ +              if (portable_input)
+ +              {
+ +                      char       *nspname; /* namespace name */
+ +                      char       *oprname; /* operator name */
+ +                      char       *leftnspname; /* left type namespace */
+ +                      char       *leftname; /* left type name */
+ +                      Oid                     oprleft; /* left type */
+ +                      char       *rightnspname; /* right type namespace */
+ +                      char       *rightname; /* right type name */
+ +                      Oid                     oprright; /* right type */
+ +                      /* token is already set to nspname */
+ +                      nspname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* get operator name */
+ +                      oprname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* left type namespace */
+ +                      leftnspname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* left type name */
+ +                      leftname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* right type namespace */
+ +                      rightnspname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* right type name */
+ +                      rightname = nullable_string(token, length);
+ +                      if (leftname)
+ +                              oprleft = get_typname_typid(leftname,
+ +                                                                                      NSP_OID(leftnspname));
+ +                      else
+ +                              oprleft = InvalidOid;
+ +                      if (rightname)
+ +                              oprright = get_typname_typid(rightname,
+ +                                                                                       NSP_OID(rightnspname));
+ +                      else
+ +                              oprright = InvalidOid;
+ +                      local_node->grpOperators[i] = get_operid(oprname,
+ +                                                                                                       oprleft,
+ +                                                                                                       oprright,
+ +                                                                                                       NSP_OID(nspname));
+ +              }
+ +              else
+ +                      local_node->grpOperators[i] = atooid(token);
+ +      }
+ +#else
         READ_OID_ARRAY(grpOperators, local_node->numCols);
+ +#endif
+ +
         READ_LONG_FIELD(numGroups);
+       READ_BITMAPSET_FIELD(aggParams);
         READ_NODE_FIELD(groupingSets);
         READ_NODE_FIELD(chain);
   
@@@ -3227,17 -2184,9 +3345,12 @@@ _readHash(void
   
         ReadCommonPlan(&local_node->plan);
   
- -      READ_OID_FIELD(skewTable);
+ +      if (portable_input)
+ +              READ_RELID_FIELD(skewTable);
+ +      else
+ +              READ_OID_FIELD(skewTable);
         READ_INT_FIELD(skewColumn);
         READ_BOOL_FIELD(skewInherit);
-       if (portable_input)
-               READ_TYPID_FIELD(skewColType);
-       else
-               READ_OID_FIELD(skewColType);
-       READ_INT_FIELD(skewColTypmod);
   
         READ_DONE();
   }
@@@ -3423,17 -2306,12 +3536,18 @@@ _readSubPlan(void
         READ_NODE_FIELD(paramIds);
         READ_INT_FIELD(plan_id);
         READ_STRING_FIELD(plan_name);
- -      READ_OID_FIELD(firstColType);
+ +      if (portable_input)
+ +              READ_TYPID_FIELD(firstColType);
+ +      else
+ +              READ_OID_FIELD(firstColType);
         READ_INT_FIELD(firstColTypmod);
- -      READ_OID_FIELD(firstColCollation);
+ +      if (portable_input)
+ +              READ_COLLID_FIELD(firstColCollation);
+ +      else
+ +              READ_OID_FIELD(firstColCollation);
         READ_BOOL_FIELD(useHashTable);
         READ_BOOL_FIELD(unknownEqFalse);
+       READ_BOOL_FIELD(parallel_safe);
         READ_NODE_FIELD(setParam);
         READ_NODE_FIELD(parParam);
         READ_NODE_FIELD(args);
@@@ -3486,204 -2364,40 +3600,238 @@@ _readExtensibleNode(void
         READ_DONE();
   }
   
+ +
+ +/*
+ + * _readRemoteSubplan
+ + */
+ +static RemoteSubplan *
+ +_readRemoteSubplan(void)
+ +{
+ +      READ_SCAN_FIELDS(RemoteSubplan);
+ +
+ +      READ_CHAR_FIELD(distributionType);
+ +      READ_INT_FIELD(distributionKey);
+ +      READ_NODE_FIELD(distributionNodes);
+ +      READ_NODE_FIELD(distributionRestrict);
+ +      READ_NODE_FIELD(nodeList);
+ +      READ_BOOL_FIELD(execOnAll);
+ +      READ_NODE_FIELD(sort);
+ +      READ_STRING_FIELD(cursor);
+ +      READ_INT_FIELD(unique);
+ +
+ +      READ_DONE();
+ +}
+ +
+ +
+ +/*
+ + * _readRemoteStmt
+ + */
+ +static RemoteStmt *
+ +_readRemoteStmt(void)
+ +{
+ +      int i;
+ +      READ_LOCALS(RemoteStmt);
+ +
+ +      READ_ENUM_FIELD(commandType, CmdType);
+ +      READ_BOOL_FIELD(hasReturning);
+ +      READ_NODE_FIELD(planTree);
+ +      READ_NODE_FIELD(rtable);
+ +      READ_NODE_FIELD(resultRelations);
+ +      READ_NODE_FIELD(subplans);
+ +      READ_INT_FIELD(nParamExec);
+ +      READ_INT_FIELD(nParamRemote);
+ +      if (local_node->nParamRemote > 0)
+ +      {
+ +              local_node->remoteparams = (RemoteParam *) palloc(
+ +                              local_node->nParamRemote * sizeof(RemoteParam));
+ +              for (i = 0; i < local_node->nParamRemote; i++)
+ +              {
+ +                      RemoteParam *rparam = &(local_node->remoteparams[i]);
+ +                      token = pg_strtok(&length); /* skip  :paramkind */
+ +                      token = pg_strtok(&length);
+ +                      rparam->paramkind = (ParamKind) atoi(token);
+ +
+ +                      token = pg_strtok(&length); /* skip  :paramid */
+ +                      token = pg_strtok(&length);
+ +                      rparam->paramid = atoi(token);
+ +
+ +                      token = pg_strtok(&length); /* skip  :paramused */
+ +                      token = pg_strtok(&length);
+ +                      rparam->paramused = atoi(token);
+ +
+ +                      token = pg_strtok(&length); /* skip  :paramtype */
+ +                      if (portable_input)
+ +                      {
+ +                              char       *nspname; /* namespace name */
+ +                              char       *typname; /* data type name */
+ +                              token = pg_strtok(&length); /* get nspname */
+ +                              nspname = nullable_string(token, length);
+ +                              token = pg_strtok(&length); /* get typname */
+ +                              typname = nullable_string(token, length);
+ +                              if (typname)
+ +                                      rparam->paramtype = get_typname_typid(typname,
+ +                                                                                                                NSP_OID(nspname));
+ +                              else
+ +                                      rparam->paramtype = InvalidOid;
+ +                      }
+ +                      else
+ +                      {
+ +                              token = pg_strtok(&length);
+ +                              rparam->paramtype = atooid(token);
+ +                      }
+ +              }
+ +      }
+ +      else
+ +              local_node->remoteparams = NULL;
+ +
+ +      READ_NODE_FIELD(rowMarks);
+ +      READ_CHAR_FIELD(distributionType);
+ +      READ_INT_FIELD(distributionKey);
+ +      READ_NODE_FIELD(distributionNodes);
+ +      READ_NODE_FIELD(distributionRestrict);
+ +
+ +      READ_DONE();
+ +}
+ +
+ +
+ +/*
+ + * _readSimpleSort
+ + */
+ +static SimpleSort *
+ +_readSimpleSort(void)
+ +{
+ +      int i;
+ +      READ_LOCALS(SimpleSort);
+ +
+ +      READ_INT_FIELD(numCols);
+ +
+ +      token = pg_strtok(&length);             /* skip :sortColIdx */
+ +      local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ +      for (i = 0; i < local_node->numCols; i++)
+ +      {
+ +              token = pg_strtok(&length);
+ +              local_node->sortColIdx[i] = atoi(token);
+ +      }
+ +
+ +      token = pg_strtok(&length);             /* skip :sortOperators */
+ +      local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ +      for (i = 0; i < local_node->numCols; i++)
+ +      {
+ +              token = pg_strtok(&length);
+ +              if (portable_input)
+ +              {
+ +                      char       *nspname; /* namespace name */
+ +                      char       *oprname; /* operator name */
+ +                      char       *leftnspname; /* left type namespace */
+ +                      char       *leftname; /* left type name */
+ +                      Oid                     oprleft; /* left type */
+ +                      char       *rightnspname; /* right type namespace */
+ +                      char       *rightname; /* right type name */
+ +                      Oid                     oprright; /* right type */
+ +                      /* token is already set to nspname */
+ +                      nspname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* get operator name */
+ +                      oprname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* left type namespace */
+ +                      leftnspname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* left type name */
+ +                      leftname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* right type namespace */
+ +                      rightnspname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* right type name */
+ +                      rightname = nullable_string(token, length);
+ +                      if (leftname)
+ +                              oprleft = get_typname_typid(leftname,
+ +                                                                                      NSP_OID(leftnspname));
+ +                      else
+ +                              oprleft = InvalidOid;
+ +                      if (rightname)
+ +                              oprright = get_typname_typid(rightname,
+ +                                                                                       NSP_OID(rightnspname));
+ +                      else
+ +                              oprright = InvalidOid;
+ +                      local_node->sortOperators[i] = get_operid(oprname,
+ +                                                                                                        oprleft,
+ +                                                                                                        oprright,
+ +                                                                                                        NSP_OID(nspname));
+ +              }
+ +              else
+ +                      local_node->sortOperators[i] = atooid(token);
+ +      }
+ +
+ +      token = pg_strtok(&length);             /* skip :sortCollations */
+ +      local_node->sortCollations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ +      for (i = 0; i < local_node->numCols; i++)
+ +      {
+ +              token = pg_strtok(&length);
+ +              if (portable_input)
+ +              {
+ +                      char       *nspname; /* namespace name */
+ +                      char       *collname; /* collation name */
+ +                      int             collencoding; /* collation encoding */
+ +                      /* the token is already read */
+ +                      nspname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* get collname */
+ +                      collname = nullable_string(token, length);
+ +                      token = pg_strtok(&length); /* get nargs */
+ +                      collencoding = atoi(token);
+ +                      if (collname)
+ +                              local_node->sortCollations[i] = get_collid(collname,
+ +                                                                                                         collencoding,
+ +                                                                                                         NSP_OID(nspname));
+ +                      else
+ +                              local_node->sortCollations[i] = InvalidOid;
+ +              }
+ +              else
+ +                      local_node->sortCollations[i] = atooid(token);
+ +      }
+ +
+ +      token = pg_strtok(&length);             /* skip :nullsFirst */
+ +      local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
+ +      for (i = 0; i < local_node->numCols; i++)
+ +      {
+ +              token = pg_strtok(&length);
+ +              local_node->nullsFirst[i] = strtobool(token);
+ +      }
+ +
+ +      READ_DONE();
+ +}
+ +
+ +
+ /*
+  * _readPartitionBoundSpec
+  */
+ static PartitionBoundSpec *
+ _readPartitionBoundSpec(void)
+ {
+       READ_LOCALS(PartitionBoundSpec);
+ 
+       READ_CHAR_FIELD(strategy);
+       READ_NODE_FIELD(listdatums);
+       READ_NODE_FIELD(lowerdatums);
+       READ_NODE_FIELD(upperdatums);
+       /* XXX somebody forgot location field; too late to change for v10 */
+       local_node->location = -1;
+ 
+       READ_DONE();
+ }
+ 
+ /*
+  * _readPartitionRangeDatum
+  */
+ static PartitionRangeDatum *
+ _readPartitionRangeDatum(void)
+ {
+       READ_LOCALS(PartitionRangeDatum);
+ 
+       READ_BOOL_FIELD(infinite);
+       READ_NODE_FIELD(value);
+       /* XXX somebody forgot location field; too late to change for v10 */
+       local_node->location = -1;
+ 
+       READ_DONE();
+ }
+ 
   /*
    * parseNodeString
    *
@@@ -3914,12 -2638,10 +4072,16 @@@ parseNodeString(void
                 return_value = _readAlternativeSubPlan();
         else if (MATCH("EXTENSIBLENODE", 14))
                 return_value = _readExtensibleNode();
+ +      else if (MATCH("REMOTESUBPLAN", 13))
+ +              return_value = _readRemoteSubplan();
+ +      else if (MATCH("REMOTESTMT", 10))
+ +              return_value = _readRemoteStmt();
+ +      else if (MATCH("SIMPLESORT", 10))
+ +              return_value = _readSimpleSort();
+       else if (MATCH("PARTITIONBOUND", 14))
+               return_value = _readPartitionBoundSpec();
+       else if (MATCH("PARTRANGEDATUM", 14))
+               return_value = _readPartitionRangeDatum();
         else
         {
                 elog(ERROR, "badly formatted node string \"%.32s\"...", token);
diff --cc src/backend/optimizer/path/allpaths.c

index 34bc42b19689b4379fb3b19416f9f11e0a3e24ab,78ca55bbd6dc1049f624bb0c401fee823dab1168..196c6194cb39de7efbab5bec49596e679f4c2240
--- 1/src/backend/optimizer/path/allpaths.c
--- 2/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@@ -3,8 -3,7 +3,8 @@@
    * allpaths.c
    *      Routines to find possible search paths for processing a query
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -625,6 -646,14 +653,17 @@@ set_rel_consider_parallel(PlannerInfo *
                          * executed only once.
                          */
                         return;
+ 
+               case RTE_NAMEDTUPLESTORE:
+ 
+                       /*
+                        * tuplestore cannot be shared, at least without more
+                        * infrastructure to support that.
+                        */
+                       return;
++
++              case RTE_REMOTE_DUMMY:
++                      return;
         }
   
         /*
@@@ -1325,14 -1423,8 +1433,14 @@@ add_paths_to_append_rel(PlannerInfo *ro
   
                 /* Generate a partial append path. */
                 appendpath = create_append_path(rel, partial_subpaths, NULL,
-                                                                               parallel_workers);
+                                                                               parallel_workers, partitioned_rels);
- -              add_partial_path(rel, (Path *) appendpath);
+ +
+ +              /*
+ +               * XL: In case we had to re-distribute the child relations, don't
+ +               * do anything. Otherwise create_gather_path hits an Assert etc.
+ +               */
+ +              if (appendpath->path.parallel_safe)
+ +                      add_partial_path(rel, (Path *) appendpath);
         }
   
         /*
diff --cc src/backend/optimizer/path/costsize.c

index 485717accea120654a9f56fa6d5c4c1d5e5f9e68,cdb18d978db4e96a90f5330dca723000a5426243..6e4808d51bbc7e59197b4b316adeb2c4e60e0136
--- 1/src/backend/optimizer/path/costsize.c
--- 2/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@@ -60,8 -60,7 +60,8 @@@
    * values.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
@@@ -131,7 -126,7 +131,8 @@@ bool               enable_nestloop = true
   bool          enable_material = true;
   bool          enable_mergejoin = true;
   bool          enable_hashjoin = true;
+ +bool          enable_fast_query_shipping = true;
+ bool          enable_gathermerge = true;
   
   typedef struct
   {
@@@ -4781,29 -5083,96 +5098,122 @@@ page_size(double tuples, int width
         return ceil(relation_byte_size(tuples, width) / BLCKSZ);
   }
   
- 
+ +#ifdef XCP
+ +void
+ +cost_remote_subplan(Path *path,
+ +                        Cost input_startup_cost, Cost input_total_cost,
+ +                        double tuples, int width, int replication)
+ +{
+ +      Cost            startup_cost = input_startup_cost + remote_query_cost;
+ +      Cost            run_cost = input_total_cost - input_startup_cost;
+ +
+ +      path->rows = tuples;
+ +
+ +      /*
+ +       * Charge 2x cpu_operator_cost per tuple to reflect bookkeeping overhead.
+ +       */
+ +      run_cost += 2 * cpu_operator_cost * tuples;
+ +
+ +      /*
+ +       * Estimate cost of sending data over network
+ +       */
+ +      run_cost += network_byte_cost * tuples * width * replication;
+ +
+ +      path->startup_cost = startup_cost;
+ +      path->total_cost = startup_cost + run_cost;
+ +}
+ +#endif
++
+ /*
+  * Estimate the fraction of the work that each worker will do given the
+  * number of workers budgeted for the path.
+  */
+ static double
+ get_parallel_divisor(Path *path)
+ {
+       double          parallel_divisor = path->parallel_workers;
+       double          leader_contribution;
+ 
+       /*
+        * Early experience with parallel query suggests that when there is only
+        * one worker, the leader often makes a very substantial contribution to
+        * executing the parallel portion of the plan, but as more workers are
+        * added, it does less and less, because it's busy reading tuples from the
+        * workers and doing whatever non-parallel post-processing is needed.  By
+        * the time we reach 4 workers, the leader no longer makes a meaningful
+        * contribution.  Thus, for now, estimate that the leader spends 30% of
+        * its time servicing each worker, and the remainder executing the
+        * parallel plan.
+        */
+       leader_contribution = 1.0 - (0.3 * path->parallel_workers);
+       if (leader_contribution > 0)
+               parallel_divisor += leader_contribution;
+ 
+       return parallel_divisor;
+ }
+ 
+ /*
+  * compute_bitmap_pages
+  *
+  * compute number of pages fetched from heap in bitmap heap scan.
+  */
+ double
+ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
+                                        int loop_count, Cost *cost, double *tuple)
+ {
+       Cost            indexTotalCost;
+       Selectivity indexSelectivity;
+       double          T;
+       double          pages_fetched;
+       double          tuples_fetched;
+ 
+       /*
+        * Fetch total cost of obtaining the bitmap, as well as its total
+        * selectivity.
+        */
+       cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
+ 
+       /*
+        * Estimate number of main-table pages fetched.
+        */
+       tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
+ 
+       T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
+ 
+       if (loop_count > 1)
+       {
+               /*
+                * For repeated bitmap scans, scale up the number of tuples fetched in
+                * the Mackert and Lohman formula by the number of scans, so that we
+                * estimate the number of pages fetched by all the scans. Then
+                * pro-rate for one scan.
+                */
+               pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
+                                                                                       baserel->pages,
+                                                                                       get_indexpath_pages(bitmapqual),
+                                                                                       root);
+               pages_fetched /= loop_count;
+       }
+       else
+       {
+               /*
+                * For a single scan, the number of heap pages that need to be fetched
+                * is the same as the Mackert and Lohman formula for the case T <= b
+                * (ie, no re-reads needed).
+                */
+               pages_fetched =
+                       (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
+       }
+ 
+       if (pages_fetched >= T)
+               pages_fetched = T;
+       else
+               pages_fetched = ceil(pages_fetched);
+ 
+       if (cost)
+               *cost = indexTotalCost;
+       if (tuple)
+               *tuple = tuples_fetched;
+ 
+       return pages_fetched;
+ }
diff --cc src/backend/optimizer/plan/createplan.c

index 637926ff3a46500eb84ca145a696471cdab10a71,94beeb858d8cddcd582ab7f61525bed5dcd3da07..af89e9d288366b996bc1f5389b68937f0fa41781
--- 1/src/backend/optimizer/plan/createplan.c
--- 2/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@@ -5,8 -5,7 +5,8 @@@
    *      Planning is complete, we just need to convert the selected
    *      Path into a Plan.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -103,13 -81,7 +103,14 @@@ static Plan *create_join_plan(PlannerIn
   static Plan *create_append_plan(PlannerInfo *root, AppendPath *best_path);
   static Plan *create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path);
   static Result *create_result_plan(PlannerInfo *root, ResultPath *best_path);
+ +#ifdef XCP
+ +static void adjust_subplan_distribution(PlannerInfo *root, Distribution *pathd,
+ +                                                Distribution *subd);
+ +static RemoteSubplan *create_remotescan_plan(PlannerInfo *root,
+ +                                         RemoteSubPath *best_path);
+ +static char *get_internal_cursor(void);
+ +#endif
+ static ProjectSet *create_project_set_plan(PlannerInfo *root, ProjectSetPath *best_path);
   static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path,
                                          int flags);
   static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path,
@@@ -218,13 -195,16 +224,17 @@@ static FunctionScan *make_functionscan(
                                   Index scanrelid, List *functions, bool funcordinality);
   static ValuesScan *make_valuesscan(List *qptlist, List *qpqual,
                                 Index scanrelid, List *values_lists);
+ static TableFuncScan *make_tablefuncscan(List *qptlist, List *qpqual,
+                                  Index scanrelid, TableFunc *tablefunc);
   static CteScan *make_ctescan(List *qptlist, List *qpqual,
                          Index scanrelid, int ctePlanId, int cteParam);
+ static NamedTuplestoreScan *make_namedtuplestorescan(List *qptlist, List *qpqual,
+                                                Index scanrelid, char *enrname);
   static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual,
                                    Index scanrelid, int wtParam);
- static Append *make_append(List *appendplans, List *tlist);
+ static Append *make_append(List *appendplans, List *tlist, List *partitioned_rels);
- -static RecursiveUnion *make_recursive_union(List *tlist,
+ +static RecursiveUnion *make_recursive_union(PlannerInfo *root,
+ +                                       List *tlist,
                                          Plan *lefttree,
                                          Plan *righttree,
                                          int wtParam,
@@@ -300,14 -280,9 +310,16 @@@ static ModifyTable *make_modifytable(Pl
                                  List *resultRelations, List *subplans,
                                  List *withCheckOptionLists, List *returningLists,
                                  List *rowMarks, OnConflictExpr *onconflict, int epqParam);
+ static GatherMerge *create_gather_merge_plan(PlannerInfo *root,
+                                                GatherMergePath *best_path);
   
+ +#ifdef XCP
+ +static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll,
+ +                              bool nulls_first,int numCols, AttrNumber *sortColIdx,
+ +                              Oid *sortOperators, Oid *collations, bool *nullsFirst);
+ +#endif
+ +
+ +static RemoteSubplan *find_push_down_plan(Plan *plan, bool force);
   
   /*
    * create_plan
@@@ -509,8 -479,12 +527,12 @@@ create_plan_recurse(PlannerInfo *root, 
                 case T_Limit:
                         plan = (Plan *) create_limit_plan(root,
                                                                                           (LimitPath *) best_path,
- -                                                                                        flags);
+ +                                                                                        flags, 0, 1);
                         break;
+               case T_GatherMerge:
+                       plan = (Plan *) create_gather_merge_plan(root,
+                                                                                         (GatherMergePath *) best_path);
+                       break;
                 default:
                         elog(ERROR, "unrecognized node type: %d",
                                  (int) best_path->pathtype);
@@@ -1297,17 -1323,10 +1371,18 @@@ create_unique_plan(PlannerInfo *root, U
                  */
                 if (!is_projection_capable_plan(subplan) &&
                         !tlist_same_exprs(newtlist, subplan->targetlist))
-                       subplan = inject_projection_plan(subplan, newtlist);
+                       subplan = inject_projection_plan(subplan, newtlist,
+                                                                                        best_path->path.parallel_safe);
                 else
                         subplan->targetlist = newtlist;
+ +#ifdef XCP
+ +              /*
+ +               * RemoteSubplan is conditionally projection capable - it is pushing
+ +               * projection to the data nodes
+ +               */
+ +              if (IsA(subplan, RemoteSubplan))
+ +                      subplan->lefttree->targetlist = newtlist;
+ +#endif
         }
   
         /*
@@@ -1911,36 -2000,8 +2057,37 @@@ create_minmaxagg_plan(PlannerInfo *root
                 plan->plan_rows = 1;
                 plan->plan_width = mminfo->path->pathtarget->width;
                 plan->parallel_aware = false;
+               plan->parallel_safe = mminfo->path->parallel_safe;
   
+ +              /*
+ +               * XL: Add a remote subplan, splitting the LIMIT into a remote and
+ +               * local part LIMIT parts.
+ +               *
+ +               * XXX This should probably happen when constructing the path in
+ +               * create_minmaxagg_path(), not this late.
+ +               *
+ +               * XXX The costing in here is mostly bogus. Not that it'd matter
+ +               * this late, though.
+ +               */
+ +              if (mminfo->path->distribution)
+ +              {
+ +                      plan = (Plan *) make_remotesubplan(root, plan,
+ +                                                                                         NULL,
+ +                                                                                         mminfo->path->distribution,
+ +                                                                                         mminfo->path->pathkeys);
+ +
+ +                      plan = (Plan *) make_limit(plan,
+ +                                                                         subparse->limitOffset,
+ +                                                                         subparse->limitCount,
+ +                                                                         0, 1);
+ +
+ +                      plan->startup_cost = mminfo->path->startup_cost;
+ +                      plan->total_cost = mminfo->pathcost;
+ +                      plan->plan_rows = 1;
+ +                      plan->plan_width = mminfo->path->pathtarget->width;
+ +                      plan->parallel_aware = false;
+ +              }
+ +
                 /* Convert the plan into an InitPlan in the outer query. */
                 SS_make_initplan_from_plan(root, subroot, plan, mminfo->param);
         }
@@@ -5204,361 -5233,6 +5538,361 @@@ make_worktablescan(List *qptlist
         return node;
   }
   
-                                       tle = tlist_member((Node *) em->em_expr, tlist);
+ +#ifdef XCP
+ +/*
+ + * make_remotesubplan
+ + *    Create a RemoteSubplan node to execute subplan on remote nodes.
+ + *  leftree - the subplan which we want to push down to remote node.
+ + *  resultDistribution - the distribution of the remote result. May be NULL -
+ + * results are coming to the invoking node
+ + *  execDistribution - determines how source data of the subplan are
+ + * distributed, where we should send the subplan and how combine results.
+ + *    pathkeys - the remote subplan is sorted according to these keys, executor
+ + *            should perform merge sort of incoming tuples
+ + */
+ +RemoteSubplan *
+ +make_remotesubplan(PlannerInfo *root,
+ +                                 Plan *lefttree,
+ +                                 Distribution *resultDistribution,
+ +                                 Distribution *execDistribution,
+ +                                 List *pathkeys)
+ +{
+ +      RemoteSubplan *node = makeNode(RemoteSubplan);
+ +      Plan       *plan = &node->scan.plan;
+ +      Bitmapset  *tmpset;
+ +      int                     nodenum;
+ +
+ +      /* Sanity checks */
+ +      Assert(!equal(resultDistribution, execDistribution));
+ +      Assert(!IsA(lefttree, RemoteSubplan));
+ +
+ +      if (resultDistribution)
+ +      {
+ +              node->distributionType = resultDistribution->distributionType;
+ +              node->distributionKey = InvalidAttrNumber;
+ +              if (resultDistribution->distributionExpr)
+ +              {
+ +                      ListCell   *lc;
+ +                      Expr       *expr;
+ +
+ +                      /* XXX Is that correct to reference a column of different type? */
+ +                      if (IsA(resultDistribution->distributionExpr, RelabelType))
+ +                              expr = ((RelabelType *) resultDistribution->distributionExpr)->arg;
+ +                      else
+ +                              expr = (Expr *) resultDistribution->distributionExpr;
+ +
+ +                      /* Find distribution expression in the target list */
+ +                      foreach(lc, lefttree->targetlist)
+ +                      {
+ +                              TargetEntry *tle = (TargetEntry *) lfirst(lc);
+ +
+ +                              if (equal(tle->expr, expr))
+ +                              {
+ +                                      node->distributionKey = tle->resno;
+ +                                      break;
+ +                              }
+ +                      }
+ +
+ +                      if (node->distributionKey == InvalidAttrNumber)
+ +                      {
+ +                              TargetEntry *newtle;
+ +
+ +                              /* The expression is not found, need to add junk */
+ +                              newtle = makeTargetEntry(expr,
+ +                                                                               list_length(lefttree->targetlist) + 1,
+ +                                                                           NULL,
+ +                                                                               true);
+ +
+ +                              if (is_projection_capable_plan(lefttree))
+ +                              {
+ +                                      /* Ok to modify subplan's target list */
+ +                                      lefttree->targetlist = lappend(lefttree->targetlist, newtle);
+ +                              }
+ +                              else
+ +                              {
+ +                                      /* Use Result node to calculate expression */
+ +                                      List *newtlist = list_copy(lefttree->targetlist);
+ +                                      newtlist = lappend(newtlist, newtle);
+ +                                      lefttree = (Plan *) make_result(newtlist, NULL, lefttree);
+ +                              }
+ +
+ +                              node->distributionKey = newtle->resno;
+ +                      }
+ +              }
+ +              /*
+ +               * The distributionNodes describes result distribution
+ +               */
+ +              tmpset = bms_copy(resultDistribution->nodes);
+ +              node->distributionNodes = NIL;
+ +              while ((nodenum = bms_first_member(tmpset)) >= 0)
+ +                      node->distributionNodes = lappend_int(node->distributionNodes,
+ +                                                                                                nodenum);
+ +              bms_free(tmpset);
+ +              /*
+ +               * The distributionRestrict defines the set of nodes where results are
+ +               * actually shipped. These are the nodes where upper level step
+ +               * is executed.
+ +               */
+ +              if (resultDistribution->restrictNodes)
+ +              {
+ +                      tmpset = bms_copy(resultDistribution->restrictNodes);
+ +                      node->distributionRestrict = NIL;
+ +                      while ((nodenum = bms_first_member(tmpset)) >= 0)
+ +                              node->distributionRestrict =
+ +                                              lappend_int(node->distributionRestrict, nodenum);
+ +                      bms_free(tmpset);
+ +              }
+ +              else
+ +                      node->distributionRestrict = list_copy(node->distributionNodes);
+ +      }
+ +      else
+ +      {
+ +              node->distributionType = LOCATOR_TYPE_NONE;
+ +              node->distributionKey = InvalidAttrNumber;
+ +              node->distributionNodes = NIL;
+ +      }
+ +
+ +      /* determine where subplan will be executed */
+ +      if (execDistribution)
+ +      {
+ +              if (execDistribution->restrictNodes)
+ +                      tmpset = bms_copy(execDistribution->restrictNodes);
+ +              else
+ +                      tmpset = bms_copy(execDistribution->nodes);
+ +              node->nodeList = NIL;
+ +              while ((nodenum = bms_first_member(tmpset)) >= 0)
+ +                      node->nodeList = lappend_int(node->nodeList, nodenum);
+ +              bms_free(tmpset);
+ +              node->execOnAll = list_length(node->nodeList) == 1 ||
+ +                              !IsLocatorReplicated(execDistribution->distributionType);
+ +      }
+ +      else
+ +      {
+ +              /*
+ +               * Prepare single execution of replicated subplan. Choose one node from
+ +               * the execution node list, preferrably the node is also a member of
+ +               * the list of result nodes, so later all node executors contact the
+ +               * same node to get tuples
+ +               */
+ +              tmpset = NULL;
+ +              if (!bms_is_empty(resultDistribution->restrictNodes))
+ +                      tmpset = bms_copy(resultDistribution->restrictNodes);
+ +              else
+ +                      tmpset = bms_copy(resultDistribution->nodes);
+ +              /*
+ +               * If result goes on single node execute subplan locally
+ +               */
+ +              if (bms_num_members(tmpset) > 1)
+ +              {
+ +                      /* get one execution node TODO: load balancing */
+ +                      nodenum = bms_any_member(tmpset);
+ +                      node->nodeList = list_make1_int(nodenum);
+ +                      node->execOnAll = true;
+ +              }
+ +              else
+ +              {
+ +                      node->nodeList = NIL;
+ +                      node->execOnAll = false;
+ +              }
+ +              bms_free(tmpset);
+ +      }
+ +
+ +      /* We do not need to merge sort if only one node is yielding tuples */
+ +      if (pathkeys && node->execOnAll && list_length(node->nodeList) > 1)
+ +      {
+ +              List       *tlist = lefttree->targetlist;
+ +              ListCell   *i;
+ +              int                     numsortkeys;
+ +              AttrNumber *sortColIdx;
+ +              Oid                *sortOperators;
+ +              Oid                *collations;
+ +              bool       *nullsFirst;
+ +
+ +              /*
+ +               * We will need at most list_length(pathkeys) sort columns; possibly less
+ +               */
+ +              numsortkeys = list_length(pathkeys);
+ +              sortColIdx = (AttrNumber *) palloc(numsortkeys * sizeof(AttrNumber));
+ +              sortOperators = (Oid *) palloc(numsortkeys * sizeof(Oid));
+ +              collations = (Oid *) palloc(numsortkeys * sizeof(Oid));
+ +              nullsFirst = (bool *) palloc(numsortkeys * sizeof(bool));
+ +
+ +              numsortkeys = 0;
+ +
+ +              foreach(i, pathkeys)
+ +              {
+ +                      PathKey    *pathkey = (PathKey *) lfirst(i);
+ +                      EquivalenceClass *ec = pathkey->pk_eclass;
+ +                      TargetEntry *tle = NULL;
+ +                      Oid                     pk_datatype = InvalidOid;
+ +                      Oid                     sortop;
+ +                      ListCell   *j;
+ +
+ +                      if (ec->ec_has_volatile)
+ +                      {
+ +                              /*
+ +                               * If the pathkey's EquivalenceClass is volatile, then it must
+ +                               * have come from an ORDER BY clause, and we have to match it to
+ +                               * that same targetlist entry.
+ +                               */
+ +                              if (ec->ec_sortref == 0)        /* can't happen */
+ +                                      elog(ERROR, "volatile EquivalenceClass has no sortref");
+ +                              tle = get_sortgroupref_tle(ec->ec_sortref, tlist);
+ +                              Assert(tle);
+ +                              Assert(list_length(ec->ec_members) == 1);
+ +                              pk_datatype = ((EquivalenceMember *) linitial(ec->ec_members))->em_datatype;
+ +                      }
+ +                      else
+ +                      {
+ +                              /*
+ +                               * Otherwise, we can sort by any non-constant expression listed in
+ +                               * the pathkey's EquivalenceClass.  For now, we take the first one
+ +                               * that corresponds to an available item in the tlist.  If there
+ +                               * isn't any, use the first one that is an expression in the
+ +                               * input's vars.  (The non-const restriction only matters if the
+ +                               * EC is below_outer_join; but if it isn't, it won't contain
+ +                               * consts anyway, else we'd have discarded the pathkey as
+ +                               * redundant.)
+ +                               *
+ +                               * XXX if we have a choice, is there any way of figuring out which
+ +                               * might be cheapest to execute?  (For example, int4lt is likely
+ +                               * much cheaper to execute than numericlt, but both might appear
+ +                               * in the same equivalence class...)  Not clear that we ever will
+ +                               * have an interesting choice in practice, so it may not matter.
+ +                               */
+ +                              foreach(j, ec->ec_members)
+ +                              {
+ +                                      EquivalenceMember *em = (EquivalenceMember *) lfirst(j);
+ +
+ +                                      if (em->em_is_const)
+ +                                              continue;
+ +
-                                       tle = tlist_member_ignore_relabel((Node *) em->em_expr, tlist);
++                                      tle = tlist_member(em->em_expr, tlist);
+ +                                      if (tle)
+ +                                      {
+ +                                              pk_datatype = em->em_datatype;
+ +                                              break;          /* found expr already in tlist */
+ +                                      }
+ +
+ +                                      /*
+ +                                       * We can also use it if the pathkey expression is a relabel
+ +                                       * of the tlist entry, or vice versa.  This is needed for
+ +                                       * binary-compatible cases (cf. make_pathkey_from_sortinfo).
+ +                                       * We prefer an exact match, though, so we do the basic search
+ +                                       * first.
+ +                                       */
++                                      tle = tlist_member_ignore_relabel(em->em_expr, tlist);
+ +                                      if (tle)
+ +                                      {
+ +                                              pk_datatype = em->em_datatype;
+ +                                              break;          /* found expr already in tlist */
+ +                                      }
+ +                              }
+ +
+ +                              if (!tle)
+ +                              {
+ +                                      /* No matching tlist item; look for a computable expression */
+ +                                      Expr       *sortexpr = NULL;
+ +
+ +                                      foreach(j, ec->ec_members)
+ +                                      {
+ +                                              EquivalenceMember *em = (EquivalenceMember *) lfirst(j);
+ +                                              List       *exprvars;
+ +                                              ListCell   *k;
+ +
+ +                                              if (em->em_is_const)
+ +                                                      continue;
+ +                                              sortexpr = em->em_expr;
+ +                                              exprvars = pull_var_clause((Node *) sortexpr,
+ +                                                                                                 PVC_INCLUDE_AGGREGATES |
+ +                                                                                                 PVC_INCLUDE_PLACEHOLDERS);
+ +                                              foreach(k, exprvars)
+ +                                              {
+ +                                                      if (!tlist_member_ignore_relabel(lfirst(k), tlist))
+ +                                                              break;
+ +                                              }
+ +                                              list_free(exprvars);
+ +                                              if (!k)
+ +                                              {
+ +                                                      pk_datatype = em->em_datatype;
+ +                                                      break;  /* found usable expression */
+ +                                              }
+ +                                      }
+ +                                      if (!j)
+ +                                              elog(ERROR, "could not find pathkey item to sort");
+ +
+ +                                      /*
+ +                                       * Do we need to insert a Result node?
+ +                                       */
+ +                                      if (!is_projection_capable_plan(lefttree))
+ +                                      {
+ +                                              /* copy needed so we don't modify input's tlist below */
+ +                                              tlist = copyObject(tlist);
+ +                                              lefttree = (Plan *) make_result(tlist, NULL, lefttree);
+ +                                      }
+ +
+ +                                      /*
+ +                                       * Add resjunk entry to input's tlist
+ +                                       */
+ +                                      tle = makeTargetEntry(sortexpr,
+ +                                                                                list_length(tlist) + 1,
+ +                                                                                NULL,
+ +                                                                                true);
+ +                                      tlist = lappend(tlist, tle);
+ +                                      lefttree->targetlist = tlist;   /* just in case NIL before */
+ +                              }
+ +                      }
+ +
+ +                      /*
+ +                       * Look up the correct sort operator from the PathKey's slightly
+ +                       * abstracted representation.
+ +                       */
+ +                      sortop = get_opfamily_member(pathkey->pk_opfamily,
+ +                                                                               pk_datatype,
+ +                                                                               pk_datatype,
+ +                                                                               pathkey->pk_strategy);
+ +                      if (!OidIsValid(sortop))        /* should not happen */
+ +                              elog(ERROR, "could not find member %d(%u,%u) of opfamily %u",
+ +                                       pathkey->pk_strategy, pk_datatype, pk_datatype,
+ +                                       pathkey->pk_opfamily);
+ +
+ +                      /*
+ +                       * The column might already be selected as a sort key, if the pathkeys
+ +                       * contain duplicate entries.  (This can happen in scenarios where
+ +                       * multiple mergejoinable clauses mention the same var, for example.)
+ +                       * So enter it only once in the sort arrays.
+ +                       */
+ +                      numsortkeys = add_sort_column(tle->resno,
+ +                                                                                sortop,
+ +                                                                                pathkey->pk_eclass->ec_collation,
+ +                                                                                pathkey->pk_nulls_first,
+ +                                                                                numsortkeys,
+ +                                                                                sortColIdx, sortOperators,
+ +                                                                                collations, nullsFirst);
+ +              }
+ +              Assert(numsortkeys > 0);
+ +
+ +              node->sort = makeNode(SimpleSort);
+ +              node->sort->numCols = numsortkeys;
+ +              node->sort->sortColIdx = sortColIdx;
+ +              node->sort->sortOperators = sortOperators;
+ +              node->sort->sortCollations = collations;
+ +              node->sort->nullsFirst = nullsFirst;
+ +      }
+ +
+ +      plan->qual = NIL;
+ +      plan->targetlist = lefttree->targetlist;
+ +      plan->lefttree = lefttree;
+ +      plan->righttree = NULL;
+ +      copy_plan_costsize(plan, lefttree);
+ +
+ +      node->cursor = get_internal_cursor();
+ +      node->unique = 0;
+ +      return node;
+ +}
+ +#endif /* XCP */
+ +
+ +
   ForeignScan *
   make_foreignscan(List *qptlist,
                                  List *qpqual,
@@@ -7027,15 -6594,15 +7409,24 @@@ is_projection_capable_plan(Plan *plan
                 case T_MergeAppend:
                 case T_RecursiveUnion:
                         return false;
+ +#ifdef XCP
+ +              /*
+ +               * Remote subplan may push down projection to the data nodes if do not
+ +               * performs merge sort
+ +               */
+ +              case T_RemoteSubplan:
+ +                      return ((RemoteSubplan *) plan)->sort == NULL &&
+ +                                      is_projection_capable_plan(plan->lefttree);
+ +#endif
+               case T_ProjectSet:
+ 
+                       /*
+                        * Although ProjectSet certainly projects, say "no" because we
+                        * don't want the planner to randomly replace its tlist with
+                        * something else; the SRFs have to stay at top level.  This might
+                        * get relaxed later.
+                        */
+                       return false;
                 default:
                         break;
         }
diff --cc src/backend/optimizer/plan/planagg.c

index f7d6dace59deacefdb925305b708d1577d5ff8cc,55657360fc380a6a8b76e44912f66504c38c706b..c9331d272ad44b0b76f7fbafa0709d1919ff3f6b
--- 1/src/backend/optimizer/plan/planagg.c
--- 2/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@@@ -17,8 -17,7 +17,8 @@@
    * scan all the rows anyway.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/optimizer/plan/planner.c

index 89031d265eaf4f5756cef685c44c5d846f1ea89e,40cb79d4cd23ef90d0aa31dabf4ab52899932efb..b49a91a3b0c2c7206b6015cf69bfc8a9e92ca93c
--- 1/src/backend/optimizer/plan/planner.c
--- 2/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@@ -3,8 -3,7 +3,8 @@@
    * planner.c
    *      The query optimizer external interface.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -158,16 -182,9 +188,19 @@@ static List *make_pathkeys_for_window(P
   static PathTarget *make_sort_input_target(PlannerInfo *root,
                                            PathTarget *final_target,
                                            bool *have_postponed_srfs);
+ +static bool equal_distributions(PlannerInfo *root, Distribution *dst1,
+ +                                      Distribution *dst2);
+ +static bool grouping_distribution_match(PlannerInfo *root, Query *parse,
+ +                                        Path *path, List *clauses);
+ +static bool groupingsets_distribution_match(PlannerInfo *root, Query *parse,
+ +                                        Path *path);
+ +static Path *adjust_path_distribution(PlannerInfo *root, Query *parse,
+ +                                        Path *path);
+ +static bool can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path);
+ +static bool can_push_down_window(PlannerInfo *root, Path *path);
+ static void adjust_paths_for_srfs(PlannerInfo *root, RelOptInfo *rel,
+                                         List *targets, List *targets_contain_srfs);
+ 
   
   /*****************************************************************************
    *
@@@ -216,17 -224,6 +249,11 @@@ standard_planner(Query *parse, int curs
         ListCell   *lp,
                            *lr;
   
- 
-       /* Cursor options may come from caller or from DECLARE CURSOR stmt */
-       if (parse->utilityStmt &&
-               IsA(parse->utilityStmt, DeclareCursorStmt))
-               cursorOptions |= ((DeclareCursorStmt *) parse->utilityStmt)->options;
- 
+ +#ifdef XCP
+ +      if (IS_PGXC_LOCAL_COORDINATOR && parse->utilityStmt &&
+ +                      IsA(parse->utilityStmt, RemoteQuery))
+ +              return pgxc_direct_planner(parse, cursorOptions, boundParams);
+ +#endif
         /*
          * Set up global state for this planner invocation.  This data is needed
          * across all levels of sub-Query that might exist in the given command,
@@@ -458,12 -442,11 +485,16 @@@
         result->rowMarks = glob->finalrowmarks;
         result->relationOids = glob->relationOids;
         result->invalItems = glob->invalItems;
+ +#ifdef XCP
+ +      result->distributionType = LOCATOR_TYPE_NONE;
+ +      result->distributionKey = InvalidAttrNumber;
+ +      result->distributionNodes = NULL;
+ +#endif
         result->nParamExec = glob->nParamExec;
+       /* utilityStmt should be null, but we might as well copy it */
+       result->utilityStmt = parse->utilityStmt;
+       result->stmt_location = parse->stmt_location;
+       result->stmt_len = parse->stmt_len;
   
         return result;
   }
@@@ -538,9 -512,8 +570,10 @@@ subquery_planner(PlannerGlobal *glob, Q
         memset(root->upper_targets, 0, sizeof(root->upper_targets));
         root->processed_tlist = NIL;
         root->grouping_map = NULL;
+ +      root->recursiveOk = true;
+ +
         root->minmax_aggs = NIL;
+       root->qual_security_level = 0;
         root->hasInheritedTarget = false;
         root->hasRecursion = hasRecursion;
         if (hasRecursion)
@@@ -2368,31 -2332,6 +2537,32 @@@ preprocess_rowmarks(PlannerInfo *root
                  */
                 CheckSelectLocking(parse, ((RowMarkClause *)
                                                                    linitial(parse->rowMarks))->strength);
-                       Bitmapset  *baserels = get_base_rel_indexes((Node *) parse->jointree);
+ +
+ +              if (parse->jointree)
+ +              {
++                      Bitmapset  *baserels = get_relids_in_jointree((Node *)
++                                      parse->jointree, false);
+ +                      int x, num_rels = 0;
+ +                      bool dist_found = false;
+ +
+ +                      while ((x = bms_first_member(baserels)) >= 0)
+ +                      {
+ +                              RangeTblEntry *rte = rt_fetch(x, parse->rtable);
+ +                              RelationLocInfo *locinfo = NULL;
+ +                              if (OidIsValid(rte->relid))
+ +                                      locinfo = GetRelationLocInfo(rte->relid);
+ +                              if (locinfo && !IsRelationReplicated(locinfo))
+ +                                      dist_found = true;
+ +                              num_rels++;
+ +                      }
+ +
+ +                      if (dist_found && num_rels > 1)
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ +                                               errmsg("%s is not allowed with joins",
+ +                                                       LCS_asString(((RowMarkClause *)
+ +                                                                       linitial(parse->rowMarks))->strength))));
+ +              }
         }
         else
         {
@@@ -4075,49 -4015,45 +4340,61 @@@ create_grouping_paths(PlannerInfo *root
   
         if (can_hash)
         {
-               hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
-                                                                                                         agg_costs,
-                                                                                                         dNumGroups);
- 
-               /*
-                * Provided that the estimated size of the hashtable does not exceed
-                * work_mem, we'll generate a HashAgg Path, although if we were unable
-                * to sort above, then we'd better generate a Path, so that we at
-                * least have one.
-                */
-               if (hashaggtablesize < work_mem * 1024L ||
-                       grouped_rel->pathlist == NIL)
+               if (parse->groupingSets)
                 {
-                       /* Don't mess with the cheapest path directly. */
-                       Path *path = cheapest_path;
- 
                         /*
-                        * If the grouping can't be fully pushed down, we'll push down the
-                        * first phase of the aggregate, and redistribute only the partial
-                        * results.
-                        *
-                        * If if can be pushed down, disable construction of complex
-                        * distributed paths.
+                        * Try for a hash-only groupingsets path over unsorted input.
                          */
-                       if (! can_push_down_grouping(root, parse, path))
-                               path = create_remotesubplan_path(root, path, NULL);
-                       else
-                               try_distributed_aggregation = false;
+                       consider_groupingsets_paths(root, grouped_rel,
+                                                                               cheapest_path, false, true, target,
+                                                                               gd, agg_costs, dNumGroups);
+               }
+               else
+               {
+                       hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
+                                                                                                                 agg_costs,
+                                                                                                                 dNumGroups);
   
                         /*
-                        * We just need an Agg over the cheapest-total input path, since
-                        * input order won't matter.
+                        * Provided that the estimated size of the hashtable does not
+                        * exceed work_mem, we'll generate a HashAgg Path, although if we
+                        * were unable to sort above, then we'd better generate a Path, so
+                        * that we at least have one.
                          */
-                       add_path(grouped_rel, (Path *)
-                                        create_agg_path(root, grouped_rel,
-                                                                        path,
-                                                                        target,
-                                                                        AGG_HASHED,
-                                                                        AGGSPLIT_SIMPLE,
-                                                                        parse->groupClause,
-                                                                        (List *) parse->havingQual,
-                                                                        agg_costs,
-                                                                        dNumGroups));
+                       if (hashaggtablesize < work_mem * 1024L ||
+                               grouped_rel->pathlist == NIL)
+                       {
++                              /* Don't mess with the cheapest path directly. */
++                              Path *path = cheapest_path;
++
++                              /*
++                               * If the grouping can't be fully pushed down, we'll push down the
++                               * first phase of the aggregate, and redistribute only the partial
++                               * results.
++                               *
++                               * If if can be pushed down, disable construction of complex
++                               * distributed paths.
++                               */
++                              if (! can_push_down_grouping(root, parse, path))
++                                      path = create_remotesubplan_path(root, path, NULL);
++                              else
++                                      try_distributed_aggregation = false;
++
+                               /*
+                                * We just need an Agg over the cheapest-total input path,
+                                * since input order won't matter.
+                                */
+                               add_path(grouped_rel, (Path *)
+                                                create_agg_path(root, grouped_rel,
- -                                                                               cheapest_path,
++                                                                               path,
+                                                                                target,
+                                                                                AGG_HASHED,
+                                                                                AGGSPLIT_SIMPLE,
+                                                                                parse->groupClause,
+                                                                                (List *) parse->havingQual,
+                                                                                agg_costs,
+                                                                                dNumGroups));
+                       }
                 }
   
                 /*
@@@ -4172,478 -4095,381 +4449,824 @@@
                 }
         }
   
- -      /* Give a helpful error if we failed to find any implementation */
- -      if (grouped_rel->pathlist == NIL)
- -              ereport(ERROR,
- -                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- -                               errmsg("could not implement GROUP BY"),
- -                               errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
- -
- -      /*
- -       * If there is an FDW that's responsible for all baserels of the query,
- -       * let it consider adding ForeignPaths.
- -       */
- -      if (grouped_rel->fdwroutine &&
- -              grouped_rel->fdwroutine->GetForeignUpperPaths)
- -              grouped_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_GROUP_AGG,
- -                                                                                                        input_rel, grouped_rel);
- -
- -      /* Let extensions possibly add some more paths */
- -      if (create_upper_paths_hook)
- -              (*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG,
- -                                                                      input_rel, grouped_rel);
- -
- -      /* Now choose the best path(s) */
- -      set_cheapest(grouped_rel);
- -
- -      /*
- -       * We've been using the partial pathlist for the grouped relation to hold
- -       * partially aggregated paths, but that's actually a little bit bogus
- -       * because it's unsafe for later planning stages -- like ordered_rel ---
- -       * to get the idea that they can use these partial paths as if they didn't
- -       * need a FinalizeAggregate step.  Zap the partial pathlist at this stage
- -       * so we don't get confused.
- -       */
- -      grouped_rel->partial_pathlist = NIL;
- -
- -      return grouped_rel;
- -}
- -
- -
- -/*
- - * For a given input path, consider the possible ways of doing grouping sets on
- - * it, by combinations of hashing and sorting.  This can be called multiple
- - * times, so it's important that it not scribble on input.  No result is
- - * returned, but any generated paths are added to grouped_rel.
- - */
- -static void
- -consider_groupingsets_paths(PlannerInfo *root,
- -                                                      RelOptInfo *grouped_rel,
- -                                                      Path *path,
- -                                                      bool is_sorted,
- -                                                      bool can_hash,
- -                                                      PathTarget *target,
- -                                                      grouping_sets_data *gd,
- -                                                      const AggClauseCosts *agg_costs,
- -                                                      double dNumGroups)
- -{
- -      Query      *parse = root->parse;
+ +      /* Generate XL aggregate paths, with distributed 2-phase aggregation. */
   
         /*
- -       * If we're not being offered sorted input, then only consider plans that
- -       * can be done entirely by hashing.
+ +       * If there were no partial paths, we did not initialize any of the
+ +       * partial paths above. If that's the case, initialize here.
          *
- -       * We can hash everything if it looks like it'll fit in work_mem. But if
- -       * the input is actually sorted despite not being advertised as such, we
- -       * prefer to make use of that in order to use less memory.
+ +       * XXX The reason why the initialization block at the beginning is not
+ +       * simply performed unconditionally is that we may skip it if we've been
+ +       * successful in fully pushing down any of the aggregates, and entirely
+ +       * skip generating the XL paths.
          *
- -       * If none of the grouping sets are sortable, then ignore the work_mem
- -       * limit and generate a path anyway, since otherwise we'll just fail.
+ +       * XXX Can we simply use the same estimates as regular partial aggregates,
+ +       * or do we need to invent something else? It might be a better idea to
+ +       * use estimates for the whole result here (e.g. total number of groups)
+ +       * instead of the partial ones. Underestimates often have more severe
+ +       * consequences (e.g. OOM with HashAggregate) than overestimates, so this
+ +       * seems like a more defensive approach.
+ +       *
+ +       * XXX After thinking a bit more about the estimation, it may depend on
+ +       * pushdown - if the aggregate is fully pushed down (as above, we can
+ +       * probably use dNumGroups/numberOfNodes as a cardinality estimate, as
+ +       * we know the per-node groupings won't overlap. But here we need to be
+ +       * more careful.
          */
- -      if (!is_sorted)
+ +      if (try_distributed_aggregation)
         {
- -              List       *new_rollups = NIL;
- -              RollupData *unhashed_rollup = NULL;
- -              List       *sets_data;
- -              List       *empty_sets_data = NIL;
- -              List       *empty_sets = NIL;
- -              ListCell   *lc;
- -              ListCell   *l_start = list_head(gd->rollups);
- -              AggStrategy strat = AGG_HASHED;
- -              Size            hashsize;
- -              double          exclude_groups = 0.0;
- -
- -              Assert(can_hash);
- -
- -              if (pathkeys_contained_in(root->group_pathkeys, path->pathkeys))
- -              {
- -                      unhashed_rollup = lfirst(l_start);
- -                      exclude_groups = unhashed_rollup->numGroups;
- -                      l_start = lnext(l_start);
- -              }
+ +              partial_grouping_target = make_partial_grouping_target(root, target);
   
- -              hashsize = estimate_hashagg_tablesize(path,
- -                                                                                        agg_costs,
- -                                                                                        dNumGroups - exclude_groups);
+ +              /* Estimate number of partial groups. */
+ +              dNumPartialGroups = get_number_of_groups(root,
+ +                                                                                               cheapest_path->rows,
-                                                                                                NIL,
-                                                                                                NIL);
++                                                                                               gd);
   
                 /*
- -               * gd->rollups is empty if we have only unsortable columns to work
- -               * with.  Override work_mem in that case; otherwise, we'll rely on the
- -               * sorted-input case to generate usable mixed paths.
+ +               * Collect statistics about aggregates for estimating costs of
+ +               * performing aggregation in parallel.
                  */
- -              if (hashsize > work_mem * 1024L && gd->rollups)
- -                      return;                         /* nope, won't fit */
+ +              MemSet(&agg_partial_costs, 0, sizeof(AggClauseCosts));
+ +              MemSet(&agg_final_costs, 0, sizeof(AggClauseCosts));
+ +              if (parse->hasAggs)
+ +              {
+ +                      /* partial phase */
+ +                      get_agg_clause_costs(root, (Node *) partial_grouping_target->exprs,
+ +                                                               AGGSPLIT_INITIAL_SERIAL,
+ +                                                               &agg_partial_costs);
+ +
+ +                      /* final phase */
+ +                      get_agg_clause_costs(root, (Node *) target->exprs,
+ +                                                               AGGSPLIT_FINAL_DESERIAL,
+ +                                                               &agg_final_costs);
+ +                      get_agg_clause_costs(root, parse->havingQual,
+ +                                                               AGGSPLIT_FINAL_DESERIAL,
+ +                                                               &agg_final_costs);
+ +              }
+ +      }
   
+ +      /* Build final XL grouping paths */
+ +      if (can_sort && try_distributed_aggregation)
+ +      {
                 /*
- -               * We need to burst the existing rollups list into individual grouping
- -               * sets and recompute a groupClause for each set.
+ +               * Use any available suitably-sorted path as input, and also consider
+ +               * sorting the cheapest-total path.
                  */
- -              sets_data = list_copy(gd->unsortable_sets);
- -
- -              for_each_cell(lc, l_start)
+ +              foreach(lc, input_rel->pathlist)
                 {
- -                      RollupData *rollup = lfirst(lc);
+ +                      Path       *path = (Path *) lfirst(lc);
+ +                      bool            is_sorted;
+ +
+ +                      is_sorted = pathkeys_contained_in(root->group_pathkeys,
-                                                                                         path->pathkeys);
++                                      path->pathkeys);
+ +
+ +                      /*
+ +                       * XL: Can it happen that the cheapest path can't be pushed down,
+ +                       * while some other path could be? Perhaps we should move the check
+ +                       * if a path can be pushed down up, and add another OR condition
+ +                       * to consider all paths that can be pushed down?
+ +                       *
+ +                       * if (path == cheapest_path || is_sorted || can_push_down)
+ +                       */
+ +                      if (path == cheapest_path || is_sorted)
+ +                      {
+ +                              /*
+ +                               * We can't really beat paths that we managed to fully push
+ +                               * down above, so we can skip them entirely.
+ +                               *
+ +                               * XXX Not constructing any paths, so we can do this before
+ +                               * adding the Sort path.
+ +                               */
+ +                              if (can_push_down_grouping(root, parse, path))
+ +                                      continue;
+ +
+ +                              /* Sort the cheapest-total path if it isn't already sorted */
+ +                              if (!is_sorted)
+ +                                      path = (Path *) create_sort_path(root,
+ +                                                                                                       grouped_rel,
+ +                                                                                                       path,
+ +                                                                                                       root->group_pathkeys,
+ +                                                                                                       -1.0);
+ +
+ +                              /* Now decide what to stick atop it */
+ +                              if (parse->groupingSets)
+ +                              {
+ +                                      /*
+ +                                       * TODO 2-phase aggregation for grouping sets paths not
+ +                                       * supported yet, but this the place where such paths
+ +                                       * should be constructed.
+ +                                       */
+ +                              }
+ +                              else if (parse->hasAggs)
+ +                              {
+ +                                      /*
+ +                                       * We have aggregation, possibly with plain GROUP BY. Make
+ +                                       * an AggPath.
+ +                                       */
+ +
+ +                                      path = (Path *) create_agg_path(root,
+ +                                                                                                      grouped_rel,
+ +                                                                                                      path,
+ +                                                                                                      partial_grouping_target,
+ +                                                                      parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+ +                                                                                                      AGGSPLIT_INITIAL_SERIAL,
+ +                                                                                                      parse->groupClause,
+ +                                                                                                      NIL,
+ +                                                                                                      &agg_partial_costs,
+ +                                                                                                      dNumPartialGroups);
+ +
+ +                                      path = create_remotesubplan_path(root, path, NULL);
+ +
+ +                                      /*
+ +                                       * We generate two paths, differing in the second phase
+ +                                       * implementation (sort and hash).
+ +                                       */
+ +
+ +                                      add_path(grouped_rel, (Path *)
+ +                                                       create_agg_path(root,
+ +                                                                                       grouped_rel,
+ +                                                                                       path,
+ +                                                                                       target,
+ +                                                                       parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+ +                                                                                       AGGSPLIT_FINAL_DESERIAL,
+ +                                                                                       parse->groupClause,
+ +                                                                                       (List *) parse->havingQual,
+ +                                                                                       &agg_final_costs,
+ +                                                                                       dNumGroups));
+ +
+ +                                      if (can_hash)
+ +                                              add_path(grouped_rel, (Path *)
+ +                                                               create_agg_path(root,
+ +                                                                                               grouped_rel,
+ +                                                                                               path,
+ +                                                                                               target,
+ +                                                                                               AGG_HASHED,
+ +                                                                                               AGGSPLIT_FINAL_DESERIAL,
+ +                                                                                               parse->groupClause,
+ +                                                                                               (List *) parse->havingQual,
+ +                                                                                               &agg_final_costs,
+ +                                                                                               dNumGroups));
+ +                              }
+ +                              else if (parse->groupClause)
+ +                              {
+ +                                      /*
+ +                                       * We have GROUP BY without aggregation or grouping sets.
+ +                                       * Make a GroupPath.
+ +                                       */
+ +                                      path = (Path *) create_group_path(root,
+ +                                                                                                        grouped_rel,
+ +                                                                                                        path,
+ +                                                                                                        partial_grouping_target,
+ +                                                                                                        parse->groupClause,
+ +                                                                                                        NIL,
+ +                                                                                                        dNumPartialGroups);
+ +
+ +                                      path = create_remotesubplan_path(root, path, NULL);
+ +
+ +                                      add_path(grouped_rel, (Path *)
+ +                                                       create_group_path(root,
+ +                                                                                         grouped_rel,
+ +                                                                                         path,
+ +                                                                                         target,
+ +                                                                                         parse->groupClause,
+ +                                                                                         (List *) parse->havingQual,
+ +                                                                                         dNumGroups));
+ +
+ +                              }
+ +                              else
+ +                              {
+ +                                      /* Other cases should have been handled above */
+ +                                      Assert(false);
+ +                              }
+ +                      }
+ +              }
+ +
+ +              /*
+ +               * So far we've only constructed simple paths combining partial and
+ +               * distributed aggregate paths, i.e.
+ +               *
+ +               *     Finalize -> RemoteSubplan -> Gather -> Partial
+ +               *
+ +               * It may however be more efficient to reduce the amount of data
+ +               * transferred over the network by generating paths like this:
+ +               *
+ +               *     Finalize -> RemoteSubplan -> Combine -> Gather -> Partial
+ +               *
+ +               * where Combine deserialized the aggstates, combines them and then
+ +               * serializes them again. This AggSplit case is not defined yet, but
+ +               * should not be hard to add.
+ +               *
+ +               * We only want to do this for partial paths with RemoteSubplan on
+ +               * top of them, i.e. when the whole aggregate was not pushed down.
+ +               *
+ +               * XXX Gather output is never sorted, so we can only bother with the
+ +               * cheapest partial path here (just like above).
+ +               *
+ +               * XXX This only generates paths with both the combine and finalize
+ +               * steps using the same implementation (sort+sort or hash+hash). Maybe
+ +               * we should relax that, and allow hash+sort or sort+hash?
+ +               *
+ +               * XXX grouped_rel->partial_pathlist may be empty here, if the planner
+ +               * did not consider parallel paths (try_parallel_aggregation=false).
+ +               * But that's OK - we only want to put the combine on top of a Gather,
+ +               * so if there's none we're done.
+ +               *
+ +               * XXX The "combine" paths seem not to be picked up, most likely
+ +               * because of bad costing, not reflecting the reduction in number of
+ +               * rows transferred over the network.
+ +               */
+ +              if (grouped_rel->partial_pathlist)
+ +              {
+ +                      Path       *path = (Path *) linitial(grouped_rel->partial_pathlist);
+ +                      double          total_groups = path->rows * path->parallel_workers;
+ +
+ +                      /* We don't care about paths that were fully pushed down. */
+ +                      if (! can_push_down_grouping(root, parse, path))
+ +                      {
+ +                              path = (Path *) create_gather_path(root,
+ +                                                                                                 grouped_rel,
+ +                                                                                                 path,
+ +                                                                                                 partial_grouping_target,
+ +                                                                                                 NULL,
+ +                                                                                                 &total_groups);
+ +
+ +                              /*
+ +                               * Gather is always unsorted, so we'll need to sort, unless
+ +                               * there's no GROUP BY clause, in which case there will only be a
+ +                               * single group.
+ +                               */
+ +                              if (parse->groupClause)
+ +                                      path = (Path *) create_sort_path(root,
+ +                                                                                                       grouped_rel,
+ +                                                                                                       path,
+ +                                                                                                       root->group_pathkeys,
+ +                                                                                                       -1.0);
+ +
+ +                              /* Intermediate combine phase. */
+ +                              if (parse->hasAggs)
+ +                              {
+ +                                      path = (Path *) create_agg_path(root,
+ +                                                                                                      grouped_rel,
+ +                                                                                                      path,
+ +                                                                                                      target,
+ +                                                                      parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+ +                                                                                                      AGGSPLIT_COMBINE,
+ +                                                                                                      parse->groupClause,
+ +                                                                                                      (List *) parse->havingQual,
+ +                                                                                                      &agg_final_costs,
+ +                                                                                                      dNumGroups);
+ +
+ +                                      path = create_remotesubplan_path(root, path, NULL);
+ +
+ +                                      add_path(grouped_rel, (Path *)
+ +                                                       create_agg_path(root,
+ +                                                                                       grouped_rel,
+ +                                                                                       path,
+ +                                                                                       target,
+ +                                                                       parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+ +                                                                                       AGGSPLIT_FINAL_DESERIAL,
+ +                                                                                       parse->groupClause,
+ +                                                                                       (List *) parse->havingQual,
+ +                                                                                       &agg_final_costs,
+ +                                                                                       dNumGroups));
+ +                              }
+ +                              else
+ +                              {
+ +                                      path = (Path *) create_group_path(root,
+ +                                                                                                        grouped_rel,
+ +                                                                                                        path,
+ +                                                                                                        target,
+ +                                                                                                        parse->groupClause,
+ +                                                                                                        (List *) parse->havingQual,
+ +                                                                                                        dNumGroups);
+ +
+ +                                      path = create_remotesubplan_path(root, path, NULL);
+ +
+ +                                      add_path(grouped_rel, (Path *)
+ +                                                       create_group_path(root,
+ +                                                                                         grouped_rel,
+ +                                                                                         path,
+ +                                                                                         target,
+ +                                                                                         parse->groupClause,
+ +                                                                                         (List *) parse->havingQual,
+ +                                                                                         dNumGroups));
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      if (can_hash && try_distributed_aggregation)
+ +      {
+ +              hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
+ +                                                                                                        agg_costs,
+ +                                                                                                        dNumGroups);
+ +
+ +              /*
+ +               * Provided that the estimated size of the hashtable does not exceed
+ +               * work_mem, we'll generate a HashAgg Path, although if we were unable
+ +               * to sort above, then we'd better generate a Path, so that we at
+ +               * least have one.
+ +               */
+ +              if (hashaggtablesize < work_mem * 1024L ||
+ +                      grouped_rel->pathlist == NIL)
+ +              {
+ +                      /* If the whole aggregate was pushed down, we're done. */
+ +                      if (! can_push_down_grouping(root, parse, cheapest_path))
+ +                      {
+ +                              Path *path, *agg_path;
+ +
+ +                              path = (Path *) create_agg_path(root,
+ +                                                                         grouped_rel,
+ +                                                                         cheapest_path,
+ +                                                                         partial_grouping_target,
+ +                                                                         AGG_HASHED,
+ +                                                                         AGGSPLIT_INITIAL_SERIAL,
+ +                                                                         parse->groupClause,
+ +                                                                         NIL,
+ +                                                                         &agg_partial_costs,
+ +                                                                         dNumPartialGroups);
+ +
+ +                              /* keep partially aggregated path for the can_sort branch */
+ +                              agg_path = path;
+ +
+ +                              path = create_remotesubplan_path(root, path, NULL);
+ +
+ +                              /* Generate paths with both hash and sort second phase. */
+ +
+ +                              add_path(grouped_rel, (Path *)
+ +                                               create_agg_path(root,
+ +                                                                               grouped_rel,
+ +                                                                               path,
+ +                                                                               target,
+ +                                                                               AGG_HASHED,
+ +                                                                               AGGSPLIT_FINAL_DESERIAL,
+ +                                                                               parse->groupClause,
+ +                                                                               (List *) parse->havingQual,
+ +                                                                               &agg_final_costs,
+ +                                                                               dNumGroups));
+ +
+ +                              if (can_sort)
+ +                              {
+ +                                      /*
+ +                                       * AGG_HASHED aggregate paths are always unsorted, so add
+ +                                       * a Sorted node for the final AGG_SORTED step.
+ +                                       */
+ +                                      path = (Path *) create_sort_path(root,
+ +                                                                                                       grouped_rel,
+ +                                                                                                       agg_path,
+ +                                                                                                       root->group_pathkeys,
+ +                                                                                                       -1.0);
+ +
+ +                                      path = create_remotesubplan_path(root, path, NULL);
+ +
+ +                                      add_path(grouped_rel, (Path *)
+ +                                                       create_agg_path(root,
+ +                                                                                       grouped_rel,
+ +                                                                                       path,
+ +                                                                                       target,
+ +                                                                       parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+ +                                                                                       AGGSPLIT_FINAL_DESERIAL,
+ +                                                                                       parse->groupClause,
+ +                                                                                       (List *) parse->havingQual,
+ +                                                                                       &agg_final_costs,
+ +                                                                                       dNumGroups));
+ +                              }
+ +                      }
+ +              }
+ +
+ +              /*
+ +               * Generate a path with the extra combine phase.
+ +               *
+ +               * XXX See the comments in the block generating combine paths for
+ +               * the sorted case.
+ +               */
+ +              if (grouped_rel->partial_pathlist)
+ +              {
+ +                      Path       *path = (Path *) linitial(grouped_rel->partial_pathlist);
+ +
+ +                      hashaggtablesize = estimate_hashagg_tablesize(path,
+ +                                                                                                                &agg_final_costs,
+ +                                                                                                                dNumGroups);
+ +
+ +                      /*
+ +                       * Ignore the path if the hash table won't fit into memory, or
+ +                       * if we managed to push dowh the whole aggregation.
+ +                       */
+ +                      if ((hashaggtablesize < work_mem * 1024L) &&
+ +                              (! can_push_down_grouping(root, parse, path)))
+ +                      {
+ +                              double          total_groups = path->rows * path->parallel_workers;
+ +
+ +                              path = (Path *) create_gather_path(root,
+ +                                                                                                 grouped_rel,
+ +                                                                                                 path,
+ +                                                                                                 partial_grouping_target,
+ +                                                                                                 NULL,
+ +                                                                                                 &total_groups);
+ +
+ +                              path = (Path *) create_agg_path(root,
+ +                                                                                              grouped_rel,
+ +                                                                                              path,
+ +                                                                                              target,
+ +                                                                                              AGG_HASHED,
+ +                                                                                              AGGSPLIT_COMBINE,
+ +                                                                                              parse->groupClause,
+ +                                                                                              (List *) parse->havingQual,
+ +                                                                                              &agg_final_costs,
+ +                                                                                              dNumGroups);
+ +
+ +                              /* We know the full push down can't happen, so redistribute. */
+ +                              path = create_remotesubplan_path(root, path, NULL);
+ +
+ +                              add_path(grouped_rel, (Path *)
+ +                                               create_agg_path(root,
+ +                                                                               grouped_rel,
+ +                                                                               path,
+ +                                                                               target,
+ +                                                                               AGG_HASHED,
+ +                                                                               AGGSPLIT_FINAL_DESERIAL,
+ +                                                                               parse->groupClause,
+ +                                                                               (List *) parse->havingQual,
+ +                                                                               &agg_final_costs,
+ +                                                                               dNumGroups));
+ +                      }
+ +              }
+ +      }
+ +
+ +      /* Give a helpful error if we failed to find any implementation */
+ +      if (grouped_rel->pathlist == NIL)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ +                               errmsg("could not implement GROUP BY"),
+ +                               errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
+ +
+ +      /*
+ +       * If there is an FDW that's responsible for all baserels of the query,
+ +       * let it consider adding ForeignPaths.
+ +       */
+ +      if (grouped_rel->fdwroutine &&
+ +              grouped_rel->fdwroutine->GetForeignUpperPaths)
+ +              grouped_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_GROUP_AGG,
+ +                                                                                                        input_rel, grouped_rel);
+ +
+ +      /* Let extensions possibly add some more paths */
+ +      if (create_upper_paths_hook)
+ +              (*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG,
+ +                                                                      input_rel, grouped_rel);
+ +
+ +      /* Now choose the best path(s) */
+ +      set_cheapest(grouped_rel);
++      /*
++       * We've been using the partial pathlist for the grouped relation to hold
++       * partially aggregated paths, but that's actually a little bit bogus
++       * because it's unsafe for later planning stages -- like ordered_rel ---
++       * to get the idea that they can use these partial paths as if they didn't
++       * need a FinalizeAggregate step.  Zap the partial pathlist at this stage
++       * so we don't get confused.
++       */
++      grouped_rel->partial_pathlist = NIL;
+ +
+ +      return grouped_rel;
+ +}
+ +
++
++/*
++ * For a given input path, consider the possible ways of doing grouping sets on
++ * it, by combinations of hashing and sorting.  This can be called multiple
++ * times, so it's important that it not scribble on input.  No result is
++ * returned, but any generated paths are added to grouped_rel.
++ */
++static void
++consider_groupingsets_paths(PlannerInfo *root,
++                                                      RelOptInfo *grouped_rel,
++                                                      Path *path,
++                                                      bool is_sorted,
++                                                      bool can_hash,
++                                                      PathTarget *target,
++                                                      grouping_sets_data *gd,
++                                                      const AggClauseCosts *agg_costs,
++                                                      double dNumGroups)
++{
++      Query      *parse = root->parse;
++
++      /*
++       * If we're not being offered sorted input, then only consider plans that
++       * can be done entirely by hashing.
++       *
++       * We can hash everything if it looks like it'll fit in work_mem. But if
++       * the input is actually sorted despite not being advertised as such, we
++       * prefer to make use of that in order to use less memory.
++       *
++       * If none of the grouping sets are sortable, then ignore the work_mem
++       * limit and generate a path anyway, since otherwise we'll just fail.
++       */
++      if (!is_sorted)
++      {
++              List       *new_rollups = NIL;
++              RollupData *unhashed_rollup = NULL;
++              List       *sets_data;
++              List       *empty_sets_data = NIL;
++              List       *empty_sets = NIL;
++              ListCell   *lc;
++              ListCell   *l_start = list_head(gd->rollups);
++              AggStrategy strat = AGG_HASHED;
++              Size            hashsize;
++              double          exclude_groups = 0.0;
++
++              Assert(can_hash);
++
++              if (pathkeys_contained_in(root->group_pathkeys, path->pathkeys))
++              {
++                      unhashed_rollup = lfirst(l_start);
++                      exclude_groups = unhashed_rollup->numGroups;
++                      l_start = lnext(l_start);
++              }
++
++              hashsize = estimate_hashagg_tablesize(path,
++                                                                                        agg_costs,
++                                                                                        dNumGroups - exclude_groups);
++
++              /*
++               * gd->rollups is empty if we have only unsortable columns to work
++               * with.  Override work_mem in that case; otherwise, we'll rely on the
++               * sorted-input case to generate usable mixed paths.
++               */
++              if (hashsize > work_mem * 1024L && gd->rollups)
++                      return;                         /* nope, won't fit */
++
++              /*
++               * We need to burst the existing rollups list into individual grouping
++               * sets and recompute a groupClause for each set.
++               */
++              sets_data = list_copy(gd->unsortable_sets);
++
++              for_each_cell(lc, l_start)
++              {
++                      RollupData *rollup = lfirst(lc);
+ 
+                       /*
+                        * If we find an unhashable rollup that's not been skipped by the
+                        * "actually sorted" check above, we can't cope; we'd need sorted
+                        * input (with a different sort order) but we can't get that here.
+                        * So bail out; we'll get a valid path from the is_sorted case
+                        * instead.
+                        *
+                        * The mere presence of empty grouping sets doesn't make a rollup
+                        * unhashable (see preprocess_grouping_sets), we handle those
+                        * specially below.
+                        */
+                       if (!rollup->hashable)
+                               return;
+                       else
+                               sets_data = list_concat(sets_data, list_copy(rollup->gsets_data));
+               }
+               foreach(lc, sets_data)
+               {
+                       GroupingSetData *gs = lfirst(lc);
+                       List       *gset = gs->set;
+                       RollupData *rollup;
+ 
+                       if (gset == NIL)
+                       {
+                               /* Empty grouping sets can't be hashed. */
+                               empty_sets_data = lappend(empty_sets_data, gs);
+                               empty_sets = lappend(empty_sets, NIL);
+                       }
+                       else
+                       {
+                               rollup = makeNode(RollupData);
+ 
+                               rollup->groupClause = preprocess_groupclause(root, gset);
+                               rollup->gsets_data = list_make1(gs);
+                               rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
+                                                                                                                rollup->gsets_data,
+                                                                                                  gd->tleref_to_colnum_map);
+                               rollup->numGroups = gs->numGroups;
+                               rollup->hashable = true;
+                               rollup->is_hashed = true;
+                               new_rollups = lappend(new_rollups, rollup);
+                       }
+               }
+ 
+               /*
+                * If we didn't find anything nonempty to hash, then bail.  We'll
+                * generate a path from the is_sorted case.
+                */
+               if (new_rollups == NIL)
+                       return;
+ 
+               /*
+                * If there were empty grouping sets they should have been in the
+                * first rollup.
+                */
+               Assert(!unhashed_rollup || !empty_sets);
+ 
+               if (unhashed_rollup)
+               {
+                       new_rollups = lappend(new_rollups, unhashed_rollup);
+                       strat = AGG_MIXED;
+               }
+               else if (empty_sets)
+               {
+                       RollupData *rollup = makeNode(RollupData);
+ 
+                       rollup->groupClause = NIL;
+                       rollup->gsets_data = empty_sets_data;
+                       rollup->gsets = empty_sets;
+                       rollup->numGroups = list_length(empty_sets);
+                       rollup->hashable = false;
+                       rollup->is_hashed = false;
+                       new_rollups = lappend(new_rollups, rollup);
+                       strat = AGG_MIXED;
+               }
+ 
+               add_path(grouped_rel, (Path *)
+                                create_groupingsets_path(root,
+                                                                                 grouped_rel,
+                                                                                 path,
+                                                                                 target,
+                                                                                 (List *) parse->havingQual,
+                                                                                 strat,
+                                                                                 new_rollups,
+                                                                                 agg_costs,
+                                                                                 dNumGroups));
+               return;
+       }
+ 
+       /*
+        * If we have sorted input but nothing we can do with it, bail.
+        */
+       if (list_length(gd->rollups) == 0)
+               return;
+ 
+       /*
+        * Given sorted input, we try and make two paths: one sorted and one mixed
+        * sort/hash. (We need to try both because hashagg might be disabled, or
+        * some columns might not be sortable.)
+        *
+        * can_hash is passed in as false if some obstacle elsewhere (such as
+        * ordered aggs) means that we shouldn't consider hashing at all.
+        */
+       if (can_hash && gd->any_hashable)
+       {
+               List       *rollups = NIL;
+               List       *hash_sets = list_copy(gd->unsortable_sets);
+               double          availspace = (work_mem * 1024.0);
+               ListCell   *lc;
+ 
+               /*
+                * Account first for space needed for groups we can't sort at all.
+                */
+               availspace -= (double) estimate_hashagg_tablesize(path,
+                                                                                                                 agg_costs,
+                                                                                                                 gd->dNumHashGroups);
+ 
+               if (availspace > 0 && list_length(gd->rollups) > 1)
+               {
+                       double          scale;
+                       int                     num_rollups = list_length(gd->rollups);
+                       int                     k_capacity;
+                       int                *k_weights = palloc(num_rollups * sizeof(int));
+                       Bitmapset  *hash_items = NULL;
+                       int                     i;
+ 
+                       /*
+                        * We treat this as a knapsack problem: the knapsack capacity
+                        * represents work_mem, the item weights are the estimated memory
+                        * usage of the hashtables needed to implement a single rollup,
+                        * and we really ought to use the cost saving as the item value;
+                        * however, currently the costs assigned to sort nodes don't
+                        * reflect the comparison costs well, and so we treat all items as
+                        * of equal value (each rollup we hash instead saves us one sort).
+                        *
+                        * To use the discrete knapsack, we need to scale the values to a
+                        * reasonably small bounded range.  We choose to allow a 5% error
+                        * margin; we have no more than 4096 rollups in the worst possible
+                        * case, which with a 5% error margin will require a bit over 42MB
+                        * of workspace. (Anyone wanting to plan queries that complex had
+                        * better have the memory for it.  In more reasonable cases, with
+                        * no more than a couple of dozen rollups, the memory usage will
+                        * be negligible.)
+                        *
+                        * k_capacity is naturally bounded, but we clamp the values for
+                        * scale and weight (below) to avoid overflows or underflows (or
+                        * uselessly trying to use a scale factor less than 1 byte).
+                        */
+                       scale = Max(availspace / (20.0 * num_rollups), 1.0);
+                       k_capacity = (int) floor(availspace / scale);
+ 
+                       /*
+                        * We leave the first rollup out of consideration since it's the
+                        * one that matches the input sort order.  We assign indexes "i"
+                        * to only those entries considered for hashing; the second loop,
+                        * below, must use the same condition.
+                        */
+                       i = 0;
+                       for_each_cell(lc, lnext(list_head(gd->rollups)))
+                       {
+                               RollupData *rollup = lfirst(lc);
+ 
+                               if (rollup->hashable)
+                               {
+                                       double          sz = estimate_hashagg_tablesize(path,
+                                                                                                                               agg_costs,
+                                                                                                                 rollup->numGroups);
+ 
+                                       /*
+                                        * If sz is enormous, but work_mem (and hence scale) is
+                                        * small, avoid integer overflow here.
+                                        */
+                                       k_weights[i] = (int) Min(floor(sz / scale),
+                                                                                        k_capacity + 1.0);
+                                       ++i;
+                               }
+                       }
+ 
+                       /*
+                        * Apply knapsack algorithm; compute the set of items which
+                        * maximizes the value stored (in this case the number of sorts
+                        * saved) while keeping the total size (approximately) within
+                        * capacity.
+                        */
+                       if (i > 0)
+                               hash_items = DiscreteKnapsack(k_capacity, i, k_weights, NULL);
+ 
+                       if (!bms_is_empty(hash_items))
+                       {
+                               rollups = list_make1(linitial(gd->rollups));
+ 
+                               i = 0;
+                               for_each_cell(lc, lnext(list_head(gd->rollups)))
+                               {
+                                       RollupData *rollup = lfirst(lc);
+ 
+                                       if (rollup->hashable)
+                                       {
+                                               if (bms_is_member(i, hash_items))
+                                                       hash_sets = list_concat(hash_sets,
+                                                                                         list_copy(rollup->gsets_data));
+                                               else
+                                                       rollups = lappend(rollups, rollup);
+                                               ++i;
+                                       }
+                                       else
+                                               rollups = lappend(rollups, rollup);
+                               }
+                       }
+               }
+ 
+               if (!rollups && hash_sets)
+                       rollups = list_copy(gd->rollups);
+ 
+               foreach(lc, hash_sets)
+               {
+                       GroupingSetData *gs = lfirst(lc);
+                       RollupData *rollup = makeNode(RollupData);
+ 
+                       Assert(gs->set != NIL);
+ 
+                       rollup->groupClause = preprocess_groupclause(root, gs->set);
+                       rollup->gsets_data = list_make1(gs);
+                       rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
+                                                                                                        rollup->gsets_data,
+                                                                                                  gd->tleref_to_colnum_map);
+                       rollup->numGroups = gs->numGroups;
+                       rollup->hashable = true;
+                       rollup->is_hashed = true;
+                       rollups = lcons(rollup, rollups);
+               }
+ 
+               if (rollups)
+               {
+                       add_path(grouped_rel, (Path *)
+                                        create_groupingsets_path(root,
+                                                                                         grouped_rel,
+                                                                                         path,
+                                                                                         target,
+                                                                                         (List *) parse->havingQual,
+                                                                                         AGG_MIXED,
+                                                                                         rollups,
+                                                                                         agg_costs,
+                                                                                         dNumGroups));
+               }
+       }
+ 
+       /*
+        * Now try the simple sorted case.
+        */
+       if (!gd->unsortable_sets)
+               add_path(grouped_rel, (Path *)
+                                create_groupingsets_path(root,
+                                                                                 grouped_rel,
+                                                                                 path,
+                                                                                 target,
+                                                                                 (List *) parse->havingQual,
+                                                                                 AGG_SORTED,
+                                                                                 gd->rollups,
+                                                                                 agg_costs,
+                                                                                 dNumGroups));
+ }
+ 
   /*
    * create_window_paths
    *
@@@ -6104,255 -6057,32 +6881,286 @@@ plan_cluster_use_sort(Oid tableOid, Oi
         return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
   }
   
+ +
+ +/*
+ + * grouping_distribution_match
+ + *    Check if the path distribution matches grouping distribution.
+ + *
+ + * Grouping preserves distribution if the distribution key is on of the
+ + * grouping keys (arbitrary one). In that case it's guaranteed that groups
+ + * on different nodes do not overlap, and we can push the aggregation to
+ + * remote nodes as a whole.
+ + *
+ + * Otherwise we need to either fetch all the data to the coordinator and
+ + * perform the aggregation there, or use two-phase aggregation, with the
+ + * first phase (partial aggregation) pushed down, and the second phase
+ + * (combining and finalizing the results) executed on the coordinator.
+ + *
+ + * XXX This is used not only for plain aggregation, but also for various
+ + * other paths, relying on grouping infrastructure (DISTINCT ON, UNIQUE).
+ + */
+ +static bool
+ +grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
+ +                                                      List *clauses)
+ +{
+ +      int             i;
+ +      bool    matches_key = false;
+ +      Distribution *distribution = path->distribution;
+ +
+ +      int numGroupCols = list_length(clauses);
+ +      AttrNumber *groupColIdx = extract_grouping_cols(clauses,
+ +                                                                                                      parse->targetList);
+ +
+ +      /*
+ +       * With no explicit data distribution or replicated tables, we can simply
+ +       * push down the whole aggregation to the remote node, without any sort
+ +       * of redistribution. So consider this to be a match.
+ +       */
+ +      if ((distribution == NULL) ||
+ +              IsLocatorReplicated(distribution->distributionType))
+ +              return true;
+ +
+ +      /* But no distribution expression means 'no match'. */
+ +      if (distribution->distributionExpr == NULL)
+ +              return false;
+ +
+ +      /*
+ +       * With distributed data and table distributed using an expression, we
+ +       * need to check if the distribution expression matches one of the
+ +       * grouping keys (arbitrary one).
+ +       */
+ +      for (i = 0; i < numGroupCols; i++)
+ +      {
+ +              TargetEntry *te = (TargetEntry *)list_nth(parse->targetList,
+ +                                                                                                groupColIdx[i]-1);
+ +
+ +              if (equal(te->expr, distribution->distributionExpr))
+ +              {
+ +                      matches_key = true;
+ +                      break;
+ +              }
+ +      }
+ +
+ +      return matches_key;
+ +}
+ +
+ /*
+  * get_partitioned_child_rels
+  *            Returns a list of the RT indexes of the partitioned child relations
+  *            with rti as the root parent RT index.
+  *
+  * Note: Only call this function on RTEs known to be partitioned tables.
+  */
+ List *
+ get_partitioned_child_rels(PlannerInfo *root, Index rti)
+ {
+       List       *result = NIL;
+       ListCell   *l;
+ 
+       foreach(l, root->pcinfo_list)
+       {
+               PartitionedChildRelInfo *pc = lfirst(l);
+ 
+               if (pc->parent_relid == rti)
+               {
+                       result = pc->child_rels;
+                       break;
+               }
+       }
+ 
+       /* The root partitioned table is included as a child rel */
+       Assert(list_length(result) >= 1);
+ 
+       return result;
+ }
++
++
+ +static bool
+ +groupingsets_distribution_match(PlannerInfo *root, Query *parse, Path *path)
+ +{
+ +      Distribution *distribution = path->distribution;
+ +
+ +      /*
+ +       * With no explicit data distribution or replicated tables, we can simply
+ +       * push down the whole grouping sets to the remote node, without any sort
+ +       * of redistribution. So consider this to be a match.
+ +       */
+ +      if ((distribution == NULL) ||
+ +              IsLocatorReplicated(distribution->distributionType))
+ +              return true;
+ +
+ +      return false;
+ +}
+ +
+ +/*
+ + * equal_distributions
+ + *    Check that two distributions are equal.
+ + *
+ + * Distributions are considered equal if they are of the same type, on the
+ + * same set of nodes, and if the distribution expressions are known to be equal
+ + * (either the same expressions or members of the same equivalence class).
+ + */
+ +static bool
+ +equal_distributions(PlannerInfo *root, Distribution *dst1,
+ +                                      Distribution *dst2)
+ +{
+ +      /* fast path */
+ +      if (dst1 == dst2)
+ +              return true;
+ +
+ +      if (dst1 == NULL || dst2 == NULL)
+ +              return false;
+ +
+ +      /* conditions easier to check go first */
+ +      if (dst1->distributionType != dst2->distributionType)
+ +              return false;
+ +
+ +      if (!bms_equal(dst1->nodes, dst2->nodes))
+ +              return false;
+ +
+ +      if (equal(dst1->distributionExpr, dst2->distributionExpr))
+ +              return true;
+ +
+ +      /*
+ +       * For more thorough expression check we need to ensure they both are
+ +       * defined
+ +       */
+ +      if (dst1->distributionExpr == NULL || dst2->distributionExpr == NULL)
+ +              return false;
+ +
+ +      /*
+ +       * More thorough check, but allows some important cases, like if
+ +       * distribution column is not updated (implicit set distcol=distcol) or
+ +       * set distcol = CONST, ... WHERE distcol = CONST - pattern used by many
+ +       * applications.
+ +       */
+ +      if (exprs_known_equal(root, dst1->distributionExpr, dst2->distributionExpr))
+ +              return true;
+ +
+ +      /* The restrictNodes field does not matter for distribution equality */
+ +      return false;
+ +}
+ +
+ +static Path *
+ +adjust_path_distribution(PlannerInfo *root, Query *parse, Path *path)
+ +{
+ +      /* if the root distribution is NULL, set it to path distribution */
+ +      if (!root->distribution)
+ +      {
+ +              root->distribution = path->distribution;
+ +              return path;
+ +      }
+ +
+ +      /* don't touch paths without distribution attached (catalogs etc.) */
+ +      if ((path->distribution == NULL) && (root->distribution == NULL))
+ +              return path;
+ +
+ +      if (equal_distributions(root, root->distribution, path->distribution))
+ +      {
+ +              if (IsLocatorReplicated(path->distribution->distributionType) &&
+ +                      contain_volatile_functions((Node *) parse->targetList))
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ +                                      errmsg("can not update replicated table with result of volatile function")));
+ +
+ +              /*
+ +               * Source tuple will be consumed on the same node where it is
+ +               * produced, so if it is known that some node does not yield tuples
+ +               * we do not want to send subquery for execution on these nodes
+ +               * at all. So copy the restriction to the external distribution.
+ +               *
+ +               * XXX Is that ever possible if external restriction is already
+ +               * defined? If yes we probably should use intersection of the sets,
+ +               * and if resulting set is empty create dummy plan and set it as
+ +               * the result_plan. Need to think this over
+ +               */
+ +              root->distribution->restrictNodes =
+ +                              bms_copy(path->distribution->restrictNodes);
+ +      }
+ +      else
+ +      {
+ +              /*
+ +               * If the planned statement is either UPDATE or DELETE, different
+ +               * distributions here mean the ModifyTable node will be placed on
+ +               * top of RemoteSubquery.
+ +               *
+ +               * UPDATE and DELETE versions of ModifyTable use TID of incoming
+ +               * tuple to apply the changes, but the RemoteSubquery plan supplies
+ +               * RemoteTuples, without such field. Therefore we can't execute
+ +               * such plan and error-out.
+ +               *
+ +               * Most common example is when the UPDATE statement modifies the
+ +               * distribution column, or when a complex UPDATE or DELETE statement
+ +               * involves a join. It's difficult to determine the exact reason,
+ +               * but we assume the first one (correlated UPDATE) is more likely.
+ +               *
+ +               * There are two ways of fixing the UPDATE ambiguity:
+ +               *
+ +               * 1. Modify the planner to never consider redistribution of the
+ +               * target table. In this case the planner would find there's no way
+ +               * to plan the query, and it would throw error somewhere else, and
+ +               * we'd only be dealing with updates of distribution columns.
+ +               *
+ +               * 2. Modify executor to allow distribution column updates. However
+ +               * there are a lot of issues behind the scene when implementing that
+ +               * approach, and so it's unlikely to happen soon.
+ +               *
+ +               * DELETE statements may only fail because of complex joins.
+ +               */
+ +
+ +              if (parse->commandType == CMD_UPDATE)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ +                                       errmsg("could not plan this distributed update"),
+ +                                       errdetail("correlated UPDATE or updating distribution column currently not supported in Postgres-XL.")));
+ +
+ +              if (parse->commandType == CMD_DELETE)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ +                                       errmsg("could not plan this distributed delete"),
+ +                                       errdetail("correlated or complex DELETE is currently not supported in Postgres-XL.")));
+ +
+ +              /*
+ +               * We already know the distributions are not equal, but let's see if
+ +               * the redistribution is actually necessary. We can skip it if we
+ +               * already have Result path, and if the distribution is one of
+ +               *
+ +               * a) 'hash' restricted to a single node
+ +               * b) 'replicate' without volatile functions in the target list
+ +               *
+ +               * In those cases we don't need the RemoteSubplan.
+ +               *
+ +               * XXX Not sure what the (result_plan->lefttree == NULL) does.
+ +               * See planner.c:2730 in 9.5.
+ +               */
+ +              if (!(IsA(path, ResultPath) && /* FIXME missing (result_plan->lefttree == NULL) condition */
+ +                      ((root->distribution->distributionType == 'H' && bms_num_members(root->distribution->restrictNodes) == 1) ||
+ +                       (root->distribution->distributionType == 'R' && !contain_mutable_functions((Node *)parse->targetList)))))
+ +
+ +                      path = create_remotesubplan_path(root, path, root->distribution);
+ +      }
+ +
+ +      return path;
+ +}
+ +
+ +static bool
+ +can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path)
+ +{
+ +      /* only called when constructing grouping paths */
+ +      Assert(parse->hasAggs || parse->groupClause);
+ +
+ +      if (parse->groupingSets)
+ +              return groupingsets_distribution_match(root, parse, path);
+ +
+ +      return grouping_distribution_match(root, parse, path, parse->groupClause);
+ +}
+ +
+ +static bool
+ +can_push_down_window(PlannerInfo *root, Path *path)
+ +{
+ +      /*  */
+ +      if (! path->distribution)
+ +              return true;
+ +
+ +      return false;
+ +}
diff --cc src/backend/optimizer/plan/setrefs.c

index d5bc9e07602bb99de6fdf64ef57bf496be86c5e0,5cac171cb6e411cb5ab4deac14bcfefcb4ad29f7..398586e98acdca850ec390ec7e8b80a06734acd1
--- 1/src/backend/optimizer/plan/setrefs.c
--- 2/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@@ -4,8 -4,7 +4,8 @@@
    *      Post-processing of a completed plan tree: fix references to subplan
    *      vars, compute regproc values for operators, etc
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/optimizer/plan/subselect.c

index bc2cbcee6b335b5f9c5b7a86e6f82348de043443,c1be34dd12c8cdcbbe96b910c7c652885d936dc2..d8545f2bdd6e51a4f759ee0ca744f5a9bfdf7553
--- 1/src/backend/optimizer/plan/subselect.c
--- 2/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@@ -3,8 -3,7 +3,8 @@@
    * subselect.c
    *      Planning routines for subselects and parameters.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
diff --cc src/backend/optimizer/prep/prepjointree.c

index 5fa672d02cc9388d3fb1b48fb0a301e499494571,749ea805f824a717943267cb58b2ce97852af682..41a930428faeb336fe4ff37d2f5b2accd7218e22
--- 1/src/backend/optimizer/prep/prepjointree.c
--- 2/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@@ -1125,9 -1123,7 +1127,10 @@@ pull_up_simple_subquery(PlannerInfo *ro
                                         break;
                                 case RTE_JOIN:
                                 case RTE_CTE:
+ +#ifdef XCP    
+ +                              case RTE_REMOTE_DUMMY:
+ +#endif
+                               case RTE_NAMEDTUPLESTORE:
                                         /* these can't contain any lateral references */
                                         break;
                         }
@@@ -1985,9 -1978,7 +1985,10 @@@ replace_vars_in_jointree(Node *jtnode
                                                 break;
                                         case RTE_JOIN:
                                         case RTE_CTE:
+ +#ifdef XCP
+ +                                      case RTE_REMOTE_DUMMY:
+ +#endif                                        
+                                       case RTE_NAMEDTUPLESTORE:
                                                 /* these shouldn't be marked LATERAL */
                                                 Assert(false);
                                                 break;
diff --cc src/backend/optimizer/prep/preptlist.c

index 64cd7262d07a2ef8ca9e12cd438e5e78a8f459c5,de47153bacef43eec0736043e6beb6fa6159823c..4d4727278147cc5e06cfd057c647eb1313eb9d40
--- 1/src/backend/optimizer/prep/preptlist.c
--- 2/src/backend/optimizer/prep/preptlist.c
+++ b/src/backend/optimizer/prep/preptlist.c
@@@ -26,8 -26,8 +26,8 @@@
    * the tlists for child tables to keep expand_targetlist happy.  We do it like
    * that because it's faster in typical non-inherited cases.
    *
- - *
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
diff --cc src/backend/optimizer/prep/prepunion.c

index 25226363920ed9772175534d69c4336f38ec321b,8b44fb96b08c6ca0c4d6ffaf002b7af6e8544465..66c684c065bda61fbe24a958261672d6b9ddc59c
--- 1/src/backend/optimizer/prep/prepunion.c
--- 2/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@@ -17,8 -17,7 +17,8 @@@
    * append relations, and thenceforth share code with the UNION ALL case.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/optimizer/util/pathnode.c

index 971ffa882268ef1899c5a899336c10329abb1b39,ec4a093d9fb375c5ff6d5b944b5e4f783d851dfd..0ccf4bd47da594e205f4233c5bf88d5a052ed35e
--- 1/src/backend/optimizer/util/pathnode.c
--- 2/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@@ -3,8 -3,7 +3,8 @@@
    * pathnode.c
    *      Routines to manipulate pathlists and create path nodes
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -2220,19 -1046,7 +2217,19 @@@ create_index_path(PlannerInfo *root
         pathnode->indexorderbycols = indexorderbycols;
         pathnode->indexscandir = indexscandir;
   
-       cost_index(pathnode, root, loop_count);
+ +#ifdef XCP
+ +      set_scanpath_distribution(root, rel, (Path *) pathnode);
+ +      if (indexclauses)
+ +      {
+ +              ListCell *lc;
+ +              foreach (lc, indexclauses)
+ +              {
+ +                      RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
+ +                      restrict_distribution(root, ri, (Path *) pathnode);
+ +              }
+ +      }
+ +#endif
+       cost_index(pathnode, root, loop_count, partial_path);
   
         return pathnode;
   }
@@@ -2433,75 -1216,7 +2431,77 @@@ create_append_path(RelOptInfo *rel, Lis
         pathnode->path.parallel_workers = parallel_workers;
         pathnode->path.pathkeys = NIL;          /* result is always considered
                                                                                  * unsorted */
+ +#ifdef XCP
+ +      /*
+ +       * Append path is used to implement scans of inherited tables and some
+ +       * "set" operations, like UNION ALL. While all inherited tables should
+ +       * have the same distribution, UNION'ed queries may have different.
+ +       * When paths being appended have the same distribution it is OK to push
+ +       * Append down to the data nodes. If not, perform "coordinator" Append.
+ +       */
+ +
+ +      /* Special case of the dummy relation, if the subpaths list is empty */
+ +      if (subpaths)
+ +      {
+ +              /* Take distribution of the first node */
+ +              l = list_head(subpaths);
+ +              subpath = (Path *) lfirst(l);
+ +              distribution = copyObject(subpath->distribution);
+ +              /*
+ +               * Check remaining subpaths, if all distributions equal to the first set
+ +               * it as a distribution of the Append path; otherwise make up coordinator
+ +               * Append
+ +               */
+ +              while ((l = lnext(l)))
+ +              {
+ +                      subpath = (Path *) lfirst(l);
+ +
+ +                      /*
+ +                       * For Append and MergeAppend paths, we are most often dealing with
+ +                       * different relations, appended together. So its very likely that
+ +                       * the distribution for each relation will have a different varno.
+ +                       * But we should be able to push down Append and MergeAppend as
+ +                       * long as rest of the distribution information matches.
+ +                       *
+ +                       * equalDistribution() compares everything except the varnos
+ +                       */
+ +                      if (equalDistribution(distribution, subpath->distribution))
+ +                      {
+ +                              /*
+ +                               * Both distribution and subpath->distribution may be NULL at
+ +                               * this point, or they both are not null.
+ +                               */
+ +                              if (distribution && subpath->distribution->restrictNodes)
+ +                                      distribution->restrictNodes = bms_union(
+ +                                                      distribution->restrictNodes,
+ +                                                      subpath->distribution->restrictNodes);
+ +                      }
+ +                      else
+ +                      {
+ +                              break;
+ +                      }
+ +              }
+ +              if (l)
+ +              {
+ +                      List *newsubpaths = NIL;
+ +                      foreach(l, subpaths)
+ +                      {
+ +                              subpath = (Path *) lfirst(l);
+ +                              if (subpath->distribution)
+ +                                      subpath = redistribute_path(NULL, subpath, NIL,
+ +                                                                                              LOCATOR_TYPE_NONE, NULL,
+ +                                                                                              NULL, NULL);
+ +                              newsubpaths = lappend(newsubpaths, subpath);
+ +                      }
+ +                      subpaths = newsubpaths;
+ +                      pathnode->path.distribution = NULL;
+ +              }
+ +              else
+ +                      pathnode->path.distribution = distribution;
+ +      }
+ +#endif
++
+       pathnode->partitioned_rels = list_copy(partitioned_rels);
         pathnode->subpaths = subpaths;
   
         /*
@@@ -3046,19 -1742,17 +3108,20 @@@ create_gather_path(PlannerInfo *root, R
                                                                                                                   required_outer);
         pathnode->path.parallel_aware = false;
         pathnode->path.parallel_safe = false;
-       pathnode->path.parallel_workers = subpath->parallel_workers;
+       pathnode->path.parallel_workers = 0;
         pathnode->path.pathkeys = NIL;          /* Gather has unordered result */
   
+ +      /* distribution is the same as in the subpath */
+ +      pathnode->path.distribution = (Distribution *) copyObject(subpath->distribution);
+ +
         pathnode->subpath = subpath;
+       pathnode->num_workers = subpath->parallel_workers;
         pathnode->single_copy = false;
   
-       if (pathnode->path.parallel_workers == 0)
+       if (pathnode->num_workers == 0)
         {
-               pathnode->path.parallel_workers = 1;
                 pathnode->path.pathkeys = subpath->pathkeys;
+               pathnode->num_workers = 1;
                 pathnode->single_copy = true;
         }
   
@@@ -3395,26 -2122,8 +3509,26 @@@ create_nestloop_path(PlannerInfo *root
         pathnode->innerjoinpath = inner_path;
         pathnode->joinrestrictinfo = restrict_clauses;
   
-       final_cost_nestloop(root, pathnode, workspace, sjinfo, semifactors);
+ +#ifdef XCP
+ +      pathnode->movedrestrictinfo = mclauses;
+ +
+ +      alternate = set_joinpath_distribution(root, pathnode);
+ +#endif
+       final_cost_nestloop(root, pathnode, workspace, extra);
   
-               final_cost_nestloop(root, altpath, workspace, sjinfo, semifactors);
+ +#ifdef XCP
+ +      /*
+ +       * Also calculate costs of all alternates and return cheapest path
+ +       */
+ +      foreach(lc, alternate)
+ +      {
+ +              NestPath *altpath = (NestPath *) lfirst(lc);
++              final_cost_nestloop(root, altpath, workspace, extra);
+ +              if (altpath->path.total_cost < pathnode->path.total_cost)
+ +                      pathnode = altpath;
+ +      }
+ +#endif
+ +
         return pathnode;
   }
   
@@@ -3482,25 -2188,11 +3597,27 @@@ create_mergejoin_path(PlannerInfo *root
         pathnode->path_mergeclauses = mergeclauses;
         pathnode->outersortkeys = outersortkeys;
         pathnode->innersortkeys = innersortkeys;
+ +#ifdef XCP
+ +      alternate = set_joinpath_distribution(root, (JoinPath *) pathnode);
+ +#endif
+       /* pathnode->skip_mark_restore will be set by final_cost_mergejoin */
         /* pathnode->materialize_inner will be set by final_cost_mergejoin */
-       final_cost_mergejoin(root, pathnode, workspace, sjinfo);
+ 
+       final_cost_mergejoin(root, pathnode, workspace, extra);
   
-               final_cost_mergejoin(root, altpath, workspace, sjinfo);
+ +#ifdef XCP
+ +      /*
+ +       * Also calculate costs of all alternates and return cheapest path
+ +       */
+ +      foreach(lc, alternate)
+ +      {
+ +              MergePath *altpath = (MergePath *) lfirst(lc);
++              final_cost_mergejoin(root, altpath, workspace, extra);
+ +              if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost)
+ +                      pathnode = altpath;
+ +      }
+ +#endif
+ +
         return pathnode;
   }
   
@@@ -3573,25 -2260,10 +3689,26 @@@ create_hashjoin_path(PlannerInfo *root
         pathnode->jpath.innerjoinpath = inner_path;
         pathnode->jpath.joinrestrictinfo = restrict_clauses;
         pathnode->path_hashclauses = hashclauses;
+ +#ifdef XCP
+ +      alternate = set_joinpath_distribution(root, (JoinPath *) pathnode);
+ +#endif
         /* final_cost_hashjoin will fill in pathnode->num_batches */
-       final_cost_hashjoin(root, pathnode, workspace, sjinfo, semifactors);
+ 
+       final_cost_hashjoin(root, pathnode, workspace, extra);
   
-               final_cost_hashjoin(root, altpath, workspace, sjinfo, semifactors);
+ +#ifdef XCP
+ +      /*
+ +       * Calculate costs of all alternates and return cheapest path
+ +       */
+ +      foreach(lc, alternate)
+ +      {
+ +              HashPath *altpath = (HashPath *) lfirst(lc);
++              final_cost_hashjoin(root, altpath, workspace, extra);
+ +              if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost)
+ +                      pathnode = altpath;
+ +      }
+ +#endif
+ +
         return pathnode;
   }
   
@@@ -4029,9 -2755,19 +4213,22 @@@ create_groupingsets_path(PlannerInfo *r
         pathnode->path.parallel_workers = subpath->parallel_workers;
         pathnode->subpath = subpath;
   
+ +      /* distribution is the same as in the subpath */
+ +      pathnode->path.distribution = (Distribution *) copyObject(subpath->distribution);
+ +
+       /*
+        * Simplify callers by downgrading AGG_SORTED to AGG_PLAIN, and AGG_MIXED
+        * to AGG_HASHED, here if possible.
+        */
+       if (aggstrategy == AGG_SORTED &&
+               list_length(rollups) == 1 &&
+               ((RollupData *) linitial(rollups))->groupClause == NIL)
+               aggstrategy = AGG_PLAIN;
+ 
+       if (aggstrategy == AGG_MIXED &&
+               list_length(rollups) == 1)
+               aggstrategy = AGG_HASHED;
+ 
         /*
          * Output will be in sorted order by group_pathkeys if, and only if, there
          * is a single rollup operation on a non-empty list of grouping
@@@ -4669,21 -3409,9 +4889,21 @@@ reparameterize_path(PlannerInfo *root, 
                                                                                                                 rel,
                                                                                                                 bpath->bitmapqual,
                                                                                                                 required_outer,
-                                                                                                               loop_count);
+                                                                                                               loop_count, 0);
                         }
                 case T_SubqueryScan:
+ +#ifdef XCP
+ +                      {
+ +                              SubqueryScanPath *spath = (SubqueryScanPath *) path;
+ +
+ +                              return (Path *) create_subqueryscan_path(root,
+ +                                                                                                               rel,
+ +                                                                                                               spath->subpath,
+ +                                                                                                               spath->path.pathkeys,
+ +                                                                                                               required_outer,
+ +                                                                                                               path->distribution);
+ +                      }
+ +#else
                         {
                                 SubqueryScanPath *spath = (SubqueryScanPath *) path;
   
diff --cc src/backend/optimizer/util/plancat.c

index 2b50919b10887bce1f535f5b943962ae701cbbd7,8f9dd9099b0c4ea16a46a7c4ba44eb1cf7cebb5c..aa8f6cf02024fefdfb7cc35371e13a121b834013
--- 1/src/backend/optimizer/util/plancat.c
--- 2/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@@ -4,8 -4,7 +4,8 @@@
    *       routines for accessing the system catalogs
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -40,13 -41,14 +42,16 @@@
   #include "parser/parse_relation.h"
   #include "parser/parsetree.h"
   #include "rewrite/rewriteManip.h"
+ #include "statistics/statistics.h"
   #include "storage/bufmgr.h"
+ #include "utils/builtins.h"
   #include "utils/lsyscache.h"
+ #include "utils/syscache.h"
   #include "utils/rel.h"
   #include "utils/snapmgr.h"
- -
+ +#ifdef PGXC
+ +#include "pgxc/pgxc.h"
+ +#endif
   
   /* GUC parameter */
   int                   constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION;
diff --cc src/backend/optimizer/util/relnode.c
Simple merge
diff --cc src/backend/parser/analyze.c

index 90603dd5e556cedda9f986b6012ec39f4ca96580,86482eba26ee894cda3edb665190470b240456e8..020d6f74c4b25884cce849bbccf90420bd7cc753
--- 1/src/backend/parser/analyze.c
--- 2/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@@ -14,8 -14,7 +14,8 @@@
    * contain optimizable statements, which we should transform.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *    src/backend/parser/analyze.c
@@@ -2540,173 -2572,6 +2624,174 @@@ transformCreateTableAsStmt(ParseState *
         return result;
   }
   
-               Node   *parsetree = (Node *) lfirst(raw_parsetree_item);
-               result = parse_analyze(parsetree, query, NULL, 0);
+ +#ifdef PGXC
+ +/*
+ + * transformExecDirectStmt -
+ + *    transform an EXECUTE DIRECT Statement
+ + *
+ + * Handling is depends if we should execute on nodes or on Coordinator.
+ + * To execute on nodes we return CMD_UTILITY query having one T_RemoteQuery node
+ + * with the inner statement as a sql_command.
+ + * If statement is to run on Coordinator we should parse inner statement and
+ + * analyze resulting query tree.
+ + */
+ +static Query *
+ +transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt)
+ +{
+ +      Query           *result = makeNode(Query);
+ +      char            *query = stmt->query;
+ +      List            *nodelist = stmt->node_names;
+ +      RemoteQuery     *step = makeNode(RemoteQuery);
+ +      bool            is_local = false;
+ +      List            *raw_parsetree_list;
+ +      ListCell        *raw_parsetree_item;
+ +      char            *nodename;
+ +      int                     nodeIndex;
+ +      char            nodetype;
+ +
+ +      /* Support not available on Datanodes */
+ +      if (IS_PGXC_DATANODE)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ +                               errmsg("EXECUTE DIRECT cannot be executed on a Datanode")));
+ +
+ +      if (list_length(nodelist) > 1)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ +                               errmsg("Support for EXECUTE DIRECT on multiple nodes is not available yet")));
+ +
+ +      Assert(list_length(nodelist) == 1);
+ +      Assert(IS_PGXC_COORDINATOR);
+ +
+ +      /* There is a single element here */
+ +      nodename = strVal(linitial(nodelist));
+ +#ifdef XCP
+ +      nodetype = PGXC_NODE_NONE;
+ +      nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype);
+ +      if (nodetype == PGXC_NODE_NONE)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                               errmsg("PGXC Node %s: object not defined",
+ +                                              nodename)));
+ +#else
+ +      nodeoid = get_pgxc_nodeoid(nodename);
+ +
+ +      if (!OidIsValid(nodeoid))
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                               errmsg("PGXC Node %s: object not defined",
+ +                                              nodename)));
+ +
+ +      /* Get node type and index */
+ +      nodetype = get_pgxc_nodetype(nodeoid);
+ +      nodeIndex = PGXCNodeGetNodeId(nodeoid, get_pgxc_nodetype(nodeoid));
+ +#endif
+ +
+ +      /* Check if node is requested is the self-node or not */
+ +      if (nodetype == PGXC_NODE_COORDINATOR && nodeIndex == PGXCNodeId - 1)
+ +              is_local = true;
+ +
+ +      /* Transform the query into a raw parse list */
+ +      raw_parsetree_list = pg_parse_query(query);
+ +
+ +      /* EXECUTE DIRECT can just be executed with a single query */
+ +      if (list_length(raw_parsetree_list) > 1)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ +                               errmsg("EXECUTE DIRECT cannot execute multiple queries")));
+ +
+ +      /*
+ +       * Analyze the Raw parse tree
+ +       * EXECUTE DIRECT is restricted to one-step usage
+ +       */
+ +      foreach(raw_parsetree_item, raw_parsetree_list)
+ +      {
++              RawStmt   *parsetree = lfirst_node(RawStmt, raw_parsetree_item);
++              List *result_list = pg_analyze_and_rewrite(parsetree, query, NULL, 0, NULL);
++              result = linitial_node(Query, result_list);
+ +      }
+ +
+ +      /* Default list of parameters to set */
+ +      step->sql_statement = NULL;
+ +      step->exec_nodes = makeNode(ExecNodes);
+ +      step->combine_type = COMBINE_TYPE_NONE;
+ +      step->sort = NULL;
+ +      step->read_only = true;
+ +      step->force_autocommit = false;
+ +      step->cursor = NULL;
+ +
+ +      /* This is needed by executor */
+ +      step->sql_statement = pstrdup(query);
+ +      if (nodetype == PGXC_NODE_COORDINATOR)
+ +              step->exec_type = EXEC_ON_COORDS;
+ +      else
+ +              step->exec_type = EXEC_ON_DATANODES;
+ +
+ +      step->reduce_level = 0;
+ +      step->base_tlist = NIL;
+ +      step->outer_alias = NULL;
+ +      step->inner_alias = NULL;
+ +      step->outer_reduce_level = 0;
+ +      step->inner_reduce_level = 0;
+ +      step->outer_relids = NULL;
+ +      step->inner_relids = NULL;
+ +      step->inner_statement = NULL;
+ +      step->outer_statement = NULL;
+ +      step->join_condition = NULL;
+ +
+ +      /* Change the list of nodes that will be executed for the query and others */
+ +      step->force_autocommit = false;
+ +      step->combine_type = COMBINE_TYPE_SAME;
+ +      step->read_only = true;
+ +      step->exec_direct_type = EXEC_DIRECT_NONE;
+ +
+ +      /* Set up EXECUTE DIRECT flag */
+ +      if (is_local)
+ +      {
+ +              if (result->commandType == CMD_UTILITY)
+ +                      step->exec_direct_type = EXEC_DIRECT_LOCAL_UTILITY;
+ +              else
+ +                      step->exec_direct_type = EXEC_DIRECT_LOCAL;
+ +      }
+ +      else
+ +      {
+ +              switch(result->commandType)
+ +              {
+ +                      case CMD_UTILITY:
+ +                              step->exec_direct_type = EXEC_DIRECT_UTILITY;
+ +                              break;
+ +                      case CMD_SELECT:
+ +                              step->exec_direct_type = EXEC_DIRECT_SELECT;
+ +                              break;
+ +                      case CMD_INSERT:
+ +                              step->exec_direct_type = EXEC_DIRECT_INSERT;
+ +                              break;
+ +                      case CMD_UPDATE:
+ +                              step->exec_direct_type = EXEC_DIRECT_UPDATE;
+ +                              break;
+ +                      case CMD_DELETE:
+ +                              step->exec_direct_type = EXEC_DIRECT_DELETE;
+ +                              break;
+ +                      default:
+ +                              Assert(0);
+ +              }
+ +      }
+ +
+ +      /* Build Execute Node list, there is a unique node for the time being */
+ +      step->exec_nodes->nodeList = lappend_int(step->exec_nodes->nodeList, nodeIndex);
+ +
+ +      if (!is_local)
+ +              result->utilityStmt = (Node *) step;
+ +
+ +      /*
+ +       * Reset the queryId since the caller would do that anyways.
+ +       */
+ +      result->queryId = 0;
+ +
+ +      return result;
+ +}
+ +
+ +#endif
   
   /*
    * Produce a string representation of a LockClauseStrength value.
diff --cc src/backend/parser/gram.y

index 54af97691772594072be56228bf106506084bc15,7e03624eb45a1617b69e7e5767f39b34b612cd13..7fa2f21e3f4f71f4362754ca21abf24cb99c90d3
--- 1/src/backend/parser/gram.y
--- 2/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@@ -6,9 -6,8 +6,9 @@@
    * gram.y
    *      POSTGRESQL BISON rules/actions
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -239,11 -237,10 +249,15 @@@ static Node *makeRecursiveViewSelect(ch
         struct ImportQual       *importqual;
         InsertStmt                      *istmt;
         VariableSetStmt         *vsetstmt;
+ +/* PGXC_BEGIN */
+ +      struct StmtMulti                        *stmtmulti;
+ +      DistributeBy            *distby;
+ +      PGXCSubCluster          *subclus;
+ +/* PGXC_END */
+       PartitionElem           *partelem;
+       PartitionSpec           *partspec;
+       PartitionBoundSpec      *partboundspec;
+       RoleSpec                        *rolespec;
   }
   
   %type <node>  stmt schema_stmt
@@@ -266,10 -263,10 +280,10 @@@
                 CreateUserStmt CreateUserMappingStmt CreateRoleStmt CreatePolicyStmt
                 CreatedbStmt DeclareCursorStmt DefineStmt DeleteStmt DiscardStmt DoStmt
                 DropGroupStmt DropOpClassStmt DropOpFamilyStmt DropPLangStmt DropStmt
-               DropAssertStmt DropTrigStmt DropRuleStmt DropCastStmt DropRoleStmt
-               DropPolicyStmt DropUserStmt DropdbStmt DropTableSpaceStmt DropFdwStmt
+               DropAssertStmt DropCastStmt DropRoleStmt
+               DropUserStmt DropdbStmt DropTableSpaceStmt
                 DropTransformStmt
-               DropForeignServerStmt DropUserMappingStmt ExplainStmt ExecDirectStmt FetchStmt
- -              DropUserMappingStmt ExplainStmt FetchStmt
++              DropUserMappingStmt ExplainStmt ExecDirectStmt FetchStmt
                 GrantStmt GrantRoleStmt ImportForeignSchemaStmt IndexStmt InsertStmt
                 ListenStmt LoadStmt LockStmt NotifyStmt ExplainableStmt PreparableStmt
                 CreateFunctionStmt AlterFunctionStmt ReindexStmt RemoveAggrStmt
@@@ -282,9 -279,9 +296,11 @@@
                 DeallocateStmt PrepareStmt ExecuteStmt
                 DropOwnedStmt ReassignOwnedStmt
                 AlterTSConfigurationStmt AlterTSDictionaryStmt
+ +              BarrierStmt PauseStmt AlterNodeStmt CreateNodeStmt DropNodeStmt
+ +              CreateNodeGroupStmt DropNodeGroupStmt
                 CreateMatViewStmt RefreshMatViewStmt CreateAmStmt
+               CreatePublicationStmt AlterPublicationStmt
+               CreateSubscriptionStmt AlterSubscriptionStmt DropSubscriptionStmt
   
   %type <node>  select_no_parens select_with_parens select_clause
                                 simple_select values_clause
@@@ -564,12 -569,15 +595,20 @@@
   %type <windef>        window_definition over_clause window_specification
                                 opt_frame_clause frame_extent frame_bound
   %type <str>           opt_existing_window_name
+ +/* PGXC_BEGIN */
+ +%type <str>           opt_barrier_id OptDistributeType DistributeStyle OptDistKey
+ +%type <distby>        OptDistributeBy OptDistributeByInternal
+ +%type <subclus> OptSubCluster OptSubClusterInternal
+ +/* PGXC_END */
   %type <boolean> opt_if_not_exists
+ %type <ival>  generated_when override_kind
+ %type <partspec>      PartitionSpec OptPartitionSpec
+ %type <str>                   part_strategy
+ %type <partelem>      part_elem
+ %type <list>          part_params
+ %type <partboundspec> ForValues
+ %type <node>          partbound_datum PartitionRangeDatum
+ %type <list>          partbound_datum_list range_datum_list
   
   /*
    * Non-keyword token types.  These are hard-wired into the "flex" lexer.
@@@ -593,30 -601,26 +632,27 @@@
    */
   
   /* ordinary key words in alphabetical order */
+ +/* PGXC - added DISTRIBUTE, DISTRIBUTED, DISTSYLE, DISTKEY, RANDOMLY, DIRECT, COORDINATOR, CLEAN,  NODE, BARRIER */
   %token <keyword> ABORT_P ABSOLUTE_P ACCESS ACTION ADD_P ADMIN AFTER
         AGGREGATE ALL ALSO ALTER ALWAYS ANALYSE ANALYZE AND ANY ARRAY AS ASC
-       ASSERTION ASSIGNMENT ASYMMETRIC AT ATTRIBUTE AUTHORIZATION
+       ASSERTION ASSIGNMENT ASYMMETRIC AT ATTACH ATTRIBUTE AUTHORIZATION
   
- -      BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
+ +      BACKWARD BARRIER BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
         BOOLEAN_P BOTH BY
   
         CACHE CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
- -      CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
+ +      CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLEAN CLOSE
-       CLUSTER COALESCE COLLATE COLLATION COLUMN COMMENT COMMENTS COMMIT
-       COMMITTED CONCURRENTLY CONFIGURATION CONFLICT CONNECTION CONSTRAINT CONSTRAINTS
-       CONTENT_P CONTINUE_P CONVERSION_P COORDINATOR COPY COST CREATE
+       CLUSTER COALESCE COLLATE COLLATION COLUMN COLUMNS COMMENT COMMENTS COMMIT
+       COMMITTED CONCURRENTLY CONFIGURATION CONFLICT CONNECTION CONSTRAINT
- -      CONSTRAINTS CONTENT_P CONTINUE_P CONVERSION_P COPY COST CREATE
++      CONSTRAINTS CONTENT_P CONTINUE_P CONVERSION_P COORDINATOR COPY COST CREATE
         CROSS CSV CUBE CURRENT_P
         CURRENT_CATALOG CURRENT_DATE CURRENT_ROLE CURRENT_SCHEMA
         CURRENT_TIME CURRENT_TIMESTAMP CURRENT_USER CURSOR CYCLE
   
         DATA_P DATABASE DAY_P DEALLOCATE DEC DECIMAL_P DECLARE DEFAULT DEFAULTS
         DEFERRABLE DEFERRED DEFINER DELETE_P DELIMITER DELIMITERS DEPENDS DESC
- /* PGXC_BEGIN */
-       DICTIONARY DIRECT DISABLE_P DISCARD DISTINCT DISTKEY DISTRIBUTE DISTRIBUTED
-       DISTSTYLE DO DOCUMENT_P DOMAIN_P DOUBLE_P
- /* PGXC_END */
-       DROP
- -      DETACH DICTIONARY DISABLE_P DISCARD DISTINCT DO DOCUMENT_P DOMAIN_P
++      DETACH DICTIONARY DIRECT DISABLE_P DISCARD DISTINCT DISTKEY DISTRIBUTE DISTRIBUTED DISTSTYLE DO DOCUMENT_P DOMAIN_P
+       DOUBLE_P DROP
   
         EACH ELSE ENABLE_P ENCODING ENCRYPTED END_P ENUM_P ESCAPE EVENT EXCEPT
         EXCLUDE EXCLUDING EXCLUSIVE EXECUTE EXISTS EXPLAIN
@@@ -644,21 -648,21 +680,21 @@@
   
         MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE
   
-       NAME_P NAMES NATIONAL NATURAL NCHAR NEXT NO NODE NONE
- -      NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE
++      NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NODE NONE
         NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF
         NULLS_P NUMERIC
   
-       OBJECT_P OF OFF OFFSET OIDS ON ONLY OPERATOR OPTION OPTIONS OR
-       ORDER ORDINALITY OUT_P OUTER_P OVER OVERLAPS OVERLAY OWNED OWNER
+       OBJECT_P OF OFF OFFSET OIDS OLD ON ONLY OPERATOR OPTION OPTIONS OR
+       ORDER ORDINALITY OUT_P OUTER_P OVER OVERLAPS OVERLAY OVERRIDING OWNED OWNER
   
- -      PARALLEL PARSER PARTIAL PARTITION PASSING PASSWORD PLACING PLANS POLICY
- -      POSITION PRECEDING PRECISION PRESERVE PREPARE PREPARED PRIMARY
+ +      PARALLEL PARSER PARTIAL PARTITION PASSING PASSWORD PAUSE PLACING PLANS POLICY
+ +      POSITION PRECEDING PRECISION PREFERRED PRESERVE PREPARE PREPARED PRIMARY
-       PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROGRAM
+       PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROGRAM PUBLICATION
   
         QUOTE
   
-       RANDOMLY RANGE READ REAL REASSIGN RECHECK RECURSIVE REF REFERENCES REFRESH REINDEX
-       RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA
- -      RANGE READ REAL REASSIGN RECHECK RECURSIVE REF REFERENCES REFERENCING
++      RANDOMLY RANGE READ REAL REASSIGN RECHECK RECURSIVE REF REFERENCES REFERENCING
+       REFRESH REINDEX RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA
         RESET RESTART RESTRICT RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROLLUP
         ROW ROWS RULE
   
@@@ -774,85 -778,34 +811,112 @@@ stmtblock:     stmtmult
                         }
                 ;
   
- /* the thrashing around here is to discard "empty" statements... */
+ /*
+  * At top level, we wrap each stmt with a RawStmt node carrying start location
+  * and length of the stmt's text.  Notice that the start loc/len are driven
+  * entirely from semicolon locations (@2).  It would seem natural to use
+  * @1 or @3 to get the true start location of a stmt, but that doesn't work
+  * for statements that can start with empty nonterminals (opt_with_clause is
+  * the main offender here); as noted in the comments for YYLLOC_DEFAULT,
+  * we'd get -1 for the location in such cases.
+  * We also take care to discard empty statements entirely.
+  */
   stmtmulti:    stmtmulti ';' stmt
                                 {
++                                      /* 
++                                       * XXX PG10MERGE: Looks like support for obtaining raw
++                                       * query string for individual commands is added in PG10.
++                                       * If so, we can make use of the same infrastructure.
++                                       *
++                                       * XXX The following gives a compilation WARNING because
++                                       * stmtmulti is defined as a List in PG10, but we have our
++                                       * own definition.
++                                       */
+                                       if ($1 != NIL)
+                                       {
+                                               /* update length of previous stmt */
+                                               updateRawStmtEnd(llast_node(RawStmt, $1), @2);
+                                       }
+ +                                      if ($3 != NULL)
+ +                                      {
+ +                                              char *query;
+ +                                              ListCell *last;
+ +                                              /*
+ +                                               * Because of the way multi-commands are parsed by the
+ +                                               * parser, when the earlier command was parsed and
+ +                                               * reduced to a 'stmtmulti', we did not have the
+ +                                               * end-of-the-query marker. But now that we have seen
+ +                                               * the ';' token, add '\0' at the corresponding offset
+ +                                               * to get a separated command.
+ +                                               */
+ +                                              if ($1 != NULL)
+ +                                              {
+ +                                                      last = list_tail($1->queries);
+ +                                                      query = palloc(@2 - $1->offset + 1);
+ +                                                      memcpy(query, lfirst(last), @2 - $1->offset);
+ +                                                      query[@2 - $1->offset] = '\0';
+ +                                                      lfirst(last) = query;
+ +
+ +                                                      query = scanner_get_query(@3, -1, yyscanner);
+ +                                                      $1->offset = @2;
+ +                                                      $1->parsetrees = lappend($1->parsetrees, $3);
+ +                                                      $1->queries = lappend($1->queries, query);
+ +                                                      $$ = $1;
+ +                                              }
+ +                                              /*
+ +                                               *
+ +                                               * If the earlier statements were all null, then we
+ +                                               * must initialise the StmtMulti structure and make
+ +                                               * singleton lists
+ +                                               */
+ +                                              else
+ +                                              {
+ +                                                      StmtMulti *n = (StmtMulti *) palloc0(sizeof (StmtMulti));
+ +                                                      query = scanner_get_query(@3, -1, yyscanner);
+ +                                                      n->offset = @2;
+ +                                                      n->parsetrees = list_make1($3);
+ +                                                      n->queries = list_make1(query);
+ +                                                      $$ = n;
+ +                                              }
+ +                                      }
+                                       if ($3 != NULL)
+                                               $$ = lappend($1, makeRawStmt($3, @2 + 1));
                                         else
                                                 $$ = $1;
                                 }
                         | stmt
                                 {
+ +                                      if ($1 != NULL)
+ +                                      {
+ +                                              StmtMulti *n = (StmtMulti *) palloc0(sizeof (StmtMulti));
+ +                                              char *query = scanner_get_query(@1, -1, yyscanner);
+ +
+ +                                              /*
+ +                                               * Keep track of the offset where $1 started. We don't
+ +                                               * have the offset where it ends so we copy the entire
+ +                                               * query to the end. If later, we find a ';' followed
+ +                                               * by another command, we'll add the '\0' at the
+ +                                               * appropriate offset
+ +                                               *
+ +                                               * XXX May be there is a better way to get the matching  
+ +                                               * portion of the query string, but this does the trick
+ +                                               * for regression as well as the problem we are trying
+ +                                               * to solve with multi-command queries
+ +                                               */
+ +                                              n->offset = @1;
+ +
+ +                                              /*
+ +                                               * Collect both parsetree as well as the original query
+ +                                               * that resulted in the parsetree
+ +                                               */
+ +                                              n->parsetrees = list_make1($1);
+ +                                              n->queries = list_make1(query);
+ +                                              $$ = n;
+ +                                      }
+                                       if ($1 != NULL)
+                                               $$ = list_make1(makeRawStmt($1, 0));
                                         else
- -                                              $$ = NIL;
+ +                                              $$ = NULL;
                                 }
                 ;
   
@@@ -910,10 -863,9 +977,11 @@@ stmt 
                         | CreateFunctionStmt
                         | CreateGroupStmt
                         | CreateMatViewStmt
+ +                      | CreateNodeGroupStmt
+ +                      | CreateNodeStmt
                         | CreateOpClassStmt
                         | CreateOpFamilyStmt
+                       | CreatePublicationStmt
                         | AlterOpFamilyStmt
                         | CreatePolicyStmt
                         | CreatePLangStmt
@@@ -936,11 -890,7 +1006,9 @@@
                         | DoStmt
                         | DropAssertStmt
                         | DropCastStmt
-                       | DropFdwStmt
-                       | DropForeignServerStmt
                         | DropGroupStmt
+ +                      | DropNodeGroupStmt
+ +                      | DropNodeStmt
                         | DropOpClassStmt
                         | DropOpFamilyStmt
                         | DropOwnedStmt
@@@ -2958,74 -3062,43 +3223,77 @@@ copy_generic_opt_arg_list_item
    *****************************************************************************/
   
   CreateStmt:   CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
-                       OptInherit OptWith OnCommitOption OptTableSpace
+                       OptInherit OptPartitionSpec OptWith OnCommitOption OptTableSpace
+ +/* PGXC_BEGIN */
+ +                      OptDistributeBy OptSubCluster
+ +/* PGXC_END */
                                 {
                                         CreateStmt *n = makeNode(CreateStmt);
                                         $4->relpersistence = $2;
                                         n->relation = $4;
                                         n->tableElts = $6;
                                         n->inhRelations = $8;
+                                       n->partspec = $9;
                                         n->ofTypename = NULL;
                                         n->constraints = NIL;
-                                       n->options = $9;
-                                       n->oncommit = $10;
-                                       n->tablespacename = $11;
+                                       n->options = $10;
+                                       n->oncommit = $11;
+                                       n->tablespacename = $12;
                                         n->if_not_exists = false;
-                                       n->distributeby = $12;
-                                       n->subcluster = $13;
+ +/* PGXC_BEGIN */
+ +                                      if ($2 == RELPERSISTENCE_LOCAL_TEMP)
+ +                                      {
+ +                                              $4->relpersistence = RELPERSISTENCE_TEMP;
+ +                                              n->islocal = true;
+ +                                      }
+ +                                      n->relkind = RELKIND_RELATION;
++                                      n->distributeby = $13;
++                                      n->subcluster = $14;
+ +/* PGXC_END */
                                         $$ = (Node *)n;
                                 }
                 | CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name '('
-                       OptTableElementList ')' OptInherit OptWith OnCommitOption
-                       OptTableSpace
+                       OptTableElementList ')' OptInherit OptPartitionSpec OptWith
+                       OnCommitOption OptTableSpace
+ +/* PGXC_BEGIN */
+ +                      OptDistributeBy OptSubCluster
+ +/* PGXC_END */
                                 {
                                         CreateStmt *n = makeNode(CreateStmt);
                                         $7->relpersistence = $2;
                                         n->relation = $7;
                                         n->tableElts = $9;
                                         n->inhRelations = $11;
+                                       n->partspec = $12;
                                         n->ofTypename = NULL;
                                         n->constraints = NIL;
-                                       n->options = $12;
-                                       n->oncommit = $13;
-                                       n->tablespacename = $14;
+                                       n->options = $13;
+                                       n->oncommit = $14;
+                                       n->tablespacename = $15;
                                         n->if_not_exists = true;
-                                       n->distributeby = $15;
-                                       n->subcluster = $16;
+ +/* PGXC_BEGIN */
+ +                                      if ($2 == RELPERSISTENCE_LOCAL_TEMP)
+ +                                      {
+ +                                              $7->relpersistence = RELPERSISTENCE_TEMP;
+ +                                              n->islocal = true;
+ +                                      }
+ +                                      n->relkind = RELKIND_RELATION;
++                                      n->distributeby = $16;
++                                      n->subcluster = $17;
+ +                                      if (n->inhRelations != NULL && n->distributeby != NULL)
+ +                                              ereport(ERROR,
+ +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                               errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
+ +                                                               parser_errposition(exprLocation((Node *) n->distributeby))));
+ +/* PGXC_END */
                                         $$ = (Node *)n;
                                 }
                 | CREATE OptTemp TABLE qualified_name OF any_name
-                       OptTypedTableElementList OptWith OnCommitOption OptTableSpace
+                       OptTypedTableElementList OptPartitionSpec OptWith OnCommitOption
+                       OptTableSpace
+ +/* PGXC_BEGIN */
+ +                      OptDistributeBy OptSubCluster
+ +/* PGXC_END */
                                 {
                                         CreateStmt *n = makeNode(CreateStmt);
                                         $4->relpersistence = $2;
@@@ -3035,32 -3109,15 +3304,33 @@@
                                         n->ofTypename = makeTypeNameFromNameList($6);
                                         n->ofTypename->location = @6;
                                         n->constraints = NIL;
-                                       n->options = $8;
-                                       n->oncommit = $9;
-                                       n->tablespacename = $10;
+                                       n->options = $9;
+                                       n->oncommit = $10;
+                                       n->tablespacename = $11;
                                         n->if_not_exists = false;
-                                       n->distributeby = $11;
-                                       n->subcluster = $12;
+ +/* PGXC_BEGIN */
+ +                                      if ($2 == RELPERSISTENCE_LOCAL_TEMP)
+ +                                      {
+ +                                              $4->relpersistence = RELPERSISTENCE_TEMP;
+ +                                              n->islocal = true;
+ +                                      }
+ +                                      n->relkind = RELKIND_RELATION;
++                                      n->distributeby = $12;
++                                      n->subcluster = $13;
+ +                                      if (n->inhRelations != NULL && n->distributeby != NULL)
+ +                                              ereport(ERROR,
+ +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                               errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
+ +                                                               parser_errposition(exprLocation((Node *) n->distributeby))));
+ +/* PGXC_END */
                                         $$ = (Node *)n;
                                 }
                 | CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name OF any_name
-                       OptTypedTableElementList OptWith OnCommitOption OptTableSpace
+                       OptTypedTableElementList OptPartitionSpec OptWith OnCommitOption
+                       OptTableSpace
+ +/* PGXC_BEGIN */
+ +                      OptDistributeBy OptSubCluster
+ +/* PGXC_END */
                                 {
                                         CreateStmt *n = makeNode(CreateStmt);
                                         $7->relpersistence = $2;
@@@ -3070,27 -3128,50 +3341,65 @@@
                                         n->ofTypename = makeTypeNameFromNameList($9);
                                         n->ofTypename->location = @9;
                                         n->constraints = NIL;
-                                       n->options = $11;
-                                       n->oncommit = $12;
-                                       n->tablespacename = $13;
+                                       n->options = $12;
+                                       n->oncommit = $13;
+                                       n->tablespacename = $14;
                                         n->if_not_exists = true;
-                                       n->distributeby = $14;
-                                       n->subcluster = $15;
+ +/* PGXC_BEGIN */
+ +                                      if ($2 == RELPERSISTENCE_LOCAL_TEMP)
+ +                                      {
+ +                                              $7->relpersistence = RELPERSISTENCE_TEMP;
+ +                                              n->islocal = true;
+ +                                      }
+ +                                      n->relkind = RELKIND_RELATION;
++                                      n->distributeby = $15;
++                                      n->subcluster = $16;
+ +                                      if (n->inhRelations != NULL && n->distributeby != NULL)
+ +                                              ereport(ERROR,
+ +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                               errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
+ +                                                               parser_errposition(exprLocation((Node *) n->distributeby))));
+ +/* PGXC_END */
                                         $$ = (Node *)n;
                                 }
+               | CREATE OptTemp TABLE qualified_name PARTITION OF qualified_name
+                       OptTypedTableElementList ForValues OptPartitionSpec OptWith
+                       OnCommitOption OptTableSpace
+                               {
+                                       CreateStmt *n = makeNode(CreateStmt);
+                                       $4->relpersistence = $2;
+                                       n->relation = $4;
+                                       n->tableElts = $8;
+                                       n->inhRelations = list_make1($7);
+                                       n->partbound = $9;
+                                       n->partspec = $10;
+                                       n->ofTypename = NULL;
+                                       n->constraints = NIL;
+                                       n->options = $11;
+                                       n->oncommit = $12;
+                                       n->tablespacename = $13;
+                                       n->if_not_exists = false;
+                                       $$ = (Node *)n;
+                               }
+               | CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name PARTITION OF
+                       qualified_name OptTypedTableElementList ForValues OptPartitionSpec
+                       OptWith OnCommitOption OptTableSpace
+                               {
+                                       CreateStmt *n = makeNode(CreateStmt);
+                                       $7->relpersistence = $2;
+                                       n->relation = $7;
+                                       n->tableElts = $11;
+                                       n->inhRelations = list_make1($10);
+                                       n->partbound = $12;
+                                       n->partspec = $13;
+                                       n->ofTypename = NULL;
+                                       n->constraints = NIL;
+                                       n->options = $14;
+                                       n->oncommit = $15;
+                                       n->tablespacename = $16;
+                                       n->if_not_exists = true;
+                                       $$ = (Node *)n;
+                               }
                 ;
   
   /*
@@@ -14371,11 -14618,9 +15259,12 @@@ unreserved_keyword
                         | ASSERTION
                         | ASSIGNMENT
                         | AT
+                       | ATTACH
                         | ATTRIBUTE
                         | BACKWARD
+ +/* PGXC_BEGIN */
+ +                      | BARRIER
+ +/* PGXC_END */
                         | BEFORE
                         | BEGIN_P
                         | BY
@@@ -14388,9 -14633,9 +15277,10 @@@
                         | CHARACTERISTICS
                         | CHECKPOINT
                         | CLASS
+ +                      | CLEAN
                         | CLOSE
                         | CLUSTER
+                       | COLUMNS
                         | COMMENT
                         | COMMENTS
                         | COMMIT
@@@ -14422,16 -14666,10 +15312,17 @@@
                         | DELIMITER
                         | DELIMITERS
                         | DEPENDS
+                       | DETACH
                         | DICTIONARY
+ +                      | DIRECT
                         | DISABLE_P
                         | DISCARD
+ +/* PGXC_BEGIN */
+ +                      | DISTKEY
+ +                      | DISTRIBUTE
+ +                      | DISTRIBUTED
+ +                      | DISTSTYLE
+ +/* PGXC_END */
                         | DOCUMENT_P
                         | DOMAIN_P
                         | DOUBLE_P
@@@ -14509,9 -14748,9 +15401,10 @@@
                         | MOVE
                         | NAME_P
                         | NAMES
+                       | NEW
                         | NEXT
                         | NO
+ +                      | NODE
                         | NOTHING
                         | NOTIFY
                         | NOWAIT
@@@ -14550,10 -14785,8 +15445,11 @@@
                         | PROCEDURAL
                         | PROCEDURE
                         | PROGRAM
+                       | PUBLICATION
                         | QUOTE
+ +/* PGXC_BEGIN */
+ +                      | RANDOMLY
+ +/* PGXC_END */
                         | RANGE
                         | READ
                         | REASSIGN
diff --cc src/backend/parser/parse_agg.c

index 6876f2a3d4423712f9aa08cd44666a56bff4c6b2,efe1c371efc205f7b87c63fb1da07c2313cfdb96..9fc0371cb35822af016fbcdde49bf60942039ed2
--- 1/src/backend/parser/parse_agg.c
--- 2/src/backend/parser/parse_agg.c
+++ b/src/backend/parser/parse_agg.c
@@@ -3,8 -3,7 +3,8 @@@
    * parse_agg.c
    *      handle aggregates and window functions in parser
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/parser/parse_coerce.c
Simple merge
diff --cc src/backend/parser/parse_expr.c
Simple merge
diff --cc src/backend/parser/parse_relation.c

index c10e272d72866b8906e1d0254dfd0b97c198729c,e412d0f9d30b8779594b9543bf194bee3472148d..8ae8b00236c35bce0c83b28cbab6e34d1a9b55bc
--- 1/src/backend/parser/parse_relation.c
--- 2/src/backend/parser/parse_relation.c
+++ b/src/backend/parser/parse_relation.c
@@@ -3,8 -3,7 +3,8 @@@
    * parse_relation.c
    *      parser support routines dealing with relations
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -32,13 -32,7 +33,14 @@@
   #include "utils/lsyscache.h"
   #include "utils/rel.h"
   #include "utils/syscache.h"
+ +#ifdef XCP
+ +#include "utils/guc.h"
+ +#include "catalog/pg_statistic.h"
+ +#include "catalog/pg_namespace.h"
+ +#include "pgxc/pgxc.h"
+ +#include "miscadmin.h"
+ +#endif
+ #include "utils/varlena.h"
   
   
   #define MAX_FUZZY_DISTANCE                            3
diff --cc src/backend/parser/parse_target.c
Simple merge
diff --cc src/backend/parser/parse_type.c
Simple merge
diff --cc src/backend/parser/parse_utilcmd.c

index bd0a6202859a7ce689512958602cc24282e3b8d1,9134fb9d63c1c5f06f2ff097e9c573889c44e690..c04e77775ed9f4ff408aa756071be20306cb8108
--- 1/src/backend/parser/parse_utilcmd.c
--- 2/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@@ -16,10 -16,8 +16,10 @@@
    * a quick copyObject() call before manipulating the query tree.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *    src/backend/parser/parse_utilcmd.c
    *
@@@ -42,11 -40,9 +42,12 @@@
   #include "catalog/pg_opclass.h"
   #include "catalog/pg_operator.h"
   #include "catalog/pg_type.h"
+ +#ifdef XCP
+ +#include "catalog/pgxc_node.h"
+ +#endif
   #include "commands/comment.h"
   #include "commands/defrem.h"
+ #include "commands/sequence.h"
   #include "commands/tablecmds.h"
   #include "commands/tablespace.h"
   #include "miscadmin.h"
@@@ -111,12 -90,8 +114,14 @@@ typedef struc
         List       *alist;                      /* "after list" of things to do after creating
                                                                  * the table */
         IndexStmt  *pkey;                       /* PRIMARY KEY index, if any */
+ +#ifdef PGXC
+ +      FallbackSrc fallback_source;
+ +      List       *fallback_dist_cols;
+ +      DistributeBy    *distributeby;          /* original distribute by column of CREATE TABLE */
+ +      PGXCSubCluster  *subcluster;            /* original subcluster option of CREATE TABLE */
+ +#endif
+       bool            ispartitioned;  /* true if table is partitioned */
+       PartitionBoundSpec *partbound;          /* transformed FOR VALUES */
   } CreateStmtContext;
   
   /* State shared by transformCreateSchemaStmt and its subroutines */
@@@ -162,13 -134,10 +167,17 @@@ static void transformConstraintAttrs(Cr
                                                  List *constraintList);
   static void transformColumnType(CreateStmtContext *cxt, ColumnDef *column);
   static void setSchemaName(char *context_schema, char **stmt_schema_name);
+ +#ifdef PGXC
+ +static void checkLocalFKConstraints(CreateStmtContext *cxt);
+ +#endif
+ +#ifdef XCP
+ +static List *transformSubclusterNodes(PGXCSubCluster *subcluster);
+ +static PGXCSubCluster *makeSubCluster(List *nodelist);
+ +#endif
+ static void transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd);
+ static Const *transformPartitionBoundValue(ParseState *pstate, A_Const *con,
+                                                 const char *colName, Oid colType, int32 colTypmod);
+ 
   
   /*
    * transformCreateStmt -
@@@ -274,12 -237,7 +283,13 @@@ transformCreateStmt(CreateStmt *stmt, c
         cxt.blist = NIL;
         cxt.alist = NIL;
         cxt.pkey = NULL;
+ +#ifdef PGXC
+ +      cxt.fallback_source = FBS_NONE;
+ +      cxt.fallback_dist_cols = NIL;
+ +      cxt.distributeby = stmt->distributeby;
+ +      cxt.subcluster = stmt->subcluster;
+ +#endif
+       cxt.ispartitioned = stmt->partspec != NULL;
   
         /*
          * Notice that we allow OIDs here only for plain tables, even though
@@@ -551,86 -541,14 +682,15 @@@ transformColumnDefinition(CreateStmtCon
                 char       *snamespace;
                 char       *sname;
                 char       *qstring;
- -              A_Const    *snamenode;
+ +              A_Const    *snamenode;
                 TypeCast   *castnode;
                 FuncCall   *funccallnode;
-               CreateSeqStmt *seqstmt;
-               AlterSeqStmt *altseqstmt;
-               List       *attnamelist;
- 
-               /*
-                * Determine namespace and name to use for the sequence.
-                *
-                * Although we use ChooseRelationName, it's not guaranteed that the
-                * selected sequence name won't conflict; given sufficiently long
-                * field names, two different serial columns in the same table could
-                * be assigned the same sequence name, and we'd not notice since we
-                * aren't creating the sequence quite yet.  In practice this seems
-                * quite unlikely to be a problem, especially since few people would
-                * need two serial columns in one table.
-                */
-               if (cxt->rel)
-                       snamespaceid = RelationGetNamespace(cxt->rel);
-               else
-               {
-                       snamespaceid = RangeVarGetCreationNamespace(cxt->relation);
-                       RangeVarAdjustRelationPersistence(cxt->relation, snamespaceid);
-               }
-               snamespace = get_namespace_name(snamespaceid);
-               sname = ChooseRelationName(cxt->relation->relname,
-                                                                  column->colname,
-                                                                  "seq",
-                                                                  snamespaceid);
- 
-               ereport(DEBUG1,
-                               (errmsg("%s will create implicit sequence \"%s\" for serial column \"%s.%s\"",
-                                               cxt->stmtType, sname,
-                                               cxt->relation->relname, column->colname)));
- 
-               /*
-                * Build a CREATE SEQUENCE command to create the sequence object, and
-                * add it to the list of things to be done before this CREATE/ALTER
-                * TABLE.
-                */
-               seqstmt = makeNode(CreateSeqStmt);
-               seqstmt->sequence = makeRangeVar(snamespace, sname, -1);
-               seqstmt->options = NIL;
- #ifdef PGXC
-               seqstmt->is_serial = true;
- #endif
- 
-               /*
-                * If this is ALTER ADD COLUMN, make sure the sequence will be owned
-                * by the table's owner.  The current user might be someone else
-                * (perhaps a superuser, or someone who's only a member of the owning
-                * role), but the SEQUENCE OWNED BY mechanisms will bleat unless table
-                * and sequence have exactly the same owning role.
-                */
-               if (cxt->rel)
-                       seqstmt->ownerId = cxt->rel->rd_rel->relowner;
-               else
-                       seqstmt->ownerId = InvalidOid;
- 
-               cxt->blist = lappend(cxt->blist, seqstmt);
- 
-               /*
-                * Build an ALTER SEQUENCE ... OWNED BY command to mark the sequence
-                * as owned by this column, and add it to the list of things to be
-                * done after this CREATE/ALTER TABLE.
-                */
-               altseqstmt = makeNode(AlterSeqStmt);
-               altseqstmt->sequence = makeRangeVar(snamespace, sname, -1);
- #ifdef PGXC
-               altseqstmt->is_serial = true;
- #endif
-               attnamelist = list_make3(makeString(snamespace),
-                                                                makeString(cxt->relation->relname),
-                                                                makeString(column->colname));
-               altseqstmt->options = list_make1(makeDefElem("owned_by",
-                                                                                                        (Node *) attnamelist));
+               Constraint *constraint;
   
-               cxt->alist = lappend(cxt->alist, altseqstmt);
++              /* XXX XL 9.6 was setting stmt->is_serial. CHECK */
+               generateSerialExtraStmts(cxt, column,
+                                                                column->typeName->typeOid, NIL, false,
+                                                                &snamespace, &sname);
   
                 /*
                  * Create appropriate constraints for SERIAL.  We do this in full,
@@@ -2824,12 -2670,8 +2973,14 @@@ transformAlterTableStmt(Oid relid, Alte
         cxt.blist = NIL;
         cxt.alist = NIL;
         cxt.pkey = NULL;
+ +#ifdef PGXC
+ +      cxt.fallback_source = FBS_NONE;
+ +      cxt.fallback_dist_cols = NIL;
+ +      cxt.distributeby = NULL;
+ +      cxt.subcluster = NULL;
+ +#endif
+       cxt.ispartitioned = (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+       cxt.partbound = NULL;
   
         /*
          * The only subtypes that currently require parse transformation handling
@@@ -3299,495 -3239,274 +3564,767 @@@ setSchemaName(char *context_schema, cha
                                                 *stmt_schema_name, context_schema)));
   }
   
+ +#ifdef PGXC
+ +/*
+ + * CheckLocalIndexColumn
+ + *
+ + * Checks whether or not the index can be safely enforced locally
+ + */
+ +bool
+ +CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname)
+ +{
+ +      if (IsLocatorReplicated(loctype))
+ +              /* always safe */
+ +              return true;
+ +      if (loctype == LOCATOR_TYPE_RROBIN)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ +                                      errmsg("Cannot locally enforce a unique index on round robin distributed table.")));
+ +      else if (loctype == LOCATOR_TYPE_HASH || loctype == LOCATOR_TYPE_MODULO)
+ +      {
+ +              if (partcolname && indexcolname && strcmp(partcolname, indexcolname) == 0)
+ +                      return true;
+ +      }
+ +      return false;
+ +}
+ +
+ +/*
+ + * Given relation, find the index of the attribute in the primary key,
+ + * which is the distribution key. Returns -1 if table is not a Hash/Modulo
+ + * distributed, does not have a primary key or distribution key is not in the
+ + * primary key (last should not happen).
+ + */
+ +static int
+ +find_relation_pk_dist_index(Relation rel)
+ +{
+ +      int             result = -1;
+ +      List       *indexoidlist;
+ +      ListCell   *indexoidscan;
+ +      int                     partAttNum = InvalidAttrNumber;
+ +      bool            pk_found = false;
+ +
+ +      if (rel->rd_locator_info)
+ +              partAttNum = rel->rd_locator_info->partAttrNum;
+ +
+ +      if (partAttNum == InvalidAttrNumber)
+ +              return -1;
+ +
+ +      /*
+ +       * Look up the primary key
+ +       */
+ +      indexoidlist = RelationGetIndexList(rel);
+ +
+ +      foreach(indexoidscan, indexoidlist)
+ +      {
+ +              Oid                     indexoid = lfirst_oid(indexoidscan);
+ +              HeapTuple       indexTuple;
+ +              Form_pg_index indexForm;
+ +
+ +              indexTuple = SearchSysCache1(INDEXRELID,
+ +                                                               ObjectIdGetDatum(indexoid));
+ +              if (!HeapTupleIsValid(indexTuple)) /* should not happen */
+ +                      elog(ERROR, "cache lookup failed for index %u", indexoid);
+ +              indexForm = ((Form_pg_index) GETSTRUCT(indexTuple));
+ +              if (indexForm->indisprimary)
+ +              {
+ +                      int i;
+ +
+ +                      pk_found = true;
+ +
+ +                      /*
+ +                       * Loop over index attributes to find
+ +                       * the distribution key
+ +                       */
+ +                      for (i = 0; i < indexForm->indnatts; i++)
+ +                      {
+ +                              if (indexForm->indkey.values[i] == partAttNum)
+ +                              {
+ +                                      result = i;
+ +                                      break;
+ +                              }
+ +                      }
+ +              }
+ +              ReleaseSysCache(indexTuple);
+ +              if (pk_found)
+ +                      break;
+ +      }
+ +
+ +      list_free(indexoidlist);
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * check to see if the constraint can be enforced locally
+ + * if not, an error will be thrown
+ + */
+ +static void
+ +checkLocalFKConstraints(CreateStmtContext *cxt)
+ +{
+ +      ListCell   *fkclist;
+ +      List       *nodelist = NIL;
+ +
+ +      if (cxt->subcluster)
+ +              nodelist = transformSubclusterNodes(cxt->subcluster);
+ +
+ +      foreach(fkclist, cxt->fkconstraints)
+ +      {
+ +              Constraint *constraint;
+ +              Oid pk_rel_id;
+ +              RelationLocInfo *rel_loc_info;
+ +              constraint = (Constraint *) lfirst(fkclist);
+ +
+ +              /*
+ +               * If constraint references to the table itself, it is safe
+ +               * Check if relation name is the same
+ +               * XCTODO: NO! It is only safe if table is replicated
+ +               * or distributed on primary key
+ +               */
+ +              if (constraint->pktable &&
+ +                      strcmp(constraint->pktable->relname,cxt->relation->relname) == 0)
+ +              {
+ +                      /* Is namespace also the same ? */
+ +                      char *fkcon_schemaname = NULL;
+ +
+ +                      if (!cxt->relation->schemaname &&
+ +                              !constraint->pktable->schemaname)
+ +                              continue;
+ +
+ +                      if (!constraint->pktable->schemaname)
+ +                      {
+ +                              /* Schema name is not defined, look for current one */
+ +                              List   *search_path = fetch_search_path(false);
+ +                              fkcon_schemaname = get_namespace_name(linitial_oid(search_path));
+ +                              list_free(search_path);
+ +                      }
+ +                      else
+ +                              fkcon_schemaname = constraint->pktable->schemaname;
+ +
+ +                      /*
+ +                       * If schema name and relation name are the same, table
+ +                       * references to itself, so constraint is safe
+ +                       */
+ +                      if (fkcon_schemaname &&
+ +                              strcmp(fkcon_schemaname,
+ +                                         cxt->relation->schemaname) == 0)
+ +                      {
+ +                              /* check if bad distribution is already defined */
+ +                              if ((cxt->distributeby && cxt->distributeby->disttype != DISTTYPE_REPLICATION) ||
+ +                                              (cxt->isalter && cxt->rel->rd_locator_info != NULL && !IsLocatorReplicated(cxt->rel->rd_locator_info->locatorType)))
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                       errmsg("only replicated table can reference itself")));
+ +                              /* Record that replication is required */
+ +                              cxt->fallback_source = FBS_REPLICATE;
+ +                              if (cxt->fallback_dist_cols)
+ +                              {
+ +                                      list_free_deep(cxt->fallback_dist_cols);
+ +                                      cxt->fallback_dist_cols = NULL;
+ +                              }
+ +                              continue;
+ +                      }
+ +              }
+ +
+ +              pk_rel_id = RangeVarGetRelid(constraint->pktable, NoLock, false);
+ +              rel_loc_info = GetRelationLocInfo(pk_rel_id);
+ +              /* If referenced table is replicated, the constraint is safe */
+ +              if (rel_loc_info == NULL || IsLocatorReplicated(rel_loc_info->locatorType))
+ +              {
+ +                      List *common;
+ +
+ +                      if (cxt->subcluster)
+ +                      {
+ +                              /*
+ +                               * Distribution nodes are defined, they must be a subset of
+ +                               * the referenced relation's nodes
+ +                               */
+ +                              common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
+ +                              if (list_length(common) < list_length(nodelist))
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                       errmsg("referenced table is not defined on all target nodes")));
+ +                              list_free(common);
+ +                      }
+ +                      else
+ +                      {
+ +                              /* suggest distribution */
+ +                              if (nodelist)
+ +                              {
+ +                                      common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
+ +                                      if (list_length(common) == 0)
+ +                                              ereport(ERROR,
+ +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                               errmsg("referenced tables is defined on different nodes")));
+ +                                      list_free(nodelist);
+ +                                      nodelist = common;
+ +                              }
+ +                              else
+ +                                      nodelist = rel_loc_info? list_copy(rel_loc_info->rl_nodeList):NIL;
+ +                      }
+ +              }
+ +              else if (rel_loc_info->locatorType == LOCATOR_TYPE_RROBIN)
+ +              {
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                       errmsg("Cannot reference a round robin table in a foreign key constraint")));
+ +              }
+ +              else if (IsLocatorDistributedByValue(rel_loc_info->locatorType))
+ +              {
+ +                      ListCell   *fklc;
+ +                      ListCell   *pklc;
+ +                      char            ltype;
+ +                      char       *lattr;
+ +                      bool            found = false;
+ +                      List       *common;
+ +
+ +                      /*
+ +                       * First check nodes, they must be the same as in
+ +                       * the referenced relation
+ +                       */
+ +                      if (cxt->subcluster)
+ +                      {
+ +                              common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
+ +                              if (list_length(common) != list_length(rel_loc_info->rl_nodeList) ||
+ +                                              list_length(common) != list_length(nodelist))
+ +                              {
+ +                                      if (list_length(common) == 0)
+ +                                              ereport(ERROR,
+ +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                               errmsg("referenced HASH/MODULO table must be defined on same nodes")));
+ +                              }
+ +                              list_free(common);
+ +                      }
+ +                      else
+ +                      {
+ +                              if (nodelist)
+ +                              {
+ +                                      common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
+ +                                      if (list_length(common) != list_length(rel_loc_info->rl_nodeList))
+ +                                              ereport(ERROR,
+ +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                               errmsg("referenced HASH/MODULO table must be defined on same nodes")));
+ +                                      list_free(nodelist);
+ +                                      nodelist = common;
+ +                              }
+ +                              else
+ +                                      nodelist = list_copy(rel_loc_info->rl_nodeList);
+ +                              /* Now define the subcluster */
+ +                              cxt->subcluster = makeSubCluster(nodelist);
+ +                      }
+ +
+ +                      if (cxt->distributeby)
+ +                      {
+ +                              ltype = ConvertToLocatorType(cxt->distributeby->disttype);
+ +                              lattr = cxt->distributeby->colname;
+ +                      }
+ +                      else if (cxt->isalter)
+ +                      {
+ +                              if (cxt->rel->rd_locator_info == NULL)
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                       errmsg("Hash/Modulo distribution column does not refer"
+ +                                                                      " to hash/modulo distribution column in referenced table.")));
+ +                              ltype = cxt->rel->rd_locator_info->locatorType;
+ +                              lattr = cxt->rel->rd_locator_info->partAttrName;
+ +                      }
+ +                      else
+ +                      {
+ +                              /*
+ +                               * Not defined distribution, but we can define now.
+ +                               * The distribution must be the same as in referenced table,
+ +                               * distribution keys must be matching fk/pk
+ +                               */
+ +                              /*
+ +                               * Can not define distribution by value already
+ +                               */
+ +                              if (cxt->fallback_source == FBS_REPLICATE)
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                       errmsg("Hash/Modulo distribution column does not refer"
+ +                                                                      " to hash/modulo distribution column in referenced table.")));
+ +                              /* find the fk attribute matching the distribution column */
+ +                              lattr = NULL;
+ +                              if (list_length(constraint->pk_attrs) == 0)
+ +                              {
+ +                                      /*
+ +                                       * PK attribute list may be missing, so FK must reference
+ +                                       * the primary table's primary key. The primary key may
+ +                                       * consist of multiple attributes, one of them is a
+ +                                       * distribution key. We should find the foreign attribute
+ +                                       * referencing that primary attribute and set it as the
+ +                                       * distribution key of the table.
+ +                                       */
+ +                                      int             pk_attr_idx;
+ +                                      Relation        rel;
+ +
+ +                                      rel = relation_open(pk_rel_id, AccessShareLock);
+ +                                      pk_attr_idx = find_relation_pk_dist_index(rel);
+ +                                      relation_close(rel, AccessShareLock);
+ +
+ +                                      if (pk_attr_idx >= 0 &&
+ +                                                      pk_attr_idx < list_length(constraint->fk_attrs))
+ +                                      {
+ +                                              lattr = strVal(list_nth(constraint->fk_attrs, pk_attr_idx));
+ +                                      }
+ +                              }
+ +                              else
+ +                              {
+ +                                      /*
+ +                                       * One of the primary attributes must be the primary
+ +                                       * tabble's distribution key. We should find the foreign
+ +                                       * attribute referencing that primary attribute and set it
+ +                                       * as the distribution key of the table.
+ +                                       */
+ +                                      forboth(fklc, constraint->fk_attrs,
+ +                                                      pklc, constraint->pk_attrs)
+ +                                      {
+ +                                              if (strcmp(rel_loc_info->partAttrName,
+ +                                                                 strVal(lfirst(pklc))) == 0)
+ +                                              {
+ +                                                      lattr = strVal(lfirst(fklc));
+ +                                                      break;
+ +                                              }
+ +                                      }
+ +                              }
+ +                              /* distribution column is not referenced? */
+ +                              if (lattr == NULL)
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                       errmsg("Hash/Modulo distribution column does not refer"
+ +                                                                      " to hash/modulo distribution column in referenced table.")));
+ +                              foreach(fklc, cxt->fallback_dist_cols)
+ +                              {
+ +                                      if (strcmp(lattr, (char *) lfirst(fklc)) == 0)
+ +                                      {
+ +                                              found = true;
+ +                                              break;
+ +                                      }
+ +                              }
+ +                              if (found)
+ +                              {
+ +                                      list_free_deep(cxt->fallback_dist_cols);
+ +                                      cxt->fallback_dist_cols = NIL;
+ +                                      cxt->fallback_source = FBS_NONE;
+ +                                      cxt->distributeby = makeNode(DistributeBy);
+ +                                      switch (rel_loc_info->locatorType)
+ +                                      {
+ +                                              case LOCATOR_TYPE_HASH:
+ +                                                      cxt->distributeby->disttype = DISTTYPE_HASH;
+ +                                                      cxt->distributeby->colname = pstrdup(lattr);
+ +                                                      break;
+ +                                              case LOCATOR_TYPE_MODULO:
+ +                                                      cxt->distributeby->disttype = DISTTYPE_MODULO;
+ +                                                      cxt->distributeby->colname = pstrdup(lattr);
+ +                                                      break;
+ +                                              default:
+ +                                                      /* can not happen ?*/
+ +                                                      ereport(ERROR,
+ +                                                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                                       errmsg("Hash/Modulo distribution column does not refer"
+ +                                                                                      " to hash/modulo distribution column in referenced table.")));
+ +                                      }
+ +                              }
+ +                              else /* dist attr is not found */
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                       errmsg("Hash/Modulo distribution column does not refer"
+ +                                                                      " to hash/modulo distribution column in referenced table.")));
+ +                              continue;
+ +                      }
+ +                      /*
+ +                       * Here determine if already defined distribution is matching
+ +                       * to distribution of primary table.
+ +                       */
+ +                      if (ltype != rel_loc_info->locatorType || lattr == NULL)
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                               errmsg("Hash/Modulo distribution column does not refer"
+ +                                                              " to hash/modulo distribution column in referenced table.")));
+ +                      if (list_length(constraint->pk_attrs) == 0)
+ +                      {
+ +                              /*
+ +                               * PK attribute list may be missing, so FK must reference
+ +                               * the primary table's primary key. The primary key may
+ +                               * consist of multiple attributes, one of them is a
+ +                               * distribution key. We should find the foreign attribute
+ +                               * referencing that primary attribute and make sure it is a
+ +                               * distribution key of the table.
+ +                               */
+ +                              int             pk_attr_idx;
+ +                              Relation        rel;
+ +
+ +                              rel = relation_open(pk_rel_id, AccessShareLock);
+ +                              pk_attr_idx = find_relation_pk_dist_index(rel);
+ +                              relation_close(rel, AccessShareLock);
+ +
+ +                              /*
+ +                               * Two first conditions are just avoid assertion failure in
+ +                               * list_nth. First should never happen, because the primary key
+ +                               * of hash/modulo distributed table must contain distribution
+ +                               * key. Second may only happen if list of foreign columns is
+ +                               * shorter then the primary key. In that case statement would
+ +                               * probably fail later, but no harm if it fails here.
+ +                               */
+ +                              if (pk_attr_idx >= 0 &&
+ +                                              pk_attr_idx < list_length(constraint->fk_attrs) &&
+ +                                              strcmp(lattr, strVal(list_nth(constraint->fk_attrs,
+ +                                                                                                        pk_attr_idx))) == 0)
+ +                              {
+ +                                      found = true;
+ +                              }
+ +                      }
+ +                      else
+ +                      {
+ +                              forboth(fklc, constraint->fk_attrs, pklc, constraint->pk_attrs)
+ +                              {
+ +                                      if (strcmp(lattr, strVal(lfirst(fklc))) == 0)
+ +                                      {
+ +                                              found = true;
+ +                                              if (strcmp(rel_loc_info->partAttrName,
+ +                                                                 strVal(lfirst(pklc))) == 0)
+ +                                                      break;
+ +                                              else
+ +                                                      ereport(ERROR,
+ +                                                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                                       errmsg("Hash/Modulo distribution column does not refer"
+ +                                                                                      " to hash/modulo distribution column in referenced table.")));
+ +                                      }
+ +                              }
+ +                      }
+ +                      if (!found)
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                               errmsg("Hash/Modulo distribution column does not refer"
+ +                                                              " to hash/modulo distribution column in referenced table.")));
+ +              }
+ +              else /* Unsupported distribution */
+ +              {
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                       errmsg("Cannot reference a table with distribution type \"%c\"",
+ +                                       rel_loc_info->locatorType)));
+ +              }
+ +      }
+ +      /*
+ +       * If presence of a foreign constraint suggested a set of nodes, fix it here
+ +       */
+ +      if (nodelist && cxt->subcluster == NULL)
+ +              cxt->subcluster = makeSubCluster(nodelist);
+ +}
+ +#endif
+ +
+ +
+ +#ifdef XCP
+ +/*
+ + * Convert SubCluster definition to a list of Datanode indexes, to compare to
+ + * relation nodes
+ + */
+ +static List *
+ +transformSubclusterNodes(PGXCSubCluster *subcluster)
+ +{
+ +      List   *result = NIL;
+ +      Oid        *nodeoids;
+ +      int             numnodes;
+ +      int     i;
+ +      char    nodetype = PGXC_NODE_DATANODE;
+ +
+ +      nodeoids = GetRelationDistributionNodes(subcluster, &numnodes);
+ +      for (i = 0; i < numnodes; i++)
+ +              result = lappend_int(result, PGXCNodeGetNodeId(nodeoids[i], &nodetype));
+ +
+ +      return result;
+ +}
+ +
+ +
+ +/*
+ + * Create a SubCluster definition from a list of node indexes.
+ + */
+ +static PGXCSubCluster *
+ +makeSubCluster(List *nodelist)
+ +{
+ +      PGXCSubCluster *result;
+ +      ListCell           *lc;
+ +      result = makeNode(PGXCSubCluster);
+ +      result->clustertype = SUBCLUSTER_NODE;
+ +      foreach (lc, nodelist)
+ +      {
+ +              int     nodeidx = lfirst_int(lc);
+ +              char   *nodename = get_pgxc_nodename(
+ +                                                      PGXCNodeGetNodeOid(nodeidx, PGXC_NODE_DATANODE));
+ +              result->members = lappend(result->members, makeString(nodename));
+ +      }
+ +      return result;
+ +}
+ +#endif
++
+ /*
+  * transformPartitionCmd
+  *            Analyze the ATTACH/DETACH PARTITION command
+  *
+  * In case of the ATTACH PARTITION command, cxt->partbound is set to the
+  * transformed value of cmd->bound.
+  */
+ static void
+ transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd)
+ {
+       Relation        parentRel = cxt->rel;
+ 
+       /* the table must be partitioned */
+       if (parentRel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                errmsg("\"%s\" is not partitioned",
+                                               RelationGetRelationName(parentRel))));
+ 
+       /* transform the partition bound, if any */
+       Assert(RelationGetPartitionKey(parentRel) != NULL);
+       if (cmd->bound != NULL)
+               cxt->partbound = transformPartitionBound(cxt->pstate, parentRel,
+                                                                                                cmd->bound);
+ }
+ 
+ /*
+  * transformPartitionBound
+  *
+  * Transform a partition bound specification
+  */
+ PartitionBoundSpec *
+ transformPartitionBound(ParseState *pstate, Relation parent,
+                                               PartitionBoundSpec *spec)
+ {
+       PartitionBoundSpec *result_spec;
+       PartitionKey key = RelationGetPartitionKey(parent);
+       char            strategy = get_partition_strategy(key);
+       int                     partnatts = get_partition_natts(key);
+       List       *partexprs = get_partition_exprs(key);
+ 
+       /* Avoid scribbling on input */
+       result_spec = copyObject(spec);
+ 
+       if (strategy == PARTITION_STRATEGY_LIST)
+       {
+               ListCell   *cell;
+               char       *colname;
+               Oid                     coltype;
+               int32           coltypmod;
+ 
+               if (spec->strategy != PARTITION_STRATEGY_LIST)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+                                 errmsg("invalid bound specification for a list partition"),
+                                  parser_errposition(pstate, exprLocation((Node *) spec))));
+ 
+               /* Get the only column's name in case we need to output an error */
+               if (key->partattrs[0] != 0)
+                       colname = get_relid_attribute_name(RelationGetRelid(parent),
+                                                                                          key->partattrs[0]);
+               else
+                       colname = deparse_expression((Node *) linitial(partexprs),
+                                                deparse_context_for(RelationGetRelationName(parent),
+                                                                                        RelationGetRelid(parent)),
+                                                                                false, false);
+               /* Need its type data too */
+               coltype = get_partition_col_typid(key, 0);
+               coltypmod = get_partition_col_typmod(key, 0);
+ 
+               result_spec->listdatums = NIL;
+               foreach(cell, spec->listdatums)
+               {
+                       A_Const    *con = castNode(A_Const, lfirst(cell));
+                       Const      *value;
+                       ListCell   *cell2;
+                       bool            duplicate;
+ 
+                       value = transformPartitionBoundValue(pstate, con,
+                                                                                                colname, coltype, coltypmod);
+ 
+                       /* Don't add to the result if the value is a duplicate */
+                       duplicate = false;
+                       foreach(cell2, result_spec->listdatums)
+                       {
+                               Const      *value2 = castNode(Const, lfirst(cell2));
+ 
+                               if (equal(value, value2))
+                               {
+                                       duplicate = true;
+                                       break;
+                               }
+                       }
+                       if (duplicate)
+                               continue;
+ 
+                       result_spec->listdatums = lappend(result_spec->listdatums,
+                                                                                         value);
+               }
+       }
+       else if (strategy == PARTITION_STRATEGY_RANGE)
+       {
+               ListCell   *cell1,
+                                  *cell2;
+               int                     i,
+                                       j;
+               bool            seen_unbounded;
+ 
+               if (spec->strategy != PARTITION_STRATEGY_RANGE)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+                                errmsg("invalid bound specification for a range partition"),
+                                  parser_errposition(pstate, exprLocation((Node *) spec))));
+ 
+               if (list_length(spec->lowerdatums) != partnatts)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+                                        errmsg("FROM must specify exactly one value per partitioning column")));
+               if (list_length(spec->upperdatums) != partnatts)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+                                        errmsg("TO must specify exactly one value per partitioning column")));
+ 
+               /*
+                * Check that no finite value follows an UNBOUNDED item in either of
+                * lower and upper bound lists.
+                */
+               seen_unbounded = false;
+               foreach(cell1, spec->lowerdatums)
+               {
+                       PartitionRangeDatum *ldatum = castNode(PartitionRangeDatum,
+                                                                                                  lfirst(cell1));
+ 
+                       if (ldatum->infinite)
+                               seen_unbounded = true;
+                       else if (seen_unbounded)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_DATATYPE_MISMATCH),
+                                          errmsg("cannot specify finite value after UNBOUNDED"),
+                                parser_errposition(pstate, exprLocation((Node *) ldatum))));
+               }
+               seen_unbounded = false;
+               foreach(cell1, spec->upperdatums)
+               {
+                       PartitionRangeDatum *rdatum = castNode(PartitionRangeDatum,
+                                                                                                  lfirst(cell1));
+ 
+                       if (rdatum->infinite)
+                               seen_unbounded = true;
+                       else if (seen_unbounded)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_DATATYPE_MISMATCH),
+                                          errmsg("cannot specify finite value after UNBOUNDED"),
+                                parser_errposition(pstate, exprLocation((Node *) rdatum))));
+               }
+ 
+               /* Transform all the constants */
+               i = j = 0;
+               result_spec->lowerdatums = result_spec->upperdatums = NIL;
+               forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums)
+               {
+                       PartitionRangeDatum *ldatum = (PartitionRangeDatum *) lfirst(cell1);
+                       PartitionRangeDatum *rdatum = (PartitionRangeDatum *) lfirst(cell2);
+                       char       *colname;
+                       Oid                     coltype;
+                       int32           coltypmod;
+                       A_Const    *con;
+                       Const      *value;
+ 
+                       /* Get the column's name in case we need to output an error */
+                       if (key->partattrs[i] != 0)
+                               colname = get_relid_attribute_name(RelationGetRelid(parent),
+                                                                                                  key->partattrs[i]);
+                       else
+                       {
+                               colname = deparse_expression((Node *) list_nth(partexprs, j),
+                                                deparse_context_for(RelationGetRelationName(parent),
+                                                                                        RelationGetRelid(parent)),
+                                                                                        false, false);
+                               ++j;
+                       }
+                       /* Need its type data too */
+                       coltype = get_partition_col_typid(key, i);
+                       coltypmod = get_partition_col_typmod(key, i);
+ 
+                       if (ldatum->value)
+                       {
+                               con = castNode(A_Const, ldatum->value);
+                               value = transformPartitionBoundValue(pstate, con,
+                                                                                                        colname,
+                                                                                                        coltype, coltypmod);
+                               if (value->constisnull)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                        errmsg("cannot specify NULL in range bound")));
+                               ldatum = copyObject(ldatum);    /* don't scribble on input */
+                               ldatum->value = (Node *) value;
+                       }
+ 
+                       if (rdatum->value)
+                       {
+                               con = castNode(A_Const, rdatum->value);
+                               value = transformPartitionBoundValue(pstate, con,
+                                                                                                        colname,
+                                                                                                        coltype, coltypmod);
+                               if (value->constisnull)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                        errmsg("cannot specify NULL in range bound")));
+                               rdatum = copyObject(rdatum);    /* don't scribble on input */
+                               rdatum->value = (Node *) value;
+                       }
+ 
+                       result_spec->lowerdatums = lappend(result_spec->lowerdatums,
+                                                                                          ldatum);
+                       result_spec->upperdatums = lappend(result_spec->upperdatums,
+                                                                                          rdatum);
+ 
+                       ++i;
+               }
+       }
+       else
+               elog(ERROR, "unexpected partition strategy: %d", (int) strategy);
+ 
+       return result_spec;
+ }
+ 
+ /*
+  * Transform one constant in a partition bound spec
+  */
+ static Const *
+ transformPartitionBoundValue(ParseState *pstate, A_Const *con,
+                                                  const char *colName, Oid colType, int32 colTypmod)
+ {
+       Node       *value;
+ 
+       /* Make it into a Const */
+       value = (Node *) make_const(pstate, &con->val, con->location);
+ 
+       /* Coerce to correct type */
+       value = coerce_to_target_type(pstate,
+                                                                 value, exprType(value),
+                                                                 colType,
+                                                                 colTypmod,
+                                                                 COERCION_ASSIGNMENT,
+                                                                 COERCE_IMPLICIT_CAST,
+                                                                 -1);
+ 
+       if (value == NULL)
+               ereport(ERROR,
+                               (errcode(ERRCODE_DATATYPE_MISMATCH),
+               errmsg("specified value cannot be cast to type %s for column \"%s\"",
+                          format_type_be(colType), colName),
+                                parser_errposition(pstate, con->location)));
+ 
+       /* Simplify the expression, in case we had a coercion */
+       if (!IsA(value, Const))
+               value = (Node *) expression_planner((Expr *) value);
+ 
+       /* Fail if we don't have a constant (i.e., non-immutable coercion) */
+       if (!IsA(value, Const))
+               ereport(ERROR,
+                               (errcode(ERRCODE_DATATYPE_MISMATCH),
+               errmsg("specified value cannot be cast to type %s for column \"%s\"",
+                          format_type_be(colType), colName),
+                                errdetail("The cast requires a non-immutable conversion."),
+                                errhint("Try putting the literal value in single quotes."),
+                                parser_errposition(pstate, con->location)));
+ 
+       return (Const *) value;
+ }
diff --cc src/backend/parser/parser.c

index 2cc9b54dd5612f237be6bb379fe5a4695d4b756a,245b4cda3b9b3a4ebe585cad639f5c1fe3b62d02..522d7ec2035bdebd86a4768ff19530780b679c2c
--- 1/src/backend/parser/parser.c
--- 2/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@@ -29,10 -29,11 +29,11 @@@
    * raw_parser
    *            Given a query in string form, do lexical and grammatical analysis.
    *
-  * Returns a list of raw (un-analyzed) parse trees.
+  * Returns a list of raw (un-analyzed) parse trees.  The immediate elements
+  * of the list are always RawStmt nodes.
    */
   List *
- -raw_parser(const char *str)
+ +raw_parser(const char *str, List **queries)
   {
         core_yyscan_t yyscanner;
         base_yy_extra_type yyextra;
diff --cc src/backend/parser/scan.l
Simple merge
diff --cc src/backend/pgxc/cluster/pause.c

index 164dafa0e8d00677a4663fdf6933a2675b565d06,0000000000000000000000000000000000000000..65769e94c3af5245d0502953e882c07efb5e85e5

mode 100644,000000..100644
--- 1/src/backend/pgxc/cluster/pause.c
--- /dev/null
+++ b/src/backend/pgxc/cluster/pause.c
@@@ -1,480 -1,0 +1,481 @@@
+ +/*-------------------------------------------------------------------------
+ + *
+ + * pause.c
+ + *
+ + *     Cluster Pause/Unpause handling
+ + *
+ + * IDENTIFICATION
+ + *      $$
+ + *
+ + *-------------------------------------------------------------------------
+ + */
+ +
+ +#ifdef XCP
+ +#include "postgres.h"
+ +#include "pgxc/execRemote.h"
+ +#include "pgxc/pause.h"
+ +#include "pgxc/pgxc.h"
++#include "storage/shmem.h"
+ +#include "storage/spin.h"
+ +#include "miscadmin.h"
+ +
+ +/* globals */
+ +bool cluster_lock_held;
+ +bool cluster_ex_lock_held;
+ +
+ +static void HandleClusterPause(bool pause, bool initiator);
+ +static void ProcessClusterPauseRequest(bool pause);
+ +
+ +ClusterLockInfo *ClustLinfo = NULL;
+ +
+ +/*
+ + * ProcessClusterPauseRequest:
+ + *
+ + * Carry out PAUSE/UNPAUSE request on a coordinator node
+ + */
+ +static void
+ +ProcessClusterPauseRequest(bool pause)
+ +{
+ +      char *action = pause? "PAUSE":"UNPAUSE";
+ +
+ +      if (!IS_PGXC_COORDINATOR || !IsConnFromCoord())
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("The %s CLUSTER message is expected to "
+ +                                              "arrive at a coordinator from another coordinator",
+ +                                              action)));
+ +
+ +      elog(DEBUG2, "Received %s CLUSTER from a coordinator", action);
+ +
+ +      /*
+ +       * If calling UNPAUSE, ensure that the cluster lock has already been held
+ +       * in exclusive mode
+ +       */
+ +      if (!pause && !cluster_ex_lock_held)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Received an UNPAUSE request when cluster not PAUSED!")));
+ +
+ +      /*
+ +       * Enable/Disable local queries. We need to release the lock first
+ +       *
+ +       * TODO: Think of some timeout mechanism here, if the locking takes too
+ +       * much time...
+ +       */
+ +      ReleaseClusterLock(pause? false:true);
+ +      AcquireClusterLock(pause? true:false);
+ +
+ +      if (pause)
+ +              cluster_ex_lock_held = true;
+ +      else
+ +              cluster_ex_lock_held = false;
+ +
+ +      elog(DEBUG2, "%s queries at the coordinator", pause? "Paused":"Resumed");
+ +
+ +      return;
+ +}
+ +
+ +/*
+ + * HandleClusterPause:
+ + *
+ + * Any errors will be reported via ereport.
+ + */
+ +static void
+ +HandleClusterPause(bool pause, bool initiator)
+ +{
+ +      PGXCNodeAllHandles *coord_handles;
+ +      int conn;
+ +      int response;
+ +      char *action = pause? "PAUSE":"UNPAUSE";
+ +
+ +      elog(DEBUG2, "Preparing coordinators for %s CLUSTER", action);
+ +
+ +      if (pause && cluster_ex_lock_held)
+ +      {
+ +              ereport(NOTICE, (errmsg("CLUSTER already PAUSED")));
+ +
+ +              /* Nothing to do */
+ +              return;
+ +      }
+ +
+ +      if (!pause && !cluster_ex_lock_held)
+ +      {
+ +              ereport(NOTICE, (errmsg("Issue PAUSE CLUSTER before calling UNPAUSE")));
+ +
+ +              /* Nothing to do */
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * If we are one of the participating coordinators, just do the action
+ +       * locally and return
+ +       */
+ +      if (!initiator)
+ +      {
+ +              ProcessClusterPauseRequest(pause);
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * Send a PAUSE/UNPAUSE CLUSTER message to all the coordinators. We should send an
+ +       * asyncronous request, update the local ClusterLock and then wait for the remote
+ +       * coordinators to respond back
+ +       */
+ +
+ +      coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true);
+ +
+ +      for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+ +      {
+ +              PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+ +
+ +              if (pgxc_node_send_query(handle, pause? "PAUSE CLUSTER" : "UNPAUSE CLUSTER") != 0)
+ +                      ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send %s CLUSTER request to some coordinator nodes",action)));
+ +      }
+ +
+ +      /*
+ +       * Disable/Enable local queries. We need to release the SHARED mode first
+ +       *
+ +       * TODO: Start a timer to cancel the request in case of a timeout
+ +       */
+ +      ReleaseClusterLock(pause? false:true);
+ +      AcquireClusterLock(pause? true:false);
+ +
+ +      if (pause)
+ +              cluster_ex_lock_held = true;
+ +      else
+ +              cluster_ex_lock_held = false;
+ +
+ +
+ +      elog(DEBUG2, "%s queries at the driving coordinator", pause? "Paused":"Resumed");
+ +
+ +      /*
+ +       * Local queries are paused/enabled. Check status of the remote coordinators
+ +       * now. We need a TRY/CATCH block here, so that if one of the coordinator
+ +       * fails for some reason, we can try best-effort to salvage the situation
+ +       * at others
+ +       *
+ +       * We hope that errors in the earlier loop generally do not occur (out of
+ +       * memory and improper handles..) or we can have a similar TRY/CATCH block
+ +       * there too
+ +       *
+ +       * To repeat: All the salvaging is best effort really...
+ +       */
+ +      PG_TRY();
+ +      {
+ +              ResponseCombiner combiner;
+ +
+ +              InitResponseCombiner(&combiner, coord_handles->co_conn_count, COMBINE_TYPE_NONE);
+ +              for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+ +              {
+ +                      PGXCNodeHandle *handle;
+ +
+ +                      handle = coord_handles->coord_handles[conn];
+ +
+ +                      while (true)
+ +                      {
+ +                              if (pgxc_node_receive(1, &handle, NULL))
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Failed to receive a response from the remote coordinator node")));
+ +
+ +                              response = handle_response(handle, &combiner);
+ +                              if (response == RESPONSE_EOF)
+ +                                      continue;
+ +                              else if (response == RESPONSE_COMPLETE)
+ +                                      break;
+ +                              else
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("%s CLUSTER command failed "
+ +                                                                      "with error %s", action, handle->error)));
+ +                      }
+ +              }
+ +
+ +              if (combiner.errorMessage)
+ +              {
+ +                      char *code = combiner.errorCode;
+ +                      if (combiner.errorDetail != NULL)
+ +                              ereport(ERROR,
+ +                                              (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ +                                               errmsg("%s", combiner.errorMessage), errdetail("%s", combiner.errorDetail) ));
+ +                      else
+ +                              ereport(ERROR,
+ +                                              (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ +                                               errmsg("%s", combiner.errorMessage)));
+ +              }
+ +
+ +              CloseCombiner(&combiner);
+ +      }
+ +      PG_CATCH();
+ +      {
+ +              /*
+ +               * If PAUSE CLUSTER, issue UNPAUSE on the reachable nodes. For failure
+ +               * in cases of UNPAUSE, might need manual intervention at the offending
+ +               * coordinator node (maybe do a pg_cancel_backend() on the backend
+ +               * that's holding the exclusive lock or something..)
+ +               */
+ +              if (!pause)
+ +                      ereport(WARNING,
+ +                               (errmsg("UNPAUSE CLUSTER command failed on one or more coordinator nodes."
+ +                                              " Manual intervention may be required!")));
+ +              else
+ +                      ereport(WARNING,
+ +                               (errmsg("PAUSE CLUSTER command failed on one or more coordinator nodes."
+ +                                              " Trying to UNPAUSE reachable nodes now")));
+ +
+ +              for (conn = 0; conn < coord_handles->co_conn_count && pause; conn++)
+ +              {
+ +                      PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+ +
+ +                      (void) pgxc_node_send_query(handle, "UNPAUSE CLUSTER");
+ +
+ +                      /*
+ +                       * The incoming data should hopefully be discarded as part of
+ +                       * cleanup..
+ +                       */
+ +              }
+ +
+ +              /* cleanup locally.. */
+ +              ReleaseClusterLock(pause? true:false);
+ +              AcquireClusterLock(pause? false:true);
+ +              cluster_ex_lock_held = false;
+ +              PG_RE_THROW();
+ +      }
+ +      PG_END_TRY();
+ +
+ +      elog(DEBUG2, "Successfully completed %s CLUSTER command on "
+ +                               "all coordinator nodes", action);
+ +
+ +      return;
+ +}
+ +
+ +void
+ +RequestClusterPause(bool pause, char *completionTag)
+ +{
+ +      char    *action = pause? "PAUSE":"UNPAUSE";
+ +      bool     initiator = true;
+ +
+ +      elog(DEBUG2, "%s CLUSTER request received", action);
+ +
+ +      /* Only a superuser can perform this activity on a cluster */
+ +      if (!superuser())
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ +                               errmsg("%s CLUSTER command: must be a superuser", action)));
+ +
+ +      /* Ensure that we are a coordinator */
+ +      if (!IS_PGXC_COORDINATOR)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("%s CLUSTER command must be sent to a coordinator", action)));
+ +
+ +      /*
+ +       * Did the command come directly to this coordinator or via another
+ +       * coordinator?
+ +       */
+ +      if (IsConnFromCoord())
+ +              initiator = false;
+ +
+ +      HandleClusterPause(pause, initiator);
+ +
+ +      if (completionTag)
+ +              snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "%s CLUSTER", action);
+ +}
+ +
+ +/*
+ + * If the backend is shutting down, cleanup the PAUSE cluster lock
+ + * appropriately. We do this before shutting down shmem, because this needs
+ + * LWLock and stuff
+ + */
+ +void
+ +PGXCCleanClusterLock(int code, Datum arg)
+ +{
+ +      PGXCNodeAllHandles *coord_handles;
+ +      int conn;
+ +
+ +      if (cluster_lock_held && !cluster_ex_lock_held)
+ +      {
+ +              ReleaseClusterLock (false);
+ +              cluster_lock_held = false;
+ +      }
+ +
+ +      /* Do nothing if cluster lock not held */
+ +      if (!cluster_ex_lock_held)
+ +              return;
+ +
+ +      /* Do nothing if we are not the initiator */
+ +      if (IsConnFromCoord())
+ +              return;
+ +
+ +      coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true);
+ +      /* Try best-effort to UNPAUSE other coordinators now */
+ +      for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+ +      {
+ +              PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+ +
+ +              /* No error checking here... */
+ +              (void)pgxc_node_send_query(handle, "UNPAUSE CLUSTER");
+ +      }
+ +
+ +      /* Release locally too. We do not want a dangling value in cl_holder_pid! */
+ +      ReleaseClusterLock(true);
+ +      cluster_ex_lock_held = false;
+ +}
+ +
+ +/* Report shared memory space needed by ClusterLockShmemInit */
+ +Size
+ +ClusterLockShmemSize(void)
+ +{
+ +      Size            size = 0;
+ +
+ +      size = add_size(size, sizeof(ClusterLockInfo));
+ +
+ +      return size;
+ +}
+ +
+ +/* Allocate and initialize cluster locking related shared memory */
+ +void
+ +ClusterLockShmemInit(void)
+ +{
+ +      bool            found;
+ +
+ +      ClustLinfo = (ClusterLockInfo *)
+ +              ShmemInitStruct("Cluster Lock Info", ClusterLockShmemSize(), &found);
+ +
+ +      if (!found)
+ +      {
+ +              /* First time through, so initialize */
+ +              MemSet(ClustLinfo, 0, ClusterLockShmemSize());
+ +              SpinLockInit(&ClustLinfo->cl_mutex);
+ +      }
+ +}
+ +
+ +/*
+ + * AcquireClusterLock
+ + *
+ + *  Based on the argument passed in, try to update the shared memory
+ + *  appropriately. In case the conditions cannot be satisfied immediately this
+ + *  function resorts to a simple sleep. We don't envision PAUSE CLUSTER to
+ + *  occur that frequently so most of the calls will come out immediately here
+ + *  without any sleeps at all
+ + *
+ + *  We could have used a semaphore to allow the processes to sleep while the
+ + *  cluster lock is held. But again we are really not worried about performance
+ + *  and immediate wakeups around PAUSE CLUSTER functionality. Using the sleep
+ + *  in an infinite loop keeps things simple yet correct
+ + */
+ +void
+ +AcquireClusterLock(bool exclusive)
+ +{
+ +      volatile ClusterLockInfo *clinfo = ClustLinfo;
+ +
+ +      if (exclusive && cluster_ex_lock_held)
+ +      {
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * In the normal case, none of the backends will ask for exclusive lock, so
+ +       * they will just update the cl_process_count value and exit immediately
+ +       * from the below loop
+ +       */
+ +      for (;;)
+ +      {
+ +              bool wait = false;
+ +
+ +              SpinLockAcquire(&clinfo->cl_mutex);
+ +
+ +              if (!exclusive)
+ +              {
+ +                      if (clinfo->cl_holder_pid == 0)
+ +                              clinfo->cl_process_count++;
+ +                      else
+ +                              wait = true;
+ +              }
+ +              else /* PAUSE CLUSTER handling */
+ +              {
+ +                      if (clinfo->cl_holder_pid != 0)
+ +                      {
+ +                              SpinLockRelease(&clinfo->cl_mutex);
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("PAUSE CLUSTER already in progress")));
+ +                      }
+ +
+ +                      /*
+ +                       * There should be no other process
+ +                       * holding the lock including ourself
+ +                       */
+ +                      if (clinfo->cl_process_count  > 0)
+ +                              wait = true;
+ +                      else
+ +                              clinfo->cl_holder_pid = MyProcPid;
+ +              }
+ +              SpinLockRelease(&clinfo->cl_mutex);
+ +
+ +              /*
+ +               * We use a simple sleep mechanism. If PAUSE CLUSTER has been invoked,
+ +               * we are not worried about immediate performance characteristics..
+ +               */
+ +              if (wait)
+ +              {
+ +                      CHECK_FOR_INTERRUPTS();
+ +                      pg_usleep(100000L);
+ +              }
+ +              else /* Got the proper semantic read/write lock.. */
+ +                      break;
+ +      }
+ +}
+ +
+ +/*
+ + * ReleaseClusterLock
+ + *
+ + *            Update the shared memory appropriately across the release call. We
+ + *            really do not need the bool argument, but it's there for some
+ + *            additional sanity checking
+ + */
+ +void
+ +ReleaseClusterLock(bool exclusive)
+ +{
+ +      volatile ClusterLockInfo *clinfo = ClustLinfo;
+ +
+ +      SpinLockAcquire(&clinfo->cl_mutex);
+ +      if (exclusive)
+ +      {
+ +              if (clinfo->cl_process_count > 1 ||
+ +                              clinfo->cl_holder_pid == 0)
+ +              {
+ +                      SpinLockRelease(&clinfo->cl_mutex);
+ +                      ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("Inconsistent state while doing UNPAUSE CLUSTER")));
+ +              }
+ +
+ +              /*
+ +               * Reset the holder pid. Any waiters in AcquireClusterLock will
+ +               * eventually come out of their sleep and notice this new value and
+ +               * move ahead
+ +               */
+ +              clinfo->cl_holder_pid = 0;
+ +      }
+ +      else
+ +      {
+ +              if (clinfo->cl_holder_pid != 0)
+ +              {
+ +                      SpinLockRelease(&clinfo->cl_mutex);
+ +                      ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("Inconsistent state while releasing CLUSTER lock")));
+ +              }
+ +              /*
+ +               * Decrement our count. If a PAUSE is waiting inside AcquireClusterLock
+ +               * elsewhere, it will wake out of sleep and do the needful
+ +               */
+ +              if (clinfo->cl_process_count > 0)
+ +                      clinfo->cl_process_count--;
+ +      }
+ +      SpinLockRelease(&clinfo->cl_mutex);
+ +}
+ +#endif
diff --cc src/backend/pgxc/locator/locator.c

index c45d7e7d146500cc76f1a71d000e973afdca479a,0000000000000000000000000000000000000000..1c6d98c8a24ac644cb2f41fa6cf8724eda62b877

mode 100644,000000..100644
--- 1/src/backend/pgxc/locator/locator.c
--- /dev/null
+++ b/src/backend/pgxc/locator/locator.c
@@@ -1,1834 -1,0 +1,1832 @@@
-               case INT2VECTOROID:
-                       return hashint2vector;
+ +/*-------------------------------------------------------------------------
+ + *
+ + * locator.c
+ + *            Functions that help manage table location information such as
+ + * partitioning and replication information.
+ + *
+ + *
+ + *
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ + *
+ + *
+ + * IDENTIFICATION
+ + *            $$
+ + *
+ + *-------------------------------------------------------------------------
+ + */
+ +
+ +#include <stdlib.h>
+ +#include <string.h>
+ +#include <stdio.h>
+ +#include <time.h>
+ +
+ +#include "postgres.h"
+ +#include "access/skey.h"
+ +#include "access/gtm.h"
+ +#include "access/relscan.h"
+ +#include "catalog/indexing.h"
+ +#include "catalog/pg_type.h"
+ +#include "nodes/pg_list.h"
+ +#include "nodes/nodeFuncs.h"
+ +#include "utils/builtins.h"
+ +#include "utils/catcache.h"
+ +#include "utils/fmgroids.h"
+ +#include "utils/lsyscache.h"
+ +#include "utils/rel.h"
+ +#include "utils/relcache.h"
+ +#include "utils/tqual.h"
+ +#include "utils/syscache.h"
+ +#include "nodes/nodes.h"
+ +#include "optimizer/clauses.h"
+ +#include "parser/parse_coerce.h"
+ +#include "pgxc/nodemgr.h"
+ +#include "pgxc/locator.h"
+ +#include "pgxc/pgxc.h"
+ +#include "pgxc/pgxcnode.h"
+ +
+ +#include "catalog/pgxc_class.h"
+ +#include "catalog/pgxc_node.h"
+ +#include "catalog/namespace.h"
+ +#include "access/hash.h"
+ +#ifdef XCP
+ +#include "utils/date.h"
+ +#include "utils/memutils.h"
+ +
+ +/*
+ + * Locator details are private
+ + */
+ +struct _Locator
+ +{
+ +      /*
+ +       * Determine target nodes for value.
+ +       * Resulting nodes are stored to the results array.
+ +       * Function returns number of node references written to the array.
+ +       */
+ +      int                     (*locatefunc) (Locator *self, Datum value, bool isnull,
+ +                                                              bool *hasprimary);
+ +      Oid                     dataType;               /* values of that type are passed to locateNodes function */
+ +      LocatorListType listType;
+ +      bool            primary;
+ +      /* locator-specific data */
+ +      /* XXX: move them into union ? */
+ +      int                     roundRobinNode; /* for LOCATOR_TYPE_RROBIN */
+ +      LocatorHashFunc hashfunc; /* for LOCATOR_TYPE_HASH */
+ +      int             valuelen; /* 1, 2 or 4 for LOCATOR_TYPE_MODULO */
+ +
+ +      int                     nodeCount; /* How many nodes are in the map */
+ +      void       *nodeMap; /* map index to node reference according to listType */
+ +      void       *results; /* array to output results */
+ +};
+ +#endif
+ +
+ +Oid           primary_data_node = InvalidOid;
+ +int           num_preferred_data_nodes = 0;
+ +Oid           preferred_data_node[MAX_PREFERRED_NODES];
+ +
+ +#ifdef XCP
+ +static int modulo_value_len(Oid dataType);
+ +static LocatorHashFunc hash_func_ptr(Oid dataType);
+ +static int locate_static(Locator *self, Datum value, bool isnull,
+ +                        bool *hasprimary);
+ +static int locate_roundrobin(Locator *self, Datum value, bool isnull,
+ +                        bool *hasprimary);
+ +static int locate_modulo_random(Locator *self, Datum value, bool isnull,
+ +                        bool *hasprimary);
+ +static int locate_hash_insert(Locator *self, Datum value, bool isnull,
+ +                        bool *hasprimary);
+ +static int locate_hash_select(Locator *self, Datum value, bool isnull,
+ +                        bool *hasprimary);
+ +static int locate_modulo_insert(Locator *self, Datum value, bool isnull,
+ +                        bool *hasprimary);
+ +static int locate_modulo_select(Locator *self, Datum value, bool isnull,
+ +                        bool *hasprimary);
+ +static Expr * pgxc_find_distcol_expr(Index varno,
+ +                                         AttrNumber attrNum,
+ +                                         Node *quals);
+ +#endif
+ +
+ +static const unsigned int xc_mod_m[] =
+ +{
+ +  0x00000000, 0x55555555, 0x33333333, 0xc71c71c7,
+ +  0x0f0f0f0f, 0xc1f07c1f, 0x3f03f03f, 0xf01fc07f,
+ +  0x00ff00ff, 0x07fc01ff, 0x3ff003ff, 0xffc007ff,
+ +  0xff000fff, 0xfc001fff, 0xf0003fff, 0xc0007fff,
+ +  0x0000ffff, 0x0001ffff, 0x0003ffff, 0x0007ffff,
+ +  0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff,
+ +  0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff,
+ +  0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff
+ +};
+ +
+ +static const unsigned int xc_mod_q[][6] =
+ +{
+ +  { 0,  0,  0,  0,  0,  0}, {16,  8,  4,  2,  1,  1}, {16,  8,  4,  2,  2,  2},
+ +  {15,  6,  3,  3,  3,  3}, {16,  8,  4,  4,  4,  4}, {15,  5,  5,  5,  5,  5},
+ +  {12,  6,  6,  6 , 6,  6}, {14,  7,  7,  7,  7,  7}, {16,  8,  8,  8,  8,  8},
+ +  { 9,  9,  9,  9,  9,  9}, {10, 10, 10, 10, 10, 10}, {11, 11, 11, 11, 11, 11},
+ +  {12, 12, 12, 12, 12, 12}, {13, 13, 13, 13, 13, 13}, {14, 14, 14, 14, 14, 14},
+ +  {15, 15, 15, 15, 15, 15}, {16, 16, 16, 16, 16, 16}, {17, 17, 17, 17, 17, 17},
+ +  {18, 18, 18, 18, 18, 18}, {19, 19, 19, 19, 19, 19}, {20, 20, 20, 20, 20, 20},
+ +  {21, 21, 21, 21, 21, 21}, {22, 22, 22, 22, 22, 22}, {23, 23, 23, 23, 23, 23},
+ +  {24, 24, 24, 24, 24, 24}, {25, 25, 25, 25, 25, 25}, {26, 26, 26, 26, 26, 26},
+ +  {27, 27, 27, 27, 27, 27}, {28, 28, 28, 28, 28, 28}, {29, 29, 29, 29, 29, 29},
+ +  {30, 30, 30, 30, 30, 30}, {31, 31, 31, 31, 31, 31}
+ +};
+ +
+ +static const unsigned int xc_mod_r[][6] =
+ +{
+ +  {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
+ +  {0x0000ffff, 0x000000ff, 0x0000000f, 0x00000003, 0x00000001, 0x00000001},
+ +  {0x0000ffff, 0x000000ff, 0x0000000f, 0x00000003, 0x00000003, 0x00000003},
+ +  {0x00007fff, 0x0000003f, 0x00000007, 0x00000007, 0x00000007, 0x00000007},
+ +  {0x0000ffff, 0x000000ff, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f},
+ +  {0x00007fff, 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f},
+ +  {0x00000fff, 0x0000003f, 0x0000003f, 0x0000003f, 0x0000003f, 0x0000003f},
+ +  {0x00003fff, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f},
+ +  {0x0000ffff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff},
+ +  {0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff},
+ +  {0x000003ff, 0x000003ff, 0x000003ff, 0x000003ff, 0x000003ff, 0x000003ff},
+ +  {0x000007ff, 0x000007ff, 0x000007ff, 0x000007ff, 0x000007ff, 0x000007ff},
+ +  {0x00000fff, 0x00000fff, 0x00000fff, 0x00000fff, 0x00000fff, 0x00000fff},
+ +  {0x00001fff, 0x00001fff, 0x00001fff, 0x00001fff, 0x00001fff, 0x00001fff},
+ +  {0x00003fff, 0x00003fff, 0x00003fff, 0x00003fff, 0x00003fff, 0x00003fff},
+ +  {0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff},
+ +  {0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff},
+ +  {0x0001ffff, 0x0001ffff, 0x0001ffff, 0x0001ffff, 0x0001ffff, 0x0001ffff},
+ +  {0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff},
+ +  {0x0007ffff, 0x0007ffff, 0x0007ffff, 0x0007ffff, 0x0007ffff, 0x0007ffff},
+ +  {0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff},
+ +  {0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff},
+ +  {0x003fffff, 0x003fffff, 0x003fffff, 0x003fffff, 0x003fffff, 0x003fffff},
+ +  {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff},
+ +  {0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff},
+ +  {0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff},
+ +  {0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff},
+ +  {0x07ffffff, 0x07ffffff, 0x07ffffff, 0x07ffffff, 0x07ffffff, 0x07ffffff},
+ +  {0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff},
+ +  {0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff},
+ +  {0x3fffffff, 0x3fffffff, 0x3fffffff, 0x3fffffff, 0x3fffffff, 0x3fffffff},
+ +  {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}
+ +};
+ +
+ +/*
+ + * GetPreferredReplicationNode
+ + * Pick any Datanode from given list, however fetch a preferred node first.
+ + */
+ +List *
+ +GetPreferredReplicationNode(List *relNodes)
+ +{
+ +      ListCell        *item;
+ +      int                     nodeid = -1;
+ +
+ +      if (list_length(relNodes) <= 0)
+ +              elog(ERROR, "a list of nodes should have at least one node");
+ +
+ +      foreach(item, relNodes)
+ +      {
+ +              int cnt_nodes;
+ +              char nodetype = PGXC_NODE_DATANODE;
+ +              for (cnt_nodes = 0;
+ +                              cnt_nodes < num_preferred_data_nodes && nodeid < 0;
+ +                              cnt_nodes++)
+ +              {
+ +                      if (PGXCNodeGetNodeId(preferred_data_node[cnt_nodes],
+ +                                                                &nodetype) == lfirst_int(item))
+ +                              nodeid = lfirst_int(item);
+ +              }
+ +              if (nodeid >= 0)
+ +                      break;
+ +      }
+ +      if (nodeid < 0)
+ +              return list_make1_int(list_nth_int(relNodes,
+ +                                      ((unsigned int) random()) % list_length(relNodes)));
+ +
+ +      return list_make1_int(nodeid);
+ +}
+ +
+ +/*
+ + * GetAnyDataNode
+ + * Pick any data node from given set, but try a preferred node
+ + */
+ +int
+ +GetAnyDataNode(Bitmapset *nodes)
+ +{
+ +      Bitmapset  *preferred = NULL;
+ +      int                     i, nodeid;
+ +      int                     nmembers = 0;
+ +      int                     members[NumDataNodes];
+ +
+ +      for (i = 0; i < num_preferred_data_nodes; i++)
+ +      {
+ +              char ntype = PGXC_NODE_DATANODE;
+ +              nodeid = PGXCNodeGetNodeId(preferred_data_node[i], &ntype);
+ +
+ +              /* OK, found one */
+ +              if (bms_is_member(nodeid, nodes))
+ +                      preferred = bms_add_member(preferred, nodeid);
+ +      }
+ +
+ +      /*
+ +       * If no preferred data nodes or they are not in the desired set, pick up
+ +       * from the original set.
+ +       */
+ +      if (bms_is_empty(preferred))
+ +              preferred = bms_copy(nodes);
+ +
+ +      /*
+ +       * Load balance.
+ +       * We can not get item from the set, convert it to array
+ +       */
+ +      while ((nodeid = bms_first_member(preferred)) >= 0)
+ +              members[nmembers++] = nodeid;
+ +      bms_free(preferred);
+ +
+ +      /* If there is a single member nothing to balance */
+ +      if (nmembers == 1)
+ +              return members[0];
+ +
+ +      /*
+ +       * In general, the set may contain any number of nodes, and if we save
+ +       * previous returned index for load balancing the distribution won't be
+ +       * flat, because small set will probably reset saved value, and lower
+ +       * indexes will be picked up more often.
+ +       * So we just get a random value from 0..nmembers-1.
+ +       */
+ +      return members[((unsigned int) random()) % nmembers];
+ +}
+ +
+ +/*
+ + * compute_modulo
+ + * This function performs modulo in an optimized way
+ + * It optimizes modulo of any positive number by
+ + * 1,2,3,4,7,8,15,16,31,32,63,64 and so on
+ + * for the rest of the denominators it uses % operator
+ + * The optimized algos have been taken from
+ + * http://www-graphics.stanford.edu/~seander/bithacks.html
+ + */
+ +static int
+ +compute_modulo(unsigned int numerator, unsigned int denominator)
+ +{
+ +      unsigned int d;
+ +      unsigned int m;
+ +      unsigned int s;
+ +      unsigned int mask;
+ +      int k;
+ +      unsigned int q, r;
+ +
+ +      if (numerator == 0)
+ +              return 0;
+ +
+ +      /* Check if denominator is a power of 2 */
+ +      if ((denominator & (denominator - 1)) == 0)
+ +              return numerator & (denominator - 1);
+ +
+ +      /* Check if (denominator+1) is a power of 2 */
+ +      d = denominator + 1;
+ +      if ((d & (d - 1)) == 0)
+ +      {
+ +              /* Which power of 2 is this number */
+ +              s = 0;
+ +              mask = 0x01;
+ +              for (k = 0; k < 32; k++)
+ +              {
+ +                      if ((d & mask) == mask)
+ +                              break;
+ +                      s++;
+ +                      mask = mask << 1;
+ +              }
+ +
+ +              m = (numerator & xc_mod_m[s]) + ((numerator >> s) & xc_mod_m[s]);
+ +
+ +              for (q = 0, r = 0; m > denominator; q++, r++)
+ +                      m = (m >> xc_mod_q[s][q]) + (m & xc_mod_r[s][r]);
+ +
+ +              m = m == denominator ? 0 : m;
+ +
+ +              return m;
+ +      }
+ +      return numerator % denominator;
+ +}
+ +
+ +/*
+ + * GetRelationDistColumn - Returns the name of the hash or modulo distribution column
+ + * First hash distribution is checked
+ + * Retuens NULL if the table is neither hash nor modulo distributed
+ + */
+ +char *
+ +GetRelationDistColumn(RelationLocInfo * rel_loc_info)
+ +{
+ +char *pColName;
+ +
+ +      pColName = NULL;
+ +
+ +      pColName = GetRelationHashColumn(rel_loc_info);
+ +      if (pColName == NULL)
+ +              pColName = GetRelationModuloColumn(rel_loc_info);
+ +
+ +      return pColName;
+ +}
+ +
+ +/*
+ + * Returns whether or not the data type is hash distributable with PG-XC
+ + * PGXCTODO - expand support for other data types!
+ + */
+ +bool
+ +IsTypeHashDistributable(Oid col_type)
+ +{
+ +      return (hash_func_ptr(col_type) != NULL);
+ +}
+ +
+ +/*
+ + * GetRelationHashColumn - return hash column for relation.
+ + *
+ + * Returns NULL if the relation is not hash partitioned.
+ + */
+ +char *
+ +GetRelationHashColumn(RelationLocInfo * rel_loc_info)
+ +{
+ +      char       *column_str = NULL;
+ +
+ +      if (rel_loc_info == NULL)
+ +              column_str = NULL;
+ +      else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+ +              column_str = NULL;
+ +      else
+ +      {
+ +              int                     len = strlen(rel_loc_info->partAttrName);
+ +
+ +              column_str = (char *) palloc(len + 1);
+ +              strncpy(column_str, rel_loc_info->partAttrName, len + 1);
+ +      }
+ +
+ +      return column_str;
+ +}
+ +
+ +/*
+ + * IsHashColumn - return whether or not column for relation is hashed.
+ + *
+ + */
+ +bool
+ +IsHashColumn(RelationLocInfo *rel_loc_info, char *part_col_name)
+ +{
+ +      bool            ret_value = false;
+ +
+ +      if (!rel_loc_info || !part_col_name)
+ +              ret_value = false;
+ +      else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+ +              ret_value = false;
+ +      else
+ +              ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName);
+ +
+ +      return ret_value;
+ +}
+ +
+ +
+ +/*
+ + * IsHashColumnForRelId - return whether or not column for relation is hashed.
+ + *
+ + */
+ +bool
+ +IsHashColumnForRelId(Oid relid, char *part_col_name)
+ +{
+ +      RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid);
+ +
+ +      return IsHashColumn(rel_loc_info, part_col_name);
+ +}
+ +
+ +/*
+ + * IsDistColumnForRelId - return whether or not column for relation is used for hash or modulo distribution
+ + *
+ + */
+ +bool
+ +IsDistColumnForRelId(Oid relid, char *part_col_name)
+ +{
+ +      bool bRet;
+ +      RelationLocInfo *rel_loc_info;
+ +
+ +      rel_loc_info = GetRelationLocInfo(relid);
+ +      bRet = false;
+ +
+ +      bRet = IsHashColumn(rel_loc_info, part_col_name);
+ +      if (bRet == false)
+ +              IsModuloColumn(rel_loc_info, part_col_name);
+ +      return bRet;
+ +}
+ +
+ +
+ +/*
+ + * Returns whether or not the data type is modulo distributable with PG-XC
+ + * PGXCTODO - expand support for other data types!
+ + */
+ +bool
+ +IsTypeModuloDistributable(Oid col_type)
+ +{
+ +      return (modulo_value_len(col_type) != -1);
+ +}
+ +
+ +/*
+ + * GetRelationModuloColumn - return modulo column for relation.
+ + *
+ + * Returns NULL if the relation is not modulo partitioned.
+ + */
+ +char *
+ +GetRelationModuloColumn(RelationLocInfo * rel_loc_info)
+ +{
+ +      char       *column_str = NULL;
+ +
+ +      if (rel_loc_info == NULL)
+ +              column_str = NULL;
+ +      else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO)
+ +              column_str = NULL;
+ +      else
+ +      {
+ +              int     len = strlen(rel_loc_info->partAttrName);
+ +
+ +              column_str = (char *) palloc(len + 1);
+ +              strncpy(column_str, rel_loc_info->partAttrName, len + 1);
+ +      }
+ +
+ +      return column_str;
+ +}
+ +
+ +/*
+ + * IsModuloColumn - return whether or not column for relation is used for modulo distribution.
+ + *
+ + */
+ +bool
+ +IsModuloColumn(RelationLocInfo *rel_loc_info, char *part_col_name)
+ +{
+ +      bool            ret_value = false;
+ +
+ +      if (!rel_loc_info || !part_col_name)
+ +              ret_value = false;
+ +      else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO)
+ +              ret_value = false;
+ +      else
+ +              ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName);
+ +
+ +      return ret_value;
+ +}
+ +
+ +
+ +/*
+ + * IsModuloColumnForRelId - return whether or not column for relation is used for modulo distribution.
+ + */
+ +bool
+ +IsModuloColumnForRelId(Oid relid, char *part_col_name)
+ +{
+ +      RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid);
+ +
+ +      return IsModuloColumn(rel_loc_info, part_col_name);
+ +}
+ +
+ +/*
+ + * Update the round robin node for the relation
+ + *
+ + * PGXCTODO - may not want to bother with locking here, we could track
+ + * these in the session memory context instead...
+ + */
+ +int
+ +GetRoundRobinNode(Oid relid)
+ +{
+ +      int                     ret_node;
+ +      Relation        rel = relation_open(relid, AccessShareLock);
+ +
+ +    Assert (IsLocatorReplicated(rel->rd_locator_info->locatorType) ||
+ +                      rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN);
+ +
+ +      ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode);
+ +
+ +      /* Move round robin indicator to next node */
+ +      if (rel->rd_locator_info->roundRobinNode->next != NULL)
+ +              rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next;
+ +      else
+ +              /* reset to first one */
+ +              rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->rl_nodeList->head;
+ +
+ +      relation_close(rel, AccessShareLock);
+ +
+ +      return ret_node;
+ +}
+ +
+ +/*
+ + * IsTableDistOnPrimary
+ + *
+ + * Does the table distribution list include the primary node?
+ + */
+ +bool
+ +IsTableDistOnPrimary(RelationLocInfo *rel_loc_info)
+ +{
+ +      ListCell *item;
+ +
+ +      if (!OidIsValid(primary_data_node) ||
+ +              rel_loc_info == NULL ||
+ +              list_length(rel_loc_info->rl_nodeList = 0))
+ +              return false;
+ +
+ +      foreach(item, rel_loc_info->rl_nodeList)
+ +      {
+ +              char ntype = PGXC_NODE_DATANODE;
+ +              if (PGXCNodeGetNodeId(primary_data_node, &ntype) == lfirst_int(item))
+ +                      return true;
+ +      }
+ +      return false;
+ +}
+ +
+ +
+ +/*
+ + * IsLocatorInfoEqual
+ + * Check equality of given locator information
+ + */
+ +bool
+ +IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2)
+ +{
+ +      List *nodeList1, *nodeList2;
+ +      Assert(rel_loc_info1 && rel_loc_info2);
+ +
+ +      nodeList1 = rel_loc_info1->rl_nodeList;
+ +      nodeList2 = rel_loc_info2->rl_nodeList;
+ +
+ +      /* Same relation? */
+ +      if (rel_loc_info1->relid != rel_loc_info2->relid)
+ +              return false;
+ +
+ +      /* Same locator type? */
+ +      if (rel_loc_info1->locatorType != rel_loc_info2->locatorType)
+ +              return false;
+ +
+ +      /* Same attribute number? */
+ +      if (rel_loc_info1->partAttrNum != rel_loc_info2->partAttrNum)
+ +              return false;
+ +
+ +      /* Same node list? */
+ +      if (list_difference_int(nodeList1, nodeList2) != NIL ||
+ +              list_difference_int(nodeList2, nodeList1) != NIL)
+ +              return false;
+ +
+ +      /* Everything is equal */
+ +      return true;
+ +}
+ +
+ +/*
+ + * ConvertToLocatorType
+ + *            get locator distribution type
+ + * We really should just have pgxc_class use disttype instead...
+ + */
+ +char
+ +ConvertToLocatorType(int disttype)
+ +{
+ +      char            loctype = LOCATOR_TYPE_NONE;
+ +
+ +      switch (disttype)
+ +      {
+ +              case DISTTYPE_HASH:
+ +                      loctype = LOCATOR_TYPE_HASH;
+ +                      break;
+ +              case DISTTYPE_ROUNDROBIN:
+ +                      loctype = LOCATOR_TYPE_RROBIN;
+ +                      break;
+ +              case DISTTYPE_REPLICATION:
+ +                      loctype = LOCATOR_TYPE_REPLICATED;
+ +                      break;
+ +              case DISTTYPE_MODULO:
+ +                      loctype = LOCATOR_TYPE_MODULO;
+ +                      break;
+ +              default:
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ +                                       errmsg("Invalid distribution type")));
+ +                      break;
+ +      }
+ +
+ +      return loctype;
+ +}
+ +
+ +
+ +/*
+ + * GetLocatorType - Returns the locator type of the table
+ + *
+ + */
+ +char
+ +GetLocatorType(Oid relid)
+ +{
+ +      char            ret = '\0';
+ +
+ +      RelationLocInfo *ret_loc_info = GetRelationLocInfo(relid);
+ +
+ +      if (ret_loc_info != NULL)
+ +              ret = ret_loc_info->locatorType;
+ +
+ +      return ret;
+ +}
+ +
+ +
+ +/*
+ + * Return a list of all Datanodes.
+ + * We assume all tables use all nodes in the prototype, so just return a list
+ + * from first one.
+ + */
+ +List *
+ +GetAllDataNodes(void)
+ +{
+ +      int                     i;
+ +      List       *nodeList = NIL;
+ +
+ +      for (i = 0; i < NumDataNodes; i++)
+ +              nodeList = lappend_int(nodeList, i);
+ +
+ +      return nodeList;
+ +}
+ +
+ +/*
+ + * Return a list of all Coordinators
+ + * This is used to send DDL to all nodes and to clean up pooler connections.
+ + * Do not put in the list the local Coordinator where this function is launched.
+ + */
+ +List *
+ +GetAllCoordNodes(void)
+ +{
+ +      int                     i;
+ +      List       *nodeList = NIL;
+ +
+ +      for (i = 0; i < NumCoords; i++)
+ +      {
+ +              /*
+ +               * Do not put in list the Coordinator we are on,
+ +               * it doesn't make sense to connect to the local Coordinator.
+ +               */
+ +
+ +              if (i != PGXCNodeId - 1)
+ +                      nodeList = lappend_int(nodeList, i);
+ +      }
+ +
+ +      return nodeList;
+ +}
+ +
+ +
+ +/*
+ + * Build locator information associated with the specified relation.
+ + */
+ +void
+ +RelationBuildLocator(Relation rel)
+ +{
+ +      Relation        pcrel;
+ +      ScanKeyData     skey;
+ +      SysScanDesc     pcscan;
+ +      HeapTuple       htup;
+ +      MemoryContext   oldContext;
+ +      RelationLocInfo *relationLocInfo;
+ +      int             j;
+ +      Form_pgxc_class pgxc_class;
+ +
+ +      ScanKeyInit(&skey,
+ +                              Anum_pgxc_class_pcrelid,
+ +                              BTEqualStrategyNumber, F_OIDEQ,
+ +                              ObjectIdGetDatum(RelationGetRelid(rel)));
+ +
+ +      pcrel = heap_open(PgxcClassRelationId, AccessShareLock);
+ +      pcscan = systable_beginscan(pcrel, PgxcClassPgxcRelIdIndexId, true,
+ +                                                              SnapshotSelf, 1, &skey);
+ +      htup = systable_getnext(pcscan);
+ +
+ +      if (!HeapTupleIsValid(htup))
+ +      {
+ +              /* Assume local relation only */
+ +              rel->rd_locator_info = NULL;
+ +              systable_endscan(pcscan);
+ +              heap_close(pcrel, AccessShareLock);
+ +              return;
+ +      }
+ +
+ +      pgxc_class = (Form_pgxc_class) GETSTRUCT(htup);
+ +
+ +      oldContext = MemoryContextSwitchTo(CacheMemoryContext);
+ +
+ +      relationLocInfo = (RelationLocInfo *) palloc(sizeof(RelationLocInfo));
+ +      rel->rd_locator_info = relationLocInfo;
+ +
+ +      relationLocInfo->relid = RelationGetRelid(rel);
+ +      relationLocInfo->locatorType = pgxc_class->pclocatortype;
+ +
+ +      relationLocInfo->partAttrNum = pgxc_class->pcattnum;
+ +
+ +      relationLocInfo->partAttrName = get_attname(relationLocInfo->relid, pgxc_class->pcattnum);
+ +
+ +      relationLocInfo->rl_nodeList = NIL;
+ +
+ +      for (j = 0; j < pgxc_class->nodeoids.dim1; j++)
+ +      {
+ +              char ntype = PGXC_NODE_DATANODE;
+ +              int nid = PGXCNodeGetNodeId(pgxc_class->nodeoids.values[j], &ntype);
+ +              relationLocInfo->rl_nodeList = lappend_int(relationLocInfo->rl_nodeList, nid);
+ +      }
+ +
+ +      /*
+ +       * If the locator type is round robin, we set a node to
+ +       * use next time. In addition, if it is replicated,
+ +       * we choose a node to use for balancing reads.
+ +       */
+ +      if (relationLocInfo->locatorType == LOCATOR_TYPE_RROBIN
+ +              || IsLocatorReplicated(relationLocInfo->locatorType))
+ +      {
+ +              int offset;
+ +              /*
+ +               * pick a random one to start with,
+ +               * since each process will do this independently
+ +               */
+ +              offset = compute_modulo(abs(rand()), list_length(relationLocInfo->rl_nodeList));
+ +
+ +              srand(time(NULL));
+ +              relationLocInfo->roundRobinNode = relationLocInfo->rl_nodeList->head; /* initialize */
+ +              for (j = 0; j < offset && relationLocInfo->roundRobinNode->next != NULL; j++)
+ +                      relationLocInfo->roundRobinNode = relationLocInfo->roundRobinNode->next;
+ +      }
+ +
+ +      systable_endscan(pcscan);
+ +      heap_close(pcrel, AccessShareLock);
+ +
+ +      MemoryContextSwitchTo(oldContext);
+ +}
+ +
+ +/*
+ + * GetLocatorRelationInfo - Returns the locator information for relation,
+ + * in a copy of the RelationLocatorInfo struct in relcache
+ + */
+ +RelationLocInfo *
+ +GetRelationLocInfo(Oid relid)
+ +{
+ +      RelationLocInfo *ret_loc_info = NULL;
+ +      Relation        rel = relation_open(relid, AccessShareLock);
+ +
+ +      /* Relation needs to be valid */
+ +      Assert(rel->rd_isvalid);
+ +
+ +      if (rel->rd_locator_info)
+ +              ret_loc_info = CopyRelationLocInfo(rel->rd_locator_info);
+ +
+ +      relation_close(rel, AccessShareLock);
+ +
+ +      return ret_loc_info;
+ +}
+ +
+ +/*
+ + * Get the distribution type of relation.
+ + */
+ +char
+ +GetRelationLocType(Oid relid)
+ +{
+ +      RelationLocInfo *locinfo = GetRelationLocInfo(relid);
+ +      if (!locinfo)
+ +              return LOCATOR_TYPE_NONE;
+ +
+ +      return locinfo->locatorType;
+ +}
+ +
+ +/*
+ + * Copy the RelationLocInfo struct
+ + */
+ +RelationLocInfo *
+ +CopyRelationLocInfo(RelationLocInfo * src_info)
+ +{
+ +      RelationLocInfo *dest_info;
+ +
+ +      Assert(src_info);
+ +
+ +      dest_info = (RelationLocInfo *) palloc0(sizeof(RelationLocInfo));
+ +
+ +      dest_info->relid = src_info->relid;
+ +      dest_info->locatorType = src_info->locatorType;
+ +      dest_info->partAttrNum = src_info->partAttrNum;
+ +      if (src_info->partAttrName)
+ +              dest_info->partAttrName = pstrdup(src_info->partAttrName);
+ +
+ +      if (src_info->rl_nodeList)
+ +              dest_info->rl_nodeList = list_copy(src_info->rl_nodeList);
+ +      /* Note, for round robin, we use the relcache entry */
+ +
+ +      return dest_info;
+ +}
+ +
+ +
+ +/*
+ + * Free RelationLocInfo struct
+ + */
+ +void
+ +FreeRelationLocInfo(RelationLocInfo *relationLocInfo)
+ +{
+ +      if (relationLocInfo)
+ +      {
+ +              if (relationLocInfo->partAttrName)
+ +                      pfree(relationLocInfo->partAttrName);
+ +              pfree(relationLocInfo);
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Free the contents of the ExecNodes expression */
+ +void
+ +FreeExecNodes(ExecNodes **exec_nodes)
+ +{
+ +      ExecNodes *tmp_en = *exec_nodes;
+ +
+ +      /* Nothing to do */
+ +      if (!tmp_en)
+ +              return;
+ +      list_free(tmp_en->primarynodelist);
+ +      list_free(tmp_en->nodeList);
+ +      pfree(tmp_en);
+ +      *exec_nodes = NULL;
+ +}
+ +
+ +
+ +#ifdef XCP
+ +/*
+ + * Determine value length in bytes for specified type for a module locator.
+ + * Return -1 if module locator is not supported for the type.
+ + */
+ +static int
+ +modulo_value_len(Oid dataType)
+ +{
+ +      switch (dataType)
+ +      {
+ +              case BOOLOID:
+ +              case CHAROID:
+ +                      return 1;
+ +              case INT2OID:
+ +                      return 2;
+ +              case INT4OID:
+ +              case ABSTIMEOID:
+ +              case RELTIMEOID:
+ +              case DATEOID:
+ +                      return 4;
+ +              default:
+ +                      return -1;
+ +      }
+ +}
+ +
+ +
+ +static LocatorHashFunc
+ +hash_func_ptr(Oid dataType)
+ +{
+ +      switch (dataType)
+ +      {
+ +              case INT8OID:
+ +              case CASHOID:
+ +                      return hashint8;
+ +              case INT2OID:
+ +                      return hashint2;
+ +              case OIDOID:
+ +                      return hashoid;
+ +              case INT4OID:
+ +              case ABSTIMEOID:
+ +              case RELTIMEOID:
+ +              case DATEOID:
+ +                      return hashint4;
+ +              case BOOLOID:
+ +              case CHAROID:
+ +                      return hashchar;
+ +              case NAMEOID:
+ +                      return hashname;
+ +              case VARCHAROID:
+ +              case TEXTOID:
+ +                      return hashtext;
+ +              case OIDVECTOROID:
+ +                      return hashoidvector;
+ +              case BPCHAROID:
+ +                      return hashbpchar;
+ +              case BYTEAOID:
+ +                      return hashvarlena;
+ +              case TIMEOID:
+ +                      return time_hash;
+ +              case TIMESTAMPOID:
+ +              case TIMESTAMPTZOID:
+ +                      return timestamp_hash;
+ +              case INTERVALOID:
+ +                      return interval_hash;
+ +              case TIMETZOID:
+ +                      return timetz_hash;
+ +              case NUMERICOID:
+ +                      return hash_numeric;
+ +              case UUIDOID:
+ +                      return uuid_hash;
+ +              default:
+ +                      return NULL;
+ +      }
+ +}
+ +
+ +
+ +Locator *
+ +createLocator(char locatorType, RelationAccessType accessType,
+ +                        Oid dataType, LocatorListType listType, int nodeCount,
+ +                        void *nodeList, void **result, bool primary)
+ +{
+ +      Locator    *locator;
+ +      ListCell   *lc;
+ +      void       *nodeMap = NULL;
+ +      int             i;
+ +
+ +      locator = (Locator *) palloc(sizeof(Locator));
+ +      locator->dataType = dataType;
+ +      locator->listType = listType;
+ +      locator->nodeCount = nodeCount;
+ +      /* Create node map */
+ +      switch (listType)
+ +      {
+ +              case LOCATOR_LIST_NONE:
+ +                      /* No map, return indexes */
+ +                      break;
+ +              case LOCATOR_LIST_INT:
+ +                      /* Copy integer array */
+ +                      nodeMap = palloc(nodeCount * sizeof(int));
+ +                      memcpy(nodeMap, nodeList, nodeCount * sizeof(int));
+ +                      break;
+ +              case LOCATOR_LIST_OID:
+ +                      /* Copy array of Oids */
+ +                      nodeMap = palloc(nodeCount * sizeof(Oid));
+ +                      memcpy(nodeMap, nodeList, nodeCount * sizeof(Oid));
+ +                      break;
+ +              case LOCATOR_LIST_POINTER:
+ +                      /* Copy array of Oids */
+ +                      nodeMap = palloc(nodeCount * sizeof(void *));
+ +                      memcpy(nodeMap, nodeList, nodeCount * sizeof(void *));
+ +                      break;
+ +              case LOCATOR_LIST_LIST:
+ +                      /* Create map from list */
+ +              {
+ +                      List *l = (List *) nodeList;
+ +                      locator->nodeCount = list_length(l);
+ +                      if (IsA(l, IntList))
+ +                      {
+ +                              int *intptr;
+ +                              nodeMap = palloc(locator->nodeCount * sizeof(int));
+ +                              intptr = (int *) nodeMap;
+ +                              foreach(lc, l)
+ +                                      *intptr++ = lfirst_int(lc);
+ +                              locator->listType = LOCATOR_LIST_INT;
+ +                      }
+ +                      else if (IsA(l, OidList))
+ +                      {
+ +                              Oid *oidptr;
+ +                              nodeMap = palloc(locator->nodeCount * sizeof(Oid));
+ +                              oidptr = (Oid *) nodeMap;
+ +                              foreach(lc, l)
+ +                                      *oidptr++ = lfirst_oid(lc);
+ +                              locator->listType = LOCATOR_LIST_OID;
+ +                      }
+ +                      else if (IsA(l, List))
+ +                      {
+ +                              void **voidptr;
+ +                              nodeMap = palloc(locator->nodeCount * sizeof(void *));
+ +                              voidptr = (void **) nodeMap;
+ +                              foreach(lc, l)
+ +                                      *voidptr++ = lfirst(lc);
+ +                              locator->listType = LOCATOR_LIST_POINTER;
+ +                      }
+ +                      else
+ +                      {
+ +                              /* can not get here */
+ +                              Assert(false);
+ +                      }
+ +                      break;
+ +              }
+ +      }
+ +      /*
+ +       * Determine locatefunc, allocate results, set up parameters
+ +       * specific to locator type
+ +       */
+ +      switch (locatorType)
+ +      {
+ +              case LOCATOR_TYPE_REPLICATED:
+ +                      if (accessType == RELATION_ACCESS_INSERT ||
+ +                                      accessType == RELATION_ACCESS_UPDATE ||
+ +                                      accessType == RELATION_ACCESS_READ_FQS)
+ +                      {
+ +                              locator->locatefunc = locate_static;
+ +                              if (nodeMap == NULL)
+ +                              {
+ +                                      /* no map, prepare array with indexes */
+ +                                      int *intptr;
+ +                                      nodeMap = palloc(locator->nodeCount * sizeof(int));
+ +                                      intptr = (int *) nodeMap;
+ +                                      for (i = 0; i < locator->nodeCount; i++)
+ +                                              *intptr++ = i;
+ +                              }
+ +                              locator->nodeMap = nodeMap;
+ +                              locator->results = nodeMap;
+ +                      }
+ +                      else
+ +                      {
+ +                              /* SELECT, use random node.. */
+ +                              locator->locatefunc = locate_modulo_random;
+ +                              locator->nodeMap = nodeMap;
+ +                              switch (locator->listType)
+ +                              {
+ +                                      case LOCATOR_LIST_NONE:
+ +                                      case LOCATOR_LIST_INT:
+ +                                              locator->results = palloc(sizeof(int));
+ +                                              break;
+ +                                      case LOCATOR_LIST_OID:
+ +                                              locator->results = palloc(sizeof(Oid));
+ +                                              break;
+ +                                      case LOCATOR_LIST_POINTER:
+ +                                              locator->results = palloc(sizeof(void *));
+ +                                              break;
+ +                                      case LOCATOR_LIST_LIST:
+ +                                              /* Should never happen */
+ +                                              Assert(false);
+ +                                              break;
+ +                              }
+ +                              locator->roundRobinNode = -1;
+ +                      }
+ +                      break;
+ +              case LOCATOR_TYPE_RROBIN:
+ +                      if (accessType == RELATION_ACCESS_INSERT)
+ +                      {
+ +                              locator->locatefunc = locate_roundrobin;
+ +                              locator->nodeMap = nodeMap;
+ +                              switch (locator->listType)
+ +                              {
+ +                                      case LOCATOR_LIST_NONE:
+ +                                      case LOCATOR_LIST_INT:
+ +                                              locator->results = palloc(sizeof(int));
+ +                                              break;
+ +                                      case LOCATOR_LIST_OID:
+ +                                              locator->results = palloc(sizeof(Oid));
+ +                                              break;
+ +                                      case LOCATOR_LIST_POINTER:
+ +                                              locator->results = palloc(sizeof(void *));
+ +                                              break;
+ +                                      case LOCATOR_LIST_LIST:
+ +                                              /* Should never happen */
+ +                                              Assert(false);
+ +                                              break;
+ +                              }
+ +                              locator->roundRobinNode = -1;
+ +                      }
+ +                      else
+ +                      {
+ +                              locator->locatefunc = locate_static;
+ +                              if (nodeMap == NULL)
+ +                              {
+ +                                      /* no map, prepare array with indexes */
+ +                                      int *intptr;
+ +                                      nodeMap = palloc(locator->nodeCount * sizeof(int));
+ +                                      intptr = (int *) nodeMap;
+ +                                      for (i = 0; i < locator->nodeCount; i++)
+ +                                              *intptr++ = i;
+ +                              }
+ +                              locator->nodeMap = nodeMap;
+ +                              locator->results = nodeMap;
+ +                      }
+ +                      break;
+ +              case LOCATOR_TYPE_HASH:
+ +                      if (accessType == RELATION_ACCESS_INSERT)
+ +                      {
+ +                              locator->locatefunc = locate_hash_insert;
+ +                              locator->nodeMap = nodeMap;
+ +                              switch (locator->listType)
+ +                              {
+ +                                      case LOCATOR_LIST_NONE:
+ +                                      case LOCATOR_LIST_INT:
+ +                                              locator->results = palloc(sizeof(int));
+ +                                              break;
+ +                                      case LOCATOR_LIST_OID:
+ +                                              locator->results = palloc(sizeof(Oid));
+ +                                              break;
+ +                                      case LOCATOR_LIST_POINTER:
+ +                                              locator->results = palloc(sizeof(void *));
+ +                                              break;
+ +                                      case LOCATOR_LIST_LIST:
+ +                                              /* Should never happen */
+ +                                              Assert(false);
+ +                                              break;
+ +                              }
+ +                      }
+ +                      else
+ +                      {
+ +                              locator->locatefunc = locate_hash_select;
+ +                              locator->nodeMap = nodeMap;
+ +                              switch (locator->listType)
+ +                              {
+ +                                      case LOCATOR_LIST_NONE:
+ +                                      case LOCATOR_LIST_INT:
+ +                                              locator->results = palloc(locator->nodeCount * sizeof(int));
+ +                                              break;
+ +                                      case LOCATOR_LIST_OID:
+ +                                              locator->results = palloc(locator->nodeCount * sizeof(Oid));
+ +                                              break;
+ +                                      case LOCATOR_LIST_POINTER:
+ +                                              locator->results = palloc(locator->nodeCount * sizeof(void *));
+ +                                              break;
+ +                                      case LOCATOR_LIST_LIST:
+ +                                              /* Should never happen */
+ +                                              Assert(false);
+ +                                              break;
+ +                              }
+ +                      }
+ +
+ +                      locator->hashfunc = hash_func_ptr(dataType);
+ +                      if (locator->hashfunc == NULL)
+ +                              ereport(ERROR, (errmsg("Error: unsupported data type for HASH locator: %d\n",
+ +                                                                 dataType)));
+ +                      break;
+ +              case LOCATOR_TYPE_MODULO:
+ +                      if (accessType == RELATION_ACCESS_INSERT)
+ +                      {
+ +                              locator->locatefunc = locate_modulo_insert;
+ +                              locator->nodeMap = nodeMap;
+ +                              switch (locator->listType)
+ +                              {
+ +                                      case LOCATOR_LIST_NONE:
+ +                                      case LOCATOR_LIST_INT:
+ +                                              locator->results = palloc(sizeof(int));
+ +                                              break;
+ +                                      case LOCATOR_LIST_OID:
+ +                                              locator->results = palloc(sizeof(Oid));
+ +                                              break;
+ +                                      case LOCATOR_LIST_POINTER:
+ +                                              locator->results = palloc(sizeof(void *));
+ +                                              break;
+ +                                      case LOCATOR_LIST_LIST:
+ +                                              /* Should never happen */
+ +                                              Assert(false);
+ +                                              break;
+ +                              }
+ +                      }
+ +                      else
+ +                      {
+ +                              locator->locatefunc = locate_modulo_select;
+ +                              locator->nodeMap = nodeMap;
+ +                              switch (locator->listType)
+ +                              {
+ +                                      case LOCATOR_LIST_NONE:
+ +                                      case LOCATOR_LIST_INT:
+ +                                              locator->results = palloc(locator->nodeCount * sizeof(int));
+ +                                              break;
+ +                                      case LOCATOR_LIST_OID:
+ +                                              locator->results = palloc(locator->nodeCount * sizeof(Oid));
+ +                                              break;
+ +                                      case LOCATOR_LIST_POINTER:
+ +                                              locator->results = palloc(locator->nodeCount * sizeof(void *));
+ +                                              break;
+ +                                      case LOCATOR_LIST_LIST:
+ +                                              /* Should never happen */
+ +                                              Assert(false);
+ +                                              break;
+ +                              }
+ +                      }
+ +
+ +                      locator->valuelen = modulo_value_len(dataType);
+ +                      if (locator->valuelen == -1)
+ +                              ereport(ERROR, (errmsg("Error: unsupported data type for MODULO locator: %d\n",
+ +                                                                 dataType)));
+ +                      break;
+ +              default:
+ +                      ereport(ERROR, (errmsg("Error: no such supported locator type: %c\n",
+ +                                                                 locatorType)));
+ +      }
+ +
+ +      if (result)
+ +              *result = locator->results;
+ +
+ +      return locator;
+ +}
+ +
+ +
+ +void
+ +freeLocator(Locator *locator)
+ +{
+ +      pfree(locator->nodeMap);
+ +      /*
+ +       * locator->nodeMap and locator->results may point to the same memory,
+ +       * do not free it twice
+ +       */
+ +      if (locator->results != locator->nodeMap)
+ +              pfree(locator->results);
+ +      pfree(locator);
+ +}
+ +
+ +
+ +/*
+ + * Each time return the same predefined results
+ + */
+ +static int
+ +locate_static(Locator *self, Datum value, bool isnull,
+ +                        bool *hasprimary)
+ +{
+ +      /* TODO */
+ +      if (hasprimary)
+ +              *hasprimary = false;
+ +      return self->nodeCount;
+ +}
+ +
+ +
+ +/*
+ + * Each time return one next node, in round robin manner
+ + */
+ +static int
+ +locate_roundrobin(Locator *self, Datum value, bool isnull,
+ +                                bool *hasprimary)
+ +{
+ +      /* TODO */
+ +      if (hasprimary)
+ +              *hasprimary = false;
+ +      if (++self->roundRobinNode >= self->nodeCount)
+ +              self->roundRobinNode = 0;
+ +      switch (self->listType)
+ +      {
+ +              case LOCATOR_LIST_NONE:
+ +                      ((int *) self->results)[0] = self->roundRobinNode;
+ +                      break;
+ +              case LOCATOR_LIST_INT:
+ +                      ((int *) self->results)[0] =
+ +                                      ((int *) self->nodeMap)[self->roundRobinNode];
+ +                      break;
+ +              case LOCATOR_LIST_OID:
+ +                      ((Oid *) self->results)[0] =
+ +                                      ((Oid *) self->nodeMap)[self->roundRobinNode];
+ +                      break;
+ +              case LOCATOR_LIST_POINTER:
+ +                      ((void **) self->results)[0] =
+ +                                      ((void **) self->nodeMap)[self->roundRobinNode];
+ +                      break;
+ +              case LOCATOR_LIST_LIST:
+ +                      /* Should never happen */
+ +                      Assert(false);
+ +                      break;
+ +      }
+ +      return 1;
+ +}
+ +
+ +/*
+ + * Each time return one node, in a random manner
+ + * This is similar to locate_modulo_select, but that
+ + * function does not use a random modulo..
+ + */
+ +static int
+ +locate_modulo_random(Locator *self, Datum value, bool isnull,
+ +                                bool *hasprimary)
+ +{
+ +      int offset;
+ +
+ +      if (hasprimary)
+ +              *hasprimary = false;
+ +
+ +      Assert(self->nodeCount > 0);
+ +      offset = compute_modulo(abs(rand()), self->nodeCount);
+ +      switch (self->listType)
+ +      {
+ +              case LOCATOR_LIST_NONE:
+ +                      ((int *) self->results)[0] = offset;
+ +                      break;
+ +              case LOCATOR_LIST_INT:
+ +                      ((int *) self->results)[0] =
+ +                                      ((int *) self->nodeMap)[offset];
+ +                      break;
+ +              case LOCATOR_LIST_OID:
+ +                      ((Oid *) self->results)[0] =
+ +                                      ((Oid *) self->nodeMap)[offset];
+ +                      break;
+ +              case LOCATOR_LIST_POINTER:
+ +                      ((void **) self->results)[0] =
+ +                                      ((void **) self->nodeMap)[offset];
+ +                      break;
+ +              case LOCATOR_LIST_LIST:
+ +                      /* Should never happen */
+ +                      Assert(false);
+ +                      break;
+ +      }
+ +      return 1;
+ +}
+ +
+ +/*
+ + * Calculate hash from supplied value and use modulo by nodeCount as an index
+ + */
+ +static int
+ +locate_hash_insert(Locator *self, Datum value, bool isnull,
+ +                                 bool *hasprimary)
+ +{
+ +      int index;
+ +      if (hasprimary)
+ +              *hasprimary = false;
+ +      if (isnull)
+ +              index = 0;
+ +      else
+ +      {
+ +              unsigned int hash32;
+ +
+ +              hash32 = (unsigned int) DatumGetInt32(DirectFunctionCall1(self->hashfunc, value));
+ +
+ +              index = compute_modulo(hash32, self->nodeCount);
+ +      }
+ +      switch (self->listType)
+ +      {
+ +              case LOCATOR_LIST_NONE:
+ +                      ((int *) self->results)[0] = index;
+ +                      break;
+ +              case LOCATOR_LIST_INT:
+ +                      ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ +                      break;
+ +              case LOCATOR_LIST_OID:
+ +                      ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ +                      break;
+ +              case LOCATOR_LIST_POINTER:
+ +                      ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ +                      break;
+ +              case LOCATOR_LIST_LIST:
+ +                      /* Should never happen */
+ +                      Assert(false);
+ +                      break;
+ +      }
+ +      return 1;
+ +}
+ +
+ +
+ +/*
+ + * Calculate hash from supplied value and use modulo by nodeCount as an index
+ + * if value is NULL assume no hint and return all the nodes.
+ + */
+ +static int
+ +locate_hash_select(Locator *self, Datum value, bool isnull,
+ +                                 bool *hasprimary)
+ +{
+ +      if (hasprimary)
+ +              *hasprimary = false;
+ +      if (isnull)
+ +      {
+ +              int i;
+ +              switch (self->listType)
+ +              {
+ +                      case LOCATOR_LIST_NONE:
+ +                              for (i = 0; i < self->nodeCount; i++)
+ +                                      ((int *) self->results)[i] = i;
+ +                              break;
+ +                      case LOCATOR_LIST_INT:
+ +                              memcpy(self->results, self->nodeMap,
+ +                                         self->nodeCount * sizeof(int));
+ +                              break;
+ +                      case LOCATOR_LIST_OID:
+ +                              memcpy(self->results, self->nodeMap,
+ +                                         self->nodeCount * sizeof(Oid));
+ +                              break;
+ +                      case LOCATOR_LIST_POINTER:
+ +                              memcpy(self->results, self->nodeMap,
+ +                                         self->nodeCount * sizeof(void *));
+ +                              break;
+ +                      case LOCATOR_LIST_LIST:
+ +                              /* Should never happen */
+ +                              Assert(false);
+ +                              break;
+ +              }
+ +              return self->nodeCount;
+ +      }
+ +      else
+ +      {
+ +              unsigned int hash32;
+ +              int              index;
+ +
+ +              hash32 = (unsigned int) DatumGetInt32(DirectFunctionCall1(self->hashfunc, value));
+ +
+ +              index = compute_modulo(hash32, self->nodeCount);
+ +              switch (self->listType)
+ +              {
+ +                      case LOCATOR_LIST_NONE:
+ +                              ((int *) self->results)[0] = index;
+ +                              break;
+ +                      case LOCATOR_LIST_INT:
+ +                              ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ +                              break;
+ +                      case LOCATOR_LIST_OID:
+ +                              ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ +                              break;
+ +                      case LOCATOR_LIST_POINTER:
+ +                              ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ +                              break;
+ +                      case LOCATOR_LIST_LIST:
+ +                              /* Should never happen */
+ +                              Assert(false);
+ +                              break;
+ +              }
+ +              return 1;
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Use modulo of supplied value by nodeCount as an index
+ + */
+ +static int
+ +locate_modulo_insert(Locator *self, Datum value, bool isnull,
+ +                                 bool *hasprimary)
+ +{
+ +      int index;
+ +      if (hasprimary)
+ +              *hasprimary = false;
+ +      if (isnull)
+ +              index = 0;
+ +      else
+ +      {
+ +              unsigned int mod32;
+ +
+ +              if (self->valuelen == 4)
+ +                      mod32 = (unsigned int) (GET_4_BYTES(value));
+ +              else if (self->valuelen == 2)
+ +                      mod32 = (unsigned int) (GET_2_BYTES(value));
+ +              else if (self->valuelen == 1)
+ +                      mod32 = (unsigned int) (GET_1_BYTE(value));
+ +              else
+ +                      mod32 = 0;
+ +
+ +              index = compute_modulo(mod32, self->nodeCount);
+ +      }
+ +      switch (self->listType)
+ +      {
+ +              case LOCATOR_LIST_NONE:
+ +                      ((int *) self->results)[0] = index;
+ +                      break;
+ +              case LOCATOR_LIST_INT:
+ +                      ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ +                      break;
+ +              case LOCATOR_LIST_OID:
+ +                      ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ +                      break;
+ +              case LOCATOR_LIST_POINTER:
+ +                      ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ +                      break;
+ +              case LOCATOR_LIST_LIST:
+ +                      /* Should never happen */
+ +                      Assert(false);
+ +                      break;
+ +      }
+ +      return 1;
+ +}
+ +
+ +
+ +/*
+ + * Use modulo of supplied value by nodeCount as an index
+ + * if value is NULL assume no hint and return all the nodes.
+ + */
+ +static int
+ +locate_modulo_select(Locator *self, Datum value, bool isnull,
+ +                                 bool *hasprimary)
+ +{
+ +      if (hasprimary)
+ +              *hasprimary = false;
+ +      if (isnull)
+ +      {
+ +              int i;
+ +              switch (self->listType)
+ +              {
+ +                      case LOCATOR_LIST_NONE:
+ +                              for (i = 0; i < self->nodeCount; i++)
+ +                                      ((int *) self->results)[i] = i;
+ +                              break;
+ +                      case LOCATOR_LIST_INT:
+ +                              memcpy(self->results, self->nodeMap,
+ +                                         self->nodeCount * sizeof(int));
+ +                              break;
+ +                      case LOCATOR_LIST_OID:
+ +                              memcpy(self->results, self->nodeMap,
+ +                                         self->nodeCount * sizeof(Oid));
+ +                              break;
+ +                      case LOCATOR_LIST_POINTER:
+ +                              memcpy(self->results, self->nodeMap,
+ +                                         self->nodeCount * sizeof(void *));
+ +                              break;
+ +                      case LOCATOR_LIST_LIST:
+ +                              /* Should never happen */
+ +                              Assert(false);
+ +                              break;
+ +              }
+ +              return self->nodeCount;
+ +      }
+ +      else
+ +      {
+ +              unsigned int mod32;
+ +              int              index;
+ +
+ +              if (self->valuelen == 4)
+ +                      mod32 = (unsigned int) (GET_4_BYTES(value));
+ +              else if (self->valuelen == 2)
+ +                      mod32 = (unsigned int) (GET_2_BYTES(value));
+ +              else if (self->valuelen == 1)
+ +                      mod32 = (unsigned int) (GET_1_BYTE(value));
+ +              else
+ +                      mod32 = 0;
+ +
+ +              index = compute_modulo(mod32, self->nodeCount);
+ +
+ +              switch (self->listType)
+ +              {
+ +                      case LOCATOR_LIST_NONE:
+ +                              ((int *) self->results)[0] = index;
+ +                              break;
+ +                      case LOCATOR_LIST_INT:
+ +                              ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ +                              break;
+ +                      case LOCATOR_LIST_OID:
+ +                              ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ +                              break;
+ +                      case LOCATOR_LIST_POINTER:
+ +                              ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ +                              break;
+ +                      case LOCATOR_LIST_LIST:
+ +                              /* Should never happen */
+ +                              Assert(false);
+ +                              break;
+ +              }
+ +              return 1;
+ +      }
+ +}
+ +
+ +
+ +int
+ +GET_NODES(Locator *self, Datum value, bool isnull, bool *hasprimary)
+ +{
+ +      return (*self->locatefunc) (self, value, isnull, hasprimary);
+ +}
+ +
+ +
+ +void *
+ +getLocatorResults(Locator *self)
+ +{
+ +      return self->results;
+ +}
+ +
+ +
+ +void *
+ +getLocatorNodeMap(Locator *self)
+ +{
+ +      return self->nodeMap;
+ +}
+ +
+ +
+ +int
+ +getLocatorNodeCount(Locator *self)
+ +{
+ +      return self->nodeCount;
+ +}
+ +#endif
+ +
+ +/*
+ + * GetRelationNodes
+ + *
+ + * Get list of relation nodes
+ + * If the table is replicated and we are reading, we can just pick one.
+ + * If the table is partitioned, we apply partitioning column value, if possible.
+ + *
+ + * If the relation is partitioned, partValue will be applied if present
+ + * (indicating a value appears for partitioning column), otherwise it
+ + * is ignored.
+ + *
+ + * preferredNodes is only used when for replicated tables. If set, it will
+ + * use one of the nodes specified if the table is replicated on it.
+ + * This helps optimize for avoiding introducing additional nodes into the
+ + * transaction.
+ + *
+ + * The returned List is a copy, so it should be freed when finished.
+ + */
+ +ExecNodes *
+ +GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
+ +                              bool isValueNull,
+ +                              RelationAccessType accessType)
+ +{
+ +      ExecNodes       *exec_nodes;
+ +      int                     *nodenums;
+ +      int                     i, count;
+ +      Locator         *locator;
+ +      Oid typeOfValueForDistCol = InvalidOid;
+ +
+ +      if (rel_loc_info == NULL)
+ +              return NULL;
+ +
+ +
+ +      if (IsLocatorDistributedByValue(rel_loc_info->locatorType))
+ +      {
+ +              /* A sufficient lock level needs to be taken at a higher level */
+ +              Relation rel = relation_open(rel_loc_info->relid, NoLock);
+ +              TupleDesc       tupDesc = RelationGetDescr(rel);
+ +              Form_pg_attribute *attr = tupDesc->attrs;
+ +              /* Get the hash type of relation */
+ +              typeOfValueForDistCol = attr[rel_loc_info->partAttrNum - 1]->atttypid;
+ +              relation_close(rel, NoLock);
+ +      }
+ +
+ +      exec_nodes = makeNode(ExecNodes);
+ +      exec_nodes->baselocatortype = rel_loc_info->locatorType;
+ +      exec_nodes->accesstype = accessType;
+ +
+ +      locator = createLocator(rel_loc_info->locatorType,
+ +                                                      accessType,
+ +                                                      typeOfValueForDistCol,
+ +                                                      LOCATOR_LIST_LIST,
+ +                                                      0,
+ +                                                      (void *)rel_loc_info->rl_nodeList,
+ +                                                      (void **)&nodenums,
+ +                                                      false);
+ +      count = GET_NODES(locator, valueForDistCol, isValueNull, NULL);
+ +
+ +      for (i = 0; i < count; i++)
+ +              exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, nodenums[i]);
+ +
+ +      freeLocator(locator);
+ +      return exec_nodes;
+ +}
+ +
+ +/*
+ + * GetRelationNodesByQuals
+ + * A wrapper around GetRelationNodes to reduce the node list by looking at the
+ + * quals. varno is assumed to be the varno of reloid inside the quals. No check
+ + * is made to see if that's correct.
+ + */
+ +ExecNodes *
+ +GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
+ +                      Index varno, Node *quals, RelationAccessType relaccess)
+ +{
+ +      Expr                    *distcol_expr = NULL;
+ +      ExecNodes               *exec_nodes;
+ +      Datum                   distcol_value;
+ +      bool                    distcol_isnull;
+ +
+ +      if (!rel_loc_info)
+ +              return NULL;
+ +      /*
+ +       * If the table distributed by value, check if we can reduce the Datanodes
+ +       * by looking at the qualifiers for this relation
+ +       */
+ +      if (IsRelationDistributedByValue(rel_loc_info))
+ +      {
+ +              Oid             disttype = get_atttype(reloid, rel_loc_info->partAttrNum);
+ +              int32   disttypmod = get_atttypmod(reloid, rel_loc_info->partAttrNum);
+ +              distcol_expr = pgxc_find_distcol_expr(varno, rel_loc_info->partAttrNum,
+ +                                                                                                      quals);
+ +              /*
+ +               * If the type of expression used to find the Datanode, is not same as
+ +               * the distribution column type, try casting it. This is same as what
+ +               * will happen in case of inserting that type of expression value as the
+ +               * distribution column value.
+ +               */
+ +              if (distcol_expr)
+ +              {
+ +                      distcol_expr = (Expr *)coerce_to_target_type(NULL,
+ +                                                                                                      (Node *)distcol_expr,
+ +                                                                                                      exprType((Node *)distcol_expr),
+ +                                                                                                      disttype, disttypmod,
+ +                                                                                                      COERCION_ASSIGNMENT,
+ +                                                                                                      COERCE_IMPLICIT_CAST, -1);
+ +                      /*
+ +                       * PGXC_FQS_TODO: We should set the bound parameters here, but we don't have
+ +                       * PlannerInfo struct and we don't handle them right now.
+ +                       * Even if constant expression mutator changes the expression, it will
+ +                       * only simplify it, keeping the semantics same
+ +                       */
+ +                      distcol_expr = (Expr *)eval_const_expressions(NULL,
+ +                                                                                                                      (Node *)distcol_expr);
+ +              }
+ +      }
+ +
+ +      if (distcol_expr && IsA(distcol_expr, Const))
+ +      {
+ +              Const *const_expr = (Const *)distcol_expr;
+ +              distcol_value = const_expr->constvalue;
+ +              distcol_isnull = const_expr->constisnull;
+ +      }
+ +      else
+ +      {
+ +              distcol_value = (Datum) 0;
+ +              distcol_isnull = true;
+ +      }
+ +
+ +      exec_nodes = GetRelationNodes(rel_loc_info, distcol_value,
+ +                                                                                              distcol_isnull,
+ +                                                                                              relaccess);
+ +      return exec_nodes;
+ +}
+ +
+ +/*
+ + * GetRelationDistribColumn
+ + * Return hash column name for relation or NULL if relation is not distributed.
+ + */
+ +char *
+ +GetRelationDistribColumn(RelationLocInfo *locInfo)
+ +{
+ +      /* No relation, so simply leave */
+ +      if (!locInfo)
+ +              return NULL;
+ +
+ +      /* No distribution column if relation is not distributed with a key */
+ +      if (!IsRelationDistributedByValue(locInfo))
+ +              return NULL;
+ +
+ +      /* Return column name */
+ +      return get_attname(locInfo->relid, locInfo->partAttrNum);
+ +}
+ +
+ +/*
+ + * pgxc_find_distcol_expr
+ + * Search through the quals provided and find out an expression which will give
+ + * us value of distribution column if exists in the quals. Say for a table
+ + * tab1 (val int, val2 int) distributed by hash(val), a query "SELECT * FROM
+ + * tab1 WHERE val = fn(x, y, z) and val2 = 3", fn(x,y,z) is the expression which
+ + * decides the distribution column value in the rows qualified by this query.
+ + * Hence return fn(x, y, z). But for a query "SELECT * FROM tab1 WHERE val =
+ + * fn(x, y, z) || val2 = 3", there is no expression which decides the values
+ + * distribution column val can take in the qualified rows. So, in such cases
+ + * this function returns NULL.
+ + */
+ +static Expr *
+ +pgxc_find_distcol_expr(Index varno,
+ +                                         AttrNumber attrNum,
+ +                                         Node *quals)
+ +{
+ +      List *lquals;
+ +      ListCell *qual_cell;
+ +
+ +      /* If no quals, no distribution column expression */
+ +      if (!quals)
+ +              return NULL;
+ +
+ +      /* Convert the qualification into List if it's not already so */
+ +      if (!IsA(quals, List))
+ +              lquals = make_ands_implicit((Expr *)quals);
+ +      else
+ +              lquals = (List *)quals;
+ +
+ +      /*
+ +       * For every ANDed expression, check if that expression is of the form
+ +       * <distribution_col> = <expr>. If so return expr.
+ +       */
+ +      foreach(qual_cell, lquals)
+ +      {
+ +              Expr *qual_expr = (Expr *)lfirst(qual_cell);
+ +              OpExpr *op;
+ +              Expr *lexpr;
+ +              Expr *rexpr;
+ +              Var *var_expr;
+ +              Expr *distcol_expr;
+ +
+ +              if (!IsA(qual_expr, OpExpr))
+ +                      continue;
+ +              op = (OpExpr *)qual_expr;
+ +              /* If not a binary operator, it can not be '='. */
+ +              if (list_length(op->args) != 2)
+ +                      continue;
+ +
+ +              lexpr = linitial(op->args);
+ +              rexpr = lsecond(op->args);
+ +
+ +              /*
+ +               * If either of the operands is a RelabelType, extract the Var in the RelabelType.
+ +               * A RelabelType represents a "dummy" type coercion between two binary compatible datatypes.
+ +               * If we do not handle these then our optimization does not work in case of varchar
+ +               * For example if col is of type varchar and is the dist key then
+ +               * select * from vc_tab where col = 'abcdefghijklmnopqrstuvwxyz';
+ +               * should be shipped to one of the nodes only
+ +               */
+ +              if (IsA(lexpr, RelabelType))
+ +                      lexpr = ((RelabelType*)lexpr)->arg;
+ +              if (IsA(rexpr, RelabelType))
+ +                      rexpr = ((RelabelType*)rexpr)->arg;
+ +
+ +              /*
+ +               * If either of the operands is a Var expression, assume the other
+ +               * one is distribution column expression. If none is Var check next
+ +               * qual.
+ +               */
+ +              if (IsA(lexpr, Var))
+ +              {
+ +                      var_expr = (Var *)lexpr;
+ +                      distcol_expr = rexpr;
+ +              }
+ +              else if (IsA(rexpr, Var))
+ +              {
+ +                      var_expr = (Var *)rexpr;
+ +                      distcol_expr = lexpr;
+ +              }
+ +              else
+ +                      continue;
+ +              /*
+ +               * If Var found is not the distribution column of required relation,
+ +               * check next qual
+ +               */
+ +              if (var_expr->varno != varno || var_expr->varattno != attrNum)
+ +                      continue;
+ +              /*
+ +               * If the operator is not an assignment operator, check next
+ +               * constraint. An operator is an assignment operator if it's
+ +               * mergejoinable or hashjoinable. Beware that not every assignment
+ +               * operator is mergejoinable or hashjoinable, so we might leave some
+ +               * oportunity. But then we have to rely on the opname which may not
+ +               * be something we know to be equality operator as well.
+ +               */
+ +              if (!op_mergejoinable(op->opno, exprType((Node *)lexpr)) &&
+ +                      !op_hashjoinable(op->opno, exprType((Node *)lexpr)))
+ +                      continue;
+ +              /* Found the distribution column expression return it */
+ +              return distcol_expr;
+ +      }
+ +      /* Exhausted all quals, but no distribution column expression */
+ +      return NULL;
+ +}
diff --cc src/backend/pgxc/nodemgr/groupmgr.c

index b63b8cf44fd0c118df7daefb64acae4a79cd16c8,0000000000000000000000000000000000000000..8104e5ba4c5c8c0705efb78335d9fb4e24daf79b

mode 100644,000000..100644
--- 1/src/backend/pgxc/nodemgr/groupmgr.c
--- /dev/null
+++ b/src/backend/pgxc/nodemgr/groupmgr.c
@@@ -1,156 -1,0 +1,153 @@@
-       /* Do the insertion */
-       (void) simple_heap_insert(rel, tup);
- 
-       CatalogUpdateIndexes(rel, tup);
+ +/*-------------------------------------------------------------------------
+ + *
+ + * groupmgr.c
+ + *      Routines to support manipulation of the pgxc_group catalog
+ + *      This includes support for DDL on objects NODE GROUP
+ + *
+ + * Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ + *
+ + *-------------------------------------------------------------------------
+ + */
+ +
+ +#include "postgres.h"
+ +#include "miscadmin.h"
+ +
+ +#include "access/heapam.h"
+ +#include "access/htup_details.h"
+ +#include "catalog/catalog.h"
+ +#include "catalog/indexing.h"
+ +#include "catalog/pg_type.h"
+ +#include "catalog/pgxc_node.h"
+ +#include "catalog/pgxc_group.h"
+ +#include "nodes/parsenodes.h"
+ +#include "nodes/pg_list.h"
+ +#include "utils/builtins.h"
+ +#include "utils/rel.h"
+ +#include "utils/syscache.h"
+ +#include "utils/lsyscache.h"
+ +#include "utils/array.h"
+ +#include "pgxc/groupmgr.h"
+ +
+ +/*
+ + * PgxcGroupCreate
+ + *
+ + * Create a PGXC node group
+ + */
+ +void
+ +PgxcGroupCreate(CreateGroupStmt *stmt)
+ +{
+ +      const char *group_name = stmt->group_name;
+ +      List       *nodes = stmt->nodes;
+ +      oidvector  *nodes_array;
+ +      Oid                *inTypes;
+ +      Relation        rel;
+ +      HeapTuple       tup;
+ +      bool            nulls[Natts_pgxc_group];
+ +      Datum           values[Natts_pgxc_group];
+ +      int                     member_count = list_length(stmt->nodes);
+ +      ListCell   *lc;
+ +      int                     i = 0;
+ +
+ +      /* Only a DB administrator can add cluster node groups */
+ +      if (!superuser())
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ +                               errmsg("must be superuser to create cluster node groups")));
+ +
+ +      /* Check if given group already exists */
+ +      if (OidIsValid(get_pgxc_groupoid(group_name)))
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DUPLICATE_OBJECT),
+ +                               errmsg("PGXC Group %s: group already defined",
+ +                                              group_name)));
+ +
+ +      inTypes = (Oid *) palloc(member_count * sizeof(Oid));
+ +
+ +      /* Build list of Oids for each node listed */
+ +      foreach(lc, nodes)
+ +      {
+ +              char   *node_name = strVal(lfirst(lc));
+ +              Oid     noid = get_pgxc_nodeoid(node_name);
+ +
+ +              if (!OidIsValid(noid))
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                                       errmsg("PGXC Node %s: object not defined",
+ +                                                      node_name)));
+ +
+ +              if (get_pgxc_nodetype(noid) != PGXC_NODE_DATANODE)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                       errmsg("PGXC node %s: only Datanodes can be group members",
+ +                                                      node_name)));
+ +
+ +              /* OK to pick up Oid of this node */
+ +              inTypes[i] = noid;
+ +              i++;
+ +      }
+ +
+ +      /* Build array of Oids to be inserted */
+ +      nodes_array = buildoidvector(inTypes, member_count);
+ +
+ +      /* Iterate through all attributes initializing nulls and values */
+ +      for (i = 0; i < Natts_pgxc_group; i++)
+ +      {
+ +              nulls[i]  = false;
+ +              values[i] = (Datum) 0;
+ +      }
+ +
+ +      /* Insert Data correctly */
+ +      values[Anum_pgxc_group_name - 1] =
+ +              DirectFunctionCall1(namein, CStringGetDatum(group_name));
+ +      values[Anum_pgxc_group_members - 1] = PointerGetDatum(nodes_array);
+ +
+ +      /* Open the relation for insertion */
+ +      rel = heap_open(PgxcGroupRelationId, RowExclusiveLock);
+ +      tup = heap_form_tuple(rel->rd_att, values, nulls);
+ +
++      CatalogTupleInsert(rel, tup);
+ +
+ +      heap_close(rel, RowExclusiveLock);
+ +}
+ +
+ +
+ +/*
+ + * PgxcNodeGroupsRemove():
+ + *
+ + * Remove a PGXC node group
+ + */
+ +void
+ +PgxcGroupRemove(DropGroupStmt *stmt)
+ +{
+ +      Relation        relation;
+ +      HeapTuple       tup;
+ +      const char *group_name = stmt->group_name;
+ +      Oid                     group_oid = get_pgxc_groupoid(group_name);
+ +
+ +      /* Only a DB administrator can remove cluster node groups */
+ +      if (!superuser())
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ +                               errmsg("must be superuser to remove cluster node groups")));
+ +
+ +      /* Check if group exists */
+ +      if (!OidIsValid(group_oid))
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DUPLICATE_OBJECT),
+ +                               errmsg("PGXC Group %s: group not defined",
+ +                                              group_name)));
+ +
+ +      /* Delete the pgxc_group tuple */
+ +      relation = heap_open(PgxcGroupRelationId, RowExclusiveLock);
+ +      tup = SearchSysCache(PGXCGROUPOID, ObjectIdGetDatum(group_oid), 0, 0, 0);
+ +
+ +      if (!HeapTupleIsValid(tup)) /* should not happen */
+ +              elog(ERROR, "PGXC Group %s: group not defined", group_name);
+ +
+ +      simple_heap_delete(relation, &tup->t_self);
+ +
+ +      ReleaseSysCache(tup);
+ +
+ +      heap_close(relation, RowExclusiveLock);
+ +}
diff --cc src/backend/pgxc/nodemgr/nodemgr.c

index 5ae6fe5f0567ca210006a015d92f69f8ebbdd1f8,0000000000000000000000000000000000000000..e6cc9af14b438f664d068cd244af04c2dd0148c7

mode 100644,000000..100644
--- 1/src/backend/pgxc/nodemgr/nodemgr.c
--- /dev/null
+++ b/src/backend/pgxc/nodemgr/nodemgr.c
@@@ -1,1032 -1,0 +1,1028 @@@
-       /* Insert tuple in catalog */
-       simple_heap_insert(pgxcnodesrel, htup);
- 
-       CatalogUpdateIndexes(pgxcnodesrel, htup);
+ +/*-------------------------------------------------------------------------
+ + *
+ + * nodemgr.c
+ + *      Routines to support manipulation of the pgxc_node catalog
+ + *      Support concerns CREATE/ALTER/DROP on NODE object.
+ + *
+ + * Copyright (c) 2010-2012 Postgres-XC Development Group
+ + *
+ + *-------------------------------------------------------------------------
+ + */
+ +
+ +#include "postgres.h"
+ +#include "miscadmin.h"
+ +
+ +#include "access/hash.h"
+ +#include "access/heapam.h"
+ +#include "access/htup_details.h"
+ +#include "catalog/catalog.h"
+ +#include "catalog/indexing.h"
+ +#include "catalog/pgxc_node.h"
+ +#include "commands/defrem.h"
+ +#include "nodes/parsenodes.h"
+ +#include "utils/builtins.h"
+ +#include "utils/rel.h"
+ +#include "utils/syscache.h"
+ +#include "utils/lsyscache.h"
+ +#include "utils/tqual.h"
+ +#include "pgxc/locator.h"
+ +#include "pgxc/nodemgr.h"
+ +#include "pgxc/pgxc.h"
++#include "storage/lwlock.h"
++#include "storage/shmem.h"
+ +
+ +/*
+ + * How many times should we try to find a unique indetifier
+ + * in case hash of the node name comes out to be duplicate
+ + */
+ +
+ +#define MAX_TRIES_FOR_NID     200
+ +
+ +static Datum generate_node_id(const char *node_name);
+ +static void count_coords_datanodes(Relation rel, int *num_coord, int *num_dns);
+ +
+ +/*
+ + * GUC parameters.
+ + * Shared memory block can not be resized dynamically, so we should have some
+ + * limits set at startup time to calculate amount of shared memory to store
+ + * node table. Nodes can be added to running cluster until that limit is reached
+ + * if cluster needs grow beyond the configuration value should be changed and
+ + * cluster restarted.
+ + */
+ +int                           MaxCoords = 16;
+ +int                           MaxDataNodes = 16;
+ +
+ +/* Global number of nodes. Point to a shared memory block */
+ +static int       *shmemNumCoords;
+ +static int       *shmemNumDataNodes;
+ +
+ +/* Shared memory tables of node definitions */
+ +NodeDefinition *coDefs;
+ +NodeDefinition *dnDefs;
+ +
+ +/*
+ + * NodeTablesInit
+ + *    Initializes shared memory tables of Coordinators and Datanodes.
+ + */
+ +void
+ +NodeTablesShmemInit(void)
+ +{
+ +      bool found;
+ +      int i;
+ +
+ +      /*
+ +       * Initialize the table of Coordinators: first sizeof(int) bytes are to
+ +       * store actual number of Coordinators, remaining data in the structure is
+ +       * array of NodeDefinition that can contain up to MaxCoords entries.
+ +       * That is a bit weird and probably it would be better have these in
+ +       * separate structures, but I am unsure about cost of having shmem structure
+ +       * containing just single integer.
+ +       */
+ +      shmemNumCoords = ShmemInitStruct("Coordinator Table",
+ +                                                              sizeof(int) +
+ +                                                                      sizeof(NodeDefinition) * MaxCoords,
+ +                                                              &found);
+ +
+ +      /* Have coDefs pointing right behind shmemNumCoords */
+ +      coDefs = (NodeDefinition *) (shmemNumCoords + 1);
+ +
+ +      /* Mark it empty upon creation */
+ +      if (!found)
+ +      {
+ +              *shmemNumCoords = 0;
+ +              /* Mark nodeishealthy true at init time for all */
+ +              for (i = 0; i < MaxCoords; i++)
+ +                      coDefs[i].nodeishealthy = true;
+ +      }
+ +
+ +      /* Same for Datanodes */
+ +      shmemNumDataNodes = ShmemInitStruct("Datanode Table",
+ +                                                                 sizeof(int) +
+ +                                                                         sizeof(NodeDefinition) * MaxDataNodes,
+ +                                                                 &found);
+ +
+ +      /* Have dnDefs pointing right behind shmemNumDataNodes */
+ +      dnDefs = (NodeDefinition *) (shmemNumDataNodes + 1);
+ +
+ +      /* Mark it empty upon creation */
+ +      if (!found)
+ +      {
+ +              *shmemNumDataNodes = 0;
+ +              /* Mark nodeishealthy true at init time for all */
+ +              for (i = 0; i < MaxDataNodes; i++)
+ +                      dnDefs[i].nodeishealthy = true;
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * NodeTablesShmemSize
+ + *    Get the size of shared memory dedicated to node definitions
+ + */
+ +Size
+ +NodeTablesShmemSize(void)
+ +{
+ +      Size co_size;
+ +      Size dn_size;
+ +
+ +      co_size = mul_size(sizeof(NodeDefinition), MaxCoords);
+ +      co_size = add_size(co_size, sizeof(int));
+ +      dn_size = mul_size(sizeof(NodeDefinition), MaxDataNodes);
+ +      dn_size = add_size(dn_size, sizeof(int));
+ +
+ +      return add_size(co_size, dn_size);
+ +}
+ +
+ +/*
+ + * Check list of options and return things filled.
+ + * This includes check on option values.
+ + */
+ +static void
+ +check_node_options(const char *node_name, List *options, char **node_host,
+ +                      int *node_port, char *node_type,
+ +                      bool *is_primary, bool *is_preferred)
+ +{
+ +      ListCell   *option;
+ +
+ +      if (!options)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                               errmsg("No options specified")));
+ +
+ +      /* Filter options */
+ +      foreach(option, options)
+ +      {
+ +              DefElem    *defel = (DefElem *) lfirst(option);
+ +
+ +              if (strcmp(defel->defname, "port") == 0)
+ +              {
+ +                      *node_port = defGetTypeLength(defel);
+ +
+ +                      if (*node_port < 1 || *node_port > 65535)
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+ +                                               errmsg("port value is out of range")));
+ +              }
+ +              else if (strcmp(defel->defname, "host") == 0)
+ +              {
+ +                      *node_host = defGetString(defel);
+ +              }
+ +              else if (strcmp(defel->defname, "type") == 0)
+ +              {
+ +                      char *type_loc;
+ +
+ +                      type_loc = defGetString(defel);
+ +
+ +                      if (strcmp(type_loc, "coordinator") != 0 &&
+ +                              strcmp(type_loc, "datanode") != 0)
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ +                                               errmsg("type value is incorrect, specify 'coordinator or 'datanode'")));
+ +
+ +                      if (strcmp(type_loc, "coordinator") == 0)
+ +                              *node_type = PGXC_NODE_COORDINATOR;
+ +                      else
+ +                              *node_type = PGXC_NODE_DATANODE;
+ +              }
+ +              else if (strcmp(defel->defname, "primary") == 0)
+ +              {
+ +                      *is_primary = defGetBoolean(defel);
+ +              }
+ +              else if (strcmp(defel->defname, "preferred") == 0)
+ +              {
+ +                      *is_preferred = defGetBoolean(defel);
+ +              }
+ +              else
+ +              {
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                       errmsg("incorrect option: %s", defel->defname)));
+ +              }
+ +      }
+ +
+ +      /* A primary node has to be a Datanode */
+ +      if (*is_primary && *node_type != PGXC_NODE_DATANODE)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                               errmsg("PGXC node %s: cannot be a primary node, it has to be a Datanode",
+ +                                              node_name)));
+ +
+ +      /* A preferred node has to be a Datanode */
+ +      if (*is_preferred && *node_type != PGXC_NODE_DATANODE)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                               errmsg("PGXC node %s: cannot be a preferred node, it has to be a Datanode",
+ +                                              node_name)));
+ +
+ +      /* Node type check */
+ +      if (*node_type == PGXC_NODE_NONE)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                               errmsg("PGXC node %s: Node type not specified",
+ +                                              node_name)));
+ +
+ +#ifdef XCP
+ +      if (*node_type == PGXC_NODE_DATANODE && NumDataNodes >= MaxDataNodes)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ +                               errmsg("Too many datanodes, current value of max_datanodes is %d",
+ +                                              MaxDataNodes)));
+ +
+ +#endif
+ +}
+ +
+ +/*
+ + * generate_node_id
+ + *
+ + * Given a node name compute its hash to generate the identifier
+ + * If the hash comes out to be duplicate , try some other values
+ + * Give up after a few tries
+ + */
+ +static Datum
+ +generate_node_id(const char *node_name)
+ +{
+ +      Datum           node_id;
+ +      uint32          n;
+ +      bool            inc;
+ +      int             i;
+ +
+ +      /* Compute node identifier by computing hash of node name */
+ +      node_id = hash_any((unsigned char *)node_name, strlen(node_name));
+ +
+ +      /*
+ +       * Check if the hash is near the overflow limit, then we will
+ +       * decrement it , otherwise we will increment
+ +       */
+ +      inc = true;
+ +      n = DatumGetUInt32(node_id);
+ +      if (n >= UINT_MAX - MAX_TRIES_FOR_NID)
+ +              inc = false;
+ +
+ +      /*
+ +       * Check if the identifier is clashing with an existing one,
+ +       * and if it is try some other
+ +       */
+ +      for (i = 0; i < MAX_TRIES_FOR_NID; i++)
+ +      {
+ +              HeapTuple       tup;
+ +
+ +              tup = SearchSysCache1(PGXCNODEIDENTIFIER, node_id);
+ +              if (tup == NULL)
+ +                      break;
+ +
+ +              ReleaseSysCache(tup);
+ +
+ +              n = DatumGetUInt32(node_id);
+ +              if (inc)
+ +                      n++;
+ +              else
+ +                      n--;
+ +
+ +              node_id = UInt32GetDatum(n);
+ +      }
+ +
+ +      /*
+ +       * This has really few chances to happen, but inform backend that node
+ +       * has not been registered correctly in this case.
+ +       */
+ +      if (i >= MAX_TRIES_FOR_NID)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ +                               errmsg("Please choose different node name."),
+ +                               errdetail("Name \"%s\" produces a duplicate identifier node_name",
+ +                                                 node_name)));
+ +
+ +      return node_id;
+ +}
+ +
+ +/* --------------------------------
+ + *  cmp_nodes
+ + *
+ + *  Compare the Oids of two XC nodes
+ + *  to sort them in ascending order by their names
+ + * --------------------------------
+ + */
+ +static int
+ +cmp_nodes(const void *p1, const void *p2)
+ +{
+ +      Oid n1 = *((Oid *)p1);
+ +      Oid n2 = *((Oid *)p2);
+ +
+ +      if (strcmp(get_pgxc_nodename(n1), get_pgxc_nodename(n2)) < 0)
+ +              return -1;
+ +
+ +      if (strcmp(get_pgxc_nodename(n1), get_pgxc_nodename(n2)) == 0)
+ +              return 0;
+ +
+ +      return 1;
+ +}
+ +
+ +/*
+ + * Count the number of coordinators and datanodes configured so far.
+ + */
+ +static void
+ +count_coords_datanodes(Relation rel, int *num_coord, int *num_dns)
+ +{
+ +      int                     coordCount = 0, dnCount = 0;
+ +      HeapScanDesc scan;
+ +      HeapTuple   tuple;
+ +
+ +      scan = heap_beginscan(rel, SnapshotSelf, 0, NULL);
+ +      while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ +      {
+ +              Form_pgxc_node  nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ +
+ +              /* Take definition for given node type */
+ +              switch (nodeForm->node_type)
+ +              {
+ +                      case PGXC_NODE_COORDINATOR:
+ +                              coordCount++;
+ +                              break;
+ +                      case PGXC_NODE_DATANODE:
+ +                              dnCount++;
+ +                              break;
+ +                      default:
+ +                              break;
+ +              }
+ +      }
+ +      heap_endscan(scan);
+ +
+ +      *num_coord = coordCount;
+ +      *num_dns = dnCount;
+ +}
+ +
+ +/*
+ + * PgxcNodeListAndCount
+ + *
+ + * Update node definitions in the shared memory tables from the catalog
+ + */
+ +void
+ +PgxcNodeListAndCount(void)
+ +{
+ +      Relation rel;
+ +      HeapScanDesc scan;
+ +      HeapTuple   tuple;
+ +      NodeDefinition *nodes = NULL;
+ +      int     numNodes;
+ +
+ +      LWLockAcquire(NodeTableLock, LW_EXCLUSIVE);
+ +
+ +      numNodes = *shmemNumCoords + *shmemNumDataNodes;
+ +
+ +      Assert((*shmemNumCoords >= 0) && (*shmemNumDataNodes >= 0));
+ +
+ +      /*
+ +       * Save the existing health status values because nodes
+ +       * might get added or deleted here. We will save
+ +       * nodeoid, status. No need to differentiate between
+ +       * coords and datanodes since oids will be unique anyways
+ +       */
+ +      if (numNodes > 0)
+ +      {
+ +              nodes = (NodeDefinition*)palloc(numNodes * sizeof(NodeDefinition));
+ +
+ +              /* XXX It's possible to call memcpy with */
+ +              if (*shmemNumCoords > 0)
+ +                      memcpy(nodes, coDefs, *shmemNumCoords * sizeof(NodeDefinition));
+ +
+ +              if (*shmemNumDataNodes > 0)
+ +                      memcpy(nodes + *shmemNumCoords, dnDefs,
+ +                                 *shmemNumDataNodes * sizeof(NodeDefinition));
+ +      }
+ +
+ +      *shmemNumCoords = 0;
+ +      *shmemNumDataNodes = 0;
+ +
+ +      /*
+ +       * Node information initialization is made in one scan:
+ +       * 1) Scan pgxc_node catalog to find the number of nodes for
+ +       *      each node type and make proper allocations
+ +       * 2) Then extract the node Oid
+ +       * 3) Complete primary/preferred node information
+ +       */
+ +      rel = heap_open(PgxcNodeRelationId, AccessShareLock);
+ +      scan = heap_beginscan(rel, SnapshotSelf, 0, NULL);
+ +      while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ +      {
+ +              Form_pgxc_node  nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ +              NodeDefinition *node;
+ +              int i;
+ +
+ +              /* Take definition for given node type */
+ +              switch (nodeForm->node_type)
+ +              {
+ +                      case PGXC_NODE_COORDINATOR:
+ +                              node = &coDefs[(*shmemNumCoords)++];
+ +                              break;
+ +                      case PGXC_NODE_DATANODE:
+ +                      default:
+ +                              node = &dnDefs[(*shmemNumDataNodes)++];
+ +                              break;
+ +              }
+ +
+ +              /* Populate the definition */
+ +              node->nodeoid = HeapTupleGetOid(tuple);
+ +              memcpy(&node->nodename, &nodeForm->node_name, NAMEDATALEN);
+ +              memcpy(&node->nodehost, &nodeForm->node_host, NAMEDATALEN);
+ +              node->nodeport = nodeForm->node_port;
+ +              node->nodeisprimary = nodeForm->nodeis_primary;
+ +              node->nodeispreferred = nodeForm->nodeis_preferred;
+ +              /*
+ +               * Copy over the health status from above for nodes that
+ +               * existed before and after the refresh. If we do not find
+ +               * entry for a nodeoid, we mark it as healthy
+ +               */
+ +              node->nodeishealthy = true;
+ +              for (i = 0; i < numNodes; i++)
+ +              {
+ +                      if (nodes[i].nodeoid == node->nodeoid)
+ +                      {
+ +                              node->nodeishealthy = nodes[i].nodeishealthy;
+ +                              break;
+ +                      }
+ +              }
+ +      }
+ +      heap_endscan(scan);
+ +      heap_close(rel, AccessShareLock);
+ +
+ +      elog(DEBUG1, "Done pgxc_nodes scan: %d coordinators and %d datanodes",
+ +                      *shmemNumCoords, *shmemNumDataNodes);
+ +
+ +      if (numNodes)
+ +              pfree(nodes);
+ +
+ +      /* Finally sort the lists */
+ +      if (*shmemNumCoords > 1)
+ +              qsort(coDefs, *shmemNumCoords, sizeof(NodeDefinition), cmp_nodes);
+ +      if (*shmemNumDataNodes > 1)
+ +              qsort(dnDefs, *shmemNumDataNodes, sizeof(NodeDefinition), cmp_nodes);
+ +
+ +      LWLockRelease(NodeTableLock);
+ +}
+ +
+ +
+ +/*
+ + * PgxcNodeGetIds
+ + *
+ + * List into palloc'ed arrays Oids of Coordinators and Datanodes currently
+ + * presented in the node table, as well as number of Coordinators and Datanodes.
+ + * Any parameter may be NULL if caller is not interested in receiving
+ + * appropriate results. Preferred and primary node information can be updated
+ + * in session if requested.
+ + */
+ +void
+ +PgxcNodeGetOids(Oid **coOids, Oid **dnOids,
+ +                              int *num_coords, int *num_dns, bool update_preferred)
+ +{
+ +      LWLockAcquire(NodeTableLock, LW_SHARED);
+ +
+ +      elog(DEBUG1, "Get OIDs from table: %d coordinators and %d datanodes",
+ +                      *shmemNumCoords, *shmemNumDataNodes);
+ +
+ +      if (num_coords)
+ +              *num_coords = *shmemNumCoords;
+ +      if (num_dns)
+ +              *num_dns = *shmemNumDataNodes;
+ +
+ +      if (coOids)
+ +      {
+ +              int i;
+ +
+ +              *coOids = (Oid *) palloc(*shmemNumCoords * sizeof(Oid));
+ +              for (i = 0; i < *shmemNumCoords; i++)
+ +                      (*coOids)[i] = coDefs[i].nodeoid;
+ +      }
+ +
+ +      if (dnOids)
+ +      {
+ +              int i;
+ +
+ +              *dnOids = (Oid *) palloc(*shmemNumDataNodes * sizeof(Oid));
+ +              for (i = 0; i < *shmemNumDataNodes; i++)
+ +                      (*dnOids)[i] = dnDefs[i].nodeoid;
+ +      }
+ +
+ +      /* Update also preferred and primary node informations if requested */
+ +      if (update_preferred)
+ +      {
+ +              int i;
+ +
+ +              /* Initialize primary and preferred node information */
+ +              primary_data_node = InvalidOid;
+ +              num_preferred_data_nodes = 0;
+ +
+ +              for (i = 0; i < *shmemNumDataNodes; i++)
+ +              {
+ +                      if (dnDefs[i].nodeisprimary)
+ +                              primary_data_node = dnDefs[i].nodeoid;
+ +
+ +                      if (dnDefs[i].nodeispreferred)
+ +                      {
+ +                              preferred_data_node[num_preferred_data_nodes] = dnDefs[i].nodeoid;
+ +                              num_preferred_data_nodes++;
+ +                      }
+ +              }
+ +      }
+ +
+ +      LWLockRelease(NodeTableLock);
+ +}
+ +
+ +/*
+ + * PgxcNodeGetHealthMap
+ + *
+ + * List into palloc'ed arrays Oids of Coordinators and Datanodes currently
+ + * presented in the node table, as well as number of Coordinators and Datanodes.
+ + * Any parameter may be NULL if caller is not interested in receiving
+ + * appropriate results for either the Coordinators or Datanodes.
+ + */
+ +void
+ +PgxcNodeGetHealthMap(Oid *coOids, Oid *dnOids,
+ +                              int *num_coords, int *num_dns, bool *coHealthMap,
+ +                              bool *dnHealthMap)
+ +{
+ +      elog(DEBUG1, "Get HealthMap from table: %d coordinators and %d datanodes",
+ +                      *shmemNumCoords, *shmemNumDataNodes);
+ +
+ +      LWLockAcquire(NodeTableLock, LW_SHARED);
+ +
+ +      if (num_coords)
+ +              *num_coords = *shmemNumCoords;
+ +      if (num_dns)
+ +              *num_dns = *shmemNumDataNodes;
+ +
+ +      if (coOids)
+ +      {
+ +              int i;
+ +              for (i = 0; i < *shmemNumCoords; i++)
+ +              {
+ +                      coOids[i] = coDefs[i].nodeoid;
+ +                      if (coHealthMap)
+ +                              coHealthMap[i] = coDefs[i].nodeishealthy;
+ +              }
+ +      }
+ +
+ +      if (dnOids)
+ +      {
+ +              int i;
+ +
+ +              for (i = 0; i < *shmemNumDataNodes; i++)
+ +              {
+ +                      dnOids[i] = dnDefs[i].nodeoid;
+ +                      if (dnHealthMap)
+ +                              dnHealthMap[i] = dnDefs[i].nodeishealthy;
+ +              }
+ +      }
+ +
+ +      LWLockRelease(NodeTableLock);
+ +}
+ +
+ +/*
+ + * Consult the shared memory NodeDefinition structures and
+ + * fetch the nodeishealthy value and return it back
+ + *
+ + * We will probably need a similar function for coordinators
+ + * in the future..
+ + */
+ +void
+ +PgxcNodeDnListHealth(List *nodeList, bool *healthmap)
+ +{
+ +      ListCell *lc;
+ +      int index = 0;
+ +
+ +      elog(DEBUG1, "Get healthmap from datanodeList");
+ +
+ +      if (!nodeList || !list_length(nodeList))
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                               errmsg("NIL or empty nodeList passed")));
+ +
+ +      LWLockAcquire(NodeTableLock, LW_SHARED);
+ +      foreach(lc, nodeList)
+ +      {
+ +              int node = lfirst_int(lc);
+ +
+ +              if (node >= *shmemNumDataNodes)
+ +              {
+ +                      LWLockRelease(NodeTableLock);
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                                       errmsg("PGXC health status not found for datanode with oid (%d)",
+ +                                               node)));
+ +              }
+ +              healthmap[index++] = dnDefs[node].nodeishealthy;
+ +      }
+ +      LWLockRelease(NodeTableLock);
+ +}
+ +
+ +/*
+ + * Find node definition in the shared memory node table.
+ + * The structure is a copy palloc'ed in current memory context.
+ + */
+ +NodeDefinition *
+ +PgxcNodeGetDefinition(Oid node)
+ +{
+ +      NodeDefinition *result = NULL;
+ +      int                             i;
+ +
+ +      LWLockAcquire(NodeTableLock, LW_SHARED);
+ +
+ +      /* search through the Datanodes first */
+ +      for (i = 0; i < *shmemNumDataNodes; i++)
+ +      {
+ +              if (dnDefs[i].nodeoid == node)
+ +              {
+ +                      result = (NodeDefinition *) palloc(sizeof(NodeDefinition));
+ +
+ +                      memcpy(result, dnDefs + i, sizeof(NodeDefinition));
+ +
+ +                      LWLockRelease(NodeTableLock);
+ +
+ +                      return result;
+ +              }
+ +      }
+ +
+ +      /* if not found, search through the Coordinators */
+ +      for (i = 0; i < *shmemNumCoords; i++)
+ +      {
+ +              if (coDefs[i].nodeoid == node)
+ +              {
+ +                      result = (NodeDefinition *) palloc(sizeof(NodeDefinition));
+ +
+ +                      memcpy(result, coDefs + i, sizeof(NodeDefinition));
+ +
+ +                      LWLockRelease(NodeTableLock);
+ +
+ +                      return result;
+ +              }
+ +      }
+ +
+ +      /* not found, return NULL */
+ +      LWLockRelease(NodeTableLock);
+ +      return NULL;
+ +}
+ +
+ +/*
+ + * Update health status of a node in the shared memory node table.
+ + *
+ + * We could try to optimize this by checking if the ishealthy value
+ + * is already the same as the passed in one.. but if the cluster is
+ + * impaired, dunno how much such optimizations are worth. So keeping
+ + * it simple for now
+ + */
+ +bool
+ +PgxcNodeUpdateHealth(Oid node, bool status)
+ +{
+ +      int                             i;
+ +
+ +      LWLockAcquire(NodeTableLock, LW_EXCLUSIVE);
+ +
+ +      /* search through the Datanodes first */
+ +      for (i = 0; i < *shmemNumDataNodes; i++)
+ +      {
+ +              if (dnDefs[i].nodeoid == node)
+ +              {
+ +                      dnDefs[i].nodeishealthy = status;
+ +
+ +                      LWLockRelease(NodeTableLock);
+ +
+ +                      return true;
+ +              }
+ +      }
+ +
+ +      /* if not found, search through the Coordinators */
+ +      for (i = 0; i < *shmemNumCoords; i++)
+ +      {
+ +              if (coDefs[i].nodeoid == node)
+ +              {
+ +                      coDefs[i].nodeishealthy = status;
+ +
+ +                      LWLockRelease(NodeTableLock);
+ +
+ +                      return true;
+ +              }
+ +      }
+ +
+ +      /* not found, return false */
+ +      LWLockRelease(NodeTableLock);
+ +      return false;
+ +}
+ +
+ +/*
+ + * PgxcNodeCreate
+ + *
+ + * Add a PGXC node
+ + */
+ +void
+ +PgxcNodeCreate(CreateNodeStmt *stmt)
+ +{
+ +      Relation        pgxcnodesrel;
+ +      HeapTuple       htup;
+ +      bool            nulls[Natts_pgxc_node];
+ +      Datum           values[Natts_pgxc_node];
+ +      const char *node_name = stmt->node_name;
+ +      int             i;
+ +      /* Options with default values */
+ +      char       *node_host = NULL;
+ +      char            node_type = PGXC_NODE_NONE;
+ +      int                     node_port = 0;
+ +      bool            is_primary = false;
+ +      bool            is_preferred = false;
+ +      Datum           node_id;
+ +      int                     coordCount = 0, dnCount = 0;
+ +
+ +      /* Only a DB administrator can add nodes */
+ +      if (!superuser())
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ +                               errmsg("must be superuser to create cluster nodes")));
+ +
+ +      /* Check that node name is node in use */
+ +      if (OidIsValid(get_pgxc_nodeoid(node_name)))
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DUPLICATE_OBJECT),
+ +                               errmsg("PGXC Node %s: object already defined",
+ +                                              node_name)));
+ +
+ +      /* Check length of node name */
+ +      if (strlen(node_name) > PGXC_NODENAME_LENGTH)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ +                               errmsg("Node name \"%s\" is too long",
+ +                                              node_name)));
+ +
+ +      /* Filter options */
+ +      check_node_options(node_name, stmt->options, &node_host,
+ +                              &node_port, &node_type,
+ +                              &is_primary, &is_preferred);
+ +
+ +      /* Compute node identifier */
+ +      node_id = generate_node_id(node_name);
+ +
+ +      /*
+ +       * Check that this node is not created as a primary if one already
+ +       * exists.
+ +       */
+ +      if (is_primary && OidIsValid(primary_data_node))
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                               errmsg("PGXC node %s: two nodes cannot be primary",
+ +                                              node_name)));
+ +
+ +      /*
+ +       * Then assign default values if necessary
+ +       * First for port.
+ +       */
+ +      if (node_port == 0)
+ +      {
+ +              node_port = 5432;
+ +              elog(DEBUG1, "PGXC node %s: Applying default port value: %d",
+ +                       node_name, node_port);
+ +      }
+ +
+ +      /* Then apply default value for host */
+ +      if (!node_host)
+ +      {
+ +              node_host = strdup("localhost");
+ +              elog(DEBUG1, "PGXC node %s: Applying default host value: %s",
+ +                       node_name, node_host);
+ +      }
+ +
+ +      /* Iterate through all attributes initializing nulls and values */
+ +      for (i = 0; i < Natts_pgxc_node; i++)
+ +      {
+ +              nulls[i]  = false;
+ +              values[i] = (Datum) 0;
+ +      }
+ +
+ +      /*
+ +       * Open the relation for insertion
+ +       * This is necessary to generate a unique Oid for the new node
+ +       * There could be a relation race here if a similar Oid
+ +       * being created before the heap is inserted.
+ +       */
+ +      pgxcnodesrel = heap_open(PgxcNodeRelationId, AccessExclusiveLock);
+ +
+ +      /*
+ +       * Get the count of datanodes and coordinators added so far and make sure
+ +       * we're not exceeding the configured limits
+ +       *
+ +       * XXX This is not full proof because someone may first set
+ +       * max_coordinators or max_datanodes to a high value, add nodes and then
+ +       * lower the value again.
+ +       */
+ +      count_coords_datanodes(pgxcnodesrel, &coordCount, &dnCount);
+ +
+ +      if ((node_type == PGXC_NODE_DATANODE && dnCount >= MaxDataNodes) ||
+ +              (node_type == PGXC_NODE_COORDINATOR && coordCount >= MaxCoords))
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ +                               errmsg("cannot add more than %d %s",
+ +                                       node_type == PGXC_NODE_COORDINATOR ?
+ +                                       MaxCoords : MaxDataNodes,
+ +                                       node_type == PGXC_NODE_COORDINATOR ?
+ +                                       "coordinators" : "datanodes"),
+ +                               errhint("increase the value of %s GUC and restart the cluster",
+ +                                       node_type == PGXC_NODE_COORDINATOR ?
+ +                                       "max_coordinators" : "max_datanodes"
+ +                                       )));
+ +
+ +      }
+ +
+ +      /* Build entry tuple */
+ +      values[Anum_pgxc_node_name - 1] = DirectFunctionCall1(namein, CStringGetDatum(node_name));
+ +      values[Anum_pgxc_node_type - 1] = CharGetDatum(node_type);
+ +      values[Anum_pgxc_node_port - 1] = Int32GetDatum(node_port);
+ +      values[Anum_pgxc_node_host - 1] = DirectFunctionCall1(namein, CStringGetDatum(node_host));
+ +      values[Anum_pgxc_node_is_primary - 1] = BoolGetDatum(is_primary);
+ +      values[Anum_pgxc_node_is_preferred - 1] = BoolGetDatum(is_preferred);
+ +      values[Anum_pgxc_node_id - 1] = node_id;
+ +
+ +      htup = heap_form_tuple(pgxcnodesrel->rd_att, values, nulls);
+ +
-       simple_heap_update(rel, &oldtup->t_self, newtup);
- 
-       /* Update indexes */
-       CatalogUpdateIndexes(rel, newtup);
++      CatalogTupleInsert(pgxcnodesrel, htup);
+ +
+ +      heap_close(pgxcnodesrel, AccessExclusiveLock);
+ +}
+ +
+ +/*
+ + * PgxcNodeAlter
+ + *
+ + * Alter a PGXC node
+ + */
+ +void
+ +PgxcNodeAlter(AlterNodeStmt *stmt)
+ +{
+ +      const char *node_name = stmt->node_name;
+ +      char       *node_host;
+ +      char            node_type;
+ +      int                     node_port;
+ +      bool            is_preferred;
+ +      bool            is_primary;
+ +      HeapTuple       oldtup, newtup;
+ +      Oid                     nodeOid = get_pgxc_nodeoid(node_name);
+ +      Relation        rel;
+ +      Datum           new_record[Natts_pgxc_node];
+ +      bool            new_record_nulls[Natts_pgxc_node];
+ +      bool            new_record_repl[Natts_pgxc_node];
+ +      uint32          node_id;
+ +      int                     coordCount = 0, dnCount = 0;
+ +
+ +      /* Only a DB administrator can alter cluster nodes */
+ +      if (!superuser())
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ +                               errmsg("must be superuser to change cluster nodes")));
+ +
+ +      /* Look at the node tuple, and take exclusive lock on it */
+ +      rel = heap_open(PgxcNodeRelationId, AccessExclusiveLock);
+ +
+ +      /* Check that node exists */
+ +      if (!OidIsValid(nodeOid))
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                               errmsg("PGXC Node %s: object not defined",
+ +                                              node_name)));
+ +
+ +      /* Open new tuple, checks are performed on it and new values */
+ +      oldtup = SearchSysCacheCopy1(PGXCNODEOID, ObjectIdGetDatum(nodeOid));
+ +      if (!HeapTupleIsValid(oldtup))
+ +              elog(ERROR, "cache lookup failed for object %u", nodeOid);
+ +
+ +      /*
+ +       * check_options performs some internal checks on option values
+ +       * so set up values.
+ +       */
+ +      node_host = get_pgxc_nodehost(nodeOid);
+ +      node_port = get_pgxc_nodeport(nodeOid);
+ +      is_preferred = is_pgxc_nodepreferred(nodeOid);
+ +      is_primary = is_pgxc_nodeprimary(nodeOid);
+ +      node_type = get_pgxc_nodetype(nodeOid);
+ +      node_id = get_pgxc_node_id(nodeOid);
+ +
+ +      /* Filter options */
+ +      check_node_options(node_name, stmt->options, &node_host,
+ +                              &node_port, &node_type,
+ +                              &is_primary, &is_preferred);
+ +
+ +      /*
+ +       * Two nodes cannot be primary at the same time. If the primary
+ +       * node is this node itself, well there is no point in having an
+ +       * error.
+ +       */
+ +      if (is_primary &&
+ +              OidIsValid(primary_data_node) &&
+ +              nodeOid != primary_data_node)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                               errmsg("PGXC node %s: two nodes cannot be primary",
+ +                                              node_name)));
+ +
+ +      /*
+ +       * Get the count of datanodes and coordinators added so far and make sure
+ +       * we're not exceeding the configured limits
+ +       */
+ +      count_coords_datanodes(rel, &coordCount, &dnCount);
+ +
+ +      if ((node_type == PGXC_NODE_DATANODE && dnCount >= MaxDataNodes) ||
+ +              (node_type == PGXC_NODE_COORDINATOR && coordCount >= MaxCoords))
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ +                               errmsg("cannot add more than %d %s",
+ +                                       node_type == PGXC_NODE_COORDINATOR ?
+ +                                       MaxCoords : MaxDataNodes,
+ +                                       node_type == PGXC_NODE_COORDINATOR ?
+ +                                       "coordinators" : "datanodes"),
+ +                               errhint("increase the value of %s GUC and restart the cluster",
+ +                                       node_type == PGXC_NODE_COORDINATOR ?
+ +                                       "max_coordinators" : "max_datanodes"
+ +                                       )));
+ +
+ +      }
+ +
+ +      /* Update values for catalog entry */
+ +      MemSet(new_record, 0, sizeof(new_record));
+ +      MemSet(new_record_nulls, false, sizeof(new_record_nulls));
+ +      MemSet(new_record_repl, false, sizeof(new_record_repl));
+ +      new_record[Anum_pgxc_node_port - 1] = Int32GetDatum(node_port);
+ +      new_record_repl[Anum_pgxc_node_port - 1] = true;
+ +      new_record[Anum_pgxc_node_host - 1] =
+ +              DirectFunctionCall1(namein, CStringGetDatum(node_host));
+ +      new_record_repl[Anum_pgxc_node_host - 1] = true;
+ +      new_record[Anum_pgxc_node_type - 1] = CharGetDatum(node_type);
+ +      new_record_repl[Anum_pgxc_node_type - 1] = true;
+ +      new_record[Anum_pgxc_node_is_primary - 1] = BoolGetDatum(is_primary);
+ +      new_record_repl[Anum_pgxc_node_is_primary - 1] = true;
+ +      new_record[Anum_pgxc_node_is_preferred - 1] = BoolGetDatum(is_preferred);
+ +      new_record_repl[Anum_pgxc_node_is_preferred - 1] = true;
+ +      new_record[Anum_pgxc_node_id - 1] = UInt32GetDatum(node_id);
+ +      new_record_repl[Anum_pgxc_node_id - 1] = true;
+ +
+ +      /* Update relation */
+ +      newtup = heap_modify_tuple(oldtup, RelationGetDescr(rel),
+ +                                                         new_record,
+ +                                                         new_record_nulls, new_record_repl);
++      CatalogTupleUpdate(rel, &oldtup->t_self, newtup);
+ +
+ +      /* Release lock at Commit */
+ +      heap_close(rel, NoLock);
+ +}
+ +
+ +
+ +/*
+ + * PgxcNodeRemove
+ + *
+ + * Remove a PGXC node
+ + */
+ +void
+ +PgxcNodeRemove(DropNodeStmt *stmt)
+ +{
+ +      Relation        relation;
+ +      HeapTuple       tup;
+ +      const char      *node_name = stmt->node_name;
+ +      Oid             noid = get_pgxc_nodeoid(node_name);
+ +
+ +      /* Only a DB administrator can remove cluster nodes */
+ +      if (!superuser())
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ +                               errmsg("must be superuser to remove cluster nodes")));
+ +
+ +      /* Check if node is defined */
+ +      if (!OidIsValid(noid))
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                               errmsg("PGXC Node %s: object not defined",
+ +                                              node_name)));
+ +
+ +      if (strcmp(node_name, PGXCNodeName) == 0)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                               errmsg("PGXC Node %s: cannot drop local node",
+ +                                              node_name)));
+ +
+ +      /* PGXCTODO:
+ +       * Is there any group which has this node as member
+ +       * XC Tables will also have this as a member in their array
+ +       * Do this search in the local data structure.
+ +       * If a node is removed, it is necessary to check if there is a distributed
+ +       * table on it. If there are only replicated table it is OK.
+ +       * However, we have to be sure that there are no pooler agents in the cluster pointing to it.
+ +       */
+ +
+ +      /* Delete the pgxc_node tuple */
+ +      relation = heap_open(PgxcNodeRelationId, RowExclusiveLock);
+ +      tup = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(noid));
+ +      if (!HeapTupleIsValid(tup)) /* should not happen */
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                               errmsg("PGXC Node %s: object not defined",
+ +                                              node_name)));
+ +
+ +      simple_heap_delete(relation, &tup->t_self);
+ +
+ +      ReleaseSysCache(tup);
+ +
+ +      heap_close(relation, RowExclusiveLock);
+ +}
diff --cc src/backend/pgxc/pool/execRemote.c

index 21f155f5f74c92921b690f22f716cb9d1519d1a8,0000000000000000000000000000000000000000..59c5d8e7c04a6ba9a9ba635ce5c4d7ab31879f03

mode 100644,000000..100644
--- 1/src/backend/pgxc/pool/execRemote.c
--- /dev/null
+++ b/src/backend/pgxc/pool/execRemote.c
@@@ -1,6483 -1,0 +1,6483 @@@
-                                                                                  &isnull,
-                                                                                  NULL);
+ +/*-------------------------------------------------------------------------
+ + *
+ + * execRemote.c
+ + *
+ + *      Functions to execute commands on remote Datanodes
+ + *
+ + *
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ + *
+ + *
+ + * IDENTIFICATION
+ + *      src/backend/pgxc/pool/execRemote.c
+ + *
+ + *-------------------------------------------------------------------------
+ + */
+ +
+ +#include <time.h>
+ +#include "postgres.h"
+ +#include "access/twophase.h"
+ +#include "access/gtm.h"
+ +#include "access/sysattr.h"
+ +#include "access/transam.h"
+ +#include "access/xact.h"
+ +#include "access/relscan.h"
+ +#include "catalog/pg_type.h"
+ +#include "catalog/pgxc_node.h"
+ +#include "commands/prepare.h"
+ +#include "executor/executor.h"
+ +#include "gtm/gtm_c.h"
+ +#include "libpq/libpq.h"
+ +#include "miscadmin.h"
+ +#include "pgxc/execRemote.h"
+ +#include "tcop/tcopprot.h"
+ +#include "executor/nodeSubplan.h"
+ +#include "nodes/nodeFuncs.h"
+ +#include "pgstat.h"
+ +#include "nodes/nodes.h"
+ +#include "nodes/nodeFuncs.h"
+ +#include "optimizer/var.h"
+ +#include "pgxc/copyops.h"
+ +#include "pgxc/nodemgr.h"
+ +#include "pgxc/poolmgr.h"
+ +#include "storage/ipc.h"
+ +#include "storage/proc.h"
+ +#include "utils/datum.h"
+ +#include "utils/lsyscache.h"
+ +#include "utils/memutils.h"
+ +#include "utils/pg_rusage.h"
+ +#include "utils/tuplesort.h"
+ +#include "utils/snapmgr.h"
+ +#include "utils/builtins.h"
+ +#include "pgxc/locator.h"
+ +#include "pgxc/pgxc.h"
+ +#include "parser/parse_type.h"
+ +#include "parser/parsetree.h"
+ +#include "pgxc/xc_maintenance_mode.h"
+ +
+ +/*
+ + * We do not want it too long, when query is terminating abnormally we just
+ + * want to read in already available data, if datanode connection will reach a
+ + * consistent state after that, we will go normal clean up procedure: send down
+ + * ABORT etc., if data node is not responding we will signal pooler to drop
+ + * the connection.
+ + * It is better to drop and recreate datanode connection then wait for several
+ + * seconds while it being cleaned up when, for example, cancelling query.
+ + */
+ +#define END_QUERY_TIMEOUT     1000
+ +
+ +/* Declarations used by guc.c */
+ +int PGXLRemoteFetchSize;
+ +
+ +typedef struct
+ +{
+ +      xact_callback function;
+ +      void *fparams;
+ +} abort_callback_type;
+ +
+ +/*
+ + * Buffer size does not affect performance significantly, just do not allow
+ + * connection buffer grows infinitely
+ + */
+ +#define COPY_BUFFER_SIZE 8192
+ +#define PRIMARY_NODE_WRITEAHEAD 1024 * 1024
+ +
+ +/*
+ + * Flag to track if a temporary object is accessed by the current transaction
+ + */
+ +static bool temp_object_included = false;
+ +static abort_callback_type dbcleanup_info = { NULL, NULL };
+ +
+ +static int    pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections,
+ +                              GlobalTransactionId gxid, bool need_tran_block,
+ +                              bool readOnly, char node_type);
+ +
+ +static PGXCNodeAllHandles *get_exec_connections(RemoteQueryState *planstate,
+ +                                       ExecNodes *exec_nodes,
+ +                                       RemoteQueryExecType exec_type,
+ +                                       bool is_global_session);
+ +
+ +
+ +static bool pgxc_start_command_on_connection(PGXCNodeHandle *connection,
+ +                                      RemoteQueryState *remotestate, Snapshot snapshot);
+ +
+ +static void pgxc_node_remote_count(int *dnCount, int dnNodeIds[],
+ +              int *coordCount, int coordNodeIds[]);
+ +static char *pgxc_node_remote_prepare(char *prepareGID, bool localNode);
+ +static bool pgxc_node_remote_finish(char *prepareGID, bool commit,
+ +                                              char *nodestring, GlobalTransactionId gxid,
+ +                                              GlobalTransactionId prepare_gxid);
+ +static void pgxc_node_remote_commit(void);
+ +static void pgxc_node_remote_abort(void);
+ +static void pgxc_connections_cleanup(ResponseCombiner *combiner);
+ +
+ +static void pgxc_node_report_error(ResponseCombiner *combiner);
+ +
+ +#define REMOVE_CURR_CONN(combiner) \
+ +      if ((combiner)->current_conn < --((combiner)->conn_count)) \
+ +      { \
+ +              (combiner)->connections[(combiner)->current_conn] = \
+ +                              (combiner)->connections[(combiner)->conn_count]; \
+ +      } \
+ +      else \
+ +              (combiner)->current_conn = 0
+ +
+ +#define MAX_STATEMENTS_PER_TRAN 10
+ +
+ +/* Variables to collect statistics */
+ +static int    total_transactions = 0;
+ +static int    total_statements = 0;
+ +static int    total_autocommit = 0;
+ +static int    nonautocommit_2pc = 0;
+ +static int    autocommit_2pc = 0;
+ +static int    current_tran_statements = 0;
+ +static int *statements_per_transaction = NULL;
+ +static int *nodes_per_transaction = NULL;
+ +
+ +/*
+ + * statistics collection: count a statement
+ + */
+ +static void
+ +stat_statement()
+ +{
+ +      total_statements++;
+ +      current_tran_statements++;
+ +}
+ +
+ +/*
+ + * To collect statistics: count a transaction
+ + */
+ +static void
+ +stat_transaction(int node_count)
+ +{
+ +      total_transactions++;
+ +
+ +      if (!statements_per_transaction)
+ +      {
+ +              statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+ +              memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+ +      }
+ +      if (current_tran_statements > MAX_STATEMENTS_PER_TRAN)
+ +              statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++;
+ +      else
+ +              statements_per_transaction[current_tran_statements]++;
+ +      current_tran_statements = 0;
+ +      if (node_count > 0 && node_count <= NumDataNodes)
+ +      {
+ +              if (!nodes_per_transaction)
+ +              {
+ +                      nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int));
+ +                      memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int));
+ +              }
+ +              nodes_per_transaction[node_count - 1]++;
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Output collected statistics to the log
+ + */
+ +static void
+ +stat_log()
+ +{
+ +      elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements);
+ +      elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d",
+ +               total_autocommit, autocommit_2pc, nonautocommit_2pc);
+ +      if (total_transactions)
+ +      {
+ +              if (statements_per_transaction)
+ +              {
+ +                      int                     i;
+ +
+ +                      for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++)
+ +                              elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)",
+ +                                       i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions);
+ +              }
+ +              elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)",
+ +                       MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions);
+ +              if (nodes_per_transaction)
+ +              {
+ +                      int                     i;
+ +
+ +                      for (i = 0; i < NumDataNodes; i++)
+ +                              elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)",
+ +                                       i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions);
+ +              }
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Create a structure to store parameters needed to combine responses from
+ + * multiple connections as well as state information
+ + */
+ +void
+ +InitResponseCombiner(ResponseCombiner *combiner, int node_count,
+ +                                         CombineType combine_type)
+ +{
+ +      combiner->node_count = node_count;
+ +      combiner->connections = NULL;
+ +      combiner->conn_count = 0;
+ +      combiner->combine_type = combine_type;
+ +      combiner->command_complete_count = 0;
+ +      combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
+ +      combiner->description_count = 0;
+ +      combiner->copy_in_count = 0;
+ +      combiner->copy_out_count = 0;
+ +      combiner->copy_file = NULL;
+ +      combiner->errorMessage = NULL;
+ +      combiner->errorDetail = NULL;
+ +      combiner->errorHint = NULL;
+ +      combiner->tuple_desc = NULL;
+ +      combiner->probing_primary = false;
+ +      combiner->returning_node = InvalidOid;
+ +      combiner->currentRow = NULL;
+ +      combiner->rowBuffer = NIL;
+ +      combiner->tapenodes = NULL;
+ +      combiner->merge_sort = false;
+ +      combiner->extended_query = false;
+ +      combiner->tapemarks = NULL;
+ +      combiner->tuplesortstate = NULL;
+ +      combiner->cursor = NULL;
+ +      combiner->update_cursor = NULL;
+ +      combiner->cursor_count = 0;
+ +      combiner->cursor_connections = NULL;
+ +      combiner->remoteCopyType = REMOTE_COPY_NONE;
+ +}
+ +
+ +
+ +/*
+ + * Parse out row count from the command status response and convert it to integer
+ + */
+ +static int
+ +parse_row_count(const char *message, size_t len, uint64 *rowcount)
+ +{
+ +      int                     digits = 0;
+ +      int                     pos;
+ +
+ +      *rowcount = 0;
+ +      /* skip \0 string terminator */
+ +      for (pos = 0; pos < len - 1; pos++)
+ +      {
+ +              if (message[pos] >= '0' && message[pos] <= '9')
+ +              {
+ +                      *rowcount = *rowcount * 10 + message[pos] - '0';
+ +                      digits++;
+ +              }
+ +              else
+ +              {
+ +                      *rowcount = 0;
+ +                      digits = 0;
+ +              }
+ +      }
+ +      return digits;
+ +}
+ +
+ +/*
+ + * Convert RowDescription message to a TupleDesc
+ + */
+ +static TupleDesc
+ +create_tuple_desc(char *msg_body, size_t len)
+ +{
+ +      TupleDesc       result;
+ +      int             i, nattr;
+ +      uint16          n16;
+ +
+ +      /* get number of attributes */
+ +      memcpy(&n16, msg_body, 2);
+ +      nattr = ntohs(n16);
+ +      msg_body += 2;
+ +
+ +      result = CreateTemplateTupleDesc(nattr, false);
+ +
+ +      /* decode attributes */
+ +      for (i = 1; i <= nattr; i++)
+ +      {
+ +              AttrNumber      attnum;
+ +              char            *attname;
+ +              char            *typname;
+ +              Oid             oidtypeid;
+ +              int32           typemode, typmod;
+ +
+ +              attnum = (AttrNumber) i;
+ +
+ +              /* attribute name */
+ +              attname = msg_body;
+ +              msg_body += strlen(attname) + 1;
+ +
+ +              /* type name */
+ +              typname = msg_body;
+ +              msg_body += strlen(typname) + 1;
+ +
+ +              /* table OID, ignored */
+ +              msg_body += 4;
+ +
+ +              /* column no, ignored */
+ +              msg_body += 2;
+ +
+ +              /* data type OID, ignored */
+ +              msg_body += 4;
+ +
+ +              /* type len, ignored */
+ +              msg_body += 2;
+ +
+ +              /* type mod */
+ +              memcpy(&typemode, msg_body, 4);
+ +              typmod = ntohl(typemode);
+ +              msg_body += 4;
+ +
+ +              /* PGXCTODO text/binary flag? */
+ +              msg_body += 2;
+ +
+ +              /* Get the OID type and mode type from typename */
+ +              parseTypeString(typname, &oidtypeid, NULL, false);
+ +
+ +              TupleDescInitEntry(result, attnum, attname, oidtypeid, typmod, 0);
+ +      }
+ +      return result;
+ +}
+ +
+ +/*
+ + * Handle CopyOutCommandComplete ('c') message from a Datanode connection
+ + */
+ +static void
+ +HandleCopyOutComplete(ResponseCombiner *combiner)
+ +{
+ +      if (combiner->request_type == REQUEST_TYPE_ERROR)
+ +              return;
+ +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ +              combiner->request_type = REQUEST_TYPE_COPY_OUT;
+ +      if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+ +              /* Inconsistent responses */
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DATA_CORRUPTED),
+ +                               errmsg("Unexpected response from the Datanodes for 'c' message, current request type %d", combiner->request_type)));
+ +      /* Just do nothing, close message is managed by the Coordinator */
+ +      combiner->copy_out_count++;
+ +}
+ +
+ +/*
+ + * Handle CommandComplete ('C') message from a Datanode connection
+ + */
+ +static void
+ +HandleCommandComplete(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
+ +{
+ +      int                     digits = 0;
+ +      EState             *estate = combiner->ss.ps.state;
+ +
+ +      /*
+ +       * If we did not receive description we are having rowcount or OK response
+ +       */
+ +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ +              combiner->request_type = REQUEST_TYPE_COMMAND;
+ +      /* Extract rowcount */
+ +      if (combiner->combine_type != COMBINE_TYPE_NONE && estate)
+ +      {
+ +              uint64  rowcount;
+ +              digits = parse_row_count(msg_body, len, &rowcount);
+ +              if (digits > 0)
+ +              {
+ +                      /* Replicated write, make sure they are the same */
+ +                      if (combiner->combine_type == COMBINE_TYPE_SAME)
+ +                      {
+ +                              if (combiner->command_complete_count)
+ +                              {
+ +                                      /*
+ +                                       * Replicated command may succeed on on node and fail on
+ +                                       * another. The example is if distributed table referenced
+ +                                       * by a foreign key constraint defined on a partitioned
+ +                                       * table. If command deletes rows from the replicated table
+ +                                       * they may be referenced on one Datanode but not on other.
+ +                                       * So, replicated command on each Datanode either affects
+ +                                       * proper number of rows, or returns error. Here if
+ +                                       * combiner got an error already, we allow to report it,
+ +                                       * not the scaring data corruption message.
+ +                                       */
+ +                                      if (combiner->errorMessage == NULL && rowcount != estate->es_processed)
+ +                                              /* There is a consistency issue in the database with the replicated table */
+ +                                              ereport(ERROR,
+ +                                                              (errcode(ERRCODE_DATA_CORRUPTED),
+ +                                                               errmsg("Write to replicated table returned different results from the Datanodes")));
+ +                              }
+ +                              else
+ +                                      /* first result */
+ +                                      estate->es_processed = rowcount;
+ +                      }
+ +                      else
+ +                              estate->es_processed += rowcount;
+ +              }
+ +              else
+ +                      combiner->combine_type = COMBINE_TYPE_NONE;
+ +      }
+ +
+ +      /* If response checking is enable only then do further processing */
+ +      if (conn->ck_resp_rollback)
+ +      {
+ +              if (strcmp(msg_body, "ROLLBACK") == 0)
+ +              {
+ +                      /*
+ +                       * Subsequent clean up routine will be checking this flag
+ +                       * to determine nodes where to send ROLLBACK PREPARED.
+ +                       * On current node PREPARE has failed and the two-phase record
+ +                       * does not exist, so clean this flag as if PREPARE was not sent
+ +                       * to that node and avoid erroneous command.
+ +                       */
+ +                      conn->ck_resp_rollback = false;
+ +                      /*
+ +                       * Set the error, if none, to force throwing.
+ +                       * If there is error already, it will be thrown anyway, do not add
+ +                       * this potentially confusing message
+ +                       */
+ +                      if (combiner->errorMessage == NULL)
+ +                      {
+ +                              MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext);
+ +                              combiner->errorMessage =
+ +                                                              pstrdup("unexpected ROLLBACK from remote node");
+ +                              MemoryContextSwitchTo(oldcontext);
+ +                              /*
+ +                               * ERRMSG_PRODUCER_ERROR
+ +                               * Messages with this code are replaced by others, if they are
+ +                               * received, so if node will send relevant error message that
+ +                               * one will be replaced.
+ +                               */
+ +                              combiner->errorCode[0] = 'X';
+ +                              combiner->errorCode[1] = 'X';
+ +                              combiner->errorCode[2] = '0';
+ +                              combiner->errorCode[3] = '1';
+ +                              combiner->errorCode[4] = '0';
+ +                      }
+ +              }
+ +      }
+ +      combiner->command_complete_count++;
+ +}
+ +
+ +/*
+ + * Handle RowDescription ('T') message from a Datanode connection
+ + */
+ +static bool
+ +HandleRowDescription(ResponseCombiner *combiner, char *msg_body, size_t len)
+ +{
+ +      if (combiner->request_type == REQUEST_TYPE_ERROR)
+ +              return false;
+ +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ +              combiner->request_type = REQUEST_TYPE_QUERY;
+ +      if (combiner->request_type != REQUEST_TYPE_QUERY)
+ +      {
+ +              /* Inconsistent responses */
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DATA_CORRUPTED),
+ +                               errmsg("Unexpected response from the Datanodes for 'T' message, current request type %d", combiner->request_type)));
+ +      }
+ +      /* Increment counter and check if it was first */
+ +      if (combiner->description_count++ == 0)
+ +      {
+ +              combiner->tuple_desc = create_tuple_desc(msg_body, len);
+ +              return true;
+ +      }
+ +      return false;
+ +}
+ +
+ +
+ +/*
+ + * Handle CopyInResponse ('G') message from a Datanode connection
+ + */
+ +static void
+ +HandleCopyIn(ResponseCombiner *combiner)
+ +{
+ +      if (combiner->request_type == REQUEST_TYPE_ERROR)
+ +              return;
+ +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ +              combiner->request_type = REQUEST_TYPE_COPY_IN;
+ +      if (combiner->request_type != REQUEST_TYPE_COPY_IN)
+ +      {
+ +              /* Inconsistent responses */
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DATA_CORRUPTED),
+ +                               errmsg("Unexpected response from the Datanodes for 'G' message, current request type %d", combiner->request_type)));
+ +      }
+ +      /*
+ +       * The normal PG code will output an G message when it runs in the
+ +       * Coordinator, so do not proxy message here, just count it.
+ +       */
+ +      combiner->copy_in_count++;
+ +}
+ +
+ +/*
+ + * Handle CopyOutResponse ('H') message from a Datanode connection
+ + */
+ +static void
+ +HandleCopyOut(ResponseCombiner *combiner)
+ +{
+ +      if (combiner->request_type == REQUEST_TYPE_ERROR)
+ +              return;
+ +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ +              combiner->request_type = REQUEST_TYPE_COPY_OUT;
+ +      if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+ +      {
+ +              /* Inconsistent responses */
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DATA_CORRUPTED),
+ +                               errmsg("Unexpected response from the Datanodes for 'H' message, current request type %d", combiner->request_type)));
+ +      }
+ +      /*
+ +       * The normal PG code will output an H message when it runs in the
+ +       * Coordinator, so do not proxy message here, just count it.
+ +       */
+ +      combiner->copy_out_count++;
+ +}
+ +
+ +/*
+ + * Handle CopyOutDataRow ('d') message from a Datanode connection
+ + */
+ +static void
+ +HandleCopyDataRow(ResponseCombiner *combiner, char *msg_body, size_t len)
+ +{
+ +      if (combiner->request_type == REQUEST_TYPE_ERROR)
+ +              return;
+ +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ +              combiner->request_type = REQUEST_TYPE_COPY_OUT;
+ +
+ +      /* Inconsistent responses */
+ +      if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DATA_CORRUPTED),
+ +                               errmsg("Unexpected response from the Datanodes for 'd' message, current request type %d", combiner->request_type)));
+ +
+ +      /* count the row */
+ +      combiner->processed++;
+ +
+ +      /* Output remote COPY operation to correct location */
+ +      switch (combiner->remoteCopyType)
+ +      {
+ +              case REMOTE_COPY_FILE:
+ +                      /* Write data directly to file */
+ +                      fwrite(msg_body, 1, len, combiner->copy_file);
+ +                      break;
+ +              case REMOTE_COPY_STDOUT:
+ +                      /* Send back data to client */
+ +                      pq_putmessage('d', msg_body, len);
+ +                      break;
+ +              case REMOTE_COPY_TUPLESTORE:
+ +                      /*
+ +                       * Do not store trailing \n character.
+ +                       * When tuplestore data are loaded to a table it automatically
+ +                       * inserts line ends.
+ +                       */
+ +                      tuplestore_putmessage(combiner->tuplestorestate, len-1, msg_body);
+ +                      break;
+ +              case REMOTE_COPY_NONE:
+ +              default:
+ +                      Assert(0); /* Should not happen */
+ +      }
+ +}
+ +
+ +/*
+ + * Handle DataRow ('D') message from a Datanode connection
+ + * The function returns true if data row is accepted and successfully stored
+ + * within the combiner.
+ + */
+ +static bool
+ +HandleDataRow(ResponseCombiner *combiner, char *msg_body, size_t len, Oid node)
+ +{
+ +      /* We expect previous message is consumed */
+ +      Assert(combiner->currentRow == NULL);
+ +
+ +      if (combiner->request_type == REQUEST_TYPE_ERROR)
+ +              return false;
+ +
+ +      if (combiner->request_type != REQUEST_TYPE_QUERY)
+ +      {
+ +              /* Inconsistent responses */
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DATA_CORRUPTED),
+ +                               errmsg("Unexpected response from the data nodes for 'D' message, current request type %d", combiner->request_type)));
+ +      }
+ +
+ +      /*
+ +       * If we got an error already ignore incoming data rows from other nodes
+ +       * Still we want to continue reading until get CommandComplete
+ +       */
+ +      if (combiner->errorMessage)
+ +              return false;
+ +
+ +      /*
+ +       * Replicated INSERT/UPDATE/DELETE with RETURNING: receive only tuples
+ +       * from one node, skip others as duplicates
+ +       */
+ +      if (combiner->combine_type == COMBINE_TYPE_SAME)
+ +      {
+ +              /* Do not return rows when probing primary, instead return when doing
+ +               * first normal node. Just save some CPU and traffic in case if
+ +               * probing fails.
+ +               */
+ +              if (combiner->probing_primary)
+ +                      return false;
+ +              if (OidIsValid(combiner->returning_node))
+ +              {
+ +                      if (combiner->returning_node != node)
+ +                              return false;
+ +              }
+ +              else
+ +                      combiner->returning_node = node;
+ +      }
+ +
+ +      /*
+ +       * We are copying message because it points into connection buffer, and
+ +       * will be overwritten on next socket read
+ +       */
+ +      combiner->currentRow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + len);
+ +      memcpy(combiner->currentRow->msg, msg_body, len);
+ +      combiner->currentRow->msglen = len;
+ +      combiner->currentRow->msgnode = node;
+ +
+ +      return true;
+ +}
+ +
+ +/*
+ + * Handle ErrorResponse ('E') message from a Datanode connection
+ + */
+ +static void
+ +HandleError(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
+ +{
+ +      /* parse error message */
+ +      char *code = NULL;
+ +      char *message = NULL;
+ +      char *detail = NULL;
+ +      char *hint = NULL;
+ +      int   offset = 0;
+ +
+ +      /*
+ +       * Scan until point to terminating \0
+ +       */
+ +      while (offset + 1 < len)
+ +      {
+ +              /* pointer to the field message */
+ +              char *str = msg_body + offset + 1;
+ +
+ +              switch (msg_body[offset])
+ +              {
+ +                      case 'C':       /* code */
+ +                              code = str;
+ +                              break;
+ +                      case 'M':       /* message */
+ +                              message = str;
+ +                              break;
+ +                      case 'D':       /* details */
+ +                              detail = str;
+ +                              break;
+ +
+ +                      case 'H':       /* hint */
+ +                              hint = str;
+ +                              break;
+ +
+ +                      /* Fields not yet in use */
+ +                      case 'S':       /* severity */
+ +                      case 'R':       /* routine */
+ +                      case 'P':       /* position string */
+ +                      case 'p':       /* position int */
+ +                      case 'q':       /* int query */
+ +                      case 'W':       /* where */
+ +                      case 'F':       /* file */
+ +                      case 'L':       /* line */
+ +                      default:
+ +                              break;
+ +              }
+ +
+ +              /* code, message and \0 */
+ +              offset += strlen(str) + 2;
+ +      }
+ +
+ +      /*
+ +       * We may have special handling for some errors, default handling is to
+ +       * throw out error with the same message. We can not ereport immediately
+ +       * because we should read from this and other connections until
+ +       * ReadyForQuery is received, so we just store the error message.
+ +       * If multiple connections return errors only first one is reported.
+ +       *
+ +       * The producer error may be hiding primary error, so if previously received
+ +       * error is a producer error allow it to be overwritten.
+ +       */
+ +      if (combiner->errorMessage == NULL ||
+ +                      MAKE_SQLSTATE(combiner->errorCode[0], combiner->errorCode[1],
+ +                                                combiner->errorCode[2], combiner->errorCode[3],
+ +                                                combiner->errorCode[4]) == ERRCODE_PRODUCER_ERROR)
+ +      {
+ +              MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext);
+ +              combiner->errorMessage = pstrdup(message);
+ +              /* Error Code is exactly 5 significant bytes */
+ +              if (code)
+ +                      memcpy(combiner->errorCode, code, 5);
+ +              if (detail)
+ +                      combiner->errorDetail = pstrdup(detail);
+ +              if (hint)
+ +                      combiner->errorHint = pstrdup(hint);
+ +              MemoryContextSwitchTo(oldcontext);
+ +      }
+ +
+ +      /*
+ +       * If the PREPARE TRANSACTION command fails for whatever reason, we don't
+ +       * want to send down ROLLBACK PREPARED to this node. Otherwise, it may end
+ +       * up rolling back an unrelated prepared transaction with the same GID as
+ +       * used by this transaction
+ +       */
+ +      if (conn->ck_resp_rollback)
+ +              conn->ck_resp_rollback = false;
+ +
+ +      /*
+ +       * If Datanode have sent ErrorResponse it will never send CommandComplete.
+ +       * Increment the counter to prevent endless waiting for it.
+ +       */
+ +      combiner->command_complete_count++;
+ +}
+ +
+ +/*
+ + * HandleCmdComplete -
+ + *    combine deparsed sql statements execution results
+ + *
+ + * Input parameters:
+ + *    commandType is dml command type
+ + *    combineTag is used to combine the completion result
+ + *    msg_body is execution result needed to combine
+ + *    len is msg_body size
+ + */
+ +void
+ +HandleCmdComplete(CmdType commandType, CombineTag *combine,
+ +                                              const char *msg_body, size_t len)
+ +{
+ +      int     digits = 0;
+ +      uint64  originrowcount = 0;
+ +      uint64  rowcount = 0;
+ +      uint64  total = 0;
+ +
+ +      if (msg_body == NULL)
+ +              return;
+ +
+ +      /* if there's nothing in combine, just copy the msg_body */
+ +      if (strlen(combine->data) == 0)
+ +      {
+ +              strcpy(combine->data, msg_body);
+ +              combine->cmdType = commandType;
+ +              return;
+ +      }
+ +      else
+ +      {
+ +              /* commandType is conflict */
+ +              if (combine->cmdType != commandType)
+ +                      return;
+ +
+ +              /* get the processed row number from msg_body */
+ +              digits = parse_row_count(msg_body, len + 1, &rowcount);
+ +              elog(DEBUG1, "digits is %d\n", digits);
+ +              Assert(digits >= 0);
+ +
+ +              /* no need to combine */
+ +              if (digits == 0)
+ +                      return;
+ +
+ +              /* combine the processed row number */
+ +              parse_row_count(combine->data, strlen(combine->data) + 1, &originrowcount);
+ +              elog(DEBUG1, "originrowcount is %lu, rowcount is %lu\n", originrowcount, rowcount);
+ +              total = originrowcount + rowcount;
+ +
+ +      }
+ +
+ +      /* output command completion tag */
+ +      switch (commandType)
+ +      {
+ +              case CMD_SELECT:
+ +                      strcpy(combine->data, "SELECT");
+ +                      break;
+ +              case CMD_INSERT:
+ +                      snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
+ +                         "INSERT %u %lu", 0, total);
+ +                      break;
+ +              case CMD_UPDATE:
+ +                      snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
+ +                                       "UPDATE %lu", total);
+ +                      break;
+ +              case CMD_DELETE:
+ +                      snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
+ +                                       "DELETE %lu", total);
+ +                      break;
+ +              default:
+ +                      strcpy(combine->data, "");
+ +                      break;
+ +      }
+ +
+ +}
+ +
+ +/*
+ + * HandleDatanodeCommandId ('M') message from a Datanode connection
+ + */
+ +static void
+ +HandleDatanodeCommandId(ResponseCombiner *combiner, char *msg_body, size_t len)
+ +{
+ +      uint32          n32;
+ +      CommandId       cid;
+ +
+ +      Assert(msg_body != NULL);
+ +      Assert(len >= 2);
+ +
+ +      /* Get the command Id */
+ +      memcpy(&n32, &msg_body[0], 4);
+ +      cid = ntohl(n32);
+ +
+ +      /* If received command Id is higher than current one, set it to a new value */
+ +      if (cid > GetReceivedCommandId())
+ +              SetReceivedCommandId(cid);
+ +}
+ +
+ +/*
+ + * Record waited-for XIDs received from the remote nodes into the transaction
+ + * state
+ + */
+ +static void
+ +HandleWaitXids(char *msg_body, size_t len)
+ +{
+ +      int xid_count;
+ +      uint32          n32;
+ +      int cur;
+ +      int i;
+ +
+ +      /* Get the xid count */
+ +      xid_count = len / sizeof (TransactionId);
+ +
+ +      cur = 0;
+ +      for (i = 0; i < xid_count; i++)
+ +      {
+ +              Assert(cur < len);
+ +              memcpy(&n32, &msg_body[cur], sizeof (TransactionId));
+ +              cur = cur + sizeof (TransactionId);
+ +              TransactionRecordXidWait(ntohl(n32));
+ +      }
+ +}
+ +
+ +static void
+ +HandleGlobalTransactionId(char *msg_body, size_t len)
+ +{
+ +      GlobalTransactionId xid;
+ +
+ +      Assert(len == sizeof (GlobalTransactionId));
+ +      memcpy(&xid, &msg_body[0], sizeof (GlobalTransactionId));
+ +
+ +      SetTopTransactionId(xid);
+ +}
+ +
+ +/*
+ + * Examine the specified combiner state and determine if command was completed
+ + * successfully
+ + */
+ +static bool
+ +validate_combiner(ResponseCombiner *combiner)
+ +{
+ +      /* There was error message while combining */
+ +      if (combiner->errorMessage)
+ +              return false;
+ +      /* Check if state is defined */
+ +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ +              return false;
+ +
+ +      /* Check all nodes completed */
+ +      if ((combiner->request_type == REQUEST_TYPE_COMMAND
+ +              || combiner->request_type == REQUEST_TYPE_QUERY)
+ +              && combiner->command_complete_count != combiner->node_count)
+ +              return false;
+ +
+ +      /* Check count of description responses */
+ +      if (combiner->request_type == REQUEST_TYPE_QUERY
+ +              && combiner->description_count != combiner->node_count)
+ +              return false;
+ +
+ +      /* Check count of copy-in responses */
+ +      if (combiner->request_type == REQUEST_TYPE_COPY_IN
+ +              && combiner->copy_in_count != combiner->node_count)
+ +              return false;
+ +
+ +      /* Check count of copy-out responses */
+ +      if (combiner->request_type == REQUEST_TYPE_COPY_OUT
+ +              && combiner->copy_out_count != combiner->node_count)
+ +              return false;
+ +
+ +      /* Add other checks here as needed */
+ +
+ +      /* All is good if we are here */
+ +      return true;
+ +}
+ +
+ +/*
+ + * Close combiner and free allocated memory, if it is not needed
+ + */
+ +void
+ +CloseCombiner(ResponseCombiner *combiner)
+ +{
+ +      if (combiner->connections)
+ +              pfree(combiner->connections);
+ +      if (combiner->tuple_desc)
+ +              FreeTupleDesc(combiner->tuple_desc);
+ +      if (combiner->errorMessage)
+ +              pfree(combiner->errorMessage);
+ +      if (combiner->errorDetail)
+ +              pfree(combiner->errorDetail);
+ +      if (combiner->errorHint)
+ +              pfree(combiner->errorHint);
+ +      if (combiner->cursor_connections)
+ +              pfree(combiner->cursor_connections);
+ +      if (combiner->tapenodes)
+ +              pfree(combiner->tapenodes);
+ +      if (combiner->tapemarks)
+ +              pfree(combiner->tapemarks);
+ +}
+ +
+ +/*
+ + * Validate combiner and release storage freeing allocated memory
+ + */
+ +static bool
+ +ValidateAndCloseCombiner(ResponseCombiner *combiner)
+ +{
+ +      bool            valid = validate_combiner(combiner);
+ +
+ +      CloseCombiner(combiner);
+ +
+ +      return valid;
+ +}
+ +
+ +/*
+ + * It is possible if multiple steps share the same Datanode connection, when
+ + * executor is running multi-step query or client is running multiple queries
+ + * using Extended Query Protocol. After returning next tuple ExecRemoteQuery
+ + * function passes execution control to the executor and then it can be given
+ + * to the same RemoteQuery or to different one. It is possible that before
+ + * returning a tuple the function do not read all Datanode responses. In this
+ + * case pending responses should be read in context of original RemoteQueryState
+ + * till ReadyForQuery message and data rows should be stored (buffered) to be
+ + * available when fetch from that RemoteQueryState is requested again.
+ + * BufferConnection function does the job.
+ + * If a RemoteQuery is going to use connection it should check connection state.
+ + * DN_CONNECTION_STATE_QUERY indicates query has data to read and combiner
+ + * points to the original RemoteQueryState. If combiner differs from "this" the
+ + * connection should be buffered.
+ + */
+ +void
+ +BufferConnection(PGXCNodeHandle *conn)
+ +{
+ +      ResponseCombiner *combiner = conn->combiner;
+ +      MemoryContext oldcontext;
+ +
+ +      if (combiner == NULL || conn->state != DN_CONNECTION_STATE_QUERY)
+ +              return;
+ +
+ +      elog(DEBUG2, "Buffer connection %u to step %s", conn->nodeoid, combiner->cursor);
+ +
+ +      /*
+ +       * When BufferConnection is invoked CurrentContext is related to other
+ +       * portal, which is trying to control the connection.
+ +       * TODO See if we can find better context to switch to
+ +       */
+ +      oldcontext = MemoryContextSwitchTo(combiner->ss.ps.ps_ResultTupleSlot->tts_mcxt);
+ +
+ +      /* Verify the connection is in use by the combiner */
+ +      combiner->current_conn = 0;
+ +      while (combiner->current_conn < combiner->conn_count)
+ +      {
+ +              if (combiner->connections[combiner->current_conn] == conn)
+ +                      break;
+ +              combiner->current_conn++;
+ +      }
+ +      Assert(combiner->current_conn < combiner->conn_count);
+ +
+ +      if (combiner->tapemarks == NULL)
+ +              combiner->tapemarks = (ListCell**) palloc0(combiner->conn_count * sizeof(ListCell*));
+ +
+ +      /*
+ +       * If current bookmark for the current tape is not set it means either
+ +       * first row in the buffer is from the current tape or no rows from
+ +       * the tape in the buffer, so if first row is not from current
+ +       * connection bookmark the last cell in the list.
+ +       */
+ +      if (combiner->tapemarks[combiner->current_conn] == NULL &&
+ +                      list_length(combiner->rowBuffer) > 0)
+ +      {
+ +              RemoteDataRow dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
+ +              if (dataRow->msgnode != conn->nodeoid)
+ +                      combiner->tapemarks[combiner->current_conn] = list_tail(combiner->rowBuffer);
+ +      }
+ +
+ +      /*
+ +       * Buffer data rows until data node return number of rows specified by the
+ +       * fetch_size parameter of last Execute message (PortalSuspended message)
+ +       * or end of result set is reached (CommandComplete message)
+ +       */
+ +      while (true)
+ +      {
+ +              int res;
+ +
+ +              /* Move to buffer currentRow (received from the data node) */
+ +              if (combiner->currentRow)
+ +              {
+ +                      combiner->rowBuffer = lappend(combiner->rowBuffer,
+ +                                                                                combiner->currentRow);
+ +                      combiner->currentRow = NULL;
+ +              }
+ +
+ +              res = handle_response(conn, combiner);
+ +              /*
+ +               * If response message is a DataRow it will be handled on the next
+ +               * iteration.
+ +               * PortalSuspended will cause connection state change and break the loop
+ +               * The same is for CommandComplete, but we need additional handling -
+ +               * remove connection from the list of active connections.
+ +               * We may need to add handling error response
+ +               */
+ +
+ +              /* Most often result check first */
+ +              if (res == RESPONSE_DATAROW)
+ +              {
+ +                      /*
+ +                       * The row is in the combiner->currentRow, on next iteration it will
+ +                       * be moved to the buffer
+ +                       */
+ +                      continue;
+ +              }
+ +
+ +              /* incomplete message, read more */
+ +              if (res == RESPONSE_EOF)
+ +              {
+ +                      if (pgxc_node_receive(1, &conn, NULL))
+ +                      {
+ +                              PGXCNodeSetConnectionState(conn,
+ +                                              DN_CONNECTION_STATE_ERROR_FATAL);
+ +                              add_error_message(conn, "Failed to fetch from data node");
+ +                      }
+ +              }
+ +
+ +              /*
+ +               * End of result set is reached, so either set the pointer to the
+ +               * connection to NULL (combiner with sort) or remove it from the list
+ +               * (combiner without sort)
+ +               */
+ +              else if (res == RESPONSE_COMPLETE)
+ +              {
+ +                      /*
+ +                       * If combiner is doing merge sort we should set reference to the
+ +                       * current connection to NULL in the array, indicating the end
+ +                       * of the tape is reached. FetchTuple will try to access the buffer
+ +                       * first anyway.
+ +                       * Since we remove that reference we can not determine what node
+ +                       * number was this connection, but we need this info to find proper
+ +                       * tuple in the buffer if we are doing merge sort. So store node
+ +                       * number in special array.
+ +                       * NB: We can not test if combiner->tuplesortstate is set here:
+ +                       * connection may require buffering inside tuplesort_begin_merge
+ +                       * - while pre-read rows from the tapes, one of the tapes may be
+ +                       * the local connection with RemoteSubplan in the tree. The
+ +                       * combiner->tuplesortstate is set only after tuplesort_begin_merge
+ +                       * returns.
+ +                       */
+ +                      if (combiner->merge_sort)
+ +                      {
+ +                              combiner->connections[combiner->current_conn] = NULL;
+ +                              if (combiner->tapenodes == NULL)
+ +                                      combiner->tapenodes = (Oid *)
+ +                                                      palloc0(combiner->conn_count * sizeof(Oid));
+ +                              combiner->tapenodes[combiner->current_conn] = conn->nodeoid;
+ +                      }
+ +                      else
+ +                      {
+ +                              /* Remove current connection, move last in-place, adjust current_conn */
+ +                              if (combiner->current_conn < --combiner->conn_count)
+ +                                      combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count];
+ +                              else
+ +                                      combiner->current_conn = 0;
+ +                      }
+ +                      /*
+ +                       * If combiner runs Simple Query Protocol we need to read in
+ +                       * ReadyForQuery. In case of Extended Query Protocol it is not
+ +                       * sent and we should quit.
+ +                       */
+ +                      if (combiner->extended_query)
+ +                              break;
+ +              }
+ +              else if (res == RESPONSE_ERROR)
+ +              {
+ +                      if (combiner->extended_query)
+ +                      {
+ +                              /*
+ +                               * Need to sync connection to enable receiving commands
+ +                               * by the datanode
+ +                               */
+ +                              if (pgxc_node_send_sync(conn) != 0)
+ +                              {
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Failed to sync msg to node %u", conn->nodeoid)));
+ +                              }
+ +                      }
+ +              }
+ +              else if (res == RESPONSE_SUSPENDED || res == RESPONSE_READY)
+ +              {
+ +                      /* Now it is OK to quit */
+ +                      break;
+ +              }
+ +      }
+ +      Assert(conn->state != DN_CONNECTION_STATE_QUERY);
+ +      MemoryContextSwitchTo(oldcontext);
+ +      conn->combiner = NULL;
+ +}
+ +
+ +/*
+ + * copy the datarow from combiner to the given slot, in the slot's memory
+ + * context
+ + */
+ +static void
+ +CopyDataRowTupleToSlot(ResponseCombiner *combiner, TupleTableSlot *slot)
+ +{
+ +      RemoteDataRow   datarow;
+ +      MemoryContext   oldcontext;
+ +      oldcontext = MemoryContextSwitchTo(slot->tts_mcxt);
+ +      datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + combiner->currentRow->msglen);
+ +      datarow->msgnode = combiner->currentRow->msgnode;
+ +      datarow->msglen = combiner->currentRow->msglen;
+ +      memcpy(datarow->msg, combiner->currentRow->msg, datarow->msglen);
+ +      ExecStoreDataRowTuple(datarow, slot, true);
+ +      pfree(combiner->currentRow);
+ +      combiner->currentRow = NULL;
+ +      MemoryContextSwitchTo(oldcontext);
+ +}
+ +
+ +
+ +/*
+ + * FetchTuple
+ + *
+ +              Get next tuple from one of the datanode connections.
+ + * The connections should be in combiner->connections, if "local" dummy
+ + * connection presents it should be the last active connection in the array.
+ + *      If combiner is set up to perform merge sort function returns tuple from
+ + * connection defined by combiner->current_conn, or NULL slot if no more tuple
+ + * are available from the connection. Otherwise it returns tuple from any
+ + * connection or NULL slot if no more available connections.
+ + *            Function looks into combiner->rowBuffer before accessing connection
+ + * and return a tuple from there if found.
+ + *            Function may wait while more data arrive from the data nodes. If there
+ + * is a locally executed subplan function advance it and buffer resulting rows
+ + * instead of waiting.
+ + */
+ +TupleTableSlot *
+ +FetchTuple(ResponseCombiner *combiner)
+ +{
+ +      PGXCNodeHandle *conn;
+ +      TupleTableSlot *slot;
+ +      Oid                     nodeOid = -1;
+ +
+ +      /*
+ +       * Case if we run local subplan.
+ +       * We do not have remote connections, so just get local tuple and return it
+ +       */
+ +      if (outerPlanState(combiner))
+ +      {
+ +              RemoteSubplanState *planstate = (RemoteSubplanState *) combiner;
+ +              RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ +              /* Advance subplan in a loop until we have something to return */
+ +              for (;;)
+ +              {
+ +                      Datum   value = (Datum) 0;
+ +                      bool    isnull = false;
+ +                      int     numnodes;
+ +                      int             i;
+ +
+ +                      slot = ExecProcNode(outerPlanState(combiner));
+ +                      /* If locator is not defined deliver all the results */
+ +                      if (planstate->locator == NULL)
+ +                              return slot;
+ +
+ +                      /*
+ +                       * If NULL tuple is returned we done with the subplan, finish it up and
+ +                       * return NULL
+ +                       */
+ +                      if (TupIsNull(slot))
+ +                              return NULL;
+ +
+ +                      /* Get partitioning value if defined */
+ +                      if (plan->distributionKey != InvalidAttrNumber)
+ +                              value = slot_getattr(slot, plan->distributionKey, &isnull);
+ +
+ +                      /* Determine target nodes */
+ +                      numnodes = GET_NODES(planstate->locator, value, isnull, NULL);
+ +                      for (i = 0; i < numnodes; i++)
+ +                      {
+ +                              /* Deliver the node */
+ +                              if (planstate->dest_nodes[i] == PGXCNodeId-1)
+ +                                      return slot;
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Get current connection
+ +       */
+ +      if (combiner->conn_count > combiner->current_conn)
+ +              conn = combiner->connections[combiner->current_conn];
+ +      else
+ +              conn = NULL;
+ +
+ +      /*
+ +       * If doing merge sort determine the node number.
+ +       * It may be needed to get buffered row.
+ +       */
+ +      if (combiner->merge_sort)
+ +      {
+ +              Assert(conn || combiner->tapenodes);
+ +              nodeOid = conn ? conn->nodeoid :
+ +                                               combiner->tapenodes[combiner->current_conn];
+ +              Assert(OidIsValid(nodeOid));
+ +      }
+ +
+ +      /*
+ +       * First look into the row buffer.
+ +       * When we are performing merge sort we need to get from the buffer record
+ +       * from the connection marked as "current". Otherwise get first.
+ +       */
+ +      if (list_length(combiner->rowBuffer) > 0)
+ +      {
+ +              RemoteDataRow dataRow;
+ +
+ +              Assert(combiner->currentRow == NULL);
+ +
+ +              if (combiner->merge_sort)
+ +              {
+ +                      ListCell *lc;
+ +                      ListCell *prev;
+ +
+ +                      elog(DEBUG1, "Getting buffered tuple from node %x", nodeOid);
+ +
+ +                      prev = combiner->tapemarks[combiner->current_conn];
+ +                      if (prev)
+ +                      {
+ +                              /*
+ +                               * Start looking through the list from the bookmark.
+ +                               * Probably the first cell we check contains row from the needed
+ +                               * node. Otherwise continue scanning until we encounter one,
+ +                               * advancing prev pointer as well.
+ +                               */
+ +                              while((lc = lnext(prev)) != NULL)
+ +                              {
+ +                                      dataRow = (RemoteDataRow) lfirst(lc);
+ +                                      if (dataRow->msgnode == nodeOid)
+ +                                      {
+ +                                              combiner->currentRow = dataRow;
+ +                                              break;
+ +                                      }
+ +                                      prev = lc;
+ +                              }
+ +                      }
+ +                      else
+ +                      {
+ +                              /*
+ +                               * Either needed row is the first in the buffer or no such row
+ +                               */
+ +                              lc = list_head(combiner->rowBuffer);
+ +                              dataRow = (RemoteDataRow) lfirst(lc);
+ +                              if (dataRow->msgnode == nodeOid)
+ +                                      combiner->currentRow = dataRow;
+ +                              else
+ +                                      lc = NULL;
+ +                      }
+ +                      if (lc)
+ +                      {
+ +                              /*
+ +                               * Delete cell from the buffer. Before we delete we must check
+ +                               * the bookmarks, if the cell is a bookmark for any tape.
+ +                               * If it is the case we are deleting last row of the current
+ +                               * block from the current tape. That tape should have bookmark
+ +                               * like current, and current bookmark will be advanced when we
+ +                               * read the tape once again.
+ +                               */
+ +                              int i;
+ +                              for (i = 0; i < combiner->conn_count; i++)
+ +                              {
+ +                                      if (combiner->tapemarks[i] == lc)
+ +                                              combiner->tapemarks[i] = prev;
+ +                              }
+ +                              elog(DEBUG1, "Found buffered tuple from node %x", nodeOid);
+ +                              combiner->rowBuffer = list_delete_cell(combiner->rowBuffer,
+ +                                                                                                         lc, prev);
+ +                      }
+ +                      elog(DEBUG1, "Update tapemark");
+ +                      combiner->tapemarks[combiner->current_conn] = prev;
+ +              }
+ +              else
+ +              {
+ +                      dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
+ +                      combiner->currentRow = dataRow;
+ +                      combiner->rowBuffer = list_delete_first(combiner->rowBuffer);
+ +              }
+ +      }
+ +
+ +      /* If we have node message in the currentRow slot, and it is from a proper
+ +       * node, consume it.  */
+ +      if (combiner->currentRow)
+ +      {
+ +              Assert(!combiner->merge_sort ||
+ +                         combiner->currentRow->msgnode == nodeOid);
+ +              slot = combiner->ss.ps.ps_ResultTupleSlot;
+ +              CopyDataRowTupleToSlot(combiner, slot);
+ +              return slot;
+ +      }
+ +
+ +      while (conn)
+ +      {
+ +              int res;
+ +
+ +              /* Going to use a connection, buffer it if needed */
+ +              CHECK_OWNERSHIP(conn, combiner);
+ +
+ +              /*
+ +               * If current connection is idle it means portal on the data node is
+ +               * suspended. Request more and try to get it
+ +               */
+ +              if (combiner->extended_query &&
+ +                              conn->state == DN_CONNECTION_STATE_IDLE)
+ +              {
+ +                      /*
+ +                       * We do not allow to suspend if querying primary node, so that
+ +                       * only may mean the current node is secondary and subplan was not
+ +                       * executed there yet. Return and go on with second phase.
+ +                       */
+ +                      if (combiner->probing_primary)
+ +                      {
+ +                              return NULL;
+ +                      }
+ +
+ +                      if (pgxc_node_send_execute(conn, combiner->cursor, PGXLRemoteFetchSize) != 0)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
+ +                      }
+ +
+ +                      if (pgxc_node_send_flush(conn) != 0)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
+ +                      }
+ +
+ +                      if (pgxc_node_receive(1, &conn, NULL))
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed receive data from node %u cursor '%s'", conn->nodeoid, combiner->cursor)));
+ +                      }
+ +              }
+ +
+ +              /* read messages */
+ +              res = handle_response(conn, combiner);
+ +              if (res == RESPONSE_DATAROW)
+ +              {
+ +                      slot = combiner->ss.ps.ps_ResultTupleSlot;
+ +                      CopyDataRowTupleToSlot(combiner, slot);
+ +                      return slot;
+ +              }
+ +              else if (res == RESPONSE_EOF)
+ +              {
+ +                      /* incomplete message, read more */
+ +                      if (pgxc_node_receive(1, &conn, NULL))
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to receive more data from data node %u", conn->nodeoid)));
+ +                      continue;
+ +              }
+ +              else if (res == RESPONSE_SUSPENDED)
+ +              {
+ +                      /*
+ +                       * If we are doing merge sort or probing primary node we should
+ +                       * remain on the same node, so query next portion immediately.
+ +                       * Otherwise leave node suspended and fetch lazily.
+ +                       */
+ +                      if (combiner->merge_sort || combiner->probing_primary)
+ +                      {
+ +                              if (pgxc_node_send_execute(conn, combiner->cursor, PGXLRemoteFetchSize) != 0)
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
+ +                              if (pgxc_node_send_flush(conn) != 0)
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
+ +                              if (pgxc_node_receive(1, &conn, NULL))
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Failed receive node from node %u cursor '%s'", conn->nodeoid, combiner->cursor)));
+ +                              continue;
+ +                      }
+ +
+ +                      /*
+ +                       * Tell the node to fetch data in background, next loop when we 
+ +                       * pgxc_node_receive, data is already there, so we can run faster
+ +                       * */
+ +                      if (pgxc_node_send_execute(conn, combiner->cursor, PGXLRemoteFetchSize) != 0)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
+ +                      }
+ +
+ +                      if (pgxc_node_send_flush(conn) != 0)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
+ +                      }
+ +
+ +                      if (++combiner->current_conn >= combiner->conn_count)
+ +                              combiner->current_conn = 0;
+ +                      conn = combiner->connections[combiner->current_conn];
+ +              }
+ +              else if (res == RESPONSE_COMPLETE)
+ +              {
+ +                      /*
+ +                       * In case of Simple Query Protocol we should receive ReadyForQuery
+ +                       * before removing connection from the list. In case of Extended
+ +                       * Query Protocol we may remove connection right away.
+ +                       */
+ +                      if (combiner->extended_query)
+ +                      {
+ +                              /* If we are doing merge sort clean current connection and return
+ +                               * NULL, otherwise remove current connection, move last in-place,
+ +                               * adjust current_conn and continue if it is not last connection */
+ +                              if (combiner->merge_sort)
+ +                              {
+ +                                      combiner->connections[combiner->current_conn] = NULL;
+ +                                      return NULL;
+ +                              }
+ +                              REMOVE_CURR_CONN(combiner);
+ +                              if (combiner->conn_count > 0)
+ +                                      conn = combiner->connections[combiner->current_conn];
+ +                              else
+ +                                      return NULL;
+ +                      }
+ +              }
+ +              else if (res == RESPONSE_ERROR)
+ +              {
+ +                      /*
+ +                       * If doing Extended Query Protocol we need to sync connection,
+ +                       * otherwise subsequent commands will be ignored.
+ +                       */
+ +                      if (combiner->extended_query)
+ +                      {
+ +                              if (pgxc_node_send_sync(conn) != 0)
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Failed to sync msg to node %u", conn->nodeoid)));
+ +                      }
+ +                      /*
+ +                       * Do not wait for response from primary, it needs to wait
+ +                       * for other nodes to respond. Instead go ahead and send query to
+ +                       * other nodes. It will fail there, but we can continue with
+ +                       * normal cleanup.
+ +                       */
+ +                      if (combiner->probing_primary)
+ +                      {
+ +                              REMOVE_CURR_CONN(combiner);
+ +                              return NULL;
+ +                      }
+ +              }
+ +              else if (res == RESPONSE_READY)
+ +              {
+ +                      /* If we are doing merge sort clean current connection and return
+ +                       * NULL, otherwise remove current connection, move last in-place,
+ +                       * adjust current_conn and continue if it is not last connection */
+ +                      if (combiner->merge_sort)
+ +                      {
+ +                              combiner->connections[combiner->current_conn] = NULL;
+ +                              return NULL;
+ +                      }
+ +                      REMOVE_CURR_CONN(combiner);
+ +                      if (combiner->conn_count > 0)
+ +                              conn = combiner->connections[combiner->current_conn];
+ +                      else
+ +                              return NULL;
+ +              }
+ +              else if (res == RESPONSE_TUPDESC)
+ +              {
+ +                      ExecSetSlotDescriptor(combiner->ss.ps.ps_ResultTupleSlot,
+ +                                                                combiner->tuple_desc);
+ +                      /* Now slot is responsible for freeng the descriptor */
+ +                      combiner->tuple_desc = NULL;
+ +              }
+ +              else if (res == RESPONSE_ASSIGN_GXID)
+ +              {
+ +                      /* Do nothing. It must have been handled in handle_response() */
+ +              }
+ +              else if (res == RESPONSE_WAITXIDS)
+ +              {
+ +                      /* Do nothing. It must have been handled in handle_response() */
+ +              }
+ +              else
+ +              {
+ +                      // Can not get here?
+ +                      Assert(false);
+ +              }
+ +      }
+ +
+ +      return NULL;
+ +}
+ +
+ +
+ +/*
+ + * Handle responses from the Datanode connections
+ + */
+ +static int
+ +pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
+ +                                               struct timeval * timeout, ResponseCombiner *combiner)
+ +{
+ +      int                     count = conn_count;
+ +      PGXCNodeHandle *to_receive[conn_count];
+ +
+ +      /* make a copy of the pointers to the connections */
+ +      memcpy(to_receive, connections, conn_count * sizeof(PGXCNodeHandle *));
+ +
+ +      /*
+ +       * Read results.
+ +       * Note we try and read from Datanode connections even if there is an error on one,
+ +       * so as to avoid reading incorrect results on the next statement.
+ +       * Other safegaurds exist to avoid this, however.
+ +       */
+ +      while (count > 0)
+ +      {
+ +              int i = 0;
+ +
+ +              if (pgxc_node_receive(count, to_receive, timeout))
+ +                      return EOF;
+ +              while (i < count)
+ +              {
+ +                      int result =  handle_response(to_receive[i], combiner);
+ +                      elog(DEBUG5, "Received response %d on connection to node %s",
+ +                                      result, to_receive[i]->nodename);
+ +                      switch (result)
+ +                      {
+ +                              case RESPONSE_EOF: /* have something to read, keep receiving */
+ +                                      i++;
+ +                                      break;
+ +                              case RESPONSE_COMPLETE:
+ +                                      if (to_receive[i]->state != DN_CONNECTION_STATE_ERROR_FATAL)
+ +                                              /* Continue read until ReadyForQuery */
+ +                                              break;
+ +                                      /* fallthru */
+ +                              case RESPONSE_READY:
+ +                                      /* fallthru */
+ +                              case RESPONSE_COPY:
+ +                                      /* Handling is done, do not track this connection */
+ +                                      count--;
+ +                                      /* Move last connection in place */
+ +                                      if (i < count)
+ +                                              to_receive[i] = to_receive[count];
+ +                                      break;
+ +                              case RESPONSE_ERROR:
+ +                                      /* no handling needed, just wait for ReadyForQuery */
+ +                                      break;
+ +
+ +                              case RESPONSE_WAITXIDS:
+ +                              case RESPONSE_ASSIGN_GXID:
+ +                              case RESPONSE_TUPDESC:
+ +                                      break;
+ +
+ +                              case RESPONSE_DATAROW:
+ +                                      combiner->currentRow = NULL;
+ +                                      break;
+ +
+ +                              default:
+ +                                      /* Inconsistent responses */
+ +                                      add_error_message(to_receive[i], "Unexpected response from the Datanodes");
+ +                                      elog(DEBUG1, "Unexpected response from the Datanodes, result = %d, request type %d", result, combiner->request_type);
+ +                                      /* Stop tracking and move last connection in place */
+ +                                      count--;
+ +                                      if (i < count)
+ +                                              to_receive[i] = to_receive[count];
+ +                      }
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Read next message from the connection and update the combiner
+ + * and connection state accordingly
+ + * If we are in an error state we just consume the messages, and do not proxy
+ + * Long term, we should look into cancelling executing statements
+ + * and closing the connections.
+ + * It returns if states need to be handled
+ + * Return values:
+ + * RESPONSE_EOF - need to receive more data for the connection
+ + * RESPONSE_READY - got ReadyForQuery
+ + * RESPONSE_COMPLETE - done with the connection, but not yet ready for query.
+ + * Also this result is output in case of error
+ + * RESPONSE_SUSPENDED - got PortalSuspended
+ + * RESPONSE_TUPLEDESC - got tuple description
+ + * RESPONSE_DATAROW - got data row
+ + * RESPONSE_COPY - got copy response
+ + * RESPONSE_BARRIER_OK - barrier command completed successfully
+ + */
+ +int
+ +handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner)
+ +{
+ +      char       *msg;
+ +      int                     msg_len;
+ +      char            msg_type;
+ +
+ +      for (;;)
+ +      {
+ +              /*
+ +               * If we are in the process of shutting down, we
+ +               * may be rolling back, and the buffer may contain other messages.
+ +               * We want to avoid a procarray exception
+ +               * as well as an error stack overflow.
+ +               */
+ +              if (proc_exit_inprogress)
+ +                      PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
+ +
+ +              /*
+ +               * Don't read from from the connection if there is a fatal error.
+ +               * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since
+ +               * Handling of RESPONSE_ERROR assumes sending SYNC message, but
+ +               * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is
+ +               * not usable.
+ +               */
+ +              if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ +                      return RESPONSE_COMPLETE;
+ +
+ +              /* No data available, exit */
+ +              if (!HAS_MESSAGE_BUFFERED(conn))
+ +                      return RESPONSE_EOF;
+ +
+ +              Assert(conn->combiner == combiner || conn->combiner == NULL);
+ +
+ +              /* TODO handle other possible responses */
+ +              msg_type = get_message(conn, &msg_len, &msg);
+ +              elog(DEBUG5, "handle_response - received message %c, node %s, "
+ +                              "current_state %d", msg_type, conn->nodename, conn->state);
+ +              switch (msg_type)
+ +              {
+ +                      case '\0':                      /* Not enough data in the buffer */
+ +                              return RESPONSE_EOF;
+ +                      case 'c':                       /* CopyToCommandComplete */
+ +                              HandleCopyOutComplete(combiner);
+ +                              break;
+ +                      case 'C':                       /* CommandComplete */
+ +                              HandleCommandComplete(combiner, msg, msg_len, conn);
+ +                              conn->combiner = NULL;
+ +                              /* 
+ +                               * In case of simple query protocol, wait for the ReadyForQuery
+ +                               * before marking connection as Idle
+ +                               */
+ +                              if (combiner->extended_query &&
+ +                                      conn->state == DN_CONNECTION_STATE_QUERY)
+ +                                      PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+ +                              return RESPONSE_COMPLETE;
+ +                      case 'T':                       /* RowDescription */
+ +#ifdef DN_CONNECTION_DEBUG
+ +                              Assert(!conn->have_row_desc);
+ +                              conn->have_row_desc = true;
+ +#endif
+ +                              if (HandleRowDescription(combiner, msg, msg_len))
+ +                                      return RESPONSE_TUPDESC;
+ +                              break;
+ +                      case 'D':                       /* DataRow */
+ +#ifdef DN_CONNECTION_DEBUG
+ +                              Assert(conn->have_row_desc);
+ +#endif
+ +                              /* Do not return if data row has not been actually handled */
+ +                              if (HandleDataRow(combiner, msg, msg_len, conn->nodeoid))
+ +                                      return RESPONSE_DATAROW;
+ +                              break;
+ +                      case 's':                       /* PortalSuspended */
+ +                              /* No activity is expected on the connection until next query */
+ +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+ +                              return RESPONSE_SUSPENDED;
+ +                      case '1': /* ParseComplete */
+ +                      case '2': /* BindComplete */
+ +                      case '3': /* CloseComplete */
+ +                      case 'n': /* NoData */
+ +                              /* simple notifications, continue reading */
+ +                              break;
+ +                      case 'G': /* CopyInResponse */
+ +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_IN);
+ +                              HandleCopyIn(combiner);
+ +                              /* Done, return to caller to let it know the data can be passed in */
+ +                              return RESPONSE_COPY;
+ +                      case 'H': /* CopyOutResponse */
+ +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_OUT);
+ +                              HandleCopyOut(combiner);
+ +                              return RESPONSE_COPY;
+ +                      case 'd': /* CopyOutDataRow */
+ +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_OUT);
+ +                              HandleCopyDataRow(combiner, msg, msg_len);
+ +                              break;
+ +                      case 'E':                       /* ErrorResponse */
+ +                              HandleError(combiner, msg, msg_len, conn);
+ +                              add_error_message(conn, combiner->errorMessage);
+ +                              /*
+ +                               * In case the remote node was running an extended query
+ +                               * protocol and reported an error, it will keep ignoring all
+ +                               * subsequent commands until it sees a SYNC message. So make
+ +                               * sure that we send down SYNC even before sending a ROLLBACK
+ +                               * command
+ +                               */
+ +                              if (conn->in_extended_query)
+ +                                      conn->needSync = true;
+ +                              return RESPONSE_ERROR;
+ +                      case 'A':                       /* NotificationResponse */
+ +                      case 'N':                       /* NoticeResponse */
+ +                      case 'S':                       /* SetCommandComplete */
+ +                              /*
+ +                               * Ignore these to prevent multiple messages, one from each
+ +                               * node. Coordinator will send one for DDL anyway
+ +                               */
+ +                              break;
+ +                      case 'Z':                       /* ReadyForQuery */
+ +                      {
+ +                              /*
+ +                               * Return result depends on previous connection state.
+ +                               * If it was PORTAL_SUSPENDED Coordinator want to send down
+ +                               * another EXECUTE to fetch more rows, otherwise it is done
+ +                               * with the connection
+ +                               */
+ +                              conn->transaction_status = msg[0];
+ +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+ +                              conn->combiner = NULL;
+ +#ifdef DN_CONNECTION_DEBUG
+ +                              conn->have_row_desc = false;
+ +#endif
+ +                              return RESPONSE_READY;
+ +                      }
+ +                      case 'M':                       /* Command Id */
+ +                              HandleDatanodeCommandId(combiner, msg, msg_len);
+ +                              break;
+ +                      case 'b':
+ +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+ +                              return RESPONSE_BARRIER_OK;
+ +                      case 'I':                       /* EmptyQuery */
+ +                              return RESPONSE_COMPLETE;
+ +                      case 'W':
+ +                              HandleWaitXids(msg, msg_len);   
+ +                              return RESPONSE_WAITXIDS;
+ +                      case 'x':
+ +                              HandleGlobalTransactionId(msg, msg_len);
+ +                              return RESPONSE_ASSIGN_GXID;
+ +                      default:
+ +                              /* sync lost? */
+ +                              elog(WARNING, "Received unsupported message type: %c", msg_type);
+ +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
+ +                              /* stop reading */
+ +                              return RESPONSE_COMPLETE;
+ +              }
+ +      }
+ +      /* never happen, but keep compiler quiet */
+ +      return RESPONSE_EOF;
+ +}
+ +
+ +/*
+ + * Has the data node sent Ready For Query
+ + */
+ +
+ +bool
+ +is_data_node_ready(PGXCNodeHandle * conn)
+ +{
+ +      char            *msg;
+ +      int             msg_len;
+ +      char            msg_type;
+ +
+ +      for (;;)
+ +      {
+ +              /*
+ +               * If we are in the process of shutting down, we
+ +               * may be rolling back, and the buffer may contain other messages.
+ +               * We want to avoid a procarray exception
+ +               * as well as an error stack overflow.
+ +               */
+ +              if (proc_exit_inprogress)
+ +                      PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
+ +
+ +              /* don't read from from the connection if there is a fatal error */
+ +              if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ +                      return true;
+ +
+ +              /* No data available, exit */
+ +              if (!HAS_MESSAGE_BUFFERED(conn))
+ +                      return false;
+ +
+ +              msg_type = get_message(conn, &msg_len, &msg);
+ +              if (msg_type == 'Z')
+ +              {
+ +                      /*
+ +                       * Return result depends on previous connection state.
+ +                       * If it was PORTAL_SUSPENDED Coordinator want to send down
+ +                       * another EXECUTE to fetch more rows, otherwise it is done
+ +                       * with the connection
+ +                       */
+ +                      conn->transaction_status = msg[0];
+ +                      PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+ +                      conn->combiner = NULL;
+ +                      return true;
+ +              }
+ +      }
+ +      /* never happen, but keep compiler quiet */
+ +      return false;
+ +}
+ +
+ +
+ +/*
+ + * Send BEGIN command to the Datanodes or Coordinators and receive responses.
+ + * Also send the GXID for the transaction.
+ + */
+ +static int
+ +pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
+ +                              GlobalTransactionId gxid, bool need_tran_block,
+ +                              bool readOnly, char node_type)
+ +{
+ +      int                     i;
+ +      struct timeval *timeout = NULL;
+ +      ResponseCombiner combiner;
+ +      TimestampTz timestamp = GetCurrentGTMStartTimestamp();
+ +      PGXCNodeHandle *new_connections[conn_count];
+ +      int new_count = 0;
+ +      char               *init_str;
+ +      char                    lxid[13];
+ +
+ +      /*
+ +       * If no remote connections, we don't have anything to do
+ +       */
+ +      if (conn_count == 0)
+ +              return 0;
+ +
+ +      for (i = 0; i < conn_count; i++)
+ +      {
+ +              if (!readOnly && !IsConnFromDatanode())
+ +                      connections[i]->read_only = false;
+ +              /*
+ +               * PGXC TODO - A connection should not be in DN_CONNECTION_STATE_QUERY
+ +               * state when we are about to send a BEGIN TRANSACTION command to the
+ +               * node. We should consider changing the following to an assert and fix
+ +               * any bugs reported
+ +               */
+ +              if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
+ +                      BufferConnection(connections[i]);
+ +
+ +              /* Send GXID and check for errors */
+ +              if (GlobalTransactionIdIsValid(gxid) && pgxc_node_send_gxid(connections[i], gxid))
+ +                      return EOF;
+ +
+ +              /* Send timestamp and check for errors */
+ +              if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp))
+ +                      return EOF;
+ +
+ +              if (IS_PGXC_DATANODE && GlobalTransactionIdIsValid(gxid))
+ +                      need_tran_block = true;
+ +              else if (IS_PGXC_REMOTE_COORDINATOR)
+ +                      need_tran_block = false;
+ +
+ +              elog(DEBUG5, "need_tran_block %d, connections[%d]->transaction_status %c",
+ +                              need_tran_block, i, connections[i]->transaction_status);
+ +              /* Send BEGIN if not already in transaction */
+ +              if (need_tran_block && connections[i]->transaction_status == 'I')
+ +              {
+ +                      /* Send the BEGIN TRANSACTION command and check for errors */
+ +                      if (pgxc_node_send_query(connections[i], "BEGIN"))
+ +                              return EOF;
+ +
+ +                      new_connections[new_count++] = connections[i];
+ +              }
+ +      }
+ +
+ +      /*
+ +       * If we did not send a BEGIN command to any node, we are done. Otherwise,
+ +       * we need to check for any errors and report them
+ +       */
+ +      if (new_count == 0)
+ +              return 0;
+ +
+ +      InitResponseCombiner(&combiner, new_count, COMBINE_TYPE_NONE);
+ +      /*
+ +       * Make sure there are zeroes in unused fields
+ +       */
+ +      memset(&combiner, 0, sizeof(ScanState));
+ +
+ +      /* Receive responses */
+ +      if (pgxc_node_receive_responses(new_count, new_connections, timeout, &combiner))
+ +              return EOF;
+ +
+ +      /* Verify status */
+ +      if (!ValidateAndCloseCombiner(&combiner))
+ +              return EOF;
+ +
+ +      /* Send virtualXID to the remote nodes using SET command */
+ +      sprintf(lxid, "%d", MyProc->lxid);
+ +      PGXCNodeSetParam(true, "coordinator_lxid", lxid, 0);
+ +
+ +      /* after transactions are started send down local set commands */
+ +      init_str = PGXCNodeGetTransactionParamStr();
+ +      if (init_str)
+ +      {
+ +              for (i = 0; i < new_count; i++)
+ +              {
+ +                      pgxc_node_set_query(new_connections[i], init_str);
+ +              }
+ +      }
+ +
+ +      /* No problem, let's get going */
+ +      return 0;
+ +}
+ +
+ +
+ +/*
+ + * Execute DISCARD ALL command on all allocated nodes to remove all session
+ + * specific stuff before releasing them to pool for reuse by other sessions.
+ + */
+ +static void
+ +pgxc_node_remote_cleanup_all(void)
+ +{
+ +      PGXCNodeAllHandles *handles = get_current_handles();
+ +      PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count];
+ +      int                             new_conn_count = 0;
+ +      int                             i;
+ +      char               *resetcmd = "RESET ALL;"
+ +                                                         "RESET SESSION AUTHORIZATION;"
+ +                                                         "RESET transaction_isolation;"
+ +                                                         "RESET global_session";
+ +
+ +      elog(DEBUG5, "pgxc_node_remote_cleanup_all - handles->co_conn_count %d,"
+ +                      "handles->dn_conn_count %d", handles->co_conn_count,
+ +                      handles->dn_conn_count);
+ +      /*
+ +       * We must handle reader and writer connections both since even a read-only
+ +       * needs to be cleaned up.
+ +       */
+ +      if (handles->co_conn_count + handles->dn_conn_count == 0)
+ +              return;
+ +
+ +      /*
+ +       * Send down snapshot followed by DISCARD ALL command.
+ +       */
+ +      for (i = 0; i < handles->co_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *handle = handles->coord_handles[i];
+ +
+ +              /* At this point connection should be in IDLE state */
+ +              if (handle->state != DN_CONNECTION_STATE_IDLE)
+ +              {
+ +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ +                      continue;
+ +              }
+ +
+ +              /*
+ +               * We must go ahead and release connections anyway, so do not throw
+ +               * an error if we have a problem here.
+ +               */
+ +              if (pgxc_node_send_query(handle, resetcmd))
+ +              {
+ +                      ereport(WARNING,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to clean up data nodes")));
+ +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ +                      continue;
+ +              }
+ +              new_connections[new_conn_count++] = handle;
+ +              handle->combiner = NULL;
+ +      }
+ +      for (i = 0; i < handles->dn_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *handle = handles->datanode_handles[i];
+ +
+ +              /* At this point connection should be in IDLE state */
+ +              if (handle->state != DN_CONNECTION_STATE_IDLE)
+ +              {
+ +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ +                      continue;
+ +              }
+ +
+ +              /*
+ +               * We must go ahead and release connections anyway, so do not throw
+ +               * an error if we have a problem here.
+ +               */
+ +              if (pgxc_node_send_query(handle, resetcmd))
+ +              {
+ +                      ereport(WARNING,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to clean up data nodes")));
+ +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ +                      continue;
+ +              }
+ +              new_connections[new_conn_count++] = handle;
+ +              handle->combiner = NULL;
+ +      }
+ +
+ +      if (new_conn_count)
+ +      {
+ +              ResponseCombiner combiner;
+ +              InitResponseCombiner(&combiner, new_conn_count, COMBINE_TYPE_NONE);
+ +              /* Receive responses */
+ +              pgxc_node_receive_responses(new_conn_count, new_connections, NULL, &combiner);
+ +              CloseCombiner(&combiner);
+ +      }
+ +      pfree_pgxc_all_handles(handles);
+ +}
+ +
+ +/*
+ + * Count how many coordinators and datanodes are involved in this transaction
+ + * so that we can save that information in the GID
+ + */
+ +static void
+ +pgxc_node_remote_count(int *dnCount, int dnNodeIds[],
+ +              int *coordCount, int coordNodeIds[])
+ +{
+ +      int i;
+ +      PGXCNodeAllHandles *handles = get_current_handles();
+ +
+ +      *dnCount = *coordCount = 0;
+ +      for (i = 0; i < handles->dn_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->datanode_handles[i];
+ +              /*
+ +               * Skip empty slots
+ +               */
+ +              if (conn->sock == NO_SOCKET)
+ +                      continue;
+ +              else if (conn->transaction_status == 'T')
+ +              {
+ +                      if (!conn->read_only)
+ +                      {
+ +                              dnNodeIds[*dnCount] = conn->nodeid;
+ +                              *dnCount = *dnCount + 1;
+ +                      }
+ +              }
+ +      }
+ +
+ +      for (i = 0; i < handles->co_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->coord_handles[i];
+ +              /*
+ +               * Skip empty slots
+ +               */
+ +              if (conn->sock == NO_SOCKET)
+ +                      continue;
+ +              else if (conn->transaction_status == 'T')
+ +              {
+ +                      if (!conn->read_only)
+ +                      {
+ +                              coordNodeIds[*coordCount] = conn->nodeid;
+ +                              *coordCount = *coordCount + 1;
+ +                      }
+ +              }
+ +      }
+ +}
+ +
+ +/*
+ + * Prepare nodes which ran write operations during the transaction.
+ + * Read only remote transactions are committed and connections are released
+ + * back to the pool.
+ + * Function returns the list of nodes where transaction is prepared, including
+ + * local node, if requested, in format expected by the GTM server.
+ + * If something went wrong the function tries to abort prepared transactions on
+ + * the nodes where it succeeded and throws error. A warning is emitted if abort
+ + * prepared fails.
+ + * After completion remote connection handles are released.
+ + */
+ +static char *
+ +pgxc_node_remote_prepare(char *prepareGID, bool localNode)
+ +{
+ +      bool                    isOK = true;
+ +      StringInfoData  nodestr;
+ +      char                    *prepare_cmd = (char *) palloc (64 + strlen(prepareGID));
+ +      char                    *abort_cmd;
+ +      GlobalTransactionId auxXid;
+ +      char               *commit_cmd = "COMMIT TRANSACTION";
+ +      int                             i;
+ +      ResponseCombiner combiner;
+ +      PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ +      int                             conn_count = 0;
+ +      PGXCNodeAllHandles *handles = get_current_handles();
+ +
+ +      initStringInfo(&nodestr);
+ +      if (localNode)
+ +              appendStringInfoString(&nodestr, PGXCNodeName);
+ +
+ +      sprintf(prepare_cmd, "PREPARE TRANSACTION '%s'", prepareGID);
+ +
+ +      for (i = 0; i < handles->dn_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->datanode_handles[i];
+ +
+ +              /*
+ +               * If something went wrong already we have nothing to do here. The error
+ +               * will be reported at the end of the function, and we will rollback
+ +               * remotes as part of the error handling.
+ +               * Just skip to clean up section and check if we have already prepared
+ +               * somewhere, we should abort that prepared transaction.
+ +               */
+ +              if (!isOK)
+ +                      goto prepare_err;
+ +
+ +              /*
+ +               * Skip empty slots
+ +               */
+ +              if (conn->sock == NO_SOCKET)
+ +                      continue;
+ +              else if (conn->transaction_status == 'T')
+ +              {
+ +                      /* Read in any pending input */
+ +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
+ +                              BufferConnection(conn);
+ +
+ +                      if (conn->read_only)
+ +                      {
+ +                              /* Send down prepare command */
+ +                              if (pgxc_node_send_query(conn, commit_cmd))
+ +                              {
+ +                                      /*
+ +                                       * not a big deal, it was read only, the connection will be
+ +                                       * abandoned later.
+ +                                       */
+ +                                      ereport(LOG,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("failed to send COMMIT command to "
+ +                                                              "the node %u", conn->nodeoid)));
+ +                              }
+ +                              else
+ +                              {
+ +                                      /* Read responses from these */
+ +                                      connections[conn_count++] = conn;
+ +                              }
+ +                      }
+ +                      else
+ +                      {
+ +                              /* Send down prepare command */
+ +                              if (pgxc_node_send_query(conn, prepare_cmd))
+ +                              {
+ +                                      /*
+ +                                       * That is the trouble, we really want to prepare it.
+ +                                       * Just emit warning so far and go to clean up.
+ +                                       */
+ +                                      isOK = false;
+ +                                      ereport(WARNING,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("failed to send PREPARE TRANSACTION command to "
+ +                                                              "the node %u", conn->nodeoid)));
+ +                              }
+ +                              else
+ +                              {
+ +                                      char *nodename = get_pgxc_nodename(conn->nodeoid);
+ +                                      if (nodestr.len > 0)
+ +                                              appendStringInfoChar(&nodestr, ',');
+ +                                      appendStringInfoString(&nodestr, nodename);
+ +                                      /* Read responses from these */
+ +                                      connections[conn_count++] = conn;
+ +                                      /*
+ +                                       * If it fails on remote node it would just return ROLLBACK.
+ +                                       * Set the flag for the message handler so the response is
+ +                                       * verified.
+ +                                       */
+ +                                      conn->ck_resp_rollback = true;
+ +                              }
+ +                      }
+ +              }
+ +              else if (conn->transaction_status == 'E')
+ +              {
+ +                      /*
+ +                       * Probably can not happen, if there was a error the engine would
+ +                       * abort anyway, even in case of explicit PREPARE.
+ +                       * Anyway, just in case...
+ +                       */
+ +                      isOK = false;
+ +                      ereport(WARNING,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("remote node %u is in error state", conn->nodeoid)));
+ +              }
+ +      }
+ +
+ +      for (i = 0; i < handles->co_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->coord_handles[i];
+ +
+ +              /*
+ +               * If something went wrong already we have nothing to do here. The error
+ +               * will be reported at the end of the function, and we will rollback
+ +               * remotes as part of the error handling.
+ +               * Just skip to clean up section and check if we have already prepared
+ +               * somewhere, we should abort that prepared transaction.
+ +               */
+ +              if (!isOK)
+ +                      goto prepare_err;
+ +
+ +              /*
+ +               * Skip empty slots
+ +               */
+ +              if (conn->sock == NO_SOCKET)
+ +                      continue;
+ +              else if (conn->transaction_status == 'T')
+ +              {
+ +                      if (conn->read_only)
+ +                      {
+ +                              /* Send down prepare command */
+ +                              if (pgxc_node_send_query(conn, commit_cmd))
+ +                              {
+ +                                      /*
+ +                                       * not a big deal, it was read only, the connection will be
+ +                                       * abandoned later.
+ +                                       */
+ +                                      ereport(LOG,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("failed to send COMMIT command to "
+ +                                                              "the node %u", conn->nodeoid)));
+ +                              }
+ +                              else
+ +                              {
+ +                                      /* Read responses from these */
+ +                                      connections[conn_count++] = conn;
+ +                              }
+ +                      }
+ +                      else
+ +                      {
+ +                              /* Send down prepare command */
+ +                              if (pgxc_node_send_query(conn, prepare_cmd))
+ +                              {
+ +                                      /*
+ +                                       * That is the trouble, we really want to prepare it.
+ +                                       * Just emit warning so far and go to clean up.
+ +                                       */
+ +                                      isOK = false;
+ +                                      ereport(WARNING,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("failed to send PREPARE TRANSACTION command to "
+ +                                                              "the node %u", conn->nodeoid)));
+ +                              }
+ +                              else
+ +                              {
+ +                                      char *nodename = get_pgxc_nodename(conn->nodeoid);
+ +                                      if (nodestr.len > 0)
+ +                                              appendStringInfoChar(&nodestr, ',');
+ +                                      appendStringInfoString(&nodestr, nodename);
+ +                                      /* Read responses from these */
+ +                                      connections[conn_count++] = conn;
+ +                                      /*
+ +                                       * If it fails on remote node it would just return ROLLBACK.
+ +                                       * Set the flag for the message handler so the response is
+ +                                       * verified.
+ +                                       */
+ +                                      conn->ck_resp_rollback = true;
+ +                              }
+ +                      }
+ +              }
+ +              else if (conn->transaction_status == 'E')
+ +              {
+ +                      /*
+ +                       * Probably can not happen, if there was a error the engine would
+ +                       * abort anyway, even in case of explicit PREPARE.
+ +                       * Anyway, just in case...
+ +                       */
+ +                      isOK = false;
+ +                      ereport(WARNING,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("remote node %u is in error state", conn->nodeoid)));
+ +              }
+ +      }
+ +
+ +      SetSendCommandId(false);
+ +
+ +      if (!isOK)
+ +              goto prepare_err;
+ +
+ +      /* exit if nothing has been prepared */
+ +      if (conn_count > 0)
+ +      {
+ +              int result;
+ +              /*
+ +               * Receive and check for any errors. In case of errors, we don't bail out
+ +               * just yet. We first go through the list of connections and look for
+ +               * errors on each connection. This is important to ensure that we run
+ +               * an appropriate ROLLBACK command later on (prepared transactions must be
+ +               * rolled back with ROLLBACK PREPARED commands).
+ +               *
+ +               * PGXCTODO - There doesn't seem to be a solid mechanism to track errors on
+ +               * individual connections. The transaction_status field doesn't get set
+ +               * every time there is an error on the connection. The combiner mechanism is
+ +               * good for parallel proessing, but I think we should have a leak-proof
+ +               * mechanism to track connection status
+ +               */
+ +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ +              /* Receive responses */
+ +              result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
+ +              if (result || !validate_combiner(&combiner))
+ +                      goto prepare_err;
+ +              else
+ +                      CloseCombiner(&combiner);
+ +
+ +              /* Before exit clean the flag, to avoid unnecessary checks */
+ +              for (i = 0; i < conn_count; i++)
+ +                      connections[i]->ck_resp_rollback = false;
+ +
+ +              pfree_pgxc_all_handles(handles);
+ +              if (!temp_object_included && !PersistentConnections)
+ +              {
+ +                      /* Clean up remote sessions */
+ +                      pgxc_node_remote_cleanup_all();
+ +                      release_handles();
+ +              }
+ +      }
+ +
+ +      pfree(prepare_cmd);
+ +      return nodestr.data;
+ +
+ +prepare_err:
+ +      abort_cmd = (char *) palloc (64 + strlen(prepareGID));
+ +      sprintf(abort_cmd, "ROLLBACK PREPARED '%s'", prepareGID);
+ +
+ +      auxXid = GetAuxilliaryTransactionId();
+ +      conn_count = 0;
+ +      for (i = 0; i < handles->dn_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->datanode_handles[i];
+ +
+ +              /*
+ +               * PREPARE succeeded on that node, roll it back there
+ +               */
+ +              if (conn->ck_resp_rollback)
+ +              {
+ +                      conn->ck_resp_rollback = false;
+ +
+ +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
+ +                      {
+ +                              ereport(WARNING,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Error while PREPARING transaction %s on "
+ +                                                       "node %s. Administrative action may be required "
+ +                                                       "to abort this transaction on the node",
+ +                                                       prepareGID, conn->nodename)));
+ +                              continue;
+ +                      }
+ +
+ +                      /* sanity checks */
+ +                      Assert(conn->sock != NO_SOCKET);
+ +                      /* Send down abort prepared command */
+ +                      if (pgxc_node_send_gxid(conn, auxXid))
+ +                      {
+ +                              /*
+ +                               * Prepared transaction is left on the node, but we can not
+ +                               * do anything with that except warn the user.
+ +                               */
+ +                              ereport(WARNING,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("failed to send xid to "
+ +                                                              "the node %u", conn->nodeoid)));
+ +                      }
+ +                      if (pgxc_node_send_query(conn, abort_cmd))
+ +                      {
+ +                              /*
+ +                               * Prepared transaction is left on the node, but we can not
+ +                               * do anything with that except warn the user.
+ +                               */
+ +                              ereport(WARNING,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("failed to send ABORT PREPARED command to "
+ +                                                              "the node %u", conn->nodeoid)));
+ +                      }
+ +                      else
+ +                      {
+ +                              /* Read responses from these */
+ +                              connections[conn_count++] = conn;
+ +                      }
+ +              }
+ +      }
+ +      for (i = 0; i < handles->co_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->coord_handles[i];
+ +
+ +              if (conn->ck_resp_rollback)
+ +              {
+ +                      conn->ck_resp_rollback = false;
+ +
+ +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
+ +                      {
+ +                              ereport(WARNING,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Error while PREPARING transaction %s on "
+ +                                                       "node %s. Administrative action may be required "
+ +                                                       "to abort this transaction on the node",
+ +                                                       prepareGID, conn->nodename)));
+ +                              continue;
+ +                      }
+ +
+ +                      /* sanity checks */
+ +                      Assert(conn->sock != NO_SOCKET);
+ +                      /* Send down abort prepared command */
+ +                      if (pgxc_node_send_gxid(conn, auxXid))
+ +                      {
+ +                              /*
+ +                               * Prepared transaction is left on the node, but we can not
+ +                               * do anything with that except warn the user.
+ +                               */
+ +                              ereport(WARNING,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("failed to send xid to "
+ +                                                              "the node %u", conn->nodeoid)));
+ +                      }
+ +                      if (pgxc_node_send_query(conn, abort_cmd))
+ +                      {
+ +                              /*
+ +                               * Prepared transaction is left on the node, but we can not
+ +                               * do anything with that except warn the user.
+ +                               */
+ +                              ereport(WARNING,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("failed to send ABORT PREPARED command to "
+ +                                                              "the node %u", conn->nodeoid)));
+ +                      }
+ +                      else
+ +                      {
+ +                              /* Read responses from these */
+ +                              connections[conn_count++] = conn;
+ +                      }
+ +              }
+ +      }
+ +      if (conn_count > 0)
+ +      {
+ +              /* Just read out responses, throw error from the first combiner */
+ +              ResponseCombiner combiner2;
+ +              InitResponseCombiner(&combiner2, conn_count, COMBINE_TYPE_NONE);
+ +              /* Receive responses */
+ +              pgxc_node_receive_responses(conn_count, connections, NULL, &combiner2);
+ +              CloseCombiner(&combiner2);
+ +      }
+ +
+ +      if (!temp_object_included && !PersistentConnections)
+ +      {
+ +              /* Clean up remote sessions */
+ +              pgxc_node_remote_cleanup_all();
+ +              release_handles();
+ +      }
+ +
+ +      pfree_pgxc_all_handles(handles);
+ +      pfree(abort_cmd);
+ +
+ +      /*
+ +       * If the flag is set we are here because combiner carries error message
+ +       */
+ +      if (isOK)
+ +              pgxc_node_report_error(&combiner);
+ +      else
+ +              elog(ERROR, "failed to PREPARE transaction on one or more nodes");
+ +      return NULL;
+ +}
+ +
+ +
+ +/*
+ + * Commit transactions on remote nodes.
+ + * If barrier lock is set wait while it is released.
+ + * Release remote connection after completion.
+ + */
+ +static void
+ +pgxc_node_remote_commit(void)
+ +{
+ +      int                             result = 0;
+ +      char               *commitCmd = "COMMIT TRANSACTION";
+ +      int                             i;
+ +      ResponseCombiner combiner;
+ +      PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ +      int                             conn_count = 0;
+ +      PGXCNodeAllHandles *handles = get_current_handles();
+ +
+ +      SetSendCommandId(false);
+ +
+ +      /*
+ +       * Barrier:
+ +       *
+ +       * We should acquire the BarrierLock in SHARE mode here to ensure that
+ +       * there are no in-progress barrier at this point. This mechanism would
+ +       * work as long as LWLock mechanism does not starve a EXCLUSIVE lock
+ +       * requester
+ +       */
+ +      LWLockAcquire(BarrierLock, LW_SHARED);
+ +
+ +      for (i = 0; i < handles->dn_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->datanode_handles[i];
+ +
+ +              /* Skip empty slots */
+ +              if (conn->sock == NO_SOCKET)
+ +                      continue;
+ +
+ +              /*
+ +               * We do not need to commit remote node if it is not in transaction.
+ +               * If transaction is in error state the commit command will cause
+ +               * rollback, that is OK
+ +               */
+ +              if (conn->transaction_status != 'I')
+ +              {
+ +                      /* Read in any pending input */
+ +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
+ +                              BufferConnection(conn);
+ +
+ +                      if (pgxc_node_send_query(conn, commitCmd))
+ +                      {
+ +                              /*
+ +                               * Do not bother with clean up, just bomb out. The error handler
+ +                               * will invoke RollbackTransaction which will do the work.
+ +                               */
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("failed to send COMMIT command to the node %u",
+ +                                                              conn->nodeoid)));
+ +                      }
+ +                      else
+ +                      {
+ +                              /* Read responses from these */
+ +                              connections[conn_count++] = conn;
+ +                      }
+ +              }
+ +      }
+ +
+ +      for (i = 0; i < handles->co_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->coord_handles[i];
+ +
+ +              /* Skip empty slots */
+ +              if (conn->sock == NO_SOCKET)
+ +                      continue;
+ +
+ +              /*
+ +               * We do not need to commit remote node if it is not in transaction.
+ +               * If transaction is in error state the commit command will cause
+ +               * rollback, that is OK
+ +               */
+ +              if (conn->transaction_status != 'I')
+ +              {
+ +                      if (pgxc_node_send_query(conn, commitCmd))
+ +                      {
+ +                              /*
+ +                               * Do not bother with clean up, just bomb out. The error handler
+ +                               * will invoke RollbackTransaction which will do the work.
+ +                               */
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("failed to send COMMIT command to the node %u",
+ +                                                              conn->nodeoid)));
+ +                      }
+ +                      else
+ +                      {
+ +                              /* Read responses from these */
+ +                              connections[conn_count++] = conn;
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Release the BarrierLock.
+ +       */
+ +      LWLockRelease(BarrierLock);
+ +
+ +      if (conn_count)
+ +      {
+ +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ +              /* Receive responses */
+ +              result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
+ +              if (result || !validate_combiner(&combiner))
+ +                      result = EOF;
+ +              else
+ +                      CloseCombiner(&combiner);
+ +      }
+ +
+ +      stat_transaction(conn_count);
+ +
+ +      if (result)
+ +      {
+ +              if (combiner.errorMessage)
+ +                      pgxc_node_report_error(&combiner);
+ +              else
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to COMMIT the transaction on one or more nodes")));
+ +      }
+ +
+ +      if (!temp_object_included && !PersistentConnections)
+ +      {
+ +              /* Clean up remote sessions */
+ +              pgxc_node_remote_cleanup_all();
+ +              release_handles();
+ +      }
+ +
+ +      pfree_pgxc_all_handles(handles);
+ +}
+ +
+ +
+ +/*
+ + * Rollback transactions on remote nodes.
+ + * Release remote connection after completion.
+ + */
+ +static void
+ +pgxc_node_remote_abort(void)
+ +{
+ +      int                             result = 0;
+ +      char               *rollbackCmd = "ROLLBACK TRANSACTION";
+ +      int                             i;
+ +      ResponseCombiner combiner;
+ +      PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ +      int                             conn_count = 0;
+ +      PGXCNodeAllHandles *handles = get_current_handles();
+ +      struct timeval timeout;
+ +
+ +      SetSendCommandId(false);
+ +
+ +      elog(DEBUG5, "pgxc_node_remote_abort - dn_conn_count %d, co_conn_count %d",
+ +                      handles->dn_conn_count, handles->co_conn_count);
+ +
+ +      timeout.tv_sec = 60;
+ +      timeout.tv_usec = 0;
+ +
+ +      for (i = 0; i < handles->dn_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->datanode_handles[i];
+ +
+ +              /* Skip empty slots */
+ +              if (conn->sock == NO_SOCKET)
+ +                      continue;
+ +
+ +              elog(DEBUG5, "node %s, conn->transaction_status %c",
+ +                              conn->nodename,
+ +                              conn->transaction_status);
+ +
+ +              if (conn->transaction_status != 'I')
+ +              {
+ +                      /* Read in any pending input */
+ +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
+ +                              BufferConnection(conn);
+ +
+ +                      /*
+ +                       * If the remote session was running extended query protocol when
+ +                       * it failed, it will expect a SYNC message before it accepts any
+ +                       * other command
+ +                       */
+ +                      if (conn->needSync)
+ +                      {
+ +                              pgxc_node_send_sync(conn);
+ +                              pgxc_node_receive(1, &conn, &timeout);
+ +                      }
+ +                      /*
+ +                       * Do not matter, is there committed or failed transaction,
+ +                       * just send down rollback to finish it.
+ +                       */
+ +                      if (pgxc_node_send_rollback(conn, rollbackCmd))
+ +                      {
+ +                              add_error_message(conn,
+ +                                              "failed to send ROLLBACK TRANSACTION command");
+ +                      }
+ +                      else
+ +                      {
+ +                              /* Read responses from these */
+ +                              connections[conn_count++] = conn;
+ +                      }
+ +              }
+ +      }
+ +
+ +      for (i = 0; i < handles->co_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->coord_handles[i];
+ +
+ +              /* Skip empty slots */
+ +              if (conn->sock == NO_SOCKET)
+ +                      continue;
+ +
+ +              if (conn->transaction_status != 'I')
+ +              {
+ +                      /* Send SYNC if the remote session is expecting one */
+ +                      if (conn->needSync)
+ +                      {
+ +                              pgxc_node_send_sync(conn);
+ +                              pgxc_node_receive(1, &conn, &timeout);
+ +                      }
+ +                      /*
+ +                       * Do not matter, is there committed or failed transaction,
+ +                       * just send down rollback to finish it.
+ +                       */
+ +                      if (pgxc_node_send_rollback(conn, rollbackCmd))
+ +                      {
+ +                              add_error_message(conn,
+ +                                              "failed to send ROLLBACK TRANSACTION command");
+ +                      }
+ +                      else
+ +                      {
+ +                              /* Read responses from these */
+ +                              connections[conn_count++] = conn;
+ +                      }
+ +              }
+ +      }
+ +
+ +      if (conn_count)
+ +      {
+ +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ +              /* Receive responses */
+ +              result = pgxc_node_receive_responses(conn_count, connections, &timeout, &combiner);
+ +              if (result || !validate_combiner(&combiner))
+ +                      result = EOF;
+ +              else
+ +                      CloseCombiner(&combiner);
+ +      }
+ +
+ +      stat_transaction(conn_count);
+ +
+ +      if (result)
+ +      {
+ +              if (combiner.errorMessage)
+ +                      pgxc_node_report_error(&combiner);
+ +              else
+ +                      ereport(LOG,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to ROLLBACK the transaction on one or more nodes")));
+ +      }
+ +
+ +      pfree_pgxc_all_handles(handles);
+ +}
+ +
+ +/*
+ + * Begin COPY command
+ + * The copy_connections array must have room for NumDataNodes items
+ + */
+ +void
+ +DataNodeCopyBegin(RemoteCopyData *rcstate)
+ +{
+ +      int i;
+ +      List *nodelist = rcstate->rel_loc->rl_nodeList;
+ +      PGXCNodeHandle **connections;
+ +      bool need_tran_block;
+ +      GlobalTransactionId gxid;
+ +      ResponseCombiner combiner;
+ +      Snapshot snapshot = GetActiveSnapshot();
+ +      int conn_count = list_length(nodelist);
+ +
+ +      /* Get needed datanode connections */
+ +      if (!rcstate->is_from && IsLocatorReplicated(rcstate->rel_loc->locatorType))
+ +      {
+ +              /* Connections is a single handle to read from */
+ +              connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *));
+ +              connections[0] = get_any_handle(nodelist);
+ +              conn_count = 1;
+ +      }
+ +      else
+ +      {
+ +              PGXCNodeAllHandles *pgxc_handles;
+ +              pgxc_handles = get_handles(nodelist, NULL, false, true);
+ +              connections = pgxc_handles->datanode_handles;
+ +              Assert(pgxc_handles->dn_conn_count == conn_count);
+ +              pfree(pgxc_handles);
+ +      }
+ +
+ +      /*
+ +       * If more than one nodes are involved or if we are already in a
+ +       * transaction block, we must the remote statements in a transaction block
+ +       */
+ +      need_tran_block = (conn_count > 1) || (TransactionBlockStatusCode() == 'T');
+ +
+ +      elog(DEBUG1, "conn_count = %d, need_tran_block = %s", conn_count,
+ +                      need_tran_block ? "true" : "false");
+ +
+ +      /* Gather statistics */
+ +      stat_statement();
+ +      stat_transaction(conn_count);
+ +
+ +      gxid = GetCurrentTransactionId();
+ +
+ +      /* Start transaction on connections where it is not started */
+ +      if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE))
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("Could not begin transaction on data nodes.")));
+ +      }
+ +
+ +      /*
+ +       * COPY TO do not use locator, it just takes connections from it, and
+ +       * we do not look up distribution data type in this case.
+ +       * So always use LOCATOR_TYPE_RROBIN to avoid errors because of not
+ +       * defined partType if real locator type is HASH or MODULO.
+ +       * Create locator before sending down query, because createLocator may
+ +       * fail and we leave with dirty connections.
+ +       * If we get an error now datanode connection will be clean and error
+ +       * handler will issue transaction abort.
+ +       */
+ +      rcstate->locator = createLocator(
+ +                      rcstate->is_from ? rcstate->rel_loc->locatorType
+ +                                      : LOCATOR_TYPE_RROBIN,
+ +                      rcstate->is_from ? RELATION_ACCESS_INSERT : RELATION_ACCESS_READ,
+ +                      rcstate->dist_type,
+ +                      LOCATOR_LIST_POINTER,
+ +                      conn_count,
+ +                      (void *) connections,
+ +                      NULL,
+ +                      false);
+ +
+ +      /* Send query to nodes */
+ +      for (i = 0; i < conn_count; i++)
+ +      {
+ +              CHECK_OWNERSHIP(connections[i], NULL);
+ +
+ +              if (snapshot && pgxc_node_send_snapshot(connections[i], snapshot))
+ +              {
+ +                      add_error_message(connections[i], "Can not send request");
+ +                      pfree(connections);
+ +                      freeLocator(rcstate->locator);
+ +                      rcstate->locator = NULL;
+ +                      return;
+ +              }
+ +              if (pgxc_node_send_query(connections[i], rcstate->query_buf.data) != 0)
+ +              {
+ +                      add_error_message(connections[i], "Can not send request");
+ +                      pfree(connections);
+ +                      freeLocator(rcstate->locator);
+ +                      rcstate->locator = NULL;
+ +                      return;
+ +              }
+ +      }
+ +
+ +      /*
+ +       * We are expecting CopyIn response, but do not want to send it to client,
+ +       * caller should take care about this, because here we do not know if
+ +       * client runs console or file copy
+ +       */
+ +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ +      /*
+ +       * Make sure there are zeroes in unused fields
+ +       */
+ +      memset(&combiner, 0, sizeof(ScanState));
+ +
+ +      /* Receive responses */
+ +      if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner)
+ +                      || !ValidateAndCloseCombiner(&combiner))
+ +      {
+ +              DataNodeCopyFinish(conn_count, connections);
+ +              freeLocator(rcstate->locator);
+ +              rcstate->locator = NULL;
+ +              return;
+ +      }
+ +      pfree(connections);
+ +}
+ +
+ +
+ +/*
+ + * Send a data row to the specified nodes
+ + */
+ +int
+ +DataNodeCopyIn(char *data_row, int len,
+ +              int conn_count, PGXCNodeHandle** copy_connections,
+ +              bool binary)
+ +{
+ +      /* size + data row + \n in CSV mode */
+ +      int msgLen = 4 + len + (binary ? 0 : 1);
+ +      int nLen = htonl(msgLen);
+ +      int i;
+ +
+ +      for(i = 0; i < conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *handle = copy_connections[i];
+ +              if (handle->state == DN_CONNECTION_STATE_COPY_IN)
+ +              {
+ +                      /* precalculate to speed up access */
+ +                      int bytes_needed = handle->outEnd + 1 + msgLen;
+ +
+ +                      /* flush buffer if it is almost full */
+ +                      if (bytes_needed > COPY_BUFFER_SIZE)
+ +                      {
+ +                              int to_send = handle->outEnd;
+ +
+ +                              /* First look if data node has sent a error message */
+ +                              int read_status = pgxc_node_read_data(handle, true);
+ +                              if (read_status == EOF || read_status < 0)
+ +                              {
+ +                                      add_error_message(handle, "failed to read data from data node");
+ +                                      return EOF;
+ +                              }
+ +
+ +                              if (handle->inStart < handle->inEnd)
+ +                              {
+ +                                      ResponseCombiner combiner;
+ +                                      InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE);
+ +                                      /*
+ +                                       * Make sure there are zeroes in unused fields
+ +                                       */
+ +                                      memset(&combiner, 0, sizeof(ScanState));
+ +
+ +                                      /*
+ +                                       * Validate the combiner but only if we see a proper
+ +                                       * resposne for our COPY message. The problem is that
+ +                                       * sometimes we might receive async messages such as
+ +                                       * 'M' which is used to send back command ID generated and
+ +                                       * consumed by the datanode. While the message gets handled
+ +                                       * in handle_response(), we don't want to declare receipt
+ +                                       * of an invalid message below.
+ +                                       *
+ +                                       * If there is an actual error of some sort then the
+ +                                       * connection state is will be set appropriately and we
+ +                                       * shall catch that subsequently.
+ +                                       */
+ +                                      if (handle_response(handle, &combiner) == RESPONSE_COPY &&
+ +                                              !ValidateAndCloseCombiner(&combiner))
+ +                                              return EOF;
+ +                              }
+ +
+ +                              if (DN_CONNECTION_STATE_ERROR(handle))
+ +                                      return EOF;
+ +
+ +                              /*
+ +                               * Try to send down buffered data if we have
+ +                               */
+ +                              if (to_send && send_some(handle, to_send) < 0)
+ +                              {
+ +                                      add_error_message(handle, "failed to send data to data node");
+ +                                      return EOF;
+ +                              }
+ +                      }
+ +
+ +                      if (ensure_out_buffer_capacity(bytes_needed, handle) != 0)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                               errmsg("out of memory")));
+ +                      }
+ +
+ +                      handle->outBuffer[handle->outEnd++] = 'd';
+ +                      memcpy(handle->outBuffer + handle->outEnd, &nLen, 4);
+ +                      handle->outEnd += 4;
+ +                      memcpy(handle->outBuffer + handle->outEnd, data_row, len);
+ +                      handle->outEnd += len;
+ +                      if (!binary)
+ +                              handle->outBuffer[handle->outEnd++] = '\n';
+ +
+ +                      handle->in_extended_query = false;
+ +              }
+ +              else
+ +              {
+ +                      add_error_message(handle, "Invalid data node connection");
+ +                      return EOF;
+ +              }
+ +      }
+ +      return 0;
+ +}
+ +
+ +uint64
+ +DataNodeCopyOut(PGXCNodeHandle** copy_connections,
+ +                                                        int conn_count, FILE* copy_file)
+ +{
+ +      ResponseCombiner combiner;
+ +      uint64          processed;
+ +      bool            error;
+ +
+ +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM);
+ +      /*
+ +       * Make sure there are zeroes in unused fields
+ +       */
+ +      memset(&combiner, 0, sizeof(ScanState));
+ +      combiner.processed = 0;
+ +      /* If there is an existing file where to copy data, pass it to combiner */
+ +      if (copy_file)
+ +      {
+ +              combiner.copy_file = copy_file;
+ +              combiner.remoteCopyType = REMOTE_COPY_FILE;
+ +      }
+ +      else
+ +      {
+ +              combiner.copy_file = NULL;
+ +              combiner.remoteCopyType = REMOTE_COPY_STDOUT;
+ +      }
+ +      error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0);
+ +
+ +      processed = combiner.processed;
+ +
+ +      if (!ValidateAndCloseCombiner(&combiner) || error)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DATA_CORRUPTED),
+ +                               errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type)));
+ +      }
+ +
+ +      return processed;
+ +}
+ +
+ +
+ +uint64
+ +DataNodeCopyStore(PGXCNodeHandle** copy_connections,
+ +                                                              int conn_count, Tuplestorestate* store)
+ +{
+ +      ResponseCombiner combiner;
+ +      uint64          processed;
+ +      bool            error;
+ +
+ +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM);
+ +      /*
+ +       * Make sure there are zeroes in unused fields
+ +       */
+ +      memset(&combiner, 0, sizeof(ScanState));
+ +      combiner.processed = 0;
+ +      combiner.remoteCopyType = REMOTE_COPY_TUPLESTORE;
+ +      combiner.tuplestorestate = store;
+ +
+ +      error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0);
+ +
+ +      processed = combiner.processed;
+ +
+ +      if (!ValidateAndCloseCombiner(&combiner) || error)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_DATA_CORRUPTED),
+ +                               errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type)));
+ +      }
+ +
+ +      return processed;
+ +}
+ +
+ +
+ +/*
+ + * Finish copy process on all connections
+ + */
+ +void
+ +DataNodeCopyFinish(int conn_count, PGXCNodeHandle** connections)
+ +{
+ +      int             i;
+ +      ResponseCombiner combiner;
+ +      bool            error = false;
+ +      for (i = 0; i < conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *handle = connections[i];
+ +
+ +              error = true;
+ +              if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT)
+ +                      error = DataNodeCopyEnd(handle, false);
+ +      }
+ +
+ +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ +      /*
+ +       * Make sure there are zeroes in unused fields
+ +       */
+ +      memset(&combiner, 0, sizeof(ScanState));
+ +      error = (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) != 0) || error;
+ +
+ +      if (!validate_combiner(&combiner) || error)
+ +      {
+ +              if (combiner.errorMessage)
+ +                      pgxc_node_report_error(&combiner);
+ +              else
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Error while running COPY")));
+ +      }
+ +      else
+ +              CloseCombiner(&combiner);
+ +}
+ +
+ +/*
+ + * End copy process on a connection
+ + */
+ +bool
+ +DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error)
+ +{
+ +      int             nLen = htonl(4);
+ +
+ +      if (handle == NULL)
+ +              return true;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + 4, handle) != 0)
+ +              return true;
+ +
+ +      if (is_error)
+ +              handle->outBuffer[handle->outEnd++] = 'f';
+ +      else
+ +              handle->outBuffer[handle->outEnd++] = 'c';
+ +
+ +      memcpy(handle->outBuffer + handle->outEnd, &nLen, 4);
+ +      handle->outEnd += 4;
+ +
+ +      handle->in_extended_query = false;
+ +      /* We need response right away, so send immediately */
+ +      if (pgxc_node_flush(handle) < 0)
+ +              return true;
+ +
+ +      return false;
+ +}
+ +
+ +
+ +/*
+ + * Get Node connections depending on the connection type:
+ + * Datanodes Only, Coordinators only or both types
+ + */
+ +static PGXCNodeAllHandles *
+ +get_exec_connections(RemoteQueryState *planstate,
+ +                                       ExecNodes *exec_nodes,
+ +                                       RemoteQueryExecType exec_type,
+ +                                       bool is_global_session)
+ +{
+ +      List       *nodelist = NIL;
+ +      List       *primarynode = NIL;
+ +      List       *coordlist = NIL;
+ +      PGXCNodeHandle *primaryconnection;
+ +      int                     co_conn_count, dn_conn_count;
+ +      bool            is_query_coord_only = false;
+ +      PGXCNodeAllHandles *pgxc_handles = NULL;
+ +
+ +      /*
+ +       * If query is launched only on Coordinators, we have to inform get_handles
+ +       * not to ask for Datanode connections even if list of Datanodes is NIL.
+ +       */
+ +      if (exec_type == EXEC_ON_COORDS)
+ +              is_query_coord_only = true;
+ +
+ +      if (exec_type == EXEC_ON_CURRENT)
+ +              return get_current_handles();
+ +
+ +      if (exec_nodes)
+ +      {
+ +              if (exec_nodes->en_expr)
+ +              {
+ +                      /* execution time determining of target Datanodes */
+ +                      bool isnull;
+ +                      ExprState *estate = ExecInitExpr(exec_nodes->en_expr,
+ +                                                                                       (PlanState *) planstate);
+ +                      Datum partvalue = ExecEvalExpr(estate,
+ +                                                                                 planstate->combiner.ss.ps.ps_ExprContext,
-       if (IS_PGXC_LOCAL_COORDINATOR && MyXactAccessedTempRel)
++                                                                                 &isnull);
+ +                      RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
+ +                      /* PGXCTODO what is the type of partvalue here */
+ +                      ExecNodes *nodes = GetRelationNodes(rel_loc_info,
+ +                                                                                              partvalue,
+ +                                                                                              isnull,
+ +                                                                                              exec_nodes->accesstype);
+ +                      /*
+ +                       * en_expr is set by pgxc_set_en_expr only for distributed
+ +                       * relations while planning DMLs, hence a select for update
+ +                       * on a replicated table here is an assertion
+ +                       */
+ +                      Assert(!(exec_nodes->accesstype == RELATION_ACCESS_READ_FOR_UPDATE &&
+ +                                              IsRelationReplicated(rel_loc_info)));
+ +
+ +                      if (nodes)
+ +                      {
+ +                              nodelist = nodes->nodeList;
+ +                              primarynode = nodes->primarynodelist;
+ +                              pfree(nodes);
+ +                      }
+ +                      FreeRelationLocInfo(rel_loc_info);
+ +              }
+ +              else if (OidIsValid(exec_nodes->en_relid))
+ +              {
+ +                      RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
+ +                      ExecNodes *nodes = GetRelationNodes(rel_loc_info, 0, true, exec_nodes->accesstype);
+ +
+ +                      /*
+ +                       * en_relid is set only for DMLs, hence a select for update on a
+ +                       * replicated table here is an assertion
+ +                       */
+ +                      Assert(!(exec_nodes->accesstype == RELATION_ACCESS_READ_FOR_UPDATE &&
+ +                                              IsRelationReplicated(rel_loc_info)));
+ +
+ +                      /* Use the obtained list for given table */
+ +                      if (nodes)
+ +                              nodelist = nodes->nodeList;
+ +
+ +                      /*
+ +                       * Special handling for ROUND ROBIN distributed tables. The target
+ +                       * node must be determined at the execution time
+ +                       */
+ +                      if (rel_loc_info->locatorType == LOCATOR_TYPE_RROBIN && nodes)
+ +                      {
+ +                              nodelist = nodes->nodeList;
+ +                              primarynode = nodes->primarynodelist;
+ +                      }
+ +                      else if (nodes)
+ +                      {
+ +                              if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
+ +                              {
+ +                                      nodelist = exec_nodes->nodeList;
+ +                                      primarynode = exec_nodes->primarynodelist;
+ +                              }
+ +                      }
+ +
+ +                      if (nodes)
+ +                              pfree(nodes);
+ +                      FreeRelationLocInfo(rel_loc_info);
+ +              }
+ +              else
+ +              {
+ +                      if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
+ +                              nodelist = exec_nodes->nodeList;
+ +                      else if (exec_type == EXEC_ON_COORDS)
+ +                              coordlist = exec_nodes->nodeList;
+ +
+ +                      primarynode = exec_nodes->primarynodelist;
+ +              }
+ +      }
+ +
+ +      /* Set node list and DN number */
+ +      if (list_length(nodelist) == 0 &&
+ +              (exec_type == EXEC_ON_ALL_NODES ||
+ +               exec_type == EXEC_ON_DATANODES))
+ +      {
+ +              /* Primary connection is included in this number of connections if it exists */
+ +              dn_conn_count = NumDataNodes;
+ +      }
+ +      else
+ +      {
+ +              if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
+ +              {
+ +                      if (primarynode)
+ +                              dn_conn_count = list_length(nodelist) + 1;
+ +                      else
+ +                              dn_conn_count = list_length(nodelist);
+ +              }
+ +              else
+ +                      dn_conn_count = 0;
+ +      }
+ +
+ +      /* Set Coordinator list and Coordinator number */
+ +      if ((list_length(nodelist) == 0 && exec_type == EXEC_ON_ALL_NODES) ||
+ +              (list_length(coordlist) == 0 && exec_type == EXEC_ON_COORDS))
+ +      {
+ +              coordlist = GetAllCoordNodes();
+ +              co_conn_count = list_length(coordlist);
+ +      }
+ +      else
+ +      {
+ +              if (exec_type == EXEC_ON_COORDS)
+ +                      co_conn_count = list_length(coordlist);
+ +              else
+ +                      co_conn_count = 0;
+ +      }
+ +
+ +      /* Get other connections (non-primary) */
+ +      pgxc_handles = get_handles(nodelist, coordlist, is_query_coord_only, is_global_session);
+ +      if (!pgxc_handles)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("Could not obtain connection from pool")));
+ +
+ +      /* Get connection for primary node, if used */
+ +      if (primarynode)
+ +      {
+ +              /* Let's assume primary connection is always a Datanode connection for the moment */
+ +              PGXCNodeAllHandles *pgxc_conn_res;
+ +              pgxc_conn_res = get_handles(primarynode, NULL, false, is_global_session);
+ +
+ +              /* primary connection is unique */
+ +              primaryconnection = pgxc_conn_res->datanode_handles[0];
+ +
+ +              pfree(pgxc_conn_res);
+ +
+ +              if (!primaryconnection)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Could not obtain connection from pool")));
+ +              pgxc_handles->primary_handle = primaryconnection;
+ +      }
+ +
+ +      /* Depending on the execution type, we still need to save the initial node counts */
+ +      pgxc_handles->dn_conn_count = dn_conn_count;
+ +      pgxc_handles->co_conn_count = co_conn_count;
+ +
+ +      return pgxc_handles;
+ +}
+ +
+ +
+ +static bool
+ +pgxc_start_command_on_connection(PGXCNodeHandle *connection,
+ +                                                                      RemoteQueryState *remotestate,
+ +                                                                      Snapshot snapshot)
+ +{
+ +      CommandId       cid;
+ +      ResponseCombiner *combiner = (ResponseCombiner *) remotestate;
+ +      RemoteQuery     *step = (RemoteQuery *) combiner->ss.ps.plan;
+ +      CHECK_OWNERSHIP(connection, combiner);
+ +
+ +      elog(DEBUG5, "pgxc_start_command_on_connection - node %s, state %d",
+ +                      connection->nodename, connection->state);
+ +
+ +      /*
+ +       * Scan descriptor would be valid and would contain a valid snapshot
+ +       * in cases when we need to send out of order command id to data node
+ +       * e.g. in case of a fetch
+ +       */
+ +      cid = GetCurrentCommandId(false);
+ +
+ +      if (pgxc_node_send_cmd_id(connection, cid) < 0 )
+ +              return false;
+ +
+ +      if (snapshot && pgxc_node_send_snapshot(connection, snapshot))
+ +              return false;
+ +      if (step->statement || step->cursor || remotestate->rqs_num_params)
+ +      {
+ +              /* need to use Extended Query Protocol */
+ +              int     fetch = 0;
+ +              bool    prepared = false;
+ +              char    nodetype = PGXC_NODE_DATANODE;
+ +
+ +              /* if prepared statement is referenced see if it is already
+ +               * exist */
+ +              if (step->statement)
+ +                      prepared =
+ +                              ActivateDatanodeStatementOnNode(step->statement,
+ +                                              PGXCNodeGetNodeId(connection->nodeoid,
+ +                                                      &nodetype));
+ +
+ +              /*
+ +               * execute and fetch rows only if they will be consumed
+ +               * immediately by the sorter
+ +               */
+ +              if (step->cursor)
+ +                      fetch = 1;
+ +
+ +              combiner->extended_query = true;
+ +
+ +              if (pgxc_node_send_query_extended(connection,
+ +                                                      prepared ? NULL : step->sql_statement,
+ +                                                      step->statement,
+ +                                                      step->cursor,
+ +                                                      remotestate->rqs_num_params,
+ +                                                      remotestate->rqs_param_types,
+ +                                                      remotestate->paramval_len,
+ +                                                      remotestate->paramval_data,
+ +                                                      step->has_row_marks ? true : step->read_only,
+ +                                                      fetch) != 0)
+ +                      return false;
+ +      }
+ +      else
+ +      {
+ +              combiner->extended_query = false;
+ +              if (pgxc_node_send_query(connection, step->sql_statement) != 0)
+ +                      return false;
+ +      }
+ +      return true;
+ +}
+ +
+ +/*
+ + * Execute utility statement on multiple Datanodes
+ + * It does approximately the same as
+ + *
+ + * RemoteQueryState *state = ExecInitRemoteQuery(plan, estate, flags);
+ + * Assert(TupIsNull(ExecRemoteQuery(state));
+ + * ExecEndRemoteQuery(state)
+ + *
+ + * But does not need an Estate instance and does not do some unnecessary work,
+ + * like allocating tuple slots.
+ + */
+ +void
+ +ExecRemoteUtility(RemoteQuery *node)
+ +{
+ +      RemoteQueryState *remotestate;
+ +      ResponseCombiner *combiner;
+ +      bool            force_autocommit = node->force_autocommit;
+ +      RemoteQueryExecType exec_type = node->exec_type;
+ +      GlobalTransactionId gxid = InvalidGlobalTransactionId;
+ +      Snapshot snapshot = NULL;
+ +      PGXCNodeAllHandles *pgxc_connections;
+ +      int                     co_conn_count;
+ +      int                     dn_conn_count;
+ +      bool            need_tran_block;
+ +      ExecDirectType          exec_direct_type = node->exec_direct_type;
+ +      int                     i;
+ +      CommandId       cid = GetCurrentCommandId(true);        
+ +
+ +      if (!force_autocommit)
+ +              RegisterTransactionLocalNode(true);
+ +
+ +      remotestate = makeNode(RemoteQueryState);
+ +      combiner = (ResponseCombiner *)remotestate;
+ +      InitResponseCombiner(combiner, 0, node->combine_type);
+ +
+ +      /*
+ +       * Do not set global_session if it is a utility statement. 
+ +       * Avoids CREATE NODE error on cluster configuration.
+ +       */
+ +      pgxc_connections = get_exec_connections(NULL, node->exec_nodes, exec_type, 
+ +                                                                                      exec_direct_type != EXEC_DIRECT_UTILITY);
+ +
+ +      dn_conn_count = pgxc_connections->dn_conn_count;
+ +      co_conn_count = pgxc_connections->co_conn_count;
+ +      /* exit right away if no nodes to run command on */
+ +      if (dn_conn_count == 0 && co_conn_count == 0)
+ +      {
+ +              pfree_pgxc_all_handles(pgxc_connections);
+ +              return;
+ +      }
+ +
+ +      if (force_autocommit)
+ +              need_tran_block = false;
+ +      else
+ +              need_tran_block = true;
+ +
+ +      /* Commands launched through EXECUTE DIRECT do not need start a transaction */
+ +      if (exec_direct_type == EXEC_DIRECT_UTILITY)
+ +      {
+ +              need_tran_block = false;
+ +
+ +              /* This check is not done when analyzing to limit dependencies */
+ +              if (IsTransactionBlock())
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+ +                                       errmsg("cannot run EXECUTE DIRECT with utility inside a transaction block")));
+ +      }
+ +
+ +      gxid = GetCurrentTransactionId();
+ +      if (ActiveSnapshotSet())
+ +              snapshot = GetActiveSnapshot();
+ +      if (!GlobalTransactionIdIsValid(gxid))
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("Failed to get next transaction ID")));
+ +
+ +      {
+ +              if (pgxc_node_begin(dn_conn_count, pgxc_connections->datanode_handles,
+ +                                      gxid, need_tran_block, false, PGXC_NODE_DATANODE))
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Could not begin transaction on Datanodes")));
+ +              for (i = 0; i < dn_conn_count; i++)
+ +              {
+ +                      PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
+ +
+ +                      if (conn->state == DN_CONNECTION_STATE_QUERY)
+ +                              BufferConnection(conn);
+ +                      if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send snapshot to Datanodes")));
+ +                      }
+ +                      if (pgxc_node_send_cmd_id(conn, cid) < 0)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send command ID to Datanodes")));
+ +                      }
+ +
+ +                      if (pgxc_node_send_query(conn, node->sql_statement) != 0)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send command to Datanodes")));
+ +                      }
+ +              }
+ +      }
+ +
+ +      {
+ +              if (pgxc_node_begin(co_conn_count, pgxc_connections->coord_handles,
+ +                                      gxid, need_tran_block, false, PGXC_NODE_COORDINATOR))
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Could not begin transaction on coordinators")));
+ +              /* Now send it to Coordinators if necessary */
+ +              for (i = 0; i < co_conn_count; i++)
+ +              {
+ +                      if (snapshot && pgxc_node_send_snapshot(pgxc_connections->coord_handles[i], snapshot))
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send command to coordinators")));
+ +                      }
+ +                      if (pgxc_node_send_cmd_id(pgxc_connections->coord_handles[i], cid) < 0)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send command ID to Datanodes")));
+ +                      }
+ +
+ +                      if (pgxc_node_send_query(pgxc_connections->coord_handles[i], node->sql_statement) != 0)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send command to coordinators")));
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Stop if all commands are completed or we got a data row and
+ +       * initialized state node for subsequent invocations
+ +       */
+ +      {
+ +              while (dn_conn_count > 0)
+ +              {
+ +                      int i = 0;
+ +
+ +                      if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL))
+ +                              break;
+ +                      /*
+ +                       * Handle input from the Datanodes.
+ +                       * We do not expect Datanodes returning tuples when running utility
+ +                       * command.
+ +                       * If we got EOF, move to the next connection, will receive more
+ +                       * data on the next iteration.
+ +                       */
+ +                      while (i < dn_conn_count)
+ +                      {
+ +                              PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
+ +                              int res = handle_response(conn, combiner);
+ +                              if (res == RESPONSE_EOF)
+ +                              {
+ +                                      i++;
+ +                              }
+ +                              else if (res == RESPONSE_COMPLETE)
+ +                              {
+ +                                      /* Ignore, wait for ReadyForQuery */
+ +                              }
+ +                              else if (res == RESPONSE_ERROR)
+ +                              {
+ +                                      /* Ignore, wait for ReadyForQuery */
+ +                              }
+ +                              else if (res == RESPONSE_READY)
+ +                              {
+ +                                      if (i < --dn_conn_count)
+ +                                              pgxc_connections->datanode_handles[i] =
+ +                                                      pgxc_connections->datanode_handles[dn_conn_count];
+ +                              }
+ +                              else if (res == RESPONSE_TUPDESC)
+ +                              {
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Unexpected response from Datanode")));
+ +                              }
+ +                              else if (res == RESPONSE_DATAROW)
+ +                              {
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Unexpected response from Datanode")));
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      /* Make the same for Coordinators */
+ +      {
+ +              while (co_conn_count > 0)
+ +              {
+ +                      int i = 0;
+ +
+ +                      if (pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL))
+ +                              break;
+ +
+ +                      while (i < co_conn_count)
+ +                      {
+ +                              int res = handle_response(pgxc_connections->coord_handles[i], combiner);
+ +                              if (res == RESPONSE_EOF)
+ +                              {
+ +                                      i++;
+ +                              }
+ +                              else if (res == RESPONSE_COMPLETE)
+ +                              {
+ +                                      /* Ignore, wait for ReadyForQuery */
+ +                              }
+ +                              else if (res == RESPONSE_ERROR)
+ +                              {
+ +                                      /* Ignore, wait for ReadyForQuery */
+ +                              }
+ +                              else if (res == RESPONSE_READY)
+ +                              {
+ +                                      if (i < --co_conn_count)
+ +                                              pgxc_connections->coord_handles[i] =
+ +                                                       pgxc_connections->coord_handles[co_conn_count];
+ +                              }
+ +                              else if (res == RESPONSE_TUPDESC)
+ +                              {
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Unexpected response from coordinator")));
+ +                              }
+ +                              else if (res == RESPONSE_DATAROW)
+ +                              {
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Unexpected response from coordinator")));
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * We have processed all responses from nodes and if we have
+ +       * error message pending we can report it. All connections should be in
+ +       * consistent state now and so they can be released to the pool after ROLLBACK.
+ +       */
+ +      pfree_pgxc_all_handles(pgxc_connections);
+ +      pgxc_node_report_error(combiner);
+ +}
+ +
+ +
+ +/*
+ + * Called when the backend is ending.
+ + */
+ +void
+ +PGXCNodeCleanAndRelease(int code, Datum arg)
+ +{
+ +
+ +      /* Disconnect from Pooler, if any connection is still held Pooler close it */
+ +      PoolManagerDisconnect();
+ +
+ +      /* Close connection with GTM */
+ +      CloseGTM();
+ +
+ +      /* Dump collected statistics to the log */
+ +      stat_log();
+ +}
+ +
+ +void
+ +ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
+ +{
+ +      PGXCNodeAllHandles *all_handles;
+ +      PGXCNodeHandle    **connections;
+ +      ResponseCombiner        combiner;
+ +      int                                     conn_count;
+ +      int                             i;
+ +
+ +      /* Exit if nodelist is empty */
+ +      if (list_length(nodelist) == 0)
+ +              return;
+ +
+ +      /* get needed Datanode connections */
+ +      all_handles = get_handles(nodelist, NIL, false, true);
+ +      conn_count = all_handles->dn_conn_count;
+ +      connections = all_handles->datanode_handles;
+ +
+ +      for (i = 0; i < conn_count; i++)
+ +      {
+ +              if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
+ +                      BufferConnection(connections[i]);
+ +              if (pgxc_node_send_close(connections[i], true, stmt_name) != 0)
+ +              {
+ +                      /*
+ +                       * statements are not affected by statement end, so consider
+ +                       * unclosed statement on the Datanode as a fatal issue and
+ +                       * force connection is discarded
+ +                       */
+ +                      PGXCNodeSetConnectionState(connections[i],
+ +                                      DN_CONNECTION_STATE_ERROR_FATAL);
+ +                      ereport(WARNING,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to close Datanode statemrnt")));
+ +              }
+ +              if (pgxc_node_send_sync(connections[i]) != 0)
+ +              {
+ +                      PGXCNodeSetConnectionState(connections[i],
+ +                                      DN_CONNECTION_STATE_ERROR_FATAL);
+ +                      ereport(WARNING,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to close Datanode statement")));
+ +              }
+ +              PGXCNodeSetConnectionState(connections[i], DN_CONNECTION_STATE_CLOSE);
+ +      }
+ +
+ +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ +      /*
+ +       * Make sure there are zeroes in unused fields
+ +       */
+ +      memset(&combiner, 0, sizeof(ScanState));
+ +
+ +      while (conn_count > 0)
+ +      {
+ +              if (pgxc_node_receive(conn_count, connections, NULL))
+ +              {
+ +                      for (i = 0; i < conn_count; i++)
+ +                              PGXCNodeSetConnectionState(connections[i],
+ +                                              DN_CONNECTION_STATE_ERROR_FATAL);
+ +
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to close Datanode statement")));
+ +              }
+ +              i = 0;
+ +              while (i < conn_count)
+ +              {
+ +                      int res = handle_response(connections[i], &combiner);
+ +                      if (res == RESPONSE_EOF)
+ +                      {
+ +                              i++;
+ +                      }
+ +                      else if (res == RESPONSE_READY ||
+ +                                      connections[i]->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ +                      {
+ +                              if (--conn_count > i)
+ +                                      connections[i] = connections[conn_count];
+ +                      }
+ +              }
+ +      }
+ +
+ +      ValidateAndCloseCombiner(&combiner);
+ +      pfree_pgxc_all_handles(all_handles);
+ +}
+ +
+ +/*
+ + * DataNodeCopyInBinaryForAll
+ + *
+ + * In a COPY TO, send to all Datanodes PG_HEADER for a COPY TO in binary mode.
+ + */
+ +int
+ +DataNodeCopyInBinaryForAll(char *msg_buf, int len, int conn_count,
+ +                                                                        PGXCNodeHandle** connections)
+ +{
+ +      int             i;
+ +      int msgLen = 4 + len;
+ +      int nLen = htonl(msgLen);
+ +
+ +      for (i = 0; i < conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *handle = connections[i];
+ +              if (handle->state == DN_CONNECTION_STATE_COPY_IN)
+ +              {
+ +                      /* msgType + msgLen */
+ +                      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ +                      {
+ +                              ereport(ERROR,
+ +                                      (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                      errmsg("out of memory")));
+ +                      }
+ +
+ +                      handle->outBuffer[handle->outEnd++] = 'd';
+ +                      memcpy(handle->outBuffer + handle->outEnd, &nLen, 4);
+ +                      handle->outEnd += 4;
+ +                      memcpy(handle->outBuffer + handle->outEnd, msg_buf, len);
+ +                      handle->outEnd += len;
+ +              }
+ +              else
+ +              {
+ +                      add_error_message(handle, "Invalid Datanode connection");
+ +                      return EOF;
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Encode parameter values to format of DataRow message (the same format is
+ + * used in Bind) to prepare for sending down to Datanodes.
+ + * The data row is copied to RemoteQueryState.paramval_data.
+ + */
+ +void
+ +SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state)
+ +{
+ +      StringInfoData buf;
+ +      uint16 n16;
+ +      int i;
+ +      int real_num_params = 0;
+ +      RemoteQuery *node = (RemoteQuery*) rq_state->combiner.ss.ps.plan;
+ +
+ +      /* If there are no parameters, there is no data to BIND. */
+ +      if (!paraminfo)
+ +              return;
+ +
+ +      Assert(!rq_state->paramval_data);
+ +
+ +      /*
+ +       * It is necessary to fetch parameters
+ +       * before looking at the output value.
+ +       */
+ +      for (i = 0; i < paraminfo->numParams; i++)
+ +      {
+ +              ParamExternData *param;
+ +
+ +              param = &paraminfo->params[i];
+ +
+ +              if (!OidIsValid(param->ptype) && paraminfo->paramFetch != NULL)
+ +                      (*paraminfo->paramFetch) (paraminfo, i + 1);
+ +
+ +              /*
+ +               * This is the last parameter found as useful, so we need
+ +               * to include all the previous ones to keep silent the remote
+ +               * nodes. All the parameters prior to the last usable having no
+ +               * type available will be considered as NULL entries.
+ +               */
+ +              if (OidIsValid(param->ptype))
+ +                      real_num_params = i + 1;
+ +      }
+ +
+ +      /*
+ +       * If there are no parameters available, simply leave.
+ +       * This is possible in the case of a query called through SPI
+ +       * and using no parameters.
+ +       */
+ +      if (real_num_params == 0)
+ +      {
+ +              rq_state->paramval_data = NULL;
+ +              rq_state->paramval_len = 0;
+ +              return;
+ +      }
+ +
+ +      initStringInfo(&buf);
+ +
+ +      /* Number of parameter values */
+ +      n16 = htons(real_num_params);
+ +      appendBinaryStringInfo(&buf, (char *) &n16, 2);
+ +
+ +      /* Parameter values */
+ +      for (i = 0; i < real_num_params; i++)
+ +      {
+ +              ParamExternData *param = &paraminfo->params[i];
+ +              uint32 n32;
+ +
+ +              /*
+ +               * Parameters with no types are considered as NULL and treated as integer
+ +               * The same trick is used for dropped columns for remote DML generation.
+ +               */
+ +              if (param->isnull || !OidIsValid(param->ptype))
+ +              {
+ +                      n32 = htonl(-1);
+ +                      appendBinaryStringInfo(&buf, (char *) &n32, 4);
+ +              }
+ +              else
+ +              {
+ +                      Oid             typOutput;
+ +                      bool    typIsVarlena;
+ +                      Datum   pval;
+ +                      char   *pstring;
+ +                      int             len;
+ +
+ +                      /* Get info needed to output the value */
+ +                      getTypeOutputInfo(param->ptype, &typOutput, &typIsVarlena);
+ +
+ +                      /*
+ +                       * If we have a toasted datum, forcibly detoast it here to avoid
+ +                       * memory leakage inside the type's output routine.
+ +                       */
+ +                      if (typIsVarlena)
+ +                              pval = PointerGetDatum(PG_DETOAST_DATUM(param->value));
+ +                      else
+ +                              pval = param->value;
+ +
+ +                      /* Convert Datum to string */
+ +                      pstring = OidOutputFunctionCall(typOutput, pval);
+ +
+ +                      /* copy data to the buffer */
+ +                      len = strlen(pstring);
+ +                      n32 = htonl(len);
+ +                      appendBinaryStringInfo(&buf, (char *) &n32, 4);
+ +                      appendBinaryStringInfo(&buf, pstring, len);
+ +              }
+ +      }
+ +
+ +
+ +      /*
+ +       * If parameter types are not already set, infer them from
+ +       * the paraminfo.
+ +       */
+ +      if (node->rq_num_params > 0)
+ +      {
+ +              /*
+ +               * Use the already known param types for BIND. Parameter types
+ +               * can be already known when the same plan is executed multiple
+ +               * times.
+ +               */
+ +              if (node->rq_num_params != real_num_params)
+ +                      elog(ERROR, "Number of user-supplied parameters do not match "
+ +                                              "the number of remote parameters");
+ +              rq_state->rqs_num_params = node->rq_num_params;
+ +              rq_state->rqs_param_types = node->rq_param_types;
+ +      }
+ +      else
+ +      {
+ +              rq_state->rqs_num_params = real_num_params;
+ +              rq_state->rqs_param_types = (Oid *) palloc(sizeof(Oid) * real_num_params);
+ +              for (i = 0; i < real_num_params; i++)
+ +                      rq_state->rqs_param_types[i] = paraminfo->params[i].ptype;
+ +      }
+ +
+ +      /* Assign the newly allocated data row to paramval */
+ +      rq_state->paramval_data = buf.data;
+ +      rq_state->paramval_len = buf.len;
+ +}
+ +
+ +/*
+ + * Clear per transaction remote information
+ + */
+ +void
+ +AtEOXact_Remote(void)
+ +{
+ +      PGXCNodeResetParams(true);
+ +}
+ +
+ +/*
+ + * Invoked when local transaction is about to be committed.
+ + * If nodestring is specified commit specified prepared transaction on remote
+ + * nodes, otherwise commit remote nodes which are in transaction.
+ + */
+ +void
+ +PreCommit_Remote(char *prepareGID, char *nodestring, bool preparedLocalNode)
+ +{
+ +      struct rusage           start_r;
+ +      struct timeval          start_t;
+ +
+ +      if (log_gtm_stats)
+ +              ResetUsageCommon(&start_r, &start_t);
+ +
+ +      /*
+ +       * Made node connections persistent if we are committing transaction
+ +       * that touched temporary tables. We never drop that flag, so after some
+ +       * transaction has created a temp table the session's remote connections
+ +       * become persistent.
+ +       * We do not need to set that flag if transaction that has created a temp
+ +       * table finally aborts - remote connections are not holding temporary
+ +       * objects in this case.
+ +       */
-       if (MyXactAccessedTempRel)
++      if (IS_PGXC_LOCAL_COORDINATOR &&
++              (MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL))
+ +              temp_object_included = true;
+ +
+ +
+ +      /*
+ +       * OK, everything went fine. At least one remote node is in PREPARED state
+ +       * and the transaction is successfully prepared on all the involved nodes.
+ +       * Now we are ready to commit the transaction. We need a new GXID to send
+ +       * down the remote nodes to execute the forthcoming COMMIT PREPARED
+ +       * command. So grab one from the GTM and track it. It will be closed along
+ +       * with the main transaction at the end.
+ +       */
+ +      if (nodestring)
+ +      {
+ +              Assert(preparedLocalNode);
+ +              pgxc_node_remote_finish(prepareGID, true, nodestring,
+ +                                                              GetAuxilliaryTransactionId(),
+ +                                                              GetTopGlobalTransactionId());
+ +
+ +      }
+ +      else
+ +              pgxc_node_remote_commit();
+ +
+ +      if (log_gtm_stats)
+ +              ShowUsageCommon("PreCommit_Remote", &start_r, &start_t);
+ +}
+ +
+ +/*
+ + * Do abort processing for the transaction. We must abort the transaction on
+ + * all the involved nodes. If a node has already prepared a transaction, we run
+ + * ROLLBACK PREPARED command on the node. Otherwise, a simple ROLLBACK command
+ + * is sufficient.
+ + *
+ + * We must guard against the case when a transaction is prepared succefully on
+ + * all the nodes and some error occurs after we send a COMMIT PREPARED message
+ + * to at lease one node. Such a transaction must not be aborted to preserve
+ + * global consistency. We handle this case by recording the nodes involved in
+ + * the transaction at the GTM and keep the transaction open at the GTM so that
+ + * its reported as "in-progress" on all the nodes until resolved
+ + */
+ +bool
+ +PreAbort_Remote(void)
+ +{
+ +      /*
+ +       * We are about to abort current transaction, and there could be an
+ +       * unexpected error leaving the node connection in some state requiring
+ +       * clean up, like COPY or pending query results.
+ +       * If we are running copy we should send down CopyFail message and read
+ +       * all possible incoming messages, there could be copy rows (if running
+ +       * COPY TO) ErrorResponse, ReadyForQuery.
+ +       * If there are pending results (connection state is DN_CONNECTION_STATE_QUERY)
+ +       * we just need to read them in and discard, all necessary commands are
+ +       * already sent. The end of input could be CommandComplete or
+ +       * PortalSuspended, in either case subsequent ROLLBACK closes the portal.
+ +       */
+ +      PGXCNodeAllHandles *all_handles;
+ +      PGXCNodeHandle     *clean_nodes[NumCoords + NumDataNodes];
+ +      int                                     node_count = 0;
+ +      int                                     cancel_dn_count = 0, cancel_co_count = 0;
+ +      int                                     cancel_dn_list[NumDataNodes];
+ +      int                                     cancel_co_list[NumCoords];
+ +      int                             i;
+ +      struct rusage           start_r;
+ +      struct timeval          start_t;
+ +
+ +      if (log_gtm_stats)
+ +              ResetUsageCommon(&start_r, &start_t);
+ +
+ +      all_handles = get_current_handles();
+ +      /*
+ +       * Find "dirty" coordinator connections.
+ +       * COPY is never running on a coordinator connections, we just check for
+ +       * pending data.
+ +       */
+ +      for (i = 0; i < all_handles->co_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *handle = all_handles->coord_handles[i];
+ +
+ +              if (handle->state == DN_CONNECTION_STATE_QUERY)
+ +              {
+ +                      /*
+ +                       * Forget previous combiner if any since input will be handled by
+ +                       * different one.
+ +                       */
+ +                      handle->combiner = NULL;
+ +                      clean_nodes[node_count++] = handle;
+ +                      cancel_co_list[cancel_co_count++] = i;
+ +              }
+ +      }
+ +
+ +      /*
+ +       * The same for data nodes, but cancel COPY if it is running.
+ +       */
+ +      for (i = 0; i < all_handles->dn_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *handle = all_handles->datanode_handles[i];
+ +
+ +              if (handle->state == DN_CONNECTION_STATE_QUERY)
+ +              {
+ +                      /*
+ +                       * Forget previous combiner if any since input will be handled by
+ +                       * different one.
+ +                       */
+ +                      handle->combiner = NULL;
+ +                      clean_nodes[node_count++] = handle;
+ +                      cancel_dn_list[cancel_dn_count++] = i;
+ +              }
+ +              else if (handle->state == DN_CONNECTION_STATE_COPY_IN ||
+ +                              handle->state == DN_CONNECTION_STATE_COPY_OUT)
+ +              {
+ +                      DataNodeCopyEnd(handle, true);
+ +                      /*
+ +                       * Forget previous combiner if any since input will be handled by
+ +                       * different one.
+ +                       */
+ +                      handle->combiner = NULL;
+ +                      clean_nodes[node_count++] = handle;
+ +                      cancel_dn_list[cancel_dn_count++] = i;
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Cancel running queries on the datanodes and the coordinators.
+ +       */
+ +      PoolManagerCancelQuery(cancel_dn_count, cancel_dn_list, cancel_co_count,
+ +                      cancel_co_list);
+ +
+ +      /*
+ +       * Now read and discard any data from the connections found "dirty"
+ +       */
+ +      if (node_count > 0)
+ +      {
+ +              ResponseCombiner combiner;
+ +
+ +              InitResponseCombiner(&combiner, node_count, COMBINE_TYPE_NONE);
+ +              /*
+ +               * Make sure there are zeroes in unused fields
+ +               */
+ +              memset(&combiner, 0, sizeof(ScanState));
+ +              combiner.connections = clean_nodes;
+ +              combiner.conn_count = node_count;
+ +              combiner.request_type = REQUEST_TYPE_ERROR;
+ +
+ +              pgxc_connections_cleanup(&combiner);
+ +
+ +              /* prevent pfree'ing local variable */
+ +              combiner.connections = NULL;
+ +
+ +              CloseCombiner(&combiner);
+ +      }
+ +
+ +      pgxc_node_remote_abort();
+ +
+ +      /*
+ +       * Drop the connections to ensure aborts are handled properly.
+ +       *
+ +       * XXX We should really be consulting PersistentConnections parameter and
+ +       * keep the connections if its set. But as a short term measure, to address
+ +       * certain issues for aborted transactions, we drop the connections.
+ +       * Revisit and fix the issue
+ +       */
+ +      elog(DEBUG5, "temp_object_included %d", temp_object_included);
+ +      if (!temp_object_included)
+ +      {
+ +              /* Clean up remote sessions */
+ +              pgxc_node_remote_cleanup_all();
+ +              release_handles();
+ +      }
+ +
+ +      pfree_pgxc_all_handles(all_handles);
+ +
+ +      if (log_gtm_stats)
+ +              ShowUsageCommon("PreAbort_Remote", &start_r, &start_t);
+ +
+ +      return true;
+ +}
+ +
+ +
+ +/*
+ + * Invoked when local transaction is about to be prepared.
+ + * If invoked on a Datanode just commit transaction on remote connections,
+ + * since secondary sessions are read only and never need to be prepared.
+ + * Otherwise run PREPARE on remote connections, where writable commands were
+ + * sent (connections marked as not read-only).
+ + * If that is explicit PREPARE (issued by client) notify GTM.
+ + * In case of implicit PREPARE not involving local node (ex. caused by
+ + * INSERT, UPDATE or DELETE) commit prepared transaction immediately.
+ + * Return list of node names where transaction was actually prepared, include
+ + * the name of the local node if localNode is true.
+ + */
+ +char *
+ +PrePrepare_Remote(char *prepareGID, bool localNode, bool implicit)
+ +{
+ +      /* Always include local node if running explicit prepare */
+ +      char *nodestring;
+ +      struct rusage           start_r;
+ +      struct timeval          start_t;
+ +
+ +      if (log_gtm_stats)
+ +              ResetUsageCommon(&start_r, &start_t);
+ +
+ +      /*
+ +       * Primary session is doing 2PC, just commit secondary processes and exit
+ +       */
+ +      if (IS_PGXC_DATANODE)
+ +      {
+ +              pgxc_node_remote_commit();
+ +              return NULL;
+ +      }
+ +
+ +      nodestring = pgxc_node_remote_prepare(prepareGID,
+ +                                                                                              !implicit || localNode);
+ +
+ +      if (!implicit && IS_PGXC_LOCAL_COORDINATOR)
+ +              /* Save the node list and gid on GTM. */
+ +              StartPreparedTranGTM(GetTopGlobalTransactionId(), prepareGID,
+ +                                                       nodestring);
+ +
+ +      /*
+ +       * If no need to commit on local node go ahead and commit prepared
+ +       * transaction right away.
+ +       */
+ +      if (implicit && !localNode && nodestring)
+ +      {
+ +              pgxc_node_remote_finish(prepareGID, true, nodestring,
+ +                                                              GetAuxilliaryTransactionId(),
+ +                                                              GetTopGlobalTransactionId());
+ +              pfree(nodestring);
+ +              nodestring = NULL;
+ +      }
+ +
+ +      if (log_gtm_stats)
+ +              ShowUsageCommon("PrePrepare_Remote", &start_r, &start_t);
+ +
+ +      return nodestring;
+ +}
+ +
+ +/*
+ + * Invoked immediately after local node is prepared.
+ + * Notify GTM about completed prepare.
+ + */
+ +void
+ +PostPrepare_Remote(char *prepareGID, bool implicit)
+ +{
+ +      struct rusage           start_r;
+ +      struct timeval          start_t;
+ +
+ +      if (log_gtm_stats)
+ +              ResetUsageCommon(&start_r, &start_t);
+ +
+ +      if (!implicit)
+ +              PrepareTranGTM(GetTopGlobalTransactionId());
+ +
+ +      if (log_gtm_stats)
+ +              ShowUsageCommon("PostPrepare_Remote", &start_r, &start_t);
+ +}
+ +
+ +/*
+ + * Returns true if 2PC is required for consistent commit: if there was write
+ + * activity on two or more nodes within current transaction.
+ + */
+ +bool
+ +IsTwoPhaseCommitRequired(bool localWrite)
+ +{
+ +      PGXCNodeAllHandles *handles;
+ +      bool                            found = localWrite;
+ +      int                             i;
+ +
+ +      /* Never run 2PC on Datanode-to-Datanode connection */
+ +      if (IS_PGXC_DATANODE)
+ +              return false;
+ +
-       combiner->ss.ps.qual = NIL;
++      if (MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL)
+ +      {
+ +              elog(DEBUG1, "Transaction accessed temporary objects - "
+ +                              "2PC will not be used and that can lead to data inconsistencies "
+ +                              "in case of failures");
+ +              return false;
+ +      }
+ +
+ +      /*
+ +       * If no XID assigned, no need to run 2PC since neither coordinator nor any
+ +       * remote nodes did write operation
+ +       */
+ +      if (!TransactionIdIsValid(GetTopTransactionIdIfAny()))
+ +              return false;
+ +
+ +      handles = get_current_handles();
+ +      for (i = 0; i < handles->dn_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->datanode_handles[i];
+ +              if (conn->sock != NO_SOCKET && !conn->read_only &&
+ +                              conn->transaction_status == 'T')
+ +              {
+ +                      if (found)
+ +                              return true; /* second found */
+ +                      else
+ +                              found = true; /* first found */
+ +              }
+ +      }
+ +      for (i = 0; i < handles->co_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = handles->coord_handles[i];
+ +              if (conn->sock != NO_SOCKET && !conn->read_only &&
+ +                              conn->transaction_status == 'T')
+ +              {
+ +                      if (found)
+ +                              return true; /* second found */
+ +                      else
+ +                              found = true; /* first found */
+ +              }
+ +      }
+ +      return false;
+ +}
+ +
+ +/*
+ + * Execute COMMIT/ABORT PREPARED issued by the remote client on remote nodes.
+ + * Contacts GTM for the list of involved nodes and for work complete
+ + * notification. Returns true if prepared transaction on local node needs to be
+ + * finished too.
+ + */
+ +bool
+ +FinishRemotePreparedTransaction(char *prepareGID, bool commit)
+ +{
+ +      char                               *nodestring;
+ +      GlobalTransactionId             gxid, prepare_gxid;
+ +      bool                                    prepared_local = false;
+ +
+ +      /*
+ +       * Get the list of nodes involved in this transaction.
+ +       *
+ +       * This function returns the GXID of the prepared transaction. It also
+ +       * returns a fresh GXID which can be used for running COMMIT PREPARED
+ +       * commands on the remote nodes. Both these GXIDs can then be either
+ +       * committed or aborted together.
+ +       *
+ +       * XXX While I understand that we get the prepared and a new GXID with a
+ +       * single call, it doesn't look nicer and create confusion. We should
+ +       * probably split them into two parts. This is used only for explicit 2PC
+ +       * which should not be very common in XC
+ +       *
+ +       * In xc_maintenance_mode mode, we don't fail if the GTM does not have
+ +       * knowledge about the prepared transaction. That may happen for various
+ +       * reasons such that an earlier attempt cleaned up it from GTM or GTM was
+ +       * restarted in between. The xc_maintenance_mode is a kludge to come out of
+ +       * such situations. So it seems alright to not be too strict about the
+ +       * state
+ +       */
+ +      if ((GetGIDDataGTM(prepareGID, &gxid, &prepare_gxid, &nodestring) < 0) &&
+ +              !xc_maintenance_mode)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("prepared transaction with identifier \"%s\" does not exist",
+ +                                              prepareGID)));
+ +
+ +      /*
+ +       * Please note that with xc_maintenance_mode = on, COMMIT/ROLLBACK PREPARED will not
+ +       * propagate to remote nodes. Only GTM status is cleaned up.
+ +       */
+ +      if (xc_maintenance_mode)
+ +      {
+ +              if (commit)
+ +              {
+ +                      pgxc_node_remote_commit();
+ +                      CommitPreparedTranGTM(prepare_gxid, gxid, 0, NULL);
+ +              }
+ +              else
+ +              {
+ +                      pgxc_node_remote_abort();
+ +                      RollbackTranGTM(prepare_gxid);
+ +                      RollbackTranGTM(gxid);
+ +              }
+ +              return false;
+ +      }
+ +
+ +      prepared_local = pgxc_node_remote_finish(prepareGID, commit, nodestring,
+ +                                                                                       gxid, prepare_gxid);
+ +
+ +      if (commit)
+ +      {
+ +              /*
+ +               * XXX For explicit 2PC, there will be enough delay for any
+ +               * waited-committed transactions to send a final COMMIT message to the
+ +               * GTM.
+ +               */
+ +              CommitPreparedTranGTM(prepare_gxid, gxid, 0, NULL);
+ +      }
+ +      else
+ +      {
+ +              RollbackTranGTM(prepare_gxid);
+ +              RollbackTranGTM(gxid);
+ +      }
+ +
+ +      return prepared_local;
+ +}
+ +
+ +
+ +/*
+ + * Complete previously prepared transactions on remote nodes.
+ + * Release remote connection after completion.
+ + */
+ +static bool
+ +pgxc_node_remote_finish(char *prepareGID, bool commit,
+ +                                              char *nodestring, GlobalTransactionId gxid,
+ +                                              GlobalTransactionId prepare_gxid)
+ +{
+ +      char                       *finish_cmd;
+ +      PGXCNodeHandle     *connections[MaxCoords + MaxDataNodes];
+ +      int                                     conn_count = 0;
+ +      ResponseCombiner        combiner;
+ +      PGXCNodeAllHandles *pgxc_handles;
+ +      bool                            prepared_local = false;
+ +      char                       *nodename;
+ +      List                       *nodelist = NIL;
+ +      List                       *coordlist = NIL;
+ +      int                                     i;
+ +      /*
+ +       * Now based on the nodestring, run COMMIT/ROLLBACK PREPARED command on the
+ +       * remote nodes and also finish the transaction locally is required
+ +       */
+ +      nodename = strtok(nodestring, ",");
+ +      while (nodename != NULL)
+ +      {
+ +              int             nodeIndex;
+ +              char    nodetype;
+ +
+ +              /* Get node type and index */
+ +              nodetype = PGXC_NODE_NONE;
+ +              nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype);
+ +              if (nodetype == PGXC_NODE_NONE)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                                       errmsg("PGXC Node %s: object not defined",
+ +                                                      nodename)));
+ +
+ +              /* Check if node is requested is the self-node or not */
+ +              if (nodetype == PGXC_NODE_COORDINATOR)
+ +              {
+ +                      if (nodeIndex == PGXCNodeId - 1)
+ +                              prepared_local = true;
+ +                      else
+ +                              coordlist = lappend_int(coordlist, nodeIndex);
+ +              }
+ +              else
+ +                      nodelist = lappend_int(nodelist, nodeIndex);
+ +
+ +              nodename = strtok(NULL, ",");
+ +      }
+ +
+ +      if (nodelist == NIL && coordlist == NIL)
+ +              return prepared_local;
+ +
+ +      pgxc_handles = get_handles(nodelist, coordlist, false, true);
+ +
+ +      finish_cmd = (char *) palloc(64 + strlen(prepareGID));
+ +
+ +      if (commit)
+ +              sprintf(finish_cmd, "COMMIT PREPARED '%s'", prepareGID);
+ +      else
+ +              sprintf(finish_cmd, "ROLLBACK PREPARED '%s'", prepareGID);
+ +
+ +      for (i = 0; i < pgxc_handles->dn_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = pgxc_handles->datanode_handles[i];
+ +
+ +              if (pgxc_node_send_gxid(conn, gxid))
+ +              {
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("failed to send GXID for %s PREPARED command",
+ +                                                      commit ? "COMMIT" : "ROLLBACK")));
+ +              }
+ +
+ +              if (pgxc_node_send_query(conn, finish_cmd))
+ +              {
+ +                      /*
+ +                       * Do not bother with clean up, just bomb out. The error handler
+ +                       * will invoke RollbackTransaction which will do the work.
+ +                       */
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("failed to send %s PREPARED command to the node %u",
+ +                                                      commit ? "COMMIT" : "ROLLBACK", conn->nodeoid)));
+ +              }
+ +              else
+ +              {
+ +                      /* Read responses from these */
+ +                      connections[conn_count++] = conn;
+ +              }
+ +      }
+ +
+ +      for (i = 0; i < pgxc_handles->co_conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = pgxc_handles->coord_handles[i];
+ +
+ +              if (pgxc_node_send_gxid(conn, gxid))
+ +              {
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("failed to send GXID for %s PREPARED command",
+ +                                                      commit ? "COMMIT" : "ROLLBACK")));
+ +              }
+ +
+ +              if (pgxc_node_send_query(conn, finish_cmd))
+ +              {
+ +                      /*
+ +                       * Do not bother with clean up, just bomb out. The error handler
+ +                       * will invoke RollbackTransaction which will do the work.
+ +                       */
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("failed to send %s PREPARED command to the node %u",
+ +                                                      commit ? "COMMIT" : "ROLLBACK", conn->nodeoid)));
+ +              }
+ +              else
+ +              {
+ +                      /* Read responses from these */
+ +                      connections[conn_count++] = conn;
+ +              }
+ +      }
+ +
+ +      if (conn_count)
+ +      {
+ +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ +              /* Receive responses */
+ +              if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+ +                              !validate_combiner(&combiner))
+ +              {
+ +                      if (combiner.errorMessage)
+ +                              pgxc_node_report_error(&combiner);
+ +                      else
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to COMMIT the transaction on one or more nodes")));
+ +              }
+ +              else
+ +                      CloseCombiner(&combiner);
+ +      }
+ +
+ +      if (!temp_object_included && !PersistentConnections)
+ +      {
+ +              /* Clean up remote sessions */
+ +              pgxc_node_remote_cleanup_all();
+ +              release_handles();
+ +      }
+ +
+ +      pfree_pgxc_all_handles(pgxc_handles);
+ +      pfree(finish_cmd);
+ +
+ +      return prepared_local;
+ +}
+ +
+ +/*****************************************************************************
+ + *
+ + * Simplified versions of ExecInitRemoteQuery, ExecRemoteQuery and
+ + * ExecEndRemoteQuery: in XCP they are only used to execute simple queries.
+ + *
+ + *****************************************************************************/
+ +RemoteQueryState *
+ +ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags)
+ +{
+ +      RemoteQueryState   *remotestate;
+ +      ResponseCombiner   *combiner;
+ +
+ +      remotestate = makeNode(RemoteQueryState);
+ +      combiner = (ResponseCombiner *) remotestate;
+ +      InitResponseCombiner(combiner, 0, node->combine_type);
+ +      combiner->ss.ps.plan = (Plan *) node;
+ +      combiner->ss.ps.state = estate;
+ +
-                                                                         true, resultslot, NULL))
++      combiner->ss.ps.qual = NULL;
+ +
+ +      combiner->request_type = REQUEST_TYPE_QUERY;
+ +
+ +      ExecInitResultTupleSlot(estate, &combiner->ss.ps);
+ +      ExecAssignResultTypeFromTL((PlanState *) remotestate);
+ +
+ +      /*
+ +       * If there are parameters supplied, get them into a form to be sent to the
+ +       * Datanodes with bind message. We should not have had done this before.
+ +       */
+ +      SetDataRowForExtParams(estate->es_param_list_info, remotestate);
+ +
+ +      /* We need expression context to evaluate */
+ +      if (node->exec_nodes && node->exec_nodes->en_expr)
+ +      {
+ +              Expr *expr = node->exec_nodes->en_expr;
+ +
+ +              if (IsA(expr, Var) && ((Var *) expr)->vartype == TIDOID)
+ +              {
+ +                      /* Special case if expression does not need to be evaluated */
+ +              }
+ +              else
+ +              {
+ +                      /* prepare expression evaluation */
+ +                      ExecAssignExprContext(estate, &combiner->ss.ps);
+ +              }
+ +      }
+ +
+ +      return remotestate;
+ +}
+ +
+ +
+ +/*
+ + * Execute step of PGXC plan.
+ + * The step specifies a command to be executed on specified nodes.
+ + * On first invocation connections to the data nodes are initialized and
+ + * command is executed. Further, as well as within subsequent invocations,
+ + * responses are received until step is completed or there is a tuple to emit.
+ + * If there is a tuple it is returned, otherwise returned NULL. The NULL result
+ + * from the function indicates completed step.
+ + * The function returns at most one tuple per invocation.
+ + */
+ +TupleTableSlot *
+ +ExecRemoteQuery(RemoteQueryState *node)
+ +{
+ +      ResponseCombiner *combiner = (ResponseCombiner *) node;
+ +      RemoteQuery    *step = (RemoteQuery *) combiner->ss.ps.plan;
+ +      TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot;
+ +
+ +      if (!node->query_Done)
+ +      {
+ +              GlobalTransactionId gxid = InvalidGlobalTransactionId;
+ +              Snapshot                snapshot = GetActiveSnapshot();
+ +              PGXCNodeHandle **connections = NULL;
+ +              PGXCNodeHandle *primaryconnection = NULL;
+ +              int                             i;
+ +              int                             regular_conn_count = 0;
+ +              int                             total_conn_count = 0;
+ +              bool                    need_tran_block;
+ +              PGXCNodeAllHandles *pgxc_connections;
+ +
+ +              /*
+ +               * Get connections for Datanodes only, utilities and DDLs
+ +               * are launched in ExecRemoteUtility
+ +               */
+ +              pgxc_connections = get_exec_connections(node, step->exec_nodes,
+ +                                                                                              step->exec_type,
+ +                                                                                              true);
+ +
+ +              if (step->exec_type == EXEC_ON_DATANODES)
+ +              {
+ +                      connections = pgxc_connections->datanode_handles;
+ +                      total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count;
+ +              }
+ +              else if (step->exec_type == EXEC_ON_COORDS)
+ +              {
+ +                      connections = pgxc_connections->coord_handles;
+ +                      total_conn_count = regular_conn_count = pgxc_connections->co_conn_count;
+ +              }
+ +
+ +              primaryconnection = pgxc_connections->primary_handle;
+ +
+ +              /*
+ +               * Primary connection is counted separately but is included in total_conn_count if used.
+ +               */
+ +              if (primaryconnection)
+ +                      regular_conn_count--;
+ +
+ +              /*
+ +               * We save only regular connections, at the time we exit the function
+ +               * we finish with the primary connection and deal only with regular
+ +               * connections on subsequent invocations
+ +               */
+ +              combiner->node_count = regular_conn_count;
+ +
+ +              /*
+ +               * Start transaction on data nodes if we are in explicit transaction
+ +               * or going to use extended query protocol or write to multiple nodes
+ +               */
+ +              if (step->force_autocommit)
+ +                      need_tran_block = false;
+ +              else
+ +                      need_tran_block = step->cursor ||
+ +                                      (!step->read_only && total_conn_count > 1) ||
+ +                                      (TransactionBlockStatusCode() == 'T');
+ +
+ +              stat_statement();
+ +              stat_transaction(total_conn_count);
+ +
+ +              gxid = GetCurrentTransactionIdIfAny();
+ +              /* See if we have a primary node, execute on it first before the others */
+ +              if (primaryconnection)
+ +              {
+ +                      if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block,
+ +                                                              step->read_only, PGXC_NODE_DATANODE))
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Could not begin transaction on data node.")));
+ +
+ +                      /* If explicit transaction is needed gxid is already sent */
+ +                      if (!pgxc_start_command_on_connection(primaryconnection, node, snapshot))
+ +                      {
+ +                              pgxc_node_remote_abort();
+ +                              pfree_pgxc_all_handles(pgxc_connections);
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send command to data nodes")));
+ +                      }
+ +                      Assert(combiner->combine_type == COMBINE_TYPE_SAME);
+ +
+ +                      pgxc_node_receive(1, &primaryconnection, NULL);
+ +                      /* Make sure the command is completed on the primary node */
+ +                      while (true)
+ +                      {
+ +                              int res = handle_response(primaryconnection, combiner);
+ +                              if (res == RESPONSE_READY)
+ +                                      break;
+ +                              else if (res == RESPONSE_EOF)
+ +                                      pgxc_node_receive(1, &primaryconnection, NULL);
+ +                              else if (res == RESPONSE_COMPLETE || res == RESPONSE_ERROR)
+ +                                  /* Get ReadyForQuery */
+ +                                      continue;
+ +                              else if (res == RESPONSE_ASSIGN_GXID)
+ +                                      continue;
+ +                              else
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Unexpected response from data node")));
+ +                      }
+ +                      if (combiner->errorMessage)
+ +                              pgxc_node_report_error(combiner);
+ +              }
+ +
+ +              for (i = 0; i < regular_conn_count; i++)
+ +              {
+ +                      if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block,
+ +                                                              step->read_only, PGXC_NODE_DATANODE))
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Could not begin transaction on data node.")));
+ +
+ +                      /* If explicit transaction is needed gxid is already sent */
+ +                      if (!pgxc_start_command_on_connection(connections[i], node, snapshot))
+ +                      {
+ +                              pgxc_node_remote_abort();
+ +                              pfree_pgxc_all_handles(pgxc_connections);
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to send command to data nodes")));
+ +                      }
+ +                      connections[i]->combiner = combiner;
+ +              }
+ +
+ +              if (step->cursor)
+ +              {
+ +                      combiner->cursor = step->cursor;
+ +                      combiner->cursor_count = regular_conn_count;
+ +                      combiner->cursor_connections = (PGXCNodeHandle **) palloc(regular_conn_count * sizeof(PGXCNodeHandle *));
+ +                      memcpy(combiner->cursor_connections, connections, regular_conn_count * sizeof(PGXCNodeHandle *));
+ +              }
+ +
+ +              combiner->connections = connections;
+ +              combiner->conn_count = regular_conn_count;
+ +              combiner->current_conn = 0;
+ +
+ +              if (combiner->cursor_count)
+ +              {
+ +                      combiner->conn_count = combiner->cursor_count;
+ +                      memcpy(connections, combiner->cursor_connections,
+ +                                 combiner->cursor_count * sizeof(PGXCNodeHandle *));
+ +                      combiner->connections = connections;
+ +              }
+ +
+ +              node->query_Done = true;
+ +
+ +              if (step->sort)
+ +              {
+ +                      SimpleSort *sort = step->sort;
+ +
+ +                      /*
+ +                       * First message is already in the buffer
+ +                       * Further fetch will be under tuplesort control
+ +                       * If query does not produce rows tuplesort will not
+ +                       * be initialized
+ +                       */
+ +                      combiner->tuplesortstate = tuplesort_begin_merge(
+ +                                                                 resultslot->tts_tupleDescriptor,
+ +                                                                 sort->numCols,
+ +                                                                 sort->sortColIdx,
+ +                                                                 sort->sortOperators,
+ +                                                                 sort->sortCollations,
+ +                                                                 sort->nullsFirst,
+ +                                                                 combiner,
+ +                                                                 work_mem);
+ +              }
+ +      }
+ +
+ +      if (combiner->tuplesortstate)
+ +      {
+ +              if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate,
-       combiner->ss.ps.qual = NIL;
++                                                                        true, true, resultslot, NULL))
+ +                      return resultslot;
+ +              else
+ +                      ExecClearTuple(resultslot);
+ +      }
+ +      else
+ +      {
+ +              TupleTableSlot *slot = FetchTuple(combiner);
+ +              if (!TupIsNull(slot))
+ +                      return slot;
+ +      }
+ +
+ +      if (combiner->errorMessage)
+ +              pgxc_node_report_error(combiner);
+ +
+ +      return NULL;
+ +}
+ +
+ +
+ +/*
+ + * Clean up and discard any data on the data node connections that might not
+ + * handled yet, including pending on the remote connection.
+ + */
+ +static void
+ +pgxc_connections_cleanup(ResponseCombiner *combiner)
+ +{
+ +      /* clean up the buffer */
+ +      list_free_deep(combiner->rowBuffer);
+ +      combiner->rowBuffer = NIL;
+ +
+ +      /*
+ +       * Read in and discard remaining data from the connections, if any
+ +       */
+ +      combiner->current_conn = 0;
+ +      while (combiner->conn_count > 0)
+ +      {
+ +              int res;
+ +              PGXCNodeHandle *conn = combiner->connections[combiner->current_conn];
+ +
+ +              /*
+ +               * Possible if we are doing merge sort.
+ +               * We can do usual procedure and move connections around since we are
+ +               * cleaning up and do not care what connection at what position
+ +               */
+ +              if (conn == NULL)
+ +              {
+ +                      REMOVE_CURR_CONN(combiner);
+ +                      continue;
+ +              }
+ +
+ +              /* throw away current message that may be in the buffer */
+ +              if (combiner->currentRow)
+ +              {
+ +                      pfree(combiner->currentRow);
+ +                      combiner->currentRow = NULL;
+ +              }
+ +
+ +              /* no data is expected */
+ +              if (conn->state == DN_CONNECTION_STATE_IDLE ||
+ +                              conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ +              {
+ +                      REMOVE_CURR_CONN(combiner);
+ +                      continue;
+ +              }
+ +
+ +              /*
+ +               * Connection owner is different, so no our data pending at
+ +               * the connection, nothing to read in.
+ +               */
+ +              if (conn->combiner && conn->combiner != combiner)
+ +              {
+ +                      REMOVE_CURR_CONN(combiner);
+ +                      continue;
+ +              }
+ +
+ +              res = handle_response(conn, combiner);
+ +              if (res == RESPONSE_EOF)
+ +              {
+ +                      struct timeval timeout;
+ +                      timeout.tv_sec = END_QUERY_TIMEOUT / 1000;
+ +                      timeout.tv_usec = (END_QUERY_TIMEOUT % 1000) * 1000;
+ +
+ +                      if (pgxc_node_receive(1, &conn, &timeout))
+ +                              elog(LOG, "Failed to read response from data nodes when ending query");
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Release tuplesort resources
+ +       */
+ +      if (combiner->tuplesortstate)
+ +      {
+ +              /*
+ +               * Free these before tuplesort_end, because these arrays may appear
+ +               * in the tuplesort's memory context, tuplesort_end deletes this
+ +               * context and may invalidate the memory.
+ +               * We still want to free them here, because these may be in different
+ +               * context.
+ +               */
+ +              if (combiner->tapenodes)
+ +              {
+ +                      pfree(combiner->tapenodes);
+ +                      combiner->tapenodes = NULL;
+ +              }
+ +              if (combiner->tapemarks)
+ +              {
+ +                      pfree(combiner->tapemarks);
+ +                      combiner->tapemarks = NULL;
+ +              }
+ +              /*
+ +               * tuplesort_end invalidates minimal tuple if it is in the slot because
+ +               * deletes the TupleSort memory context, causing seg fault later when
+ +               * releasing tuple table
+ +               */
+ +              ExecClearTuple(combiner->ss.ps.ps_ResultTupleSlot);
+ +              tuplesort_end((Tuplesortstate *) combiner->tuplesortstate);
+ +              combiner->tuplesortstate = NULL;
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * End the remote query
+ + */
+ +void
+ +ExecEndRemoteQuery(RemoteQueryState *node)
+ +{
+ +      ResponseCombiner *combiner = (ResponseCombiner *) node;
+ +
+ +      /*
+ +       * Clean up remote connections
+ +       */
+ +      pgxc_connections_cleanup(combiner);
+ +
+ +      /*
+ +       * Clean up parameters if they were set, since plan may be reused
+ +       */
+ +      if (node->paramval_data)
+ +      {
+ +              pfree(node->paramval_data);
+ +              node->paramval_data = NULL;
+ +              node->paramval_len = 0;
+ +      }
+ +
+ +      CloseCombiner(combiner);
+ +      pfree(node);
+ +}
+ +
+ +
+ +/**********************************************
+ + *
+ + * Routines to support RemoteSubplan plan node
+ + *
+ + **********************************************/
+ +
+ +
+ +/*
+ + * The routine walks recursively over the plan tree and changes cursor names of
+ + * RemoteSubplan nodes to make them different from launched from the other
+ + * datanodes. The routine changes cursor names in place, so caller should
+ + * take writable copy of the plan tree.
+ + */
+ +void
+ +RemoteSubplanMakeUnique(Node *plan, int unique)
+ +{
+ +      if (plan == NULL)
+ +              return;
+ +
+ +      if (IsA(plan, List))
+ +      {
+ +              ListCell *lc;
+ +              foreach(lc, (List *) plan)
+ +              {
+ +                      RemoteSubplanMakeUnique(lfirst(lc), unique);
+ +              }
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * Transform SharedQueue name
+ +       */
+ +      if (IsA(plan, RemoteSubplan))
+ +      {
+ +              ((RemoteSubplan *)plan)->unique = unique;
+ +      }
+ +      /* Otherwise it is a Plan descendant */
+ +      RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique);
+ +      RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->righttree, unique);
+ +      /* Tranform special cases */
+ +      switch (nodeTag(plan))
+ +      {
+ +              case T_Append:
+ +                      RemoteSubplanMakeUnique((Node *) ((Append *) plan)->appendplans,
+ +                                                                      unique);
+ +                      break;
+ +              case T_MergeAppend:
+ +                      RemoteSubplanMakeUnique((Node *) ((MergeAppend *) plan)->mergeplans,
+ +                                                                      unique);
+ +                      break;
+ +              case T_BitmapAnd:
+ +                      RemoteSubplanMakeUnique((Node *) ((BitmapAnd *) plan)->bitmapplans,
+ +                                                                      unique);
+ +                      break;
+ +              case T_BitmapOr:
+ +                      RemoteSubplanMakeUnique((Node *) ((BitmapOr *) plan)->bitmapplans,
+ +                                                                      unique);
+ +                      break;
+ +              case T_SubqueryScan:
+ +                      RemoteSubplanMakeUnique((Node *) ((SubqueryScan *) plan)->subplan,
+ +                                                                      unique);
+ +                      break;
+ +              default:
+ +                      break;
+ +      }
+ +}
+ +
+ +struct find_params_context
+ +{
+ +      RemoteParam *rparams;
+ +      Bitmapset *defineParams;
+ +};
+ +
+ +static bool
+ +determine_param_types_walker(Node *node, struct find_params_context *context)
+ +{
+ +      if (node == NULL)
+ +              return false;
+ +
+ +      if (IsA(node, Param))
+ +      {
+ +              Param *param = (Param *) node;
+ +              int paramno = param->paramid;
+ +
+ +              if (param->paramkind == PARAM_EXEC &&
+ +                              bms_is_member(paramno, context->defineParams))
+ +              {
+ +                      RemoteParam *cur = context->rparams;
+ +                      while (cur->paramkind != PARAM_EXEC || cur->paramid != paramno)
+ +                              cur++;
+ +                      cur->paramtype = param->paramtype;
+ +                      context->defineParams = bms_del_member(context->defineParams,
+ +                                                                                                 paramno);
+ +                      return bms_is_empty(context->defineParams);
+ +              }
+ +      }
+ +      return expression_tree_walker(node, determine_param_types_walker,
+ +                                                                (void *) context);
+ +
+ +}
+ +
+ +/*
+ + * Scan expressions in the plan tree to find Param nodes and get data types
+ + * from them
+ + */
+ +static bool
+ +determine_param_types(Plan *plan,  struct find_params_context *context)
+ +{
+ +      Bitmapset *intersect;
+ +
+ +      if (plan == NULL)
+ +              return false;
+ +
+ +      intersect = bms_intersect(plan->allParam, context->defineParams);
+ +      if (bms_is_empty(intersect))
+ +      {
+ +              /* the subplan does not depend on params we are interested in */
+ +              bms_free(intersect);
+ +              return false;
+ +      }
+ +      bms_free(intersect);
+ +
+ +      /* scan target list */
+ +      if (expression_tree_walker((Node *) plan->targetlist,
+ +                                                         determine_param_types_walker,
+ +                                                         (void *) context))
+ +              return true;
+ +      /* scan qual */
+ +      if (expression_tree_walker((Node *) plan->qual,
+ +                                                         determine_param_types_walker,
+ +                                                         (void *) context))
+ +              return true;
+ +
+ +      /* Check additional node-type-specific fields */
+ +      switch (nodeTag(plan))
+ +      {
+ +              case T_Result:
+ +                      if (expression_tree_walker((Node *) ((Result *) plan)->resconstantqual,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_SeqScan:
+ +              case T_SampleScan:
+ +              case T_CteScan:
+ +                      break;
+ +
+ +              case T_IndexScan:
+ +                      if (expression_tree_walker((Node *) ((IndexScan *) plan)->indexqual,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_IndexOnlyScan:
+ +                      if (expression_tree_walker((Node *) ((IndexOnlyScan *) plan)->indexqual,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_BitmapIndexScan:
+ +                      if (expression_tree_walker((Node *) ((BitmapIndexScan *) plan)->indexqual,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_BitmapHeapScan:
+ +                      if (expression_tree_walker((Node *) ((BitmapHeapScan *) plan)->bitmapqualorig,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_TidScan:
+ +                      if (expression_tree_walker((Node *) ((TidScan *) plan)->tidquals,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_SubqueryScan:
+ +                      if (determine_param_types(((SubqueryScan *) plan)->subplan, context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_FunctionScan:
+ +                      if (expression_tree_walker((Node *) ((FunctionScan *) plan)->functions,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_ValuesScan:
+ +                      if (expression_tree_walker((Node *) ((ValuesScan *) plan)->values_lists,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_ModifyTable:
+ +                      {
+ +                              ListCell   *l;
+ +
+ +                              foreach(l, ((ModifyTable *) plan)->plans)
+ +                              {
+ +                                      if (determine_param_types((Plan *) lfirst(l), context))
+ +                                              return true;
+ +                              }
+ +                      }
+ +                      break;
+ +
+ +              case T_RemoteSubplan:
+ +                      break;
+ +
+ +              case T_Append:
+ +                      {
+ +                              ListCell   *l;
+ +
+ +                              foreach(l, ((Append *) plan)->appendplans)
+ +                              {
+ +                                      if (determine_param_types((Plan *) lfirst(l), context))
+ +                                              return true;
+ +                              }
+ +                      }
+ +                      break;
+ +
+ +              case T_MergeAppend:
+ +                      {
+ +                              ListCell   *l;
+ +
+ +                              foreach(l, ((MergeAppend *) plan)->mergeplans)
+ +                              {
+ +                                      if (determine_param_types((Plan *) lfirst(l), context))
+ +                                              return true;
+ +                              }
+ +                      }
+ +                      break;
+ +
+ +              case T_BitmapAnd:
+ +                      {
+ +                              ListCell   *l;
+ +
+ +                              foreach(l, ((BitmapAnd *) plan)->bitmapplans)
+ +                              {
+ +                                      if (determine_param_types((Plan *) lfirst(l), context))
+ +                                              return true;
+ +                              }
+ +                      }
+ +                      break;
+ +
+ +              case T_BitmapOr:
+ +                      {
+ +                              ListCell   *l;
+ +
+ +                              foreach(l, ((BitmapOr *) plan)->bitmapplans)
+ +                              {
+ +                                      if (determine_param_types((Plan *) lfirst(l), context))
+ +                                              return true;
+ +                              }
+ +                      }
+ +                      break;
+ +
+ +              case T_NestLoop:
+ +                      if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_MergeJoin:
+ +                      if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      if (expression_tree_walker((Node *) ((MergeJoin *) plan)->mergeclauses,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_HashJoin:
+ +                      if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      if (expression_tree_walker((Node *) ((HashJoin *) plan)->hashclauses,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_Limit:
+ +                      if (expression_tree_walker((Node *) ((Limit *) plan)->limitOffset,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      if (expression_tree_walker((Node *) ((Limit *) plan)->limitCount,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                              return true;
+ +                      break;
+ +
+ +              case T_RecursiveUnion:
+ +                      break;
+ +
+ +              case T_LockRows:
+ +                      break;
+ +
+ +              case T_WindowAgg:
+ +                      if (expression_tree_walker((Node *) ((WindowAgg *) plan)->startOffset,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                      if (expression_tree_walker((Node *) ((WindowAgg *) plan)->endOffset,
+ +                                                                         determine_param_types_walker,
+ +                                                                         (void *) context))
+ +                      break;
+ +
+ +              case T_Hash:
+ +              case T_Agg:
+ +              case T_Material:
+ +              case T_Sort:
+ +              case T_Unique:
+ +              case T_SetOp:
+ +              case T_Group:
+ +                      break;
+ +
+ +              default:
+ +                      elog(ERROR, "unrecognized node type: %d",
+ +                               (int) nodeTag(plan));
+ +      }
+ +
+ +
+ +      /* recurse into subplans */
+ +      return determine_param_types(plan->lefttree, context) ||
+ +                      determine_param_types(plan->righttree, context);
+ +}
+ +
+ +
+ +RemoteSubplanState *
+ +ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
+ +{
+ +      RemoteStmt                      rstmt;
+ +      RemoteSubplanState *remotestate;
+ +      ResponseCombiner   *combiner;
+ +      CombineType                     combineType;
+ +      struct rusage           start_r;
+ +      struct timeval          start_t;
+ +
+ +      if (log_remotesubplan_stats)
+ +              ResetUsageCommon(&start_r, &start_t);
+ +
+ +      remotestate = makeNode(RemoteSubplanState);
+ +      combiner = (ResponseCombiner *) remotestate;
+ +      /*
+ +       * We do not need to combine row counts if we will receive intermediate
+ +       * results or if we won't return row count.
+ +       */
+ +      if (IS_PGXC_DATANODE || estate->es_plannedstmt->commandType == CMD_SELECT)
+ +      {
+ +              combineType = COMBINE_TYPE_NONE;
+ +              remotestate->execOnAll = node->execOnAll;
+ +      }
+ +      else
+ +      {
+ +              if (node->execOnAll)
+ +                      combineType = COMBINE_TYPE_SUM;
+ +              else
+ +                      combineType = COMBINE_TYPE_SAME;
+ +              /*
+ +               * If we are updating replicated table we should run plan on all nodes.
+ +               * We are choosing single node only to read
+ +               */
+ +              remotestate->execOnAll = true;
+ +      }
+ +      remotestate->execNodes = list_copy(node->nodeList);
+ +      InitResponseCombiner(combiner, 0, combineType);
+ +      combiner->ss.ps.plan = (Plan *) node;
+ +      combiner->ss.ps.state = estate;
+ +
-                                                                  true, resultslot, NULL))
++      combiner->ss.ps.qual = NULL;
+ +
+ +      combiner->request_type = REQUEST_TYPE_QUERY;
+ +
+ +      ExecInitResultTupleSlot(estate, &combiner->ss.ps);
+ +      ExecAssignResultTypeFromTL((PlanState *) remotestate);
+ +
+ +      /*
+ +       * We optimize execution if we going to send down query to next level
+ +       */
+ +      remotestate->local_exec = false;
+ +      if (IS_PGXC_DATANODE)
+ +      {
+ +              if (remotestate->execNodes == NIL)
+ +              {
+ +                      /*
+ +                       * Special case, if subplan is not distributed, like Result, or
+ +                       * query against catalog tables only.
+ +                       * We are only interested in filtering out the subplan results and
+ +                       * get only those we are interested in.
+ +                       * XXX we may want to prevent multiple executions in this case
+ +                       * either, to achieve this we will set single execNode on planning
+ +                       * time and this case would never happen, this code branch could
+ +                       * be removed.
+ +                       */
+ +                      remotestate->local_exec = true;
+ +              }
+ +              else if (!remotestate->execOnAll)
+ +              {
+ +                      /*
+ +                       * XXX We should change planner and remove this flag.
+ +                       * We want only one node is producing the replicated result set,
+ +                       * and planner should choose that node - it is too hard to determine
+ +                       * right node at execution time, because it should be guaranteed
+ +                       * that all consumers make the same decision.
+ +                       * For now always execute replicated plan on local node to save
+ +                       * resources.
+ +                       */
+ +
+ +                      /*
+ +                       * Make sure local node is in execution list
+ +                       */
+ +                      if (list_member_int(remotestate->execNodes, PGXCNodeId-1))
+ +                      {
+ +                              list_free(remotestate->execNodes);
+ +                              remotestate->execNodes = NIL;
+ +                              remotestate->local_exec = true;
+ +                      }
+ +                      else
+ +                      {
+ +                              /*
+ +                               * To support, we need to connect to some producer, so
+ +                               * each producer should be prepared to serve rows for random
+ +                               * number of consumers. It is hard, because new consumer may
+ +                               * connect after producing is started, on the other hand,
+ +                               * absence of expected consumer is a problem too.
+ +                               */
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ +                                               errmsg("Getting replicated results from remote node is not supported")));
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * If we are going to execute subplan locally or doing explain initialize
+ +       * the subplan. Otherwise have remote node doing that.
+ +       */
+ +      if (remotestate->local_exec || (eflags & EXEC_FLAG_EXPLAIN_ONLY))
+ +      {
+ +              outerPlanState(remotestate) = ExecInitNode(outerPlan(node), estate,
+ +                                                                                                 eflags);
+ +              if (node->distributionNodes)
+ +              {
+ +                      Oid             distributionType = InvalidOid;
+ +                      TupleDesc       typeInfo;
+ +
+ +                      typeInfo = combiner->ss.ps.ps_ResultTupleSlot->tts_tupleDescriptor;
+ +                      if (node->distributionKey != InvalidAttrNumber)
+ +                      {
+ +                              Form_pg_attribute attr;
+ +                              attr = typeInfo->attrs[node->distributionKey - 1];
+ +                              distributionType = attr->atttypid;
+ +                      }
+ +                      /* Set up locator */
+ +                      remotestate->locator = createLocator(node->distributionType,
+ +                                                                                               RELATION_ACCESS_INSERT,
+ +                                                                                               distributionType,
+ +                                                                                               LOCATOR_LIST_LIST,
+ +                                                                                               0,
+ +                                                                                               (void *) node->distributionNodes,
+ +                                                                                               (void **) &remotestate->dest_nodes,
+ +                                                                                               false);
+ +              }
+ +              else
+ +                      remotestate->locator = NULL;
+ +      }
+ +
+ +      /*
+ +       * Encode subplan if it will be sent to remote nodes
+ +       */
+ +      if (remotestate->execNodes && !(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+ +      {
+ +              ParamListInfo ext_params;
+ +              /* Encode plan if we are going to execute it on other nodes */
+ +              rstmt.type = T_RemoteStmt;
+ +              if (node->distributionType == LOCATOR_TYPE_NONE && IS_PGXC_DATANODE)
+ +              {
+ +                      /*
+ +                       * There are cases when planner can not determine distribution of a
+ +                       * subplan, in particular it does not determine distribution of
+ +                       * subquery nodes. Such subplans executed from current location
+ +                       * (node) and combine all results, like from coordinator nodes.
+ +                       * However, if there are multiple locations where distributed
+ +                       * executor is running this node, and there are more of
+ +                       * RemoteSubplan plan nodes in the subtree there will be a problem -
+ +                       * Instances of the inner RemoteSubplan nodes will be using the same
+ +                       * SharedQueue, causing error. To avoid this problem we should
+ +                       * traverse the subtree and change SharedQueue name to make it
+ +                       * unique.
+ +                       */
+ +                      RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId);
+ +              }
+ +              rstmt.planTree = outerPlan(node);
+ +              /*
+ +               * If datanode launch further execution of a command it should tell
+ +               * it is a SELECT, otherwise secondary data nodes won't return tuples
+ +               * expecting there will be nothing to return.
+ +               */
+ +              if (IsA(outerPlan(node), ModifyTable))
+ +              {
+ +                      rstmt.commandType = estate->es_plannedstmt->commandType;
+ +                      rstmt.hasReturning = estate->es_plannedstmt->hasReturning;
+ +                      rstmt.resultRelations = estate->es_plannedstmt->resultRelations;
+ +              }
+ +              else
+ +              {
+ +                      rstmt.commandType = CMD_SELECT;
+ +                      rstmt.hasReturning = false;
+ +                      rstmt.resultRelations = NIL;
+ +              }
+ +              rstmt.rtable = estate->es_range_table;
+ +              rstmt.subplans = estate->es_plannedstmt->subplans;
+ +              rstmt.nParamExec = estate->es_plannedstmt->nParamExec;
+ +              ext_params = estate->es_param_list_info;
+ +              rstmt.nParamRemote = (ext_params ? ext_params->numParams : 0) +
+ +                              bms_num_members(node->scan.plan.allParam);
+ +              if (rstmt.nParamRemote > 0)
+ +              {
+ +                      Bitmapset *tmpset;
+ +                      int i;
+ +                      int paramno;
+ +
+ +                      /* Allocate enough space */
+ +                      rstmt.remoteparams = (RemoteParam *) palloc(rstmt.nParamRemote *
+ +                                                                                                              sizeof(RemoteParam));
+ +                      paramno = 0;
+ +                      if (ext_params)
+ +                      {
+ +                              for (i = 0; i < ext_params->numParams; i++)
+ +                              {
+ +                                      ParamExternData *param = &ext_params->params[i];
+ +                                      /*
+ +                                       * If parameter type is not yet defined but can be defined
+ +                                       * do that
+ +                                       */
+ +                                      if (!OidIsValid(param->ptype) && ext_params->paramFetch)
+ +                                              (*ext_params->paramFetch) (ext_params, i + 1);
+ +
+ +                                      /*
+ +                                       * If the parameter type is still not defined, assume that
+ +                                       * it is unused. But we put a default INT4OID type for such
+ +                                       * unused parameters to keep the parameter pushdown code
+ +                                       * happy.
+ +                                       *
+ +                                       * These unused parameters are never accessed during
+ +                                       * execution and we will just a null value for these
+ +                                       * "dummy" parameters. But including them here ensures that
+ +                                       * we send down the parameters in the correct order and at
+ +                                       * the position that the datanode needs
+ +                                       */
+ +                                      if (OidIsValid(param->ptype))
+ +                                      {
+ +                                              rstmt.remoteparams[paramno].paramused = 1;
+ +                                              rstmt.remoteparams[paramno].paramtype = param->ptype;
+ +                                      }
+ +                                      else
+ +                                      {
+ +                                              rstmt.remoteparams[paramno].paramused = 0;
+ +                                              rstmt.remoteparams[paramno].paramtype = INT4OID;
+ +                                      }
+ +
+ +                                      rstmt.remoteparams[paramno].paramkind = PARAM_EXTERN;
+ +                                      rstmt.remoteparams[paramno].paramid = i + 1;
+ +                                      paramno++;
+ +                              }
+ +                              /* store actual number of parameters */
+ +                              rstmt.nParamRemote = paramno;
+ +                      }
+ +
+ +                      if (!bms_is_empty(node->scan.plan.allParam))
+ +                      {
+ +                              Bitmapset *defineParams = NULL;
+ +                              tmpset = bms_copy(node->scan.plan.allParam);
+ +                              while ((i = bms_first_member(tmpset)) >= 0)
+ +                              {
+ +                                      ParamExecData *prmdata;
+ +
+ +                                      prmdata = &(estate->es_param_exec_vals[i]);
+ +                                      rstmt.remoteparams[paramno].paramkind = PARAM_EXEC;
+ +                                      rstmt.remoteparams[paramno].paramid = i;
+ +                                      rstmt.remoteparams[paramno].paramtype = prmdata->ptype;
+ +                                      rstmt.remoteparams[paramno].paramused = 1;
+ +                                      /* Will scan plan tree to find out data type of the param */
+ +                                      if (prmdata->ptype == InvalidOid)
+ +                                              defineParams = bms_add_member(defineParams, i);
+ +                                      paramno++;
+ +                              }
+ +                              /* store actual number of parameters */
+ +                              rstmt.nParamRemote = paramno;
+ +                              bms_free(tmpset);
+ +                              if (!bms_is_empty(defineParams))
+ +                              {
+ +                                      struct find_params_context context;
+ +                                      bool all_found;
+ +
+ +                                      context.rparams = rstmt.remoteparams;
+ +                                      context.defineParams = defineParams;
+ +
+ +                                      all_found = determine_param_types(node->scan.plan.lefttree,
+ +                                                                                                        &context);
+ +                                      /*
+ +                                       * Remove not defined params from the list of remote params.
+ +                                       * If they are not referenced no need to send them down
+ +                                       */
+ +                                      if (!all_found)
+ +                                      {
+ +                                              for (i = 0; i < rstmt.nParamRemote; i++)
+ +                                              {
+ +                                                      if (rstmt.remoteparams[i].paramkind == PARAM_EXEC &&
+ +                                                                      bms_is_member(rstmt.remoteparams[i].paramid,
+ +                                                                                                context.defineParams))
+ +                                                      {
+ +                                                              /* Copy last parameter inplace */
+ +                                                              rstmt.nParamRemote--;
+ +                                                              if (i < rstmt.nParamRemote)
+ +                                                                      rstmt.remoteparams[i] =
+ +                                                                              rstmt.remoteparams[rstmt.nParamRemote];
+ +                                                              /* keep current in the same position */
+ +                                                              i--;
+ +                                                      }
+ +                                              }
+ +                                      }
+ +                                      bms_free(context.defineParams);
+ +                              }
+ +                      }
+ +                      remotestate->nParamRemote = rstmt.nParamRemote;
+ +                      remotestate->remoteparams = rstmt.remoteparams;
+ +              }
+ +              else
+ +                      rstmt.remoteparams = NULL;
+ +              rstmt.rowMarks = estate->es_plannedstmt->rowMarks;
+ +              rstmt.distributionKey = node->distributionKey;
+ +              rstmt.distributionType = node->distributionType;
+ +              rstmt.distributionNodes = node->distributionNodes;
+ +              rstmt.distributionRestrict = node->distributionRestrict;
+ +
+ +              set_portable_output(true);
+ +              remotestate->subplanstr = nodeToString(&rstmt);
+ +              set_portable_output(false);
+ +
+ +              /*
+ +               * Connect to remote nodes and send down subplan
+ +               */
+ +              if (!(eflags & EXEC_FLAG_SUBPLAN))
+ +                      ExecFinishInitRemoteSubplan(remotestate);
+ +      }
+ +      remotestate->bound = false;
+ +      /*
+ +       * It does not makes sense to merge sort if there is only one tuple source.
+ +       * By the contract it is already sorted
+ +       */
+ +      if (node->sort && remotestate->execOnAll &&
+ +                      list_length(remotestate->execNodes) > 1)
+ +              combiner->merge_sort = true;
+ +
+ +      if (log_remotesubplan_stats)
+ +              ShowUsageCommon("ExecInitRemoteSubplan", &start_r, &start_t);
+ +
+ +      return remotestate;
+ +}
+ +
+ +
+ +void
+ +ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
+ +{
+ +      ResponseCombiner   *combiner = (ResponseCombiner *) node;
+ +      RemoteSubplan      *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ +      EState                     *estate = combiner->ss.ps.state;
+ +      Oid                        *paramtypes = NULL;
+ +      GlobalTransactionId gxid = InvalidGlobalTransactionId;
+ +      Snapshot                        snapshot;
+ +      TimestampTz                     timestamp;
+ +      int                             i;
+ +      bool                            is_read_only;
+ +      char                            cursor[NAMEDATALEN];
+ +
+ +      /*
+ +       * Name is required to store plan as a statement
+ +       */
+ +      Assert(plan->cursor);
+ +
+ +      if (plan->unique)
+ +              snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+ +      else
+ +              strncpy(cursor, plan->cursor, NAMEDATALEN);
+ +
+ +      /* If it is alreaty fully initialized nothing to do */
+ +      if (combiner->connections)
+ +              return;
+ +
+ +      /* local only or explain only execution */
+ +      if (node->subplanstr == NULL)
+ +              return;
+ +
+ +      /* 
+ +       * Check if any results are planned to be received here.
+ +       * Otherwise it does not make sense to send out the subplan.
+ +       */
+ +      if (IS_PGXC_DATANODE && plan->distributionRestrict && 
+ +                      !list_member_int(plan->distributionRestrict, PGXCNodeId - 1))
+ +              return;
+ +
+ +      /*
+ +       * Acquire connections and send down subplan where it will be stored
+ +       * as a prepared statement.
+ +       * That does not require transaction id or snapshot, so does not send them
+ +       * here, postpone till bind.
+ +       */
+ +      if (node->execOnAll)
+ +      {
+ +              PGXCNodeAllHandles *pgxc_connections;
+ +              pgxc_connections = get_handles(node->execNodes, NIL, false, true);
+ +              combiner->conn_count = pgxc_connections->dn_conn_count;
+ +              combiner->connections = pgxc_connections->datanode_handles;
+ +              combiner->current_conn = 0;
+ +              pfree(pgxc_connections);
+ +      }
+ +      else
+ +      {
+ +              combiner->connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *));
+ +              combiner->connections[0] = get_any_handle(node->execNodes);
+ +              combiner->conn_count = 1;
+ +              combiner->current_conn = 0;
+ +      }
+ +
+ +      gxid = GetCurrentTransactionIdIfAny();
+ +
+ +      /* extract parameter data types */
+ +      if (node->nParamRemote > 0)
+ +      {
+ +              paramtypes = (Oid *) palloc(node->nParamRemote * sizeof(Oid));
+ +              for (i = 0; i < node->nParamRemote; i++)
+ +                      paramtypes[i] = node->remoteparams[i].paramtype;
+ +      }
+ +      /* send down subplan */
+ +      snapshot = GetActiveSnapshot();
+ +      timestamp = GetCurrentGTMStartTimestamp();
+ +      /*
+ +       * Datanode should not send down statements that may modify
+ +       * the database. Potgres assumes that all sessions under the same
+ +       * postmaster have different xids. That may cause a locking problem.
+ +       * Shared locks acquired for reading still work fine.
+ +       */
+ +      is_read_only = IS_PGXC_DATANODE ||
+ +                      !IsA(outerPlan(plan), ModifyTable);
+ +
+ +      for (i = 0; i < combiner->conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *connection = combiner->connections[i];
+ +
+ +              if (pgxc_node_begin(1, &connection, gxid, true,
+ +                                                      is_read_only, PGXC_NODE_DATANODE))
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Could not begin transaction on data node.")));
+ +
+ +              if (pgxc_node_send_timestamp(connection, timestamp))
+ +              {
+ +                      combiner->conn_count = 0;
+ +                      pfree(combiner->connections);
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to send command to data nodes")));
+ +              }
+ +              if (snapshot && pgxc_node_send_snapshot(connection, snapshot))
+ +              {
+ +                      combiner->conn_count = 0;
+ +                      pfree(combiner->connections);
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to send snapshot to data nodes")));
+ +              }
+ +              if (pgxc_node_send_cmd_id(connection, estate->es_snapshot->curcid) < 0 )
+ +              {
+ +                      combiner->conn_count = 0;
+ +                      pfree(combiner->connections);
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to send command ID to data nodes")));
+ +              }
+ +              pgxc_node_send_plan(connection, cursor, "Remote Subplan",
+ +                                                      node->subplanstr, node->nParamRemote, paramtypes);
+ +              if (pgxc_node_flush(connection))
+ +              {
+ +                      combiner->conn_count = 0;
+ +                      pfree(combiner->connections);
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to send subplan to data nodes")));
+ +              }
+ +      }
+ +}
+ +
+ +
+ +static void
+ +append_param_data(StringInfo buf, Oid ptype, int pused, Datum value, bool isnull)
+ +{
+ +      uint32 n32;
+ +
+ +      /* Assume unused parameters to have null values */
+ +      if (!pused)
+ +              ptype = INT4OID;
+ +
+ +      if (isnull)
+ +      {
+ +              n32 = htonl(-1);
+ +              appendBinaryStringInfo(buf, (char *) &n32, 4);
+ +      }
+ +      else
+ +      {
+ +              Oid             typOutput;
+ +              bool    typIsVarlena;
+ +              Datum   pval;
+ +              char   *pstring;
+ +              int             len;
+ +
+ +              /* Get info needed to output the value */
+ +              getTypeOutputInfo(ptype, &typOutput, &typIsVarlena);
+ +
+ +              /*
+ +               * If we have a toasted datum, forcibly detoast it here to avoid
+ +               * memory leakage inside the type's output routine.
+ +               */
+ +              if (typIsVarlena)
+ +                      pval = PointerGetDatum(PG_DETOAST_DATUM(value));
+ +              else
+ +                      pval = value;
+ +
+ +              /* Convert Datum to string */
+ +              pstring = OidOutputFunctionCall(typOutput, pval);
+ +
+ +              /* copy data to the buffer */
+ +              len = strlen(pstring);
+ +              n32 = htonl(len);
+ +              appendBinaryStringInfo(buf, (char *) &n32, 4);
+ +              appendBinaryStringInfo(buf, pstring, len);
+ +      }
+ +}
+ +
+ +
+ +static int encode_parameters(int nparams, RemoteParam *remoteparams,
+ +                                                       PlanState *planstate, char** result)
+ +{
+ +      EState             *estate = planstate->state;
+ +      StringInfoData  buf;
+ +      uint16                  n16;
+ +      int                     i;
+ +      ExprContext        *econtext;
+ +      MemoryContext   oldcontext;
+ +
+ +      if (planstate->ps_ExprContext == NULL)
+ +              ExecAssignExprContext(estate, planstate);
+ +
+ +      econtext = planstate->ps_ExprContext;
+ +      oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+ +      MemoryContextReset(econtext->ecxt_per_tuple_memory);
+ +
+ +      initStringInfo(&buf);
+ +
+ +      /* Number of parameter values */
+ +      n16 = htons(nparams);
+ +      appendBinaryStringInfo(&buf, (char *) &n16, 2);
+ +
+ +      /* Parameter values */
+ +      for (i = 0; i < nparams; i++)
+ +      {
+ +              RemoteParam *rparam = &remoteparams[i];
+ +              int ptype = rparam->paramtype;
+ +              int pused = rparam->paramused;
+ +              if (rparam->paramkind == PARAM_EXTERN)
+ +              {
+ +                      ParamExternData *param;
+ +                      param = &(estate->es_param_list_info->params[rparam->paramid - 1]);
+ +                      append_param_data(&buf, ptype, pused, param->value, param->isnull);
+ +              }
+ +              else
+ +              {
+ +                      ParamExecData *param;
+ +                      param = &(estate->es_param_exec_vals[rparam->paramid]);
+ +                      if (param->execPlan)
+ +                      {
+ +                              /* Parameter not evaluated yet, so go do it */
+ +                              ExecSetParamPlan((SubPlanState *) param->execPlan,
+ +                                                               planstate->ps_ExprContext);
+ +                              /* ExecSetParamPlan should have processed this param... */
+ +                              Assert(param->execPlan == NULL);
+ +                      }
+ +                      if (!param->done)
+ +                              param->isnull = true;
+ +                      append_param_data(&buf, ptype, pused, param->value, param->isnull);
+ +
+ +              }
+ +      }
+ +
+ +      /* Take data from the buffer */
+ +      *result = palloc(buf.len);
+ +      memcpy(*result, buf.data, buf.len);
+ +      MemoryContextSwitchTo(oldcontext);
+ +      return buf.len;
+ +}
+ +
+ +
+ +TupleTableSlot *
+ +ExecRemoteSubplan(RemoteSubplanState *node)
+ +{
+ +      ResponseCombiner *combiner = (ResponseCombiner *) node;
+ +      RemoteSubplan  *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ +      EState             *estate = combiner->ss.ps.state;
+ +      TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot;
+ +      struct rusage   start_r;
+ +      struct timeval          start_t;
+ +
+ +      /* 
+ +       * We allow combiner->conn_count == 0 after node initialization
+ +       * if we figured out that current node won't receive any result
+ +       * because of distributionRestrict is set by planner.
+ +       * But we should distinguish this case from others, when conn_count is 0.
+ +       * That is possible if local execution is chosen or data are buffered 
+ +       * at the coordinator or data are exhausted and node was reset.
+ +       * in last two cases connections are saved to cursor_connections and we
+ +       * can check their presence.  
+ +       */
+ +      if (!node->local_exec && combiner->conn_count == 0 && 
+ +                      combiner->cursor_count == 0)
+ +              return NULL;
+ +
+ +      if (log_remotesubplan_stats)
+ +              ResetUsageCommon(&start_r, &start_t);
+ +
+ +primary_mode_phase_two:
+ +      if (!node->bound)
+ +      {
+ +              int fetch = 0;
+ +              int paramlen = 0;
+ +              char *paramdata = NULL;
+ +              /*
+ +               * Conditions when we want to execute query on the primary node first:
+ +               * Coordinator running replicated ModifyTable on multiple nodes
+ +               */
+ +              bool primary_mode = combiner->probing_primary ||
+ +                              (IS_PGXC_COORDINATOR &&
+ +                               combiner->combine_type == COMBINE_TYPE_SAME &&
+ +                               OidIsValid(primary_data_node) &&
+ +                               combiner->conn_count > 1);
+ +              char cursor[NAMEDATALEN];
+ +
+ +              if (plan->cursor)
+ +              {
+ +                      fetch = PGXLRemoteFetchSize;
+ +                      if (plan->unique)
+ +                              snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+ +                      else
+ +                              strncpy(cursor, plan->cursor, NAMEDATALEN);
+ +              }
+ +              else
+ +                      cursor[0] = '\0';
+ +
+ +              /*
+ +               * Send down all available parameters, if any is used by the plan
+ +               */
+ +              if (estate->es_param_list_info ||
+ +                              !bms_is_empty(plan->scan.plan.allParam))
+ +                      paramlen = encode_parameters(node->nParamRemote,
+ +                                                                               node->remoteparams,
+ +                                                                               &combiner->ss.ps,
+ +                                                                               &paramdata);
+ +
+ +              /*
+ +               * The subplan being rescanned, need to restore connections and
+ +               * re-bind the portal
+ +               */
+ +              if (combiner->cursor)
+ +              {
+ +                      int i;
+ +
+ +                      /*
+ +                       * On second phase of primary mode connections are properly set,
+ +                       * so do not copy.
+ +                       */
+ +                      if (!combiner->probing_primary)
+ +                      {
+ +                              combiner->conn_count = combiner->cursor_count;
+ +                              memcpy(combiner->connections, combiner->cursor_connections,
+ +                                                      combiner->cursor_count * sizeof(PGXCNodeHandle *));
+ +                      }
+ +
+ +                      for (i = 0; i < combiner->conn_count; i++)
+ +                      {
+ +                              PGXCNodeHandle *conn = combiner->connections[i];
+ +
+ +                              CHECK_OWNERSHIP(conn, combiner);
+ +
+ +                              /* close previous cursor only on phase 1 */
+ +                              if (!primary_mode || !combiner->probing_primary)
+ +                                      pgxc_node_send_close(conn, false, combiner->cursor);
+ +
+ +                              /*
+ +                               * If we now should probe primary, skip execution on non-primary
+ +                               * nodes
+ +                               */
+ +                              if (primary_mode && !combiner->probing_primary &&
+ +                                              conn->nodeoid != primary_data_node)
+ +                                      continue;
+ +
+ +                              /* rebind */
+ +                              pgxc_node_send_bind(conn, combiner->cursor, combiner->cursor,
+ +                                                                      paramlen, paramdata);
+ +                              /* execute */
+ +                              pgxc_node_send_execute(conn, combiner->cursor, fetch);
+ +                              /* submit */
+ +                              if (pgxc_node_send_flush(conn))
+ +                              {
+ +                                      combiner->conn_count = 0;
+ +                                      pfree(combiner->connections);
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Failed to send command to data nodes")));
+ +                              }
+ +
+ +                              /*
+ +                               * There could be only one primary node, but can not leave the
+ +                               * loop now, because we need to close cursors.
+ +                               */
+ +                              if (primary_mode && !combiner->probing_primary)
+ +                              {
+ +                                      combiner->current_conn = i;
+ +                              }
+ +                      }
+ +              }
+ +              else if (node->execNodes)
+ +              {
+ +                      CommandId               cid;
+ +                      int                     i;
+ +
+ +                      /*
+ +                       * There are prepared statement, connections should be already here
+ +                       */
+ +                      Assert(combiner->conn_count > 0);
+ +
+ +                      combiner->extended_query = true;
+ +                      cid = estate->es_snapshot->curcid;
+ +
+ +                      for (i = 0; i < combiner->conn_count; i++)
+ +                      {
+ +                              PGXCNodeHandle *conn = combiner->connections[i];
+ +
+ +                              CHECK_OWNERSHIP(conn, combiner);
+ +
+ +                              /*
+ +                               * If we now should probe primary, skip execution on non-primary
+ +                               * nodes
+ +                               */
+ +                              if (primary_mode && !combiner->probing_primary &&
+ +                                              conn->nodeoid != primary_data_node)
+ +                                      continue;
+ +
+ +                              /*
+ +                               * Update Command Id. Other command may be executed after we
+ +                               * prepare and advanced Command Id. We should use one that
+ +                               * was active at the moment when command started.
+ +                               */
+ +                              if (pgxc_node_send_cmd_id(conn, cid))
+ +                              {
+ +                                      combiner->conn_count = 0;
+ +                                      pfree(combiner->connections);
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Failed to send command ID to data nodes")));
+ +                              }
+ +
+ +                              /*
+ +                               * Resend the snapshot as well since the connection may have
+ +                               * been buffered and use by other commands, with different
+ +                               * snapshot. Set the snapshot back to what it was
+ +                               */
+ +                              if (pgxc_node_send_snapshot(conn, estate->es_snapshot))
+ +                              {
+ +                                      combiner->conn_count = 0;
+ +                                      pfree(combiner->connections);
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Failed to send snapshot to data nodes")));
+ +                              }
+ +
+ +                              /* bind */
+ +                              pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata);
+ +                              /* execute */
+ +                              pgxc_node_send_execute(conn, cursor, fetch);
+ +                              /* submit */
+ +                              if (pgxc_node_send_flush(conn))
+ +                              {
+ +                                      combiner->conn_count = 0;
+ +                                      pfree(combiner->connections);
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                                       errmsg("Failed to send command to data nodes")));
+ +                              }
+ +
+ +                              /*
+ +                               * There could be only one primary node, so if we executed
+ +                               * subquery on the phase one of primary mode we can leave the
+ +                               * loop now.
+ +                               */
+ +                              if (primary_mode && !combiner->probing_primary)
+ +                              {
+ +                                      combiner->current_conn = i;
+ +                                      break;
+ +                              }
+ +                      }
+ +
+ +                      /*
+ +                       * On second phase of primary mode connections are backed up
+ +                       * already, so do not copy.
+ +                       */
+ +                      if (primary_mode)
+ +                      {
+ +                              if (combiner->probing_primary)
+ +                              {
+ +                                      combiner->cursor = pstrdup(cursor);
+ +                              }
+ +                              else
+ +                              {
+ +                                      combiner->cursor = pstrdup(cursor);
+ +                                      combiner->cursor_count = combiner->conn_count;
+ +                                      combiner->cursor_connections = (PGXCNodeHandle **) palloc(
+ +                                                              combiner->conn_count * sizeof(PGXCNodeHandle *));
+ +                                      memcpy(combiner->cursor_connections, combiner->connections,
+ +                                                              combiner->conn_count * sizeof(PGXCNodeHandle *));
+ +                              }
+ +                      }
+ +                      else
+ +                      {
+ +                              combiner->cursor = pstrdup(cursor);
+ +                              combiner->cursor_count = combiner->conn_count;
+ +                              combiner->cursor_connections = (PGXCNodeHandle **) palloc(
+ +                                                      combiner->conn_count * sizeof(PGXCNodeHandle *));
+ +                              memcpy(combiner->cursor_connections, combiner->connections,
+ +                                                      combiner->conn_count * sizeof(PGXCNodeHandle *));
+ +                      }
+ +              }
+ +
+ +              if (combiner->merge_sort)
+ +              {
+ +                      /*
+ +                       * Requests are already made and sorter can fetch tuples to populate
+ +                       * sort buffer.
+ +                       */
+ +                      combiner->tuplesortstate = tuplesort_begin_merge(
+ +                                                                         resultslot->tts_tupleDescriptor,
+ +                                                                         plan->sort->numCols,
+ +                                                                         plan->sort->sortColIdx,
+ +                                                                         plan->sort->sortOperators,
+ +                                                                         plan->sort->sortCollations,
+ +                                                                         plan->sort->nullsFirst,
+ +                                                                         combiner,
+ +                                                                         work_mem);
+ +              }
+ +              if (primary_mode)
+ +              {
+ +                      if (combiner->probing_primary)
+ +                      {
+ +                              combiner->probing_primary = false;
+ +                              node->bound = true;
+ +                      }
+ +                      else
+ +                              combiner->probing_primary = true;
+ +              }
+ +              else
+ +                      node->bound = true;
+ +      }
+ +
+ +      if (combiner->tuplesortstate)
+ +      {
+ +              if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate,
++                                                                 true, true, resultslot, NULL))
+ +              {
+ +                      if (log_remotesubplan_stats)
+ +                              ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t);
+ +                      return resultslot;
+ +              }
+ +      }
+ +      else
+ +      {
+ +              TupleTableSlot *slot = FetchTuple(combiner);
+ +              if (!TupIsNull(slot))
+ +              {
+ +                      if (log_remotesubplan_stats)
+ +                              ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t);
+ +                      return slot;
+ +              }
+ +              else if (combiner->probing_primary)
+ +                      /* phase1 is successfully completed, run on other nodes */
+ +                      goto primary_mode_phase_two;
+ +      }
+ +      if (combiner->errorMessage)
+ +              pgxc_node_report_error(combiner);
+ +
+ +      if (log_remotesubplan_stats)
+ +              ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t);
+ +
+ +      return NULL;
+ +}
+ +
+ +
+ +void
+ +ExecReScanRemoteSubplan(RemoteSubplanState *node)
+ +{
+ +      ResponseCombiner *combiner = (ResponseCombiner *)node;
+ +
+ +      /*
+ +       * If we haven't queried remote nodes yet, just return. If outerplan'
+ +       * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+ +       * else - no reason to re-scan it at all.
+ +       */
+ +      if (!node->bound)
+ +              return;
+ +
+ +      /*
+ +       * If we execute locally rescan local copy of the plan
+ +       */
+ +      if (outerPlanState(node))
+ +              ExecReScan(outerPlanState(node));
+ +
+ +      /*
+ +       * Consume any possible pending input
+ +       */
+ +      pgxc_connections_cleanup(combiner);
+ +
+ +      /* misc cleanup */
+ +      combiner->command_complete_count = 0;
+ +      combiner->description_count = 0;
+ +
+ +      /*
+ +       * Force query is re-bound with new parameters
+ +       */
+ +      node->bound = false;
+ +}
+ +
+ +
+ +void
+ +ExecEndRemoteSubplan(RemoteSubplanState *node)
+ +{
+ +      ResponseCombiner *combiner = (ResponseCombiner *)node;
+ +      RemoteSubplan    *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ +      int i;
+ +      struct rusage   start_r;
+ +      struct timeval          start_t;
+ +
+ +      if (log_remotesubplan_stats)
+ +              ResetUsageCommon(&start_r, &start_t);
+ +
+ +      if (outerPlanState(node))
+ +              ExecEndNode(outerPlanState(node));
+ +      if (node->locator)
+ +              freeLocator(node->locator);
+ +
+ +      /*
+ +       * Consume any possible pending input
+ +       */
+ +      if (node->bound)
+ +              pgxc_connections_cleanup(combiner);
+ +
+ +      /*
+ +       * Update coordinator statistics
+ +       */
+ +      if (IS_PGXC_COORDINATOR)
+ +      {
+ +              EState *estate = combiner->ss.ps.state;
+ +
+ +              if (estate->es_num_result_relations > 0 && estate->es_processed > 0)
+ +              {
+ +                      switch (estate->es_plannedstmt->commandType)
+ +                      {
+ +                              case CMD_INSERT:
+ +                                      /* One statement can insert into only one relation */
+ +                                      pgstat_count_remote_insert(
+ +                                                              estate->es_result_relations[0].ri_RelationDesc,
+ +                                                              estate->es_processed);
+ +                                      break;
+ +                              case CMD_UPDATE:
+ +                              case CMD_DELETE:
+ +                                      {
+ +                                              /*
+ +                                               * We can not determine here how many row were updated
+ +                                               * or delete in each table, so assume same number of
+ +                                               * affected row in each table.
+ +                                               * If resulting number of rows is 0 because of rounding,
+ +                                               * increment each counter at least on 1.
+ +                                               */
+ +                                              int             i;
+ +                                              int     n;
+ +                                              bool    update;
+ +
+ +                                              update = (estate->es_plannedstmt->commandType == CMD_UPDATE);
+ +                                              n = estate->es_processed / estate->es_num_result_relations;
+ +                                              if (n == 0)
+ +                                                      n = 1;
+ +                                              for (i = 0; i < estate->es_num_result_relations; i++)
+ +                                              {
+ +                                                      Relation r;
+ +                                                      r = estate->es_result_relations[i].ri_RelationDesc;
+ +                                                      if (update)
+ +                                                              pgstat_count_remote_update(r, n);
+ +                                                      else
+ +                                                              pgstat_count_remote_delete(r, n);
+ +                                              }
+ +                                      }
+ +                                      break;
+ +                              default:
+ +                                      /* nothing to count */
+ +                                      break;
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Close portals. While cursors_connections exist there are open portals
+ +       */
+ +      if (combiner->cursor)
+ +      {
+ +              /* Restore connections where there are active statements */
+ +              combiner->conn_count = combiner->cursor_count;
+ +              memcpy(combiner->connections, combiner->cursor_connections,
+ +                                      combiner->cursor_count * sizeof(PGXCNodeHandle *));
+ +              for (i = 0; i < combiner->cursor_count; i++)
+ +              {
+ +                      PGXCNodeHandle *conn;
+ +
+ +                      conn = combiner->cursor_connections[i];
+ +
+ +                      CHECK_OWNERSHIP(conn, combiner);
+ +
+ +                      if (pgxc_node_send_close(conn, false, combiner->cursor) != 0)
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                               errmsg("Failed to close data node cursor")));
+ +              }
+ +              /* The cursor stuff is not needed */
+ +              combiner->cursor = NULL;
+ +              combiner->cursor_count = 0;
+ +              pfree(combiner->cursor_connections);
+ +              combiner->cursor_connections = NULL;
+ +      }
+ +
+ +      /* Close statements, even if they never were bound */
+ +      for (i = 0; i < combiner->conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn;
+ +              char                    cursor[NAMEDATALEN];
+ +
+ +              if (plan->cursor)
+ +              {
+ +                      if (plan->unique)
+ +                              snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+ +                      else
+ +                              strncpy(cursor, plan->cursor, NAMEDATALEN);
+ +              }
+ +              else
+ +                      cursor[0] = '\0';
+ +
+ +              conn = combiner->connections[i];
+ +
+ +              CHECK_OWNERSHIP(conn, combiner);
+ +
+ +              if (pgxc_node_send_close(conn, true, cursor) != 0)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to close data node statement")));
+ +              /* Send SYNC and wait for ReadyForQuery */
+ +              if (pgxc_node_send_sync(conn) != 0)
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to synchronize data node")));
+ +              /*
+ +               * Formally connection is not in QUERY state, we set the state to read
+ +               * CloseDone and ReadyForQuery responses. Upon receiving ReadyForQuery
+ +               * state will be changed back to IDLE and conn->coordinator will be
+ +               * cleared.
+ +               */
+ +              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_CLOSE);
+ +      }
+ +
+ +      while (combiner->conn_count > 0)
+ +      {
+ +              if (pgxc_node_receive(combiner->conn_count,
+ +                                                        combiner->connections, NULL))
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INTERNAL_ERROR),
+ +                                       errmsg("Failed to close remote subplan")));
+ +              i = 0;
+ +              while (i < combiner->conn_count)
+ +              {
+ +                      int res = handle_response(combiner->connections[i], combiner);
+ +                      if (res == RESPONSE_EOF)
+ +                      {
+ +                              i++;
+ +                      }
+ +                      else if (res == RESPONSE_READY)
+ +                      {
+ +                              /* Done, connection is reade for query */
+ +                              if (--combiner->conn_count > i)
+ +                                      combiner->connections[i] =
+ +                                                      combiner->connections[combiner->conn_count];
+ +                      }
+ +                      else if (res == RESPONSE_DATAROW)
+ +                      {
+ +                              /*
+ +                               * If we are finishing slowly running remote subplan while it
+ +                               * is still working (because of Limit, for example) it may
+ +                               * produce one or more tuples between connection cleanup and
+ +                               * handling Close command. One tuple does not cause any problem,
+ +                               * but if it will not be read the next tuple will trigger
+ +                               * assertion failure. So if we got a tuple, just read and
+ +                               * discard it here.
+ +                               */
+ +                              pfree(combiner->currentRow);
+ +                              combiner->currentRow = NULL;
+ +                      }
+ +                      /* Ignore other possible responses */
+ +              }
+ +      }
+ +
+ +      ValidateAndCloseCombiner(combiner);
+ +      pfree(node);
+ +
+ +      if (log_remotesubplan_stats)
+ +              ShowUsageCommon("ExecEndRemoteSubplan", &start_r, &start_t);
+ +}
+ +
+ +/*
+ + * pgxc_node_report_error
+ + * Throw error from Datanode if any.
+ + */
+ +static void
+ +pgxc_node_report_error(ResponseCombiner *combiner)
+ +{
+ +      /* If no combiner, nothing to do */
+ +      if (!combiner)
+ +              return;
+ +      if (combiner->errorMessage)
+ +      {
+ +              char *code = combiner->errorCode;
+ +              if ((combiner->errorDetail == NULL) && (combiner->errorHint == NULL))
+ +                      ereport(ERROR,
+ +                                      (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ +                                      errmsg("%s", combiner->errorMessage)));
+ +              else if ((combiner->errorDetail != NULL) && (combiner->errorHint != NULL))
+ +                      ereport(ERROR,
+ +                                      (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ +                                      errmsg("%s", combiner->errorMessage),
+ +                                      errdetail("%s", combiner->errorDetail),
+ +                                      errhint("%s", combiner->errorHint)));
+ +              else if (combiner->errorDetail != NULL)
+ +                      ereport(ERROR,
+ +                                      (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ +                                      errmsg("%s", combiner->errorMessage),
+ +                                      errdetail("%s", combiner->errorDetail)));
+ +              else
+ +                      ereport(ERROR,
+ +                                      (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ +                                      errmsg("%s", combiner->errorMessage),
+ +                                      errhint("%s", combiner->errorHint)));
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * get_success_nodes:
+ + * Currently called to print a user-friendly message about
+ + * which nodes the query failed.
+ + * Gets all the nodes where no 'E' (error) messages were received; i.e. where the
+ + * query ran successfully.
+ + */
+ +static ExecNodes *
+ +get_success_nodes(int node_count, PGXCNodeHandle **handles, char node_type, StringInfo failednodes)
+ +{
+ +      ExecNodes *success_nodes = NULL;
+ +      int i;
+ +
+ +      for (i = 0; i < node_count; i++)
+ +      {
+ +              PGXCNodeHandle *handle = handles[i];
+ +              int nodenum = PGXCNodeGetNodeId(handle->nodeoid, &node_type);
+ +
+ +              if (!handle->error)
+ +              {
+ +                      if (!success_nodes)
+ +                              success_nodes = makeNode(ExecNodes);
+ +                      success_nodes->nodeList = lappend_int(success_nodes->nodeList, nodenum);
+ +              }
+ +              else
+ +              {
+ +                      if (failednodes->len == 0)
+ +                              appendStringInfo(failednodes, "Error message received from nodes:");
+ +                      appendStringInfo(failednodes, " %s#%d",
+ +                              (node_type == PGXC_NODE_COORDINATOR ? "coordinator" : "datanode"),
+ +                              nodenum + 1);
+ +              }
+ +      }
+ +      return success_nodes;
+ +}
+ +
+ +/*
+ + * pgxc_all_success_nodes: Uses get_success_nodes() to collect the
+ + * user-friendly message from coordinator as well as datanode.
+ + */
+ +void
+ +pgxc_all_success_nodes(ExecNodes **d_nodes, ExecNodes **c_nodes, char **failednodes_msg)
+ +{
+ +      PGXCNodeAllHandles *connections = get_exec_connections(NULL, NULL, EXEC_ON_ALL_NODES, true);
+ +      StringInfoData failednodes;
+ +      initStringInfo(&failednodes);
+ +
+ +      *d_nodes = get_success_nodes(connections->dn_conn_count,
+ +                                   connections->datanode_handles,
+ +                                                               PGXC_NODE_DATANODE,
+ +                                                               &failednodes);
+ +
+ +      *c_nodes = get_success_nodes(connections->co_conn_count,
+ +                                   connections->coord_handles,
+ +                                                               PGXC_NODE_COORDINATOR,
+ +                                                               &failednodes);
+ +
+ +      if (failednodes.len == 0)
+ +              *failednodes_msg = NULL;
+ +      else
+ +              *failednodes_msg = failednodes.data;
+ +
+ +      pfree_pgxc_all_handles(connections);
+ +}
+ +
+ +
+ +/*
+ + * set_dbcleanup_callback:
+ + * Register a callback function which does some non-critical cleanup tasks
+ + * on xact success or abort, such as tablespace/database directory cleanup.
+ + */
+ +void set_dbcleanup_callback(xact_callback function, void *paraminfo, int paraminfo_size)
+ +{
+ +      void *fparams;
+ +
+ +      fparams = MemoryContextAlloc(TopMemoryContext, paraminfo_size);
+ +      memcpy(fparams, paraminfo, paraminfo_size);
+ +
+ +      dbcleanup_info.function = function;
+ +      dbcleanup_info.fparams = fparams;
+ +}
+ +
+ +/*
+ + * AtEOXact_DBCleanup: To be called at post-commit or pre-abort.
+ + * Calls the cleanup function registered during this transaction, if any.
+ + */
+ +void AtEOXact_DBCleanup(bool isCommit)
+ +{
+ +      if (dbcleanup_info.function)
+ +              (*dbcleanup_info.function)(isCommit, dbcleanup_info.fparams);
+ +
+ +      /*
+ +       * Just reset the callbackinfo. We anyway don't want this to be called again,
+ +       * until explicitly set.
+ +       */
+ +      dbcleanup_info.function = NULL;
+ +      if (dbcleanup_info.fparams)
+ +      {
+ +              pfree(dbcleanup_info.fparams);
+ +              dbcleanup_info.fparams = NULL;
+ +      }
+ +}
+ +
+ +char *
+ +GetImplicit2PCGID(const char *implicit2PC_head, bool localWrite)
+ +{
+ +      int dnCount = 0, coordCount = 0;
+ +      int dnNodeIds[MaxDataNodes];
+ +      int coordNodeIds[MaxCoords];
+ +      MemoryContext oldContext = CurrentMemoryContext;
+ +      StringInfoData str;
+ +      int i;
+ +
+ +      oldContext = MemoryContextSwitchTo(TopTransactionContext);
+ +      initStringInfo(&str);
+ +      /*
+ +       * Check how many coordinators and datanodes are involved in this
+ +       * transaction
+ +       */
+ +      pgxc_node_remote_count(&dnCount, dnNodeIds, &coordCount, coordNodeIds);
+ +      appendStringInfo(&str, "%s%u:%s:%c:%d:%d",
+ +                      implicit2PC_head,
+ +                      GetTopTransactionId(),
+ +                      PGXCNodeName,
+ +                      localWrite ? 'T' : 'F',
+ +                      dnCount,
+ +                      coordCount + (localWrite ? 1 : 0));
+ +
+ +      for (i = 0; i < dnCount; i++)
+ +              appendStringInfo(&str, ":%d", dnNodeIds[i]);
+ +      for (i = 0; i < coordCount; i++)
+ +              appendStringInfo(&str, ":%d", coordNodeIds[i]);
+ +
+ +      if (localWrite)
+ +              appendStringInfo(&str, ":%d", PGXCNodeIdentifier);
+ +
+ +      MemoryContextSwitchTo(oldContext);
+ +
+ +      return str.data;
+ +}
diff --cc src/backend/pgxc/pool/pgxcnode.c

index eafd9cbbe0b927ccd6f53266a94ff2aef61af020,0000000000000000000000000000000000000000..809da4f1d2fa77147e948234868290a990f1a224

mode 100644,000000..100644
--- 1/src/backend/pgxc/pool/pgxcnode.c
--- /dev/null
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@@ -1,3228 -1,0 +1,3229 @@@
-       PG_RETURN_NAME(PGXCNodeName);
+ +/*-------------------------------------------------------------------------
+ + *
+ + * pgxcnode.c
+ + *
+ + *      Functions for the Coordinator communicating with the PGXC nodes:
+ + *      Datanodes and Coordinators
+ + *
+ + *
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ + *
+ + * IDENTIFICATION
+ + *      $$
+ + *
+ + *
+ + *-------------------------------------------------------------------------
+ + */
+ +
+ +#include "postgres.h"
+ +#include <poll.h>
+ +
+ +#ifdef __sun
+ +#include <sys/filio.h>
+ +#endif
+ +
+ +#include <sys/time.h>
+ +#include <sys/types.h>
+ +#include <sys/ioctl.h>
+ +#include <stdlib.h>
+ +#include <string.h>
+ +#include <unistd.h>
+ +#include <errno.h>
+ +#include "access/gtm.h"
+ +#include "access/transam.h"
+ +#include "access/xact.h"
+ +#include "access/htup_details.h"
+ +#include "catalog/pg_type.h"
+ +#include "commands/prepare.h"
+ +#include "gtm/gtm_c.h"
+ +#include "nodes/nodes.h"
+ +#include "pgxc/pgxcnode.h"
+ +#include "pgxc/execRemote.h"
+ +#include "catalog/pgxc_node.h"
+ +#include "catalog/pg_collation.h"
+ +#include "pgxc/locator.h"
+ +#include "pgxc/nodemgr.h"
+ +#include "pgxc/pgxc.h"
+ +#include "pgxc/poolmgr.h"
+ +#include "tcop/dest.h"
++#include "storage/lwlock.h"
+ +#include "utils/builtins.h"
+ +#include "utils/elog.h"
+ +#include "utils/memutils.h"
+ +#include "utils/fmgroids.h"
+ +#include "utils/snapmgr.h"
+ +#include "utils/syscache.h"
+ +#include "utils/lsyscache.h"
+ +#include "utils/formatting.h"
+ +#include "utils/tqual.h"
+ +#include "../interfaces/libpq/libpq-fe.h"
+ +#ifdef XCP
+ +#include "miscadmin.h"
+ +#include "storage/ipc.h"
+ +#include "pgxc/pause.h"
+ +#include "utils/snapmgr.h"
+ +#endif
+ +
+ +#define CMD_ID_MSG_LEN 8
+ +
+ +/* Number of connections held */
+ +static int    datanode_count = 0;
+ +static int    coord_count = 0;
+ +
+ +/*
+ + * Datanode handles saved in Transaction memory context
+ + * when PostgresMain is launched.
+ + * Those handles are used inside a transaction by Coordinator to Datanodes.
+ + */
+ +static PGXCNodeHandle *dn_handles = NULL;
+ +
+ +/*
+ + * Coordinator handles saved in Transaction memory context
+ + * when PostgresMain is launched.
+ + * Those handles are used inside a transaction by Coordinator to Coordinators
+ + */
+ +static PGXCNodeHandle *co_handles = NULL;
+ +
+ +/* Current size of dn_handles and co_handles */
+ +int                   NumDataNodes;
+ +int           NumCoords;
+ +
+ +
+ +#ifdef XCP
+ +volatile bool HandlesInvalidatePending = false;
+ +volatile bool HandlesRefreshPending = false;
+ +
+ +/*
+ + * Session and transaction parameters need to to be set on newly connected
+ + * remote nodes.
+ + */
+ +static List *session_param_list = NIL;
+ +static List   *local_param_list = NIL;
+ +static StringInfo     session_params;
+ +static StringInfo     local_params;
+ +
+ +typedef struct
+ +{
+ +      NameData name;
+ +      NameData value;
+ +      int              flags;
+ +} ParamEntry;
+ +
+ +
+ +static bool DoInvalidateRemoteHandles(void);
+ +static bool DoRefreshRemoteHandles(void);
+ +#endif
+ +
+ +#ifdef XCP
+ +static void pgxc_node_init(PGXCNodeHandle *handle, int sock,
+ +              bool global_session, int pid);
+ +#else
+ +static void pgxc_node_init(PGXCNodeHandle *handle, int sock);
+ +#endif
+ +static void pgxc_node_free(PGXCNodeHandle *handle);
+ +static void pgxc_node_all_free(void);
+ +
+ +static int    get_int(PGXCNodeHandle * conn, size_t len, int *out);
+ +static int    get_char(PGXCNodeHandle * conn, char *out);
+ +
+ +
+ +/*
+ + * Initialize PGXCNodeHandle struct
+ + */
+ +static void
+ +init_pgxc_handle(PGXCNodeHandle *pgxc_handle)
+ +{
+ +      /*
+ +       * Socket descriptor is small non-negative integer,
+ +       * Indicate the handle is not initialized yet
+ +       */
+ +      pgxc_handle->sock = NO_SOCKET;
+ +
+ +      /* Initialise buffers */
+ +      pgxc_handle->error = NULL;
+ +      pgxc_handle->outSize = 16 * 1024;
+ +      pgxc_handle->outBuffer = (char *) palloc(pgxc_handle->outSize);
+ +      pgxc_handle->inSize = 16 * 1024;
+ +
+ +      pgxc_handle->inBuffer = (char *) palloc(pgxc_handle->inSize);
+ +      pgxc_handle->combiner = NULL;
+ +      pgxc_handle->inStart = 0;
+ +      pgxc_handle->inEnd = 0;
+ +      pgxc_handle->inCursor = 0;
+ +      pgxc_handle->outEnd = 0;
+ +      pgxc_handle->needSync = false;
+ +
+ +      if (pgxc_handle->outBuffer == NULL || pgxc_handle->inBuffer == NULL)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Allocate and initialize memory to store Datanode and Coordinator handles.
+ + */
+ +void
+ +InitMultinodeExecutor(bool is_force)
+ +{
+ +      int                             count;
+ +      Oid                             *coOids, *dnOids;
+ +#ifdef XCP
+ +      MemoryContext   oldcontext;
+ +#endif
+ +
+ +
+ +      /* Free all the existing information first */
+ +      if (is_force)
+ +              pgxc_node_all_free();
+ +
+ +      /* This function could get called multiple times because of sigjmp */
+ +      if (dn_handles != NULL &&
+ +              co_handles != NULL)
+ +              return;
+ +
+ +      /* Update node table in the shared memory */
+ +      PgxcNodeListAndCount();
+ +
+ +      /* Get classified list of node Oids */
+ +      PgxcNodeGetOids(&coOids, &dnOids, &NumCoords, &NumDataNodes, true);
+ +
+ +#ifdef XCP
+ +      /*
+ +       * Coordinator and datanode handles should be available during all the
+ +       * session lifetime
+ +       */
+ +      oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+ +#endif
+ +
+ +      /* Do proper initialization of handles */
+ +      if (NumDataNodes > 0)
+ +              dn_handles = (PGXCNodeHandle *)
+ +                      palloc(NumDataNodes * sizeof(PGXCNodeHandle));
+ +      if (NumCoords > 0)
+ +              co_handles = (PGXCNodeHandle *)
+ +                      palloc(NumCoords * sizeof(PGXCNodeHandle));
+ +
+ +      if ((!dn_handles && NumDataNodes > 0) ||
+ +              (!co_handles && NumCoords > 0))
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory for node handles")));
+ +
+ +      /* Initialize new empty slots */
+ +      for (count = 0; count < NumDataNodes; count++)
+ +      {
+ +              init_pgxc_handle(&dn_handles[count]);
+ +              dn_handles[count].nodeoid = dnOids[count];
+ +              dn_handles[count].nodeid = get_pgxc_node_id(dnOids[count]);
+ +              strncpy(dn_handles[count].nodename, get_pgxc_nodename(dnOids[count]),
+ +                              NAMEDATALEN);
+ +              strncpy(dn_handles[count].nodehost, get_pgxc_nodehost(dnOids[count]),
+ +                              NAMEDATALEN);
+ +              dn_handles[count].nodeport = get_pgxc_nodeport(dnOids[count]);
+ +      }
+ +      for (count = 0; count < NumCoords; count++)
+ +      {
+ +              init_pgxc_handle(&co_handles[count]);
+ +              co_handles[count].nodeoid = coOids[count];
+ +              co_handles[count].nodeid = get_pgxc_node_id(coOids[count]);
+ +              strncpy(co_handles[count].nodename, get_pgxc_nodename(coOids[count]),
+ +                              NAMEDATALEN);
+ +              strncpy(co_handles[count].nodehost, get_pgxc_nodehost(coOids[count]),
+ +                              NAMEDATALEN);
+ +              co_handles[count].nodeport = get_pgxc_nodeport(coOids[count]);
+ +      }
+ +
+ +      datanode_count = 0;
+ +      coord_count = 0;
+ +      PGXCNodeId = 0;
+ +
+ +      MemoryContextSwitchTo(oldcontext);
+ +
+ +      if (IS_PGXC_COORDINATOR)
+ +      {
+ +              for (count = 0; count < NumCoords; count++)
+ +              {
+ +                      if (pg_strcasecmp(PGXCNodeName,
+ +                                         get_pgxc_nodename(co_handles[count].nodeoid)) == 0)
+ +                              PGXCNodeId = count + 1;
+ +              }
+ +      }
+ +      else /* DataNode */
+ +      {
+ +              for (count = 0; count < NumDataNodes; count++)
+ +              {
+ +                      if (pg_strcasecmp(PGXCNodeName,
+ +                                         get_pgxc_nodename(dn_handles[count].nodeoid)) == 0)
+ +                              PGXCNodeId = count + 1;
+ +              }
+ +      }
+ +}
+ +
+ +/*
+ + * Builds up a connection string
+ + */
+ +char *
+ +PGXCNodeConnStr(char *host, int port, char *dbname,
+ +                              char *user, char *pgoptions, char *remote_type, char *parent_node)
+ +{
+ +      char       *out,
+ +                              connstr[1024];
+ +      int                     num;
+ +
+ +      /*
+ +       * Build up connection string
+ +       * remote type can be Coordinator, Datanode or application.
+ +       */
+ +      num = snprintf(connstr, sizeof(connstr),
+ +                                 "host=%s port=%d dbname=%s user=%s application_name='pgxc:%s' sslmode=disable options='-c remotetype=%s -c parentnode=%s %s'",
+ +                                 host, port, dbname, user, parent_node, remote_type, parent_node,
+ +                                 pgoptions);
+ +
+ +      /* Check for overflow */
+ +      if (num > 0 && num < sizeof(connstr))
+ +      {
+ +              /* Output result */
+ +              out = (char *) palloc(num + 1);
+ +              strcpy(out, connstr);
+ +              return out;
+ +      }
+ +
+ +      /* return NULL if we have problem */
+ +      return NULL;
+ +}
+ +
+ +
+ +/*
+ + * Connect to a Datanode using a connection string
+ + */
+ +NODE_CONNECTION *
+ +PGXCNodeConnect(char *connstr)
+ +{
+ +      PGconn     *conn;
+ +
+ +      /* Delegate call to the pglib */
+ +      conn = PQconnectdb(connstr);
+ +      return (NODE_CONNECTION *) conn;
+ +}
+ +
+ +int PGXCNodePing(const char *connstr)
+ +{
+ +      if (connstr[0])
+ +      {
+ +              PGPing status = PQping(connstr);
+ +              if (status == PQPING_OK)
+ +                      return 0;
+ +              else
+ +                      return 1;
+ +      }
+ +      else
+ +              return -1;
+ +}
+ +
+ +/*
+ + * Close specified connection
+ + */
+ +void
+ +PGXCNodeClose(NODE_CONNECTION *conn)
+ +{
+ +      /* Delegate call to the pglib */
+ +      PQfinish((PGconn *) conn);
+ +}
+ +
+ +/*
+ + * Checks if connection active
+ + */
+ +int
+ +PGXCNodeConnected(NODE_CONNECTION *conn)
+ +{
+ +      /* Delegate call to the pglib */
+ +      PGconn     *pgconn = (PGconn *) conn;
+ +
+ +      /*
+ +       * Simple check, want to do more comprehencive -
+ +       * check if it is ready for guery
+ +       */
+ +      return pgconn && PQstatus(pgconn) == CONNECTION_OK;
+ +}
+ +
+ +
+ +
+ +/* Close the socket handle (this process' copy) and free occupied memory
+ + *
+ + * Note that we do not free the handle and its members. This will be
+ + * taken care of when the transaction ends, when TopTransactionContext
+ + * is destroyed in xact.c.
+ + */
+ +static void
+ +pgxc_node_free(PGXCNodeHandle *handle)
+ +{
+ +      if (handle->sock != NO_SOCKET)
+ +              close(handle->sock);
+ +      handle->sock = NO_SOCKET;
+ +}
+ +
+ +/*
+ + * Free all the node handles cached
+ + */
+ +static void
+ +pgxc_node_all_free(void)
+ +{
+ +      int i, j;
+ +
+ +      for (i = 0; i < 2; i++)
+ +      {
+ +              int num_nodes = 0;
+ +              PGXCNodeHandle *array_handles;
+ +
+ +              switch (i)
+ +              {
+ +                      case 0:
+ +                              num_nodes = NumCoords;
+ +                              array_handles = co_handles;
+ +                              break;
+ +                      case 1:
+ +                              num_nodes = NumDataNodes;
+ +                              array_handles = dn_handles;
+ +                              break;
+ +                      default:
+ +                              Assert(0);
+ +              }
+ +
+ +              for (j = 0; j < num_nodes; j++)
+ +              {
+ +                      PGXCNodeHandle *handle = &array_handles[j];
+ +                      pgxc_node_free(handle);
+ +              }
+ +              if (array_handles)
+ +                      pfree(array_handles);
+ +      }
+ +
+ +      co_handles = NULL;
+ +      dn_handles = NULL;
+ +      HandlesInvalidatePending = false;
+ +      HandlesRefreshPending = false;
+ +}
+ +
+ +/*
+ + * Create and initialise internal structure to communicate to
+ + * Datanode via supplied socket descriptor.
+ + * Structure stores state info and I/O buffers
+ + */
+ +static void
+ +pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
+ +{
+ +      char *init_str;
+ +
+ +      handle->sock = sock;
+ +      handle->backend_pid = pid;
+ +      handle->transaction_status = 'I';
+ +      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_IDLE);
+ +      handle->read_only = true;
+ +      handle->ck_resp_rollback = false;
+ +      handle->combiner = NULL;
+ +#ifdef DN_CONNECTION_DEBUG
+ +      handle->have_row_desc = false;
+ +#endif
+ +      handle->error = NULL;
+ +      handle->outEnd = 0;
+ +      handle->inStart = 0;
+ +      handle->inEnd = 0;
+ +      handle->inCursor = 0;
+ +      handle->needSync = false;
+ +      /*
+ +       * We got a new connection, set on the remote node the session parameters
+ +       * if defined. The transaction parameter should be sent after BEGIN
+ +       */
+ +      if (global_session)
+ +      {
+ +              init_str = PGXCNodeGetSessionParamStr();
+ +              if (init_str)
+ +              {
+ +                      pgxc_node_set_query(handle, init_str);
+ +              }
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Wait while at least one of specified connections has data available and read
+ + * the data into the buffer
+ + */
+ +bool
+ +pgxc_node_receive(const int conn_count,
+ +                                PGXCNodeHandle ** connections, struct timeval * timeout)
+ +{
+ +#define ERROR_OCCURED         true
+ +#define NO_ERROR_OCCURED      false
+ +      int             i,
+ +                      sockets_to_poll,
+ +                      poll_val;
+ +      bool    is_msg_buffered;
+ +      long    timeout_ms;
+ +      struct  pollfd pool_fd[conn_count];
+ +
+ +      /* sockets to be polled index */
+ +      sockets_to_poll = 0;
+ +
+ +      is_msg_buffered = false;
+ +      for (i = 0; i < conn_count; i++)
+ +      {
+ +              /* If connection has a buffered message */
+ +              if (HAS_MESSAGE_BUFFERED(connections[i]))
+ +              {
+ +                      is_msg_buffered = true;
+ +                      break;
+ +              }
+ +      }
+ +
+ +      for (i = 0; i < conn_count; i++)
+ +      {
+ +              /* If connection finished sending do not wait input from it */
+ +              if (connections[i]->state == DN_CONNECTION_STATE_IDLE || HAS_MESSAGE_BUFFERED(connections[i]))
+ +              {
+ +                      pool_fd[i].fd = -1;
+ +                      pool_fd[i].events = 0;
+ +                      continue;
+ +              }
+ +
+ +              /* prepare select params */
+ +              if (connections[i]->sock > 0)
+ +              {
+ +                      pool_fd[i].fd = connections[i]->sock;
+ +                      pool_fd[i].events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
+ +                      sockets_to_poll++;
+ +              }
+ +              else
+ +              {
+ +                      /* flag as bad, it will be removed from the list */
+ +                      PGXCNodeSetConnectionState(connections[i],
+ +                                      DN_CONNECTION_STATE_ERROR_FATAL);
+ +                      pool_fd[i].fd = -1;
+ +                      pool_fd[i].events = 0;
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Return if we do not have connections to receive input
+ +       */
+ +      if (sockets_to_poll == 0)
+ +      {
+ +              if (is_msg_buffered)
+ +                      return NO_ERROR_OCCURED;
+ +              return ERROR_OCCURED;
+ +      }
+ +
+ +      /* do conversion from the select behaviour */
+ +      if ( timeout == NULL )
+ +              timeout_ms = -1;
+ +      else
+ +              timeout_ms = (timeout->tv_sec * (uint64_t) 1000) + (timeout->tv_usec / 1000);
+ +
+ +retry:
+ +      CHECK_FOR_INTERRUPTS();
+ +      poll_val  = poll(pool_fd, conn_count, timeout_ms);
+ +      if (poll_val < 0)
+ +      {
+ +              /* error - retry if EINTR */
+ +              if (errno == EINTR  || errno == EAGAIN)
+ +                      goto retry;
+ +
+ +              elog(WARNING, "poll() error: %d", errno);
+ +              if (errno)
+ +                      return ERROR_OCCURED;
+ +              return NO_ERROR_OCCURED;
+ +      }
+ +
+ +      if (poll_val == 0)
+ +      {
+ +              /* Handle timeout */
+ +              elog(DEBUG1, "timeout %ld while waiting for any response from %d connections", timeout_ms,conn_count);
+ +              for (i = 0; i < conn_count; i++)
+ +                      PGXCNodeSetConnectionState(connections[i],
+ +                                      DN_CONNECTION_STATE_ERROR_FATAL);
+ +              return NO_ERROR_OCCURED;
+ +      }
+ +
+ +      /* read data */
+ +      for (i = 0; i < conn_count; i++)
+ +      {
+ +              PGXCNodeHandle *conn = connections[i];
+ +
+ +              if( pool_fd[i].fd == -1 )
+ +                      continue;
+ +
+ +              if ( pool_fd[i].fd == conn->sock )
+ +              {
+ +                      if( pool_fd[i].revents & POLLIN )
+ +                      {
+ +                              int     read_status = pgxc_node_read_data(conn, true);
+ +                              if ( read_status == EOF || read_status < 0 )
+ +                              {
+ +                                      /* Can not read - no more actions, just discard connection */
+ +                                      PGXCNodeSetConnectionState(conn,
+ +                                                      DN_CONNECTION_STATE_ERROR_FATAL);
+ +                                      add_error_message(conn, "unexpected EOF on datanode connection.");
+ +                                      elog(WARNING, "unexpected EOF on datanode oid connection: %d", conn->nodeoid);
+ +
+ +                                      /*
+ +                                       * before returning, also update the shared health
+ +                                       * status field to indicate that this node could be
+ +                                       * possibly unavailable.
+ +                                       *
+ +                                       * Note that this error could be due to a stale handle
+ +                                       * and it's possible that another backend might have
+ +                                       * already updated the health status OR the node
+ +                                       * might have already come back since the last disruption
+ +                                       */
+ +                                      PoolPingNodeRecheck(conn->nodeoid);
+ +
+ +                                      /* Should we read from the other connections before returning? */
+ +                                      return ERROR_OCCURED;
+ +                              }
+ +
+ +                      }
+ +                      else if (
+ +                                      (pool_fd[i].revents & POLLERR) ||
+ +                                      (pool_fd[i].revents & POLLHUP) ||
+ +                                      (pool_fd[i].revents & POLLNVAL)
+ +                                      )
+ +                      {
+ +                              PGXCNodeSetConnectionState(connections[i],
+ +                                              DN_CONNECTION_STATE_ERROR_FATAL);
+ +                              add_error_message(conn, "unexpected network error on datanode connection");
+ +                              elog(WARNING, "unexpected EOF on datanode oid connection: %d with event %d", conn->nodeoid,pool_fd[i].revents);
+ +                              /* Should we check/read from the other connections before returning? */
+ +                              return ERROR_OCCURED;
+ +                      }
+ +              }
+ +      }
+ +      return NO_ERROR_OCCURED;
+ +}
+ +
+ +/*
+ + * Is there any data enqueued in the TCP input buffer waiting
+ + * to be read sent by the PGXC node connection
+ + */
+ +
+ +int
+ +pgxc_node_is_data_enqueued(PGXCNodeHandle *conn)
+ +{
+ +      int ret;
+ +      int enqueued;
+ +
+ +      if (conn->sock < 0)
+ +              return 0;
+ +      ret = ioctl(conn->sock, FIONREAD, &enqueued);
+ +      if (ret != 0)
+ +              return 0;
+ +
+ +      return enqueued;
+ +}
+ +
+ +/*
+ + * Read up incoming messages from the PGXC node connection
+ + */
+ +int
+ +pgxc_node_read_data(PGXCNodeHandle *conn, bool close_if_error)
+ +{
+ +      int                     someread = 0;
+ +      int                     nread;
+ +
+ +      if (conn->sock < 0)
+ +      {
+ +              if (close_if_error)
+ +                      add_error_message(conn, "bad socket");
+ +              return EOF;
+ +      }
+ +
+ +      /* Left-justify any data in the buffer to make room */
+ +      if (conn->inStart < conn->inEnd)
+ +      {
+ +              if (conn->inStart > 0)
+ +              {
+ +                      memmove(conn->inBuffer, conn->inBuffer + conn->inStart,
+ +                                      conn->inEnd - conn->inStart);
+ +                      conn->inEnd -= conn->inStart;
+ +                      conn->inCursor -= conn->inStart;
+ +                      conn->inStart = 0;
+ +              }
+ +      }
+ +      else
+ +      {
+ +              /* buffer is logically empty, reset it */
+ +              conn->inStart = conn->inCursor = conn->inEnd = 0;
+ +      }
+ +
+ +      /*
+ +       * If the buffer is fairly full, enlarge it. We need to be able to enlarge
+ +       * the buffer in case a single message exceeds the initial buffer size. We
+ +       * enlarge before filling the buffer entirely so as to avoid asking the
+ +       * kernel for a partial packet. The magic constant here should be large
+ +       * enough for a TCP packet or Unix pipe bufferload.  8K is the usual pipe
+ +       * buffer size, so...
+ +       */
+ +      if (conn->inSize - conn->inEnd < 8192)
+ +      {
+ +              if (ensure_in_buffer_capacity(conn->inEnd + (size_t) 8192, conn) != 0)
+ +              {
+ +                      /*
+ +                       * We don't insist that the enlarge worked, but we need some room
+ +                       */
+ +                      if (conn->inSize - conn->inEnd < 100)
+ +                      {
+ +                              if (close_if_error)
+ +                                      add_error_message(conn, "can not allocate buffer");
+ +                              return -1;
+ +                      }
+ +              }
+ +      }
+ +
+ +retry:
+ +      nread = recv(conn->sock, conn->inBuffer + conn->inEnd,
+ +                               conn->inSize - conn->inEnd, 0);
+ +
+ +      if (nread < 0)
+ +      {
+ +              if (errno == EINTR)
+ +                      goto retry;
+ +              /* Some systems return EAGAIN/EWOULDBLOCK for no data */
+ +#ifdef EAGAIN
+ +              if (errno == EAGAIN)
+ +                      return someread;
+ +#endif
+ +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+ +              if (errno == EWOULDBLOCK)
+ +                      return someread;
+ +#endif
+ +              /* We might get ECONNRESET here if using TCP and backend died */
+ +#ifdef ECONNRESET
+ +              if (errno == ECONNRESET)
+ +              {
+ +                      /*
+ +                       * OK, we are getting a zero read even though select() says ready. This
+ +                       * means the connection has been closed.  Cope.
+ +                       */
+ +                      if (close_if_error)
+ +                      {
+ +                              add_error_message(conn,
+ +                                                              "Datanode closed the connection unexpectedly\n"
+ +                                      "\tThis probably means the Datanode terminated abnormally\n"
+ +                                                              "\tbefore or while processing the request.\n");
+ +                              PGXCNodeSetConnectionState(conn,
+ +                                              DN_CONNECTION_STATE_ERROR_FATAL);       /* No more connection to
+ +                                                                                                                      * backend */
+ +                              closesocket(conn->sock);
+ +                              conn->sock = NO_SOCKET;
+ +                      }
+ +                      return -1;
+ +              }
+ +#endif
+ +              if (close_if_error)
+ +                      add_error_message(conn, "could not receive data from server");
+ +              return -1;
+ +
+ +      }
+ +
+ +      if (nread > 0)
+ +      {
+ +              conn->inEnd += nread;
+ +
+ +              /*
+ +               * Hack to deal with the fact that some kernels will only give us back
+ +               * 1 packet per recv() call, even if we asked for more and there is
+ +               * more available.      If it looks like we are reading a long message,
+ +               * loop back to recv() again immediately, until we run out of data or
+ +               * buffer space.  Without this, the block-and-restart behavior of
+ +               * libpq's higher levels leads to O(N^2) performance on long messages.
+ +               *
+ +               * Since we left-justified the data above, conn->inEnd gives the
+ +               * amount of data already read in the current message.  We consider
+ +               * the message "long" once we have acquired 32k ...
+ +               */
+ +              if (conn->inEnd > 32768 &&
+ +                      (conn->inSize - conn->inEnd) >= 8192)
+ +              {
+ +                      someread = 1;
+ +                      goto retry;
+ +              }
+ +              return 1;
+ +      }
+ +
+ +      if (nread == 0)
+ +      {
+ +              if (close_if_error)
+ +                      elog(DEBUG1, "nread returned 0");
+ +              return EOF;
+ +      }
+ +
+ +      if (someread)
+ +              return 1;                               /* got a zero read after successful tries */
+ +
+ +      return 0;
+ +}
+ +
+ +
+ +/*
+ + * Get one character from the connection buffer and advance cursor
+ + */
+ +static int
+ +get_char(PGXCNodeHandle * conn, char *out)
+ +{
+ +      if (conn->inCursor < conn->inEnd)
+ +      {
+ +              *out = conn->inBuffer[conn->inCursor++];
+ +              return 0;
+ +      }
+ +      return EOF;
+ +}
+ +
+ +/*
+ + * Read an integer from the connection buffer and advance cursor
+ + */
+ +static int
+ +get_int(PGXCNodeHandle *conn, size_t len, int *out)
+ +{
+ +      unsigned short tmp2;
+ +      unsigned int tmp4;
+ +
+ +      if (conn->inCursor + len > conn->inEnd)
+ +              return EOF;
+ +
+ +      switch (len)
+ +      {
+ +              case 2:
+ +                      memcpy(&tmp2, conn->inBuffer + conn->inCursor, 2);
+ +                      conn->inCursor += 2;
+ +                      *out = (int) ntohs(tmp2);
+ +                      break;
+ +              case 4:
+ +                      memcpy(&tmp4, conn->inBuffer + conn->inCursor, 4);
+ +                      conn->inCursor += 4;
+ +                      *out = (int) ntohl(tmp4);
+ +                      break;
+ +              default:
+ +                      add_error_message(conn, "not supported int size");
+ +                      return EOF;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +
+ +/*
+ + * get_message
+ + * If connection has enough data read entire message from the connection buffer
+ + * and returns message type. Message data and data length are returned as
+ + * var parameters.
+ + * If buffer does not have enough data leaves cursor unchanged, changes
+ + * connection status to DN_CONNECTION_STATE_QUERY indicating it needs to
+ + * receive more and returns \0
+ + * conn - connection to read from
+ + * len - returned length of the data where msg is pointing to
+ + * msg - returns pointer to memory in the incoming buffer. The buffer probably
+ + * will be overwritten upon next receive, so if caller wants to refer it later
+ + * it should make a copy.
+ + */
+ +char
+ +get_message(PGXCNodeHandle *conn, int *len, char **msg)
+ +{
+ +      char            msgtype;
+ +
+ +      if (get_char(conn, &msgtype) || get_int(conn, 4, len))
+ +      {
+ +              /* Successful get_char would move cursor, restore position */
+ +              conn->inCursor = conn->inStart;
+ +              return '\0';
+ +      }
+ +
+ +      *len -= 4;
+ +
+ +      if (conn->inCursor + *len > conn->inEnd)
+ +      {
+ +              /*
+ +               * Not enough data in the buffer, we should read more.
+ +               * Reading function will discard already consumed data in the buffer
+ +               * till conn->inBegin. Then we want the message that is partly in the
+ +               * buffer now has been read completely, to avoid extra read/handle
+ +               * cycles. The space needed is 1 byte for message type, 4 bytes for
+ +               * message length and message itself which size is currently in *len.
+ +               * The buffer may already be large enough, in this case the function
+ +               * ensure_in_buffer_capacity() will immediately return
+ +               */
+ +              ensure_in_buffer_capacity(5 + (size_t) *len, conn);
+ +              conn->inCursor = conn->inStart;
+ +              return '\0';
+ +      }
+ +
+ +      *msg = conn->inBuffer + conn->inCursor;
+ +      conn->inCursor += *len;
+ +      conn->inStart = conn->inCursor;
+ +      return msgtype;
+ +}
+ +
+ +
+ +/*
+ + * Release all Datanode and Coordinator connections
+ + * back to pool and release occupied memory
+ + */
+ +void
+ +release_handles(void)
+ +{
+ +      bool            destroy = false;
+ +      int                     i;
+ +
+ +      if (HandlesInvalidatePending)
+ +      {
+ +              DoInvalidateRemoteHandles();
+ +              return;
+ +      }
+ +
+ +      /* don't free connection if holding a cluster lock */
+ +      if (cluster_ex_lock_held)
+ +              return;
+ +
+ +      if (datanode_count == 0 && coord_count == 0)
+ +              return;
+ +
+ +      /* Do not release connections if we have prepared statements on nodes */
+ +      if (HaveActiveDatanodeStatements())
+ +              return;
+ +
+ +      /* Free Datanodes handles */
+ +      for (i = 0; i < NumDataNodes; i++)
+ +      {
+ +              PGXCNodeHandle *handle = &dn_handles[i];
+ +
+ +              if (handle->sock != NO_SOCKET)
+ +              {
+ +                      /*
+ +                       * Connections at this point should be completely inactive,
+ +                       * otherwise abaandon them. We can not allow not cleaned up
+ +                       * connection is returned to pool.
+ +                       */
+ +                      if (handle->state != DN_CONNECTION_STATE_IDLE ||
+ +                                      handle->transaction_status != 'I')
+ +                      {
+ +                              destroy = true;
+ +                              elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped",
+ +                                       handle->nodeoid, handle->state);
+ +                      }
+ +                      pgxc_node_free(handle);
+ +              }
+ +      }
+ +
+ +      if (IS_PGXC_COORDINATOR)
+ +      {
+ +              /* Collect Coordinator handles */
+ +              for (i = 0; i < NumCoords; i++)
+ +              {
+ +                      PGXCNodeHandle *handle = &co_handles[i];
+ +
+ +                      if (handle->sock != NO_SOCKET)
+ +                      {
+ +                              /*
+ +                               * Connections at this point should be completely inactive,
+ +                               * otherwise abaandon them. We can not allow not cleaned up
+ +                               * connection is returned to pool.
+ +                               */
+ +                              if (handle->state != DN_CONNECTION_STATE_IDLE ||
+ +                                              handle->transaction_status != 'I')
+ +                              {
+ +                                      destroy = true;
+ +                                      elog(DEBUG1, "Connection to Coordinator %d has unexpected state %d and will be dropped",
+ +                                                      handle->nodeoid, handle->state);
+ +                              }
+ +                              pgxc_node_free(handle);
+ +                      }
+ +              }
+ +      }
+ +
+ +      /* And finally release all the connections on pooler */
+ +      PoolManagerReleaseConnections(destroy);
+ +
+ +      datanode_count = 0;
+ +      coord_count = 0;
+ +}
+ +
+ +/*
+ + * Ensure that the supplied buffer has enough capacity and if not, it's
+ + * extended to an appropriate size.
+ + *
+ + * currbuf is the currently used buffer of currsize. bytes_needed is the
+ + * minimum size required. We shall return the new buffer, if allocated
+ + * successfully and set newsize_p to contain the size of the repalloced buffer.
+ + * If allocation fails, NULL is returned.
+ + *
+ + * The function checks for requests beyond MaxAllocSize and throw an error.
+ + */
+ +static char *
+ +ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size_t *newsize_p)
+ +{
+ +      char       *newbuf;
+ +      Size            newsize = (Size) currsize;
+ +
+ +      if (((Size) bytes_needed) >= MaxAllocSize)
+ +              ereport(ERROR,
+ +                              (ENOSPC,
+ +                               errmsg("out of memory"),
+ +                               errdetail("Cannot enlarge buffer containing %ld bytes by %ld more bytes.",
+ +                                                 currsize, bytes_needed)));
+ +
+ +      if (bytes_needed <= newsize)
+ +      {
+ +              *newsize_p = currsize;
+ +              return currbuf;
+ +      }
+ +
+ +      /*
+ +       * The current size of the buffer should never be zero (init_pgxc_handle
+ +       * guarantees that.
+ +       */
+ +      Assert(newsize > 0);
+ +
+ +      /*
+ +       * Double the buffer size until we have enough space to hold bytes_needed
+ +       */
+ +      while (bytes_needed > newsize)
+ +              newsize = 2 * newsize;
+ +
+ +      /*
+ +       * Clamp to MaxAllocSize in case we went past it.  Note we are assuming
+ +       * here that MaxAllocSize <= INT_MAX/2, else the above loop could
+ +       * overflow.  We will still have newsize >= bytes_needed.
+ +       */
+ +      if (newsize > (int) MaxAllocSize)
+ +              newsize = (int) MaxAllocSize;
+ +
+ +      newbuf = repalloc(currbuf, newsize);
+ +      if (newbuf)
+ +      {
+ +              /* repalloc succeeded, set new size and return the buffer */
+ +              *newsize_p = newsize;
+ +              return newbuf;
+ +      }
+ +
+ +      /*
+ +       * If we fail to double the buffer, try to repalloc a buffer of the given
+ +       * size, rounded to the next multiple of 8192 and see if that works.
+ +       */
+ +      newsize = bytes_needed;
+ +      newsize = ((bytes_needed / 8192) + 1) * 8192;
+ +
+ +      newbuf = repalloc(currbuf, newsize);
+ +      if (newbuf)
+ +      {
+ +              /* repalloc succeeded, set new size and return the buffer */
+ +              *newsize_p = newsize;
+ +              return newbuf;
+ +      }
+ +
+ +      /* repalloc failed */
+ +      return NULL;
+ +}
+ +
+ +/*
+ + * Ensure specified amount of data can fit to the incoming buffer and
+ + * increase it if necessary
+ + */
+ +int
+ +ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
+ +{
+ +      size_t newsize;
+ +      char *newbuf = ensure_buffer_capacity(handle->inBuffer, handle->inSize,
+ +                      bytes_needed, &newsize);
+ +      if (newbuf)
+ +      {
+ +              handle->inBuffer = newbuf;
+ +              handle->inSize = newsize;
+ +              return 0;
+ +      }
+ +      return EOF;
+ +}
+ +
+ +/*
+ + * Ensure specified amount of data can fit to the outgoing buffer and
+ + * increase it if necessary
+ + */
+ +int
+ +ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
+ +{
+ +      size_t newsize;
+ +      char *newbuf = ensure_buffer_capacity(handle->outBuffer, handle->outSize,
+ +                      bytes_needed, &newsize);
+ +      if (newbuf)
+ +      {
+ +              handle->outBuffer = newbuf;
+ +              handle->outSize = newsize;
+ +              return 0;
+ +      }
+ +      return EOF;
+ +}
+ +
+ +
+ +/*
+ + * Send specified amount of data from the outgoing buffer over the connection
+ + */
+ +int
+ +send_some(PGXCNodeHandle *handle, int len)
+ +{
+ +      char       *ptr = handle->outBuffer;
+ +      int                     remaining = handle->outEnd;
+ +      int                     result = 0;
+ +
+ +      /* while there's still data to send */
+ +      while (len > 0)
+ +      {
+ +              int                     sent;
+ +
+ +#ifndef WIN32
+ +              sent = send(handle->sock, ptr, len, 0);
+ +#else
+ +              /*
+ +               * Windows can fail on large sends, per KB article Q201213. The failure-point
+ +               * appears to be different in different versions of Windows, but 64k should
+ +               * always be safe.
+ +               */
+ +              sent = send(handle->sock, ptr, Min(len, 65536), 0);
+ +#endif
+ +
+ +              if (sent < 0)
+ +              {
+ +                      /*
+ +                       * Anything except EAGAIN/EWOULDBLOCK/EINTR is trouble. If it's
+ +                       * EPIPE or ECONNRESET, assume we've lost the backend connection
+ +                       * permanently.
+ +                       */
+ +                      switch (errno)
+ +                      {
+ +#ifdef EAGAIN
+ +                              case EAGAIN:
+ +                                      break;
+ +#endif
+ +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+ +                              case EWOULDBLOCK:
+ +                                      break;
+ +#endif
+ +                              case EINTR:
+ +                                      continue;
+ +
+ +                              case EPIPE:
+ +#ifdef ECONNRESET
+ +                              case ECONNRESET:
+ +#endif
+ +                                      add_error_message(handle, "server closed the connection unexpectedly\n"
+ +                                      "\tThis probably means the server terminated abnormally\n"
+ +                                                        "\tbefore or while processing the request.\n");
+ +
+ +                                      /*
+ +                                       * We used to close the socket here, but that's a bad idea
+ +                                       * since there might be unread data waiting (typically, a
+ +                                       * NOTICE message from the backend telling us it's
+ +                                       * committing hara-kiri...).  Leave the socket open until
+ +                                       * pqReadData finds no more data can be read.  But abandon
+ +                                       * attempt to send data.
+ +                                       */
+ +                                      handle->outEnd = 0;
+ +                                      return -1;
+ +
+ +                              default:
+ +                                      add_error_message(handle, "could not send data to server");
+ +                                      /* We don't assume it's a fatal error... */
+ +                                      handle->outEnd = 0;
+ +                                      return -1;
+ +                      }
+ +              }
+ +              else
+ +              {
+ +                      ptr += sent;
+ +                      len -= sent;
+ +                      remaining -= sent;
+ +              }
+ +
+ +              if (len > 0)
+ +              {
+ +                      struct pollfd pool_fd;
+ +                      int poll_ret;
+ +
+ +                      /*
+ +                       * Wait for the socket to become ready again to receive more data.
+ +                       * For some cases, especially while writing large sums of data
+ +                       * during COPY protocol and when the remote node is not capable of
+ +                       * handling data at the same speed, we might otherwise go in a
+ +                       * useless tight loop, consuming all available local resources
+ +                       *
+ +                       * Use a small timeout of 1s to avoid infinite wait
+ +                       */
+ +                      pool_fd.fd = handle->sock;
+ +                      pool_fd.events = POLLOUT;
+ +
+ +                      poll_ret = poll(&pool_fd, 1, 1000);
+ +                      if (poll_ret < 0)
+ +                      {
+ +                              if (errno == EAGAIN || errno == EINTR)
+ +                                      continue;
+ +                              else
+ +                              {
+ +                                      add_error_message(handle, "poll failed ");
+ +                                      handle->outEnd = 0;
+ +                                      return -1;
+ +                              }
+ +                      }
+ +                      else if (poll_ret == 1)
+ +                      {
+ +                              if (pool_fd.revents & POLLHUP)
+ +                              {
+ +                                      add_error_message(handle, "remote end disconnected");
+ +                                      handle->outEnd = 0;
+ +                                      return -1;
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      /* shift the remaining contents of the buffer */
+ +      if (remaining > 0)
+ +              memmove(handle->outBuffer, ptr, remaining);
+ +      handle->outEnd = remaining;
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * Send PARSE message with specified statement down to the Datanode
+ + */
+ +int
+ +pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
+ +                                              const char *query, short num_params, Oid *param_types)
+ +{
+ +      /* statement name size (allow NULL) */
+ +      int                     stmtLen = statement ? strlen(statement) + 1 : 1;
+ +      /* size of query string */
+ +      int                     strLen = strlen(query) + 1;
+ +      char            **paramTypes = (char **)palloc(sizeof(char *) * num_params);
+ +      /* total size of parameter type names */
+ +      int             paramTypeLen;
+ +      /* message length */
+ +      int                     msgLen;
+ +      int                     cnt_params;
+ +#ifdef USE_ASSERT_CHECKING
+ +      size_t          old_outEnd = handle->outEnd;
+ +#endif
+ +
+ +      /* if there are parameters, param_types should exist */
+ +      Assert(num_params <= 0 || param_types);
+ +      /* 2 bytes for number of parameters, preceding the type names */
+ +      paramTypeLen = 2;
+ +      /* find names of the types of parameters */
+ +      for (cnt_params = 0; cnt_params < num_params; cnt_params++)
+ +      {
+ +              Oid typeoid;
+ +
+ +              /* Parameters with no types are simply ignored */
+ +              if (OidIsValid(param_types[cnt_params]))
+ +                      typeoid = param_types[cnt_params];
+ +              else
+ +                      typeoid = INT4OID;
+ +
+ +              paramTypes[cnt_params] = format_type_be(typeoid);
+ +              paramTypeLen += strlen(paramTypes[cnt_params]) + 1;
+ +      }
+ +
+ +      /* size + stmtLen + strlen + paramTypeLen */
+ +      msgLen = 4 + stmtLen + strLen + paramTypeLen;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'P';
+ +      /* size */
+ +      msgLen = htonl(msgLen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ +      handle->outEnd += 4;
+ +      /* statement name */
+ +      if (statement)
+ +      {
+ +              memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen);
+ +              handle->outEnd += stmtLen;
+ +      }
+ +      else
+ +              handle->outBuffer[handle->outEnd++] = '\0';
+ +      /* query */
+ +      memcpy(handle->outBuffer + handle->outEnd, query, strLen);
+ +      handle->outEnd += strLen;
+ +      /* parameter types */
+ +      Assert(sizeof(num_params) == 2);
+ +      *((short *)(handle->outBuffer + handle->outEnd)) = htons(num_params);
+ +      handle->outEnd += sizeof(num_params);
+ +      /*
+ +       * instead of parameter ids we should send parameter names (qualified by
+ +       * schema name if required). The OIDs of types can be different on
+ +       * Datanodes.
+ +       */
+ +      for (cnt_params = 0; cnt_params < num_params; cnt_params++)
+ +      {
+ +              memcpy(handle->outBuffer + handle->outEnd, paramTypes[cnt_params],
+ +                                      strlen(paramTypes[cnt_params]) + 1);
+ +              handle->outEnd += strlen(paramTypes[cnt_params]) + 1;
+ +              pfree(paramTypes[cnt_params]);
+ +      }
+ +      pfree(paramTypes);
+ +      Assert(old_outEnd + ntohl(msgLen) + 1 == handle->outEnd);
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Send PLAN message down to the Data node
+ + */
+ +int
+ +pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
+ +                                      const char *query, const char *planstr,
+ +                                      short num_params, Oid *param_types)
+ +{
+ +      int                     stmtLen;
+ +      int                     queryLen;
+ +      int                     planLen;
+ +      int             paramTypeLen;
+ +      int                     msgLen;
+ +      char      **paramTypes = (char **)palloc(sizeof(char *) * num_params);
+ +      int                     i;
+ +      short           tmp_num_params;
+ +
+ +      /* Invalid connection state, return error */
+ +      if (handle->state != DN_CONNECTION_STATE_IDLE)
+ +              return EOF;
+ +
+ +      /* statement name size (do not allow NULL) */
+ +      stmtLen = strlen(statement) + 1;
+ +      /* source query size (do not allow NULL) */
+ +      queryLen = strlen(query) + 1;
+ +      /* query plan size (do not allow NULL) */
+ +      planLen = strlen(planstr) + 1;
+ +      /* 2 bytes for number of parameters, preceding the type names */
+ +      paramTypeLen = 2;
+ +      /* find names of the types of parameters */
+ +      for (i = 0; i < num_params; i++)
+ +      {
+ +              paramTypes[i] = format_type_be(param_types[i]);
+ +              paramTypeLen += strlen(paramTypes[i]) + 1;
+ +      }
+ +      /* size + pnameLen + queryLen + parameters */
+ +      msgLen = 4 + queryLen + stmtLen + planLen + paramTypeLen;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'p';
+ +      /* size */
+ +      msgLen = htonl(msgLen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ +      handle->outEnd += 4;
+ +      /* statement name */
+ +      memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen);
+ +      handle->outEnd += stmtLen;
+ +      /* source query */
+ +      memcpy(handle->outBuffer + handle->outEnd, query, queryLen);
+ +      handle->outEnd += queryLen;
+ +      /* query plan */
+ +      memcpy(handle->outBuffer + handle->outEnd, planstr, planLen);
+ +      handle->outEnd += planLen;
+ +      /* parameter types */
+ +      tmp_num_params = htons(num_params);
+ +      memcpy(handle->outBuffer + handle->outEnd, &tmp_num_params, sizeof(tmp_num_params));
+ +      handle->outEnd += sizeof(tmp_num_params);
+ +      /*
+ +       * instead of parameter ids we should send parameter names (qualified by
+ +       * schema name if required). The OIDs of types can be different on
+ +       * datanodes.
+ +       */
+ +      for (i = 0; i < num_params; i++)
+ +      {
+ +              int plen = strlen(paramTypes[i]) + 1;
+ +              memcpy(handle->outBuffer + handle->outEnd, paramTypes[i], plen);
+ +              handle->outEnd += plen;
+ +              pfree(paramTypes[i]);
+ +      }
+ +      pfree(paramTypes);
+ +
+ +      handle->in_extended_query = true;
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Send BIND message down to the Datanode
+ + */
+ +int
+ +pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
+ +                                      const char *statement, int paramlen, char *params)
+ +{
+ +      int                     pnameLen;
+ +      int                     stmtLen;
+ +      int             paramCodeLen;
+ +      int             paramValueLen;
+ +      int             paramOutLen;
+ +      int                     msgLen;
+ +
+ +      /* Invalid connection state, return error */
+ +      if (handle->state != DN_CONNECTION_STATE_IDLE)
+ +              return EOF;
+ +
+ +      /* portal name size (allow NULL) */
+ +      pnameLen = portal ? strlen(portal) + 1 : 1;
+ +      /* statement name size (allow NULL) */
+ +      stmtLen = statement ? strlen(statement) + 1 : 1;
+ +      /* size of parameter codes array (always empty for now) */
+ +      paramCodeLen = 2;
+ +      /* size of parameter values array, 2 if no params */
+ +      paramValueLen = paramlen ? paramlen : 2;
+ +      /* size of output parameter codes array (always empty for now) */
+ +      paramOutLen = 2;
+ +      /* size + pnameLen + stmtLen + parameters */
+ +      msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen + paramOutLen;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'B';
+ +      /* size */
+ +      msgLen = htonl(msgLen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ +      handle->outEnd += 4;
+ +      /* portal name */
+ +      if (portal)
+ +      {
+ +              memcpy(handle->outBuffer + handle->outEnd, portal, pnameLen);
+ +              handle->outEnd += pnameLen;
+ +      }
+ +      else
+ +              handle->outBuffer[handle->outEnd++] = '\0';
+ +      /* statement name */
+ +      if (statement)
+ +      {
+ +              memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen);
+ +              handle->outEnd += stmtLen;
+ +      }
+ +      else
+ +              handle->outBuffer[handle->outEnd++] = '\0';
+ +      /* parameter codes (none) */
+ +      handle->outBuffer[handle->outEnd++] = 0;
+ +      handle->outBuffer[handle->outEnd++] = 0;
+ +      /* parameter values */
+ +      if (paramlen)
+ +      {
+ +              memcpy(handle->outBuffer + handle->outEnd, params, paramlen);
+ +              handle->outEnd += paramlen;
+ +      }
+ +      else
+ +      {
+ +              handle->outBuffer[handle->outEnd++] = 0;
+ +              handle->outBuffer[handle->outEnd++] = 0;
+ +      }
+ +      /* output parameter codes (none) */
+ +      handle->outBuffer[handle->outEnd++] = 0;
+ +      handle->outBuffer[handle->outEnd++] = 0;
+ +
+ +      handle->in_extended_query = true;
+ +      return 0;
+ +}
+ +
+ +
+ +/*
+ + * Send DESCRIBE message (portal or statement) down to the Datanode
+ + */
+ +int
+ +pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
+ +                                              const char *name)
+ +{
+ +      int                     nameLen;
+ +      int                     msgLen;
+ +
+ +      /* Invalid connection state, return error */
+ +      if (handle->state != DN_CONNECTION_STATE_IDLE)
+ +              return EOF;
+ +
+ +      /* statement or portal name size (allow NULL) */
+ +      nameLen = name ? strlen(name) + 1 : 1;
+ +
+ +      /* size + statement/portal + name */
+ +      msgLen = 4 + 1 + nameLen;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'D';
+ +      /* size */
+ +      msgLen = htonl(msgLen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ +      handle->outEnd += 4;
+ +      /* statement/portal flag */
+ +      handle->outBuffer[handle->outEnd++] = is_statement ? 'S' : 'P';
+ +      /* object name */
+ +      if (name)
+ +      {
+ +              memcpy(handle->outBuffer + handle->outEnd, name, nameLen);
+ +              handle->outEnd += nameLen;
+ +      }
+ +      else
+ +              handle->outBuffer[handle->outEnd++] = '\0';
+ +
+ +      handle->in_extended_query = true;
+ +      return 0;
+ +}
+ +
+ +
+ +/*
+ + * Send CLOSE message (portal or statement) down to the Datanode
+ + */
+ +int
+ +pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
+ +                                       const char *name)
+ +{
+ +      /* statement or portal name size (allow NULL) */
+ +      int                     nameLen = name ? strlen(name) + 1 : 1;
+ +
+ +      /* size + statement/portal + name */
+ +      int                     msgLen = 4 + 1 + nameLen;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'C';
+ +      /* size */
+ +      msgLen = htonl(msgLen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ +      handle->outEnd += 4;
+ +      /* statement/portal flag */
+ +      handle->outBuffer[handle->outEnd++] = is_statement ? 'S' : 'P';
+ +      /* object name */
+ +      if (name)
+ +      {
+ +              memcpy(handle->outBuffer + handle->outEnd, name, nameLen);
+ +              handle->outEnd += nameLen;
+ +      }
+ +      else
+ +              handle->outBuffer[handle->outEnd++] = '\0';
+ +
+ +      handle->in_extended_query = true;
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Send EXECUTE message down to the Datanode
+ + */
+ +int
+ +pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch)
+ +{
+ +      /* portal name size (allow NULL) */
+ +      int                     pnameLen = portal ? strlen(portal) + 1 : 1;
+ +
+ +      /* size + pnameLen + fetchLen */
+ +      int                     msgLen = 4 + pnameLen + 4;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'E';
+ +      /* size */
+ +      msgLen = htonl(msgLen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ +      handle->outEnd += 4;
+ +      /* portal name */
+ +      if (portal)
+ +      {
+ +              memcpy(handle->outBuffer + handle->outEnd, portal, pnameLen);
+ +              handle->outEnd += pnameLen;
+ +      }
+ +      else
+ +              handle->outBuffer[handle->outEnd++] = '\0';
+ +
+ +      /* fetch */
+ +      fetch = htonl(fetch);
+ +      memcpy(handle->outBuffer + handle->outEnd, &fetch, 4);
+ +      handle->outEnd += 4;
+ +
+ +      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
+ +
+ +      handle->in_extended_query = true;
+ +      return 0;
+ +}
+ +
+ +
+ +/*
+ + * Send FLUSH message down to the Datanode
+ + */
+ +int
+ +pgxc_node_send_flush(PGXCNodeHandle * handle)
+ +{
+ +      /* size */
+ +      int                     msgLen = 4;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'H';
+ +      /* size */
+ +      msgLen = htonl(msgLen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ +      handle->outEnd += 4;
+ +
+ +      handle->in_extended_query = true;
+ +      return pgxc_node_flush(handle);
+ +}
+ +
+ +
+ +/*
+ + * Send SYNC message down to the Datanode
+ + */
+ +int
+ +pgxc_node_send_sync(PGXCNodeHandle * handle)
+ +{
+ +      /* size */
+ +      int                     msgLen = 4;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'S';
+ +      /* size */
+ +      msgLen = htonl(msgLen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ +      handle->outEnd += 4;
+ +
+ +      handle->in_extended_query = false;
+ +      handle->needSync = false;
+ +
+ +      return pgxc_node_flush(handle);
+ +}
+ +
+ +
+ +/*
+ + * Send series of Extended Query protocol messages to the data node
+ + */
+ +int
+ +pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
+ +                                                        const char *statement, const char *portal,
+ +                                                        int num_params, Oid *param_types,
+ +                                                        int paramlen, char *params,
+ +                                                        bool send_describe, int fetch_size)
+ +{
+ +      /* NULL query indicates already prepared statement */
+ +      if (query)
+ +              if (pgxc_node_send_parse(handle, statement, query, num_params, param_types))
+ +                      return EOF;
+ +      if (pgxc_node_send_bind(handle, portal, statement, paramlen, params))
+ +              return EOF;
+ +      if (send_describe)
+ +              if (pgxc_node_send_describe(handle, false, portal))
+ +                      return EOF;
+ +      if (fetch_size >= 0)
+ +              if (pgxc_node_send_execute(handle, portal, fetch_size))
+ +                      return EOF;
+ +      if (pgxc_node_send_flush(handle))
+ +              return EOF;
+ +
+ +      return 0;
+ +}
+ +
+ +
+ +/*
+ + * This method won't return until connection buffer is empty or error occurs
+ + * To ensure all data are on the wire before waiting for response
+ + */
+ +int
+ +pgxc_node_flush(PGXCNodeHandle *handle)
+ +{
+ +      while (handle->outEnd)
+ +      {
+ +              if (send_some(handle, handle->outEnd) < 0)
+ +              {
+ +                      add_error_message(handle, "failed to send data to datanode");
+ +
+ +                      /*
+ +                       * before returning, also update the shared health
+ +                       * status field to indicate that this node could be
+ +                       * possibly unavailable.
+ +                       *
+ +                       * Note that this error could be due to a stale handle
+ +                       * and it's possible that another backend might have
+ +                       * already updated the health status OR the node
+ +                       * might have already come back since the last disruption
+ +                       */
+ +                      PoolPingNodeRecheck(handle->nodeoid);
+ +                      return EOF;
+ +              }
+ +      }
+ +      return 0;
+ +}
+ +
+ +/*
+ + * This method won't return until network buffer is empty or error occurs
+ + * To ensure all data in network buffers is read and wasted
+ + */
+ +void
+ +pgxc_node_flush_read(PGXCNodeHandle *handle)
+ +{
+ +      bool    is_ready;
+ +      int     read_result;
+ +
+ +      if (handle == NULL)
+ +              return;
+ +
+ +      /*
+ +       * Before reading input send Sync to make sure
+ +       * we will eventually receive ReadyForQuery
+ +       */
+ +      pgxc_node_send_sync(handle);
+ +      while(true)
+ +      {
+ +              read_result = pgxc_node_read_data(handle, false);
+ +              if (read_result < 0)
+ +                      break;
+ +
+ +              is_ready = is_data_node_ready(handle);
+ +              if (is_ready == true)
+ +                      break;
+ +
+ +      }
+ +}
+ +
+ +/*
+ + * Send specified statement down to the PGXC node
+ + */
+ +static int
+ +pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query,
+ +              bool rollback)
+ +{
+ +      int                     strLen;
+ +      int                     msgLen;
+ +
+ +      /*
+ +       * Its appropriate to send ROLLBACK commands on a failed connection, but
+ +       * for everything else we expect the connection to be in a sane state
+ +       */
+ +      elog(DEBUG5, "pgxc_node_send_query - handle->state %d, node %s, query %s",
+ +                      handle->state, handle->nodename, query);
+ +      if ((handle->state != DN_CONNECTION_STATE_IDLE) &&
+ +              !(handle->state == DN_CONNECTION_STATE_ERROR_FATAL && rollback))
+ +              return EOF;
+ +
+ +      strLen = strlen(query) + 1;
+ +      /* size + strlen */
+ +      msgLen = 4 + strLen;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'Q';
+ +      msgLen = htonl(msgLen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ +      handle->outEnd += 4;
+ +      memcpy(handle->outBuffer + handle->outEnd, query, strLen);
+ +      handle->outEnd += strLen;
+ +
+ +      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
+ +
+ +      handle->in_extended_query = false;
+ +      return pgxc_node_flush(handle);
+ +}
+ +
+ +int
+ +pgxc_node_send_rollback(PGXCNodeHandle *handle, const char *query)
+ +{
+ +      return pgxc_node_send_query_internal(handle, query, true);
+ +}
+ +
+ +int
+ +pgxc_node_send_query(PGXCNodeHandle *handle, const char *query)
+ +{
+ +      return pgxc_node_send_query_internal(handle, query, false);
+ +}
+ +
+ +
+ +/*
+ + * Send the GXID down to the PGXC node
+ + */
+ +int
+ +pgxc_node_send_gxid(PGXCNodeHandle *handle, GlobalTransactionId gxid)
+ +{
+ +      int                     msglen = 8;
+ +
+ +      /* Invalid connection state, return error */
+ +      if (handle->state != DN_CONNECTION_STATE_IDLE)
+ +              return EOF;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'g';
+ +      msglen = htonl(msglen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+ +      handle->outEnd += 4;
+ +      memcpy(handle->outBuffer + handle->outEnd, &gxid, sizeof
+ +                      (TransactionId));
+ +      handle->outEnd += sizeof (TransactionId);
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Send the Command ID down to the PGXC node
+ + */
+ +int
+ +pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid)
+ +{
+ +      int                     msglen = CMD_ID_MSG_LEN;
+ +      int                     i32;
+ +
+ +      /* No need to send command ID if its sending flag is not enabled */
+ +      if (!IsSendCommandId())
+ +              return 0;
+ +
+ +      /* Invalid connection state, return error */
+ +      if (handle->state != DN_CONNECTION_STATE_IDLE)
+ +              return EOF;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 'M';
+ +      msglen = htonl(msglen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+ +      handle->outEnd += 4;
+ +      i32 = htonl(cid);
+ +      memcpy(handle->outBuffer + handle->outEnd, &i32, 4);
+ +      handle->outEnd += 4;
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Send the snapshot down to the PGXC node
+ + */
+ +int
+ +pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot)
+ +{
+ +      int                     msglen;
+ +      int                     nval;
+ +      int                     i;
+ +
+ +      /* Invalid connection state, return error */
+ +      if (handle->state != DN_CONNECTION_STATE_IDLE)
+ +              return EOF;
+ +
+ +      /* calculate message length */
+ +      msglen = 20;
+ +      if (snapshot->xcnt > 0)
+ +              msglen += snapshot->xcnt * 4;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +
+ +      handle->outBuffer[handle->outEnd++] = 's';
+ +      msglen = htonl(msglen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+ +      handle->outEnd += 4;
+ +
+ +      memcpy(handle->outBuffer + handle->outEnd, &snapshot->xmin, sizeof (TransactionId));
+ +      handle->outEnd += sizeof (TransactionId);
+ +
+ +      memcpy(handle->outBuffer + handle->outEnd, &snapshot->xmax, sizeof (TransactionId));
+ +      handle->outEnd += sizeof (TransactionId);
+ +
+ +      memcpy(handle->outBuffer + handle->outEnd, &RecentGlobalXmin, sizeof (TransactionId));
+ +      handle->outEnd += sizeof (TransactionId);
+ +
+ +      nval = htonl(snapshot->xcnt);
+ +      memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+ +      handle->outEnd += 4;
+ +
+ +      for (i = 0; i < snapshot->xcnt; i++)
+ +      {
+ +              memcpy(handle->outBuffer + handle->outEnd, &snapshot->xip[i], sizeof
+ +                              (TransactionId));
+ +              handle->outEnd += sizeof (TransactionId);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Send the timestamp down to the PGXC node
+ + */
+ +int
+ +pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
+ +{
+ +      int             msglen = 12; /* 4 bytes for msglen and 8 bytes for timestamp (int64) */
+ +      uint32  n32;
+ +      int64   i = (int64) timestamp;
+ +
+ +      /* Invalid connection state, return error */
+ +      if (handle->state != DN_CONNECTION_STATE_IDLE)
+ +              return EOF;
+ +
+ +      /* msgType + msgLen */
+ +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+ +      {
+ +              add_error_message(handle, "out of memory");
+ +              return EOF;
+ +      }
+ +      handle->outBuffer[handle->outEnd++] = 't';
+ +      msglen = htonl(msglen);
+ +      memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+ +      handle->outEnd += 4;
+ +
+ +      /* High order half first */
+ +#ifdef INT64_IS_BUSTED
+ +      /* don't try a right shift of 32 on a 32-bit word */
+ +      n32 = (i < 0) ? -1 : 0;
+ +#else
+ +      n32 = (uint32) (i >> 32);
+ +#endif
+ +      n32 = htonl(n32);
+ +      memcpy(handle->outBuffer + handle->outEnd, &n32, 4);
+ +      handle->outEnd += 4;
+ +
+ +      /* Now the low order half */
+ +      n32 = (uint32) i;
+ +      n32 = htonl(n32);
+ +      memcpy(handle->outBuffer + handle->outEnd, &n32, 4);
+ +      handle->outEnd += 4;
+ +
+ +      return 0;
+ +}
+ +
+ +
+ +/*
+ + * Add another message to the list of errors to be returned back to the client
+ + * at the convenient time
+ + */
+ +void
+ +add_error_message(PGXCNodeHandle *handle, const char *message)
+ +{
+ +      elog(LOG, "Remote node \"%s\", running with pid %d returned an error: %s",
+ +                      handle->nodename, handle->backend_pid, message);
+ +      handle->transaction_status = 'E';
+ +      if (handle->error)
+ +      {
+ +              /* PGXCTODO append */
+ +      }
+ +      else
+ +              handle->error = pstrdup(message);
+ +}
+ +
+ +static int load_balancer = 0;
+ +/*
+ + * Get one of the specified nodes to query replicated data source.
+ + * If session already owns one or more  of the requested connection,
+ + * the function returns existing one to avoid contacting pooler.
+ + * Performs basic load balancing.
+ + */
+ +PGXCNodeHandle *
+ +get_any_handle(List *datanodelist)
+ +{
+ +      ListCell   *lc1;
+ +      int                     i, node;
+ +
+ +      /* sanity check */
+ +      Assert(list_length(datanodelist) > 0);
+ +
+ +      if (HandlesInvalidatePending)
+ +              if (DoInvalidateRemoteHandles())
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_QUERY_CANCELED),
+ +                                       errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+ +
+ +      if (HandlesRefreshPending)
+ +              if (DoRefreshRemoteHandles())
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_QUERY_CANCELED),
+ +                                       errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+ +
+ +      /* loop through local datanode handles */
+ +      for (i = 0, node = load_balancer; i < NumDataNodes; i++, node++)
+ +      {
+ +              /* At the moment node is an index in the array, and we may need to wrap it */
+ +              if (node >= NumDataNodes)
+ +                      node -= NumDataNodes;
+ +              /* See if handle is already used */
+ +              if (dn_handles[node].sock != NO_SOCKET)
+ +              {
+ +                      foreach(lc1, datanodelist)
+ +                      {
+ +                              if (lfirst_int(lc1) == node)
+ +                              {
+ +                                      /*
+ +                                       * The node is in the list of requested nodes,
+ +                                       * set load_balancer for next time and return the handle
+ +                                       */
+ +                                      load_balancer = node + 1;
+ +                                      return &dn_handles[node];
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * None of requested nodes is in use, need to get one from the pool.
+ +       * Choose one.
+ +       */
+ +      for (i = 0, node = load_balancer; i < NumDataNodes; i++, node++)
+ +      {
+ +              /* At the moment node is an index in the array, and we may need to wrap it */
+ +              if (node >= NumDataNodes)
+ +                      node -= NumDataNodes;
+ +              /* Look only at empty slots, we have already checked existing handles */
+ +              if (dn_handles[node].sock == NO_SOCKET)
+ +              {
+ +                      foreach(lc1, datanodelist)
+ +                      {
+ +                              if (lfirst_int(lc1) == node)
+ +                              {
+ +                                      /* The node is requested */
+ +                                      List   *allocate = list_make1_int(node);
+ +                                      int        *pids;
+ +                                      int    *fds = PoolManagerGetConnections(allocate, NIL,
+ +                                                      &pids);
+ +                                      PGXCNodeHandle          *node_handle;
+ +
+ +                                      if (!fds)
+ +                                      {
+ +                                              Assert(pids != NULL);
+ +                                              ereport(ERROR,
+ +                                                              (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ +                                                               errmsg("Failed to get pooled connections"),
+ +                                                               errhint("This may happen because one or more nodes are "
+ +                                                                       "currently unreachable, either because of node or "
+ +                                                                       "network failure.\n Its also possible that the target node "
+ +                                                                       "may have hit the connection limit or the pooler is "
+ +                                                                       "configured with low connections.\n Please check "
+ +                                                                       "if all nodes are running fine and also review "
+ +                                                                       "max_connections and max_pool_size configuration "
+ +                                                                       "parameters")));
+ +                                      }
+ +                                      node_handle = &dn_handles[node];
+ +                                      pgxc_node_init(node_handle, fds[0], true, pids[0]);
+ +                                      datanode_count++;
+ +
+ +                                      elog(DEBUG1, "Established a connection with datanode \"%s\","
+ +                                                      "remote backend PID %d, socket fd %d, global session %c",
+ +                                                      node_handle->nodename, (int) pids[0], fds[0], 'T');
+ +
+ +                                      /*
+ +                                       * set load_balancer for next time and return the handle
+ +                                       */
+ +                                      load_balancer = node + 1;
+ +                                      return &dn_handles[node];
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      /* We should not get here, one of the cases should be met */
+ +      Assert(false);
+ +      /* Keep compiler quiet */
+ +      return NULL;
+ +}
+ +
+ +/*
+ + * for specified list return array of PGXCNodeHandles
+ + * acquire from pool if needed.
+ + * the lenth of returned array is the same as of nodelist
+ + * For Datanodes, Special case is empty or NIL nodeList, in this case return all the nodes.
+ + * The returned list should be pfree'd when no longer needed.
+ + * For Coordinator, do not get a connection if Coordinator list is NIL,
+ + * Coordinator fds is returned only if transaction uses a DDL
+ + */
+ +PGXCNodeAllHandles *
+ +get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session)
+ +{
+ +      PGXCNodeAllHandles      *result;
+ +      ListCell                *node_list_item;
+ +      List                    *dn_allocate = NIL;
+ +      List                    *co_allocate = NIL;
+ +      PGXCNodeHandle          *node_handle;
+ +
+ +      /* index of the result array */
+ +      int                     i = 0;
+ +
+ +      if (HandlesInvalidatePending)
+ +              if (DoInvalidateRemoteHandles())
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_QUERY_CANCELED),
+ +                                       errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+ +
+ +      if (HandlesRefreshPending)
+ +              if (DoRefreshRemoteHandles())
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_QUERY_CANCELED),
+ +                                       errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+ +
+ +      result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
+ +      if (!result)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +      }
+ +
+ +      result->primary_handle = NULL;
+ +      result->datanode_handles = NULL;
+ +      result->coord_handles = NULL;
+ +      result->co_conn_count = list_length(coordlist);
+ +      result->dn_conn_count = list_length(datanodelist);
+ +
+ +      /*
+ +       * Get Handles for Datanodes
+ +       * If node list is empty execute request on current nodes.
+ +       * It is also possible that the query has to be launched only on Coordinators.
+ +       */
+ +      if (!is_coord_only_query)
+ +      {
+ +              if (list_length(datanodelist) == 0)
+ +              {
+ +                      /*
+ +                       * We do not have to zero the array - on success all items will be set
+ +                       * to correct pointers, on error the array will be freed
+ +                       */
+ +                      result->datanode_handles = (PGXCNodeHandle **)
+ +                                                                         palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
+ +                      if (!result->datanode_handles)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                               errmsg("out of memory")));
+ +                      }
+ +
+ +                      for (i = 0; i < NumDataNodes; i++)
+ +                      {
+ +                              node_handle = &dn_handles[i];
+ +                              result->datanode_handles[i] = node_handle;
+ +                              if (node_handle->sock == NO_SOCKET)
+ +                                      dn_allocate = lappend_int(dn_allocate, i);
+ +                      }
+ +              }
+ +              else
+ +              {
+ +                      /*
+ +                       * We do not have to zero the array - on success all items will be set
+ +                       * to correct pointers, on error the array will be freed
+ +                       */
+ +
+ +                      result->datanode_handles = (PGXCNodeHandle **)
+ +                              palloc(list_length(datanodelist) * sizeof(PGXCNodeHandle *));
+ +                      if (!result->datanode_handles)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                               errmsg("out of memory")));
+ +                      }
+ +
+ +                      i = 0;
+ +                      foreach(node_list_item, datanodelist)
+ +                      {
+ +                              int     node = lfirst_int(node_list_item);
+ +
+ +                              if (node < 0 || node >= NumDataNodes)
+ +                              {
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                                      errmsg("Invalid Datanode number")));
+ +                              }
+ +
+ +                              node_handle = &dn_handles[node];
+ +                              result->datanode_handles[i++] = node_handle;
+ +                              if (node_handle->sock == NO_SOCKET)
+ +                                      dn_allocate = lappend_int(dn_allocate, node);
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Get Handles for Coordinators
+ +       * If node list is empty execute request on current nodes
+ +       * There are transactions where the Coordinator list is NULL Ex:COPY
+ +       */
+ +
+ +      if (coordlist)
+ +      {
+ +              if (list_length(coordlist) == 0)
+ +              {
+ +                      /*
+ +                       * We do not have to zero the array - on success all items will be set
+ +                       * to correct pointers, on error the array will be freed
+ +                       */
+ +                      result->coord_handles = (PGXCNodeHandle **)palloc(NumCoords * sizeof(PGXCNodeHandle *));
+ +                      if (!result->coord_handles)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                               errmsg("out of memory")));
+ +                      }
+ +
+ +                      for (i = 0; i < NumCoords; i++)
+ +                      {
+ +                              node_handle = &co_handles[i];
+ +                              result->coord_handles[i] = node_handle;
+ +                              if (node_handle->sock == NO_SOCKET)
+ +                                      co_allocate = lappend_int(co_allocate, i);
+ +                      }
+ +              }
+ +              else
+ +              {
+ +                      /*
+ +                       * We do not have to zero the array - on success all items will be set
+ +                       * to correct pointers, on error the array will be freed
+ +                       */
+ +                      result->coord_handles = (PGXCNodeHandle **)
+ +                                                                      palloc(list_length(coordlist) * sizeof(PGXCNodeHandle *));
+ +                      if (!result->coord_handles)
+ +                      {
+ +                              ereport(ERROR,
+ +                                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                               errmsg("out of memory")));
+ +                      }
+ +
+ +                      i = 0;
+ +                      /* Some transactions do not need Coordinators, ex: COPY */
+ +                      foreach(node_list_item, coordlist)
+ +                      {
+ +                              int                     node = lfirst_int(node_list_item);
+ +
+ +                              if (node < 0 || node >= NumCoords)
+ +                              {
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                                      errmsg("Invalid coordinator number")));
+ +                              }
+ +
+ +                              node_handle = &co_handles[node];
+ +
+ +                              result->coord_handles[i++] = node_handle;
+ +                              if (node_handle->sock == NO_SOCKET)
+ +                                      co_allocate = lappend_int(co_allocate, node);
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Pooler can get activated even if list of Coordinator or Datanode is NULL
+ +       * If both lists are NIL, we don't need to call Pooler.
+ +       */
+ +      if (dn_allocate || co_allocate)
+ +      {
+ +              int     j = 0;
+ +              int *pids;
+ +              int     *fds = PoolManagerGetConnections(dn_allocate, co_allocate, &pids);
+ +
+ +              if (!fds)
+ +              {
+ +                      if (coordlist)
+ +                              if (result->coord_handles)
+ +                                      pfree(result->coord_handles);
+ +                      if (datanodelist)
+ +                              if (result->datanode_handles)
+ +                                      pfree(result->datanode_handles);
+ +
+ +                      pfree(result);
+ +                      if (dn_allocate)
+ +                              list_free(dn_allocate);
+ +                      if (co_allocate)
+ +                              list_free(co_allocate);
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ +                                       errmsg("Failed to get pooled connections"),
+ +                                       errhint("This may happen because one or more nodes are "
+ +                                               "currently unreachable, either because of node or "
+ +                                               "network failure.\n Its also possible that the target node "
+ +                                               "may have hit the connection limit or the pooler is "
+ +                                               "configured with low connections.\n Please check "
+ +                                               "if all nodes are running fine and also review "
+ +                                               "max_connections and max_pool_size configuration "
+ +                                               "parameters")));
+ +              }
+ +              /* Initialisation for Datanodes */
+ +              if (dn_allocate)
+ +              {
+ +                      foreach(node_list_item, dn_allocate)
+ +                      {
+ +                              int                     node = lfirst_int(node_list_item);
+ +                              int                     fdsock = fds[j];
+ +                              int                     be_pid = pids[j++];
+ +
+ +                              if (node < 0 || node >= NumDataNodes)
+ +                              {
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                                      errmsg("Invalid Datanode number")));
+ +                              }
+ +
+ +                              node_handle = &dn_handles[node];
+ +                              pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
+ +                              dn_handles[node] = *node_handle;
+ +                              datanode_count++;
+ +
+ +                              elog(DEBUG1, "Established a connection with datanode \"%s\","
+ +                                              "remote backend PID %d, socket fd %d, global session %c",
+ +                                              node_handle->nodename, (int) be_pid, fdsock,
+ +                                              is_global_session ? 'T' : 'F');
+ +                      }
+ +              }
+ +              /* Initialisation for Coordinators */
+ +              if (co_allocate)
+ +              {
+ +                      foreach(node_list_item, co_allocate)
+ +                      {
+ +                              int                     node = lfirst_int(node_list_item);
+ +                              int                     be_pid = pids[j];
+ +                              int                     fdsock = fds[j++];
+ +
+ +                              if (node < 0 || node >= NumCoords)
+ +                              {
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                                      errmsg("Invalid coordinator number")));
+ +                              }
+ +
+ +                              node_handle = &co_handles[node];
+ +                              pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
+ +                              co_handles[node] = *node_handle;
+ +                              coord_count++;
+ +
+ +                              elog(DEBUG1, "Established a connection with coordinator \"%s\","
+ +                                              "remote backend PID %d, socket fd %d, global session %c",
+ +                                              node_handle->nodename, (int) be_pid, fdsock,
+ +                                              is_global_session ? 'T' : 'F');
+ +                      }
+ +              }
+ +
+ +              pfree(fds);
+ +
+ +              if (co_allocate)
+ +                      list_free(co_allocate);
+ +              if (dn_allocate)
+ +                      list_free(dn_allocate);
+ +      }
+ +
+ +      return result;
+ +}
+ +
+ +PGXCNodeAllHandles *
+ +get_current_handles(void)
+ +{
+ +      PGXCNodeAllHandles *result;
+ +      PGXCNodeHandle     *node_handle;
+ +      int                                     i;
+ +
+ +      result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
+ +      if (!result)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +      }
+ +
+ +      result->primary_handle = NULL;
+ +      result->co_conn_count = 0;
+ +      result->dn_conn_count = 0;
+ +
+ +      result->datanode_handles = (PGXCNodeHandle **)
+ +                                                         palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
+ +      if (!result->datanode_handles)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +      }
+ +
+ +      for (i = 0; i < NumDataNodes; i++)
+ +      {
+ +              node_handle = &dn_handles[i];
+ +              if (node_handle->sock != NO_SOCKET)
+ +                      result->datanode_handles[result->dn_conn_count++] = node_handle;
+ +      }
+ +
+ +      result->coord_handles = (PGXCNodeHandle **)
+ +                                                      palloc(NumCoords * sizeof(PGXCNodeHandle *));
+ +      if (!result->coord_handles)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +      }
+ +
+ +      for (i = 0; i < NumCoords; i++)
+ +      {
+ +              node_handle = &co_handles[i];
+ +              if (node_handle->sock != NO_SOCKET)
+ +                      result->coord_handles[result->co_conn_count++] = node_handle;
+ +      }
+ +
+ +      return result;
+ +}
+ +
+ +/* Free PGXCNodeAllHandles structure */
+ +void
+ +pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
+ +{
+ +      if (!pgxc_handles)
+ +              return;
+ +
+ +      if (pgxc_handles->primary_handle)
+ +              pfree(pgxc_handles->primary_handle);
+ +      if (pgxc_handles->datanode_handles)
+ +              pfree(pgxc_handles->datanode_handles);
+ +      if (pgxc_handles->coord_handles)
+ +              pfree(pgxc_handles->coord_handles);
+ +
+ +      pfree(pgxc_handles);
+ +}
+ +
+ +/*
+ + * PGXCNodeGetNodeId
+ + *            Look at the data cached for handles and return node position
+ + *            If node type is PGXC_NODE_COORDINATOR look only in coordinator list,
+ + *            if node type is PGXC_NODE_DATANODE look only in datanode list,
+ + *            if other (assume PGXC_NODE_NODE) search both, in last case return actual
+ + *            node type.
+ + */
+ +int
+ +PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
+ +{
+ +      int i;
+ +
+ +      /* First check datanodes, they referenced more often */
+ +      if (node_type == NULL || *node_type != PGXC_NODE_COORDINATOR)
+ +      {
+ +              for (i = 0; i < NumDataNodes; i++)
+ +              {
+ +                      if (dn_handles[i].nodeoid == nodeoid)
+ +                      {
+ +                              if (node_type)
+ +                                      *node_type = PGXC_NODE_DATANODE;
+ +                              return i;
+ +                      }
+ +              }
+ +      }
+ +      /* Then check coordinators */
+ +      if (node_type == NULL || *node_type != PGXC_NODE_DATANODE)
+ +      {
+ +              for (i = 0; i < NumCoords; i++)
+ +              {
+ +                      if (co_handles[i].nodeoid == nodeoid)
+ +                      {
+ +                              if (node_type)
+ +                                      *node_type = PGXC_NODE_COORDINATOR;
+ +                              return i;
+ +                      }
+ +              }
+ +      }
+ +      /* Not found, have caller handling it */
+ +      if (node_type)
+ +              *node_type = PGXC_NODE_NONE;
+ +      return -1;
+ +}
+ +
+ +/*
+ + * PGXCNodeGetNodeOid
+ + *            Look at the data cached for handles and return node Oid
+ + */
+ +Oid
+ +PGXCNodeGetNodeOid(int nodeid, char node_type)
+ +{
+ +      PGXCNodeHandle *handles;
+ +
+ +      switch (node_type)
+ +      {
+ +              case PGXC_NODE_COORDINATOR:
+ +                      handles = co_handles;
+ +                      break;
+ +              case PGXC_NODE_DATANODE:
+ +                      handles = dn_handles;
+ +                      break;
+ +              default:
+ +                      /* Should not happen */
+ +                      Assert(0);
+ +                      return InvalidOid;
+ +      }
+ +
+ +      return handles[nodeid].nodeoid;
+ +}
+ +
+ +/*
+ + * pgxc_node_str
+ + *
+ + * get the name of the node
+ + */
+ +Datum
+ +pgxc_node_str(PG_FUNCTION_ARGS)
+ +{
++      PG_RETURN_TEXT_P(cstring_to_text(PGXCNodeName));
+ +}
+ +
+ +/*
+ + * PGXCNodeGetNodeIdFromName
+ + *            Return node position in handles array
+ + */
+ +int
+ +PGXCNodeGetNodeIdFromName(char *node_name, char *node_type)
+ +{
+ +      char *nm;
+ +      Oid nodeoid;
+ +
+ +      if (node_name == NULL)
+ +      {
+ +              if (node_type)
+ +                      *node_type = PGXC_NODE_NONE;
+ +              return -1;
+ +      }
+ +
+ +      nm = str_tolower(node_name, strlen(node_name), DEFAULT_COLLATION_OID);
+ +
+ +      nodeoid = get_pgxc_nodeoid(nm);
+ +      pfree(nm);
+ +      if (!OidIsValid(nodeoid))
+ +      {
+ +              if (node_type)
+ +                      *node_type = PGXC_NODE_NONE;
+ +              return -1;
+ +      }
+ +
+ +      return PGXCNodeGetNodeId(nodeoid, node_type);
+ +}
+ +
+ +static List *
+ +paramlist_delete_param(List *param_list, const char *name)
+ +{
+ +         ListCell   *cur_item;
+ +         ListCell   *prev_item;
+ +
+ +         prev_item = NULL;
+ +         cur_item = list_head(param_list);
+ +
+ +         while (cur_item != NULL)
+ +         {
+ +                         ParamEntry *entry = (ParamEntry *) lfirst(cur_item);
+ +
+ +                         if (strcmp(NameStr(entry->name), name) == 0)
+ +                         {
+ +                                         /* cur_item must be removed */
+ +                                         param_list = list_delete_cell(param_list, cur_item, prev_item);
+ +                                         pfree(entry);
+ +                                         if (prev_item)
+ +                                                         cur_item = lnext(prev_item);
+ +                                         else
+ +                                                         cur_item = list_head(param_list);
+ +                         }
+ +                         else
+ +                         {
+ +                                         prev_item = cur_item;
+ +                                         cur_item = lnext(prev_item);
+ +                         }
+ +         }
+ +
+ +         return param_list;
+ +}
+ +
+ +/*
+ + * Remember new value of a session or transaction parameter, and set same
+ + * values on newly connected remote nodes.
+ + */
+ +void
+ +PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
+ +{
+ +      List *param_list;
+ +      MemoryContext oldcontext;
+ +
+ +      /* Get the target hash table and invalidate command string */
+ +      if (local)
+ +      {
+ +              param_list = local_param_list;
+ +              if (local_params)
+ +                      resetStringInfo(local_params);
+ +              oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+ +      }
+ +      else
+ +      {
+ +              param_list = session_param_list;
+ +              if (session_params)
+ +                      resetStringInfo(session_params);
+ +              oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+ +      }
+ +
+ +      param_list = paramlist_delete_param(param_list, name);
+ +      if (value)
+ +      {
+ +              ParamEntry *entry;
+ +              entry = (ParamEntry *) palloc(sizeof (ParamEntry));
+ +              strlcpy((char *) (&entry->name), name, NAMEDATALEN);
+ +              strlcpy((char *) (&entry->value), value, NAMEDATALEN);
+ +              entry->flags = flags;
+ +
+ +              param_list = lappend(param_list, entry);
+ +      }
+ +
+ +      /*
+ +       * Special case for
+ +       *      RESET SESSION AUTHORIZATION
+ +       *      SET SESSION AUTHORIZATION TO DEFAULT
+ +       *
+ +       * We must also forget any SET ROLE commands since RESET SESSION
+ +       * AUTHORIZATION also resets current role to session default
+ +       */
+ +      if ((strcmp(name, "session_authorization") == 0) && (value == NULL))
+ +              param_list = paramlist_delete_param(param_list, "role");
+ +
+ +      if (local)
+ +              local_param_list = param_list;
+ +      else
+ +              session_param_list = param_list;
+ +
+ +      MemoryContextSwitchTo(oldcontext);
+ +}
+ +
+ +
+ +/*
+ + * Forget all parameter values set either for transaction or both transaction
+ + * and session.
+ + */
+ +void
+ +PGXCNodeResetParams(bool only_local)
+ +{
+ +      if (!only_local && session_param_list)
+ +      {
+ +              /* need to explicitly pfree session stuff, it is in TopMemoryContext */
+ +              list_free_deep(session_param_list);
+ +              session_param_list = NIL;
+ +              if (session_params)
+ +              {
+ +                      pfree(session_params->data);
+ +                      pfree(session_params);
+ +                      session_params = NULL;
+ +              }
+ +      }
+ +      /*
+ +       * no need to explicitly destroy the local_param_list and local_params,
+ +       * it will gone with the transaction memory context.
+ +       */
+ +      local_param_list = NIL;
+ +      local_params = NULL;
+ +}
+ +
+ +static void
+ +get_set_command(List *param_list, StringInfo command, bool local)
+ +{
+ +      ListCell                   *lc;
+ +
+ +      if (param_list == NIL)
+ +              return;
+ +
+ +      foreach (lc, param_list)
+ +      {
+ +              ParamEntry *entry = (ParamEntry *) lfirst(lc);
+ +              char *value = NameStr(entry->value);
+ +
+ +              if (strlen(value) == 0)
+ +                      value = "''";
+ +
+ +              value = quote_guc_value(value, entry->flags);
+ +
+ +              appendStringInfo(command, "SET %s %s TO %s;", local ? "LOCAL" : "",
+ +                       NameStr(entry->name), value);
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Returns SET commands needed to initialize remote session.
+ + * The command may already be biult and valid, return it right away if the case.
+ + * Otherwise build it up.
+ + * To support Distributed Session machinery coordinator should generate and
+ + * send a distributed session identifier to remote nodes. Generate it here.
+ + */
+ +char *
+ +PGXCNodeGetSessionParamStr(void)
+ +{
+ +      /*
+ +       * If no session parameters are set and that is a coordinator we need to set
+ +       * global_session anyway, even if there were no other parameters.
+ +       * We do not want this string to disappear, so create it in the
+ +       * TopMemoryContext. However if we add first session parameter we will need
+ +       * to free the buffer and recreate it in the same context as the hash table
+ +       * to avoid memory leakage.
+ +       */
+ +      if (session_params == NULL)
+ +      {
+ +              MemoryContext oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+ +              session_params = makeStringInfo();
+ +              MemoryContextSwitchTo(oldcontext);
+ +      }
+ +
+ +      /* If the paramstr invalid build it up */
+ +      if (session_params->len == 0)
+ +      {
+ +              if (IS_PGXC_COORDINATOR)
+ +                      appendStringInfo(session_params, "SET global_session TO %s_%d;",
+ +                                                       PGXCNodeName, MyProcPid);
+ +              get_set_command(session_param_list, session_params, false);
+ +              appendStringInfo(session_params, "SET parentPGXCPid TO %d;",
+ +                                                       MyProcPid);
+ +      }
+ +      return session_params->len == 0 ? NULL : session_params->data;
+ +}
+ +
+ +
+ +/*
+ + * Returns SET commands needed to initialize transaction on a remote session.
+ + * The command may already be biult and valid, return it right away if the case.
+ + * Otherwise build it up.
+ + */
+ +char *
+ +PGXCNodeGetTransactionParamStr(void)
+ +{
+ +      /* If no local parameters defined there is nothing to return */
+ +      if (local_param_list == NIL)
+ +              return NULL;
+ +
+ +      /*
+ +       * If the paramstr invalid build it up.
+ +       */
+ +      if (local_params == NULL)
+ +      {
+ +              MemoryContext oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+ +              local_params = makeStringInfo();
+ +              MemoryContextSwitchTo(oldcontext);
+ +      }
+ +      /*
+ +       * If parameter string exists it is valid, it is truncated when parameters
+ +       * are modified.
+ +       */
+ +      if (local_params->len == 0)
+ +      {
+ +              get_set_command(local_param_list, local_params, true);
+ +      }
+ +      return local_params->len == 0 ? NULL : local_params->data;
+ +}
+ +
+ +
+ +/*
+ + * Send down specified query, read and discard all responses until ReadyForQuery
+ + */
+ +void
+ +pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
+ +{
+ +      pgxc_node_send_query(handle, set_query);
+ +      /*
+ +       * Now read responses until ReadyForQuery.
+ +       * XXX We may need to handle possible errors here.
+ +       */
+ +      for (;;)
+ +      {
+ +              char    msgtype;
+ +              int     msglen;
+ +              char   *msg;
+ +              /*
+ +               * If we are in the process of shutting down, we
+ +               * may be rolling back, and the buffer may contain other messages.
+ +               * We want to avoid a procarray exception
+ +               * as well as an error stack overflow.
+ +               */
+ +              if (proc_exit_inprogress)
+ +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ +
+ +              /* don't read from from the connection if there is a fatal error */
+ +              if (handle->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ +                      break;
+ +
+ +              /* No data available, read more */
+ +              if (!HAS_MESSAGE_BUFFERED(handle))
+ +              {
+ +                      pgxc_node_receive(1, &handle, NULL);
+ +                      continue;
+ +              }
+ +              msgtype = get_message(handle, &msglen, &msg);
+ +
+ +              /*
+ +               * Ignore any response except ErrorResponse and ReadyForQuery
+ +               */
+ +
+ +              if (msgtype == 'E')     /* ErrorResponse */
+ +              {
+ +                      handle->error = pstrdup(msg);
+ +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ +                      break;
+ +              }
+ +
+ +              if (msgtype == 'Z') /* ReadyForQuery */
+ +              {
+ +                      handle->transaction_status = msg[0];
+ +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_IDLE);
+ +                      handle->combiner = NULL;
+ +                      break;
+ +              }
+ +      }
+ +}
+ +
+ +
+ +void
+ +RequestInvalidateRemoteHandles(void)
+ +{
+ +      HandlesInvalidatePending = true;
+ +}
+ +
+ +void
+ +RequestRefreshRemoteHandles(void)
+ +{
+ +      HandlesRefreshPending = true;
+ +}
+ +
+ +bool
+ +PoolerMessagesPending(void)
+ +{
+ +      if (HandlesRefreshPending)
+ +              return true;
+ +
+ +      return false;
+ +}
+ +
+ +/*
+ + * For all handles, mark as they are not in use and discard pending input/output
+ + */
+ +static bool
+ +DoInvalidateRemoteHandles(void)
+ +{
+ +      int                     i;
+ +      PGXCNodeHandle *handle;
+ +      bool                    result = false;
+ +
+ +      HandlesInvalidatePending = false;
+ +      HandlesRefreshPending = false;
+ +
+ +      for (i = 0; i < NumCoords; i++)
+ +      {
+ +              handle = &co_handles[i];
+ +              if (handle->sock != NO_SOCKET)
+ +                      result = true;
+ +              handle->sock = NO_SOCKET;
+ +              handle->inStart = handle->inEnd = handle->inCursor = 0;
+ +              handle->outEnd = 0;
+ +      }
+ +      for (i = 0; i < NumDataNodes; i++)
+ +      {
+ +              handle = &dn_handles[i];
+ +              if (handle->sock != NO_SOCKET)
+ +                      result = true;
+ +              handle->sock = NO_SOCKET;
+ +              handle->inStart = handle->inEnd = handle->inCursor = 0;
+ +              handle->outEnd = 0;
+ +      }
+ +
+ +      InitMultinodeExecutor(true);
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * Diff handles using shmem, and remove ALTERed handles
+ + */
+ +static bool
+ +DoRefreshRemoteHandles(void)
+ +{
+ +      List                    *altered = NIL, *deleted = NIL, *added = NIL;
+ +      Oid                             *coOids, *dnOids;
+ +      int                             numCoords, numDNodes, total_nodes;
+ +      bool                    res = true;
+ +
+ +      HandlesRefreshPending = false;
+ +
+ +      PgxcNodeGetOids(&coOids, &dnOids, &numCoords, &numDNodes, false);
+ +
+ +      total_nodes = numCoords + numDNodes;
+ +      if (total_nodes > 0)
+ +      {
+ +              int             i;
+ +              List   *shmoids = NIL;
+ +              Oid        *allOids = (Oid *)palloc(total_nodes * sizeof(Oid));
+ +
+ +              /* build array with Oids of all nodes (coordinators first) */
+ +              memcpy(allOids, coOids, numCoords * sizeof(Oid));
+ +              memcpy(allOids + numCoords, dnOids, numDNodes * sizeof(Oid));
+ +
+ +              LWLockAcquire(NodeTableLock, LW_SHARED);
+ +
+ +              for (i = 0; i < total_nodes; i++)
+ +              {
+ +                      NodeDefinition  *nodeDef;
+ +                      PGXCNodeHandle  *handle;
+ +
+ +                      int nid;
+ +                      Oid nodeoid;
+ +                      char ntype = PGXC_NODE_NONE;
+ +
+ +                      nodeoid = allOids[i];
+ +                      shmoids = lappend_oid(shmoids, nodeoid);
+ +
+ +                      nodeDef = PgxcNodeGetDefinition(nodeoid);
+ +                      /*
+ +                       * identify an entry with this nodeoid. If found
+ +                       * compare the name/host/port entries. If the name is
+ +                       * same and other info is different, it's an ALTER.
+ +                       * If the local entry does not exist in the shmem, it's
+ +                       * a DELETE. If the entry from shmem does not exist
+ +                       * locally, it's an ADDITION
+ +                       */
+ +                      nid = PGXCNodeGetNodeId(nodeoid, &ntype);
+ +
+ +                      if (nid == -1)
+ +                      {
+ +                              /* a new node has been added to the shmem */
+ +                              added = lappend_oid(added, nodeoid);
+ +                              elog(LOG, "Node added: name (%s) host (%s) port (%d)",
+ +                                       NameStr(nodeDef->nodename), NameStr(nodeDef->nodehost),
+ +                                       nodeDef->nodeport);
+ +                      }
+ +                      else
+ +                      {
+ +                              if (ntype == PGXC_NODE_COORDINATOR)
+ +                                      handle = &co_handles[nid];
+ +                              else if (ntype == PGXC_NODE_DATANODE)
+ +                                      handle = &dn_handles[nid];
+ +                              else
+ +                                      elog(ERROR, "Node with non-existent node type!");
+ +
+ +                              /*
+ +                               * compare name, host, port to see if this node
+ +                               * has been ALTERed
+ +                               */
+ +                              if (strncmp(handle->nodename, NameStr(nodeDef->nodename), NAMEDATALEN) != 0 ||
+ +                                      strncmp(handle->nodehost, NameStr(nodeDef->nodehost), NAMEDATALEN) != 0 ||
+ +                                      handle->nodeport != nodeDef->nodeport)
+ +                              {
+ +                                      elog(LOG, "Node altered: old name (%s) old host (%s) old port (%d)"
+ +                                                      " new name (%s) new host (%s) new port (%d)",
+ +                                               handle->nodename, handle->nodehost, handle->nodeport,
+ +                                               NameStr(nodeDef->nodename), NameStr(nodeDef->nodehost),
+ +                                               nodeDef->nodeport);
+ +                                      altered = lappend_oid(altered, nodeoid);
+ +                              }
+ +                              /* else do nothing */
+ +                      }
+ +                      pfree(nodeDef);
+ +              }
+ +
+ +              /*
+ +               * Any entry in backend area but not in shmem means that it has
+ +               * been deleted
+ +               */
+ +              for (i = 0; i < NumCoords; i++)
+ +              {
+ +                      PGXCNodeHandle  *handle = &co_handles[i];
+ +                      Oid nodeoid = handle->nodeoid;
+ +
+ +                      if (!list_member_oid(shmoids, nodeoid))
+ +                      {
+ +                              deleted = lappend_oid(deleted, nodeoid);
+ +                              elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
+ +                                       handle->nodename, handle->nodehost, handle->nodeport);
+ +                      }
+ +              }
+ +
+ +              for (i = 0; i < NumDataNodes; i++)
+ +              {
+ +                      PGXCNodeHandle  *handle = &dn_handles[i];
+ +                      Oid nodeoid = handle->nodeoid;
+ +
+ +                      if (!list_member_oid(shmoids, nodeoid))
+ +                      {
+ +                              deleted = lappend_oid(deleted, nodeoid);
+ +                              elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
+ +                                       handle->nodename, handle->nodehost, handle->nodeport);
+ +                      }
+ +              }
+ +
+ +              LWLockRelease(NodeTableLock);
+ +
+ +              /* Release palloc'ed memory */
+ +              pfree(coOids);
+ +              pfree(dnOids);
+ +              pfree(allOids);
+ +              list_free(shmoids);
+ +      }
+ +
+ +      if (deleted != NIL || added != NIL)
+ +      {
+ +              elog(LOG, "Nodes added/deleted. Reload needed!");
+ +              res = false;
+ +      }
+ +
+ +      if (altered == NIL)
+ +      {
+ +              elog(LOG, "No nodes altered. Returning");
+ +              res = true;
+ +      }
+ +      else
+ +              PgxcNodeRefreshBackendHandlesShmem(altered);
+ +
+ +      list_free(altered);
+ +      list_free(added);
+ +      list_free(deleted);
+ +
+ +      return res;
+ +}
+ +
+ +void
+ +PGXCNodeSetConnectionState(PGXCNodeHandle *handle, DNConnectionState new_state)
+ +{
+ +      elog(DEBUG5, "Changing connection state for node %s, old state %d, "
+ +                      "new state %d", handle->nodename, handle->state, new_state);
+ +      handle->state = new_state;
+ +}
+ +
+ +/*
+ + * Do a "Diff" of backend NODE metadata and the one present in catalog
+ + *
+ + * We do this in order to identify if we should do a destructive
+ + * cleanup or just invalidation of some specific handles
+ + */
+ +bool
+ +PgxcNodeDiffBackendHandles(List **nodes_alter,
+ +                         List **nodes_delete, List **nodes_add)
+ +{
+ +      Relation rel;
+ +      HeapScanDesc scan;
+ +      HeapTuple   tuple;
+ +      int     i;
+ +      List *altered = NIL, *added = NIL, *deleted = NIL;
+ +      List *catoids = NIL;
+ +      PGXCNodeHandle *handle;
+ +      Oid     nodeoid;
+ +      bool res = true;
+ +
+ +      LWLockAcquire(NodeTableLock, LW_SHARED);
+ +
+ +      rel = heap_open(PgxcNodeRelationId, AccessShareLock);
+ +      scan = heap_beginscan(rel, SnapshotSelf, 0, NULL);
+ +      while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ +      {
+ +              Form_pgxc_node  nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ +              int nid;
+ +              Oid nodeoid;
+ +              char ntype = PGXC_NODE_NONE;
+ +
+ +              nodeoid = HeapTupleGetOid(tuple);
+ +              catoids = lappend_oid(catoids, nodeoid);
+ +
+ +              /*
+ +               * identify an entry with this nodeoid. If found
+ +               * compare the name/host/port entries. If the name is
+ +               * same and other info is different, it's an ALTER.
+ +               * If the local entry does not exist in the catalog, it's
+ +               * a DELETE. If the entry from catalog does not exist
+ +               * locally, it's an ADDITION
+ +               */
+ +              nid = PGXCNodeGetNodeId(nodeoid, &ntype);
+ +
+ +              if (nid == -1)
+ +              {
+ +                      /* a new node has been added to the catalog */
+ +                      added = lappend_oid(added, nodeoid);
+ +                      elog(LOG, "Node added: name (%s) host (%s) port (%d)",
+ +                               NameStr(nodeForm->node_name), NameStr(nodeForm->node_host),
+ +                               nodeForm->node_port);
+ +              }
+ +              else
+ +              {
+ +                      if (ntype == PGXC_NODE_COORDINATOR)
+ +                              handle = &co_handles[nid];
+ +                      else if (ntype == PGXC_NODE_DATANODE)
+ +                              handle = &dn_handles[nid];
+ +                      else
+ +                              elog(ERROR, "Node with non-existent node type!");
+ +
+ +                      /*
+ +                       * compare name, host, port to see if this node
+ +                       * has been ALTERed
+ +                       */
+ +                      if (strncmp(handle->nodename, NameStr(nodeForm->node_name), NAMEDATALEN)
+ +                              != 0 ||
+ +                              strncmp(handle->nodehost, NameStr(nodeForm->node_host), NAMEDATALEN)
+ +                              != 0 ||
+ +                              handle->nodeport != nodeForm->node_port)
+ +                      {
+ +                              elog(LOG, "Node altered: old name (%s) old host (%s) old port (%d)"
+ +                                              " new name (%s) new host (%s) new port (%d)",
+ +                                       handle->nodename, handle->nodehost, handle->nodeport,
+ +                                       NameStr(nodeForm->node_name), NameStr(nodeForm->node_host),
+ +                                       nodeForm->node_port);
+ +                              /*
+ +                               * If this node itself is being altered, then we need to
+ +                               * resort to a reload. Check so..
+ +                               */
+ +                              if (pg_strcasecmp(PGXCNodeName,
+ +                                                                NameStr(nodeForm->node_name)) == 0)
+ +                              {
+ +                                      res = false;
+ +                              }
+ +                              altered = lappend_oid(altered, nodeoid);
+ +                      }
+ +                      /* else do nothing */
+ +              }
+ +      }
+ +      heap_endscan(scan);
+ +
+ +      /*
+ +       * Any entry in backend area but not in catalog means that it has
+ +       * been deleted
+ +       */
+ +      for (i = 0; i < NumCoords; i++)
+ +      {
+ +              handle = &co_handles[i];
+ +              nodeoid = handle->nodeoid;
+ +              if (!list_member_oid(catoids, nodeoid))
+ +              {
+ +                      deleted = lappend_oid(deleted, nodeoid);
+ +                      elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
+ +                               handle->nodename, handle->nodehost, handle->nodeport);
+ +              }
+ +      }
+ +      for (i = 0; i < NumDataNodes; i++)
+ +      {
+ +              handle = &dn_handles[i];
+ +              nodeoid = handle->nodeoid;
+ +              if (!list_member_oid(catoids, nodeoid))
+ +              {
+ +                      deleted = lappend_oid(deleted, nodeoid);
+ +                      elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
+ +                               handle->nodename, handle->nodehost, handle->nodeport);
+ +              }
+ +      }
+ +      heap_close(rel, AccessShareLock);
+ +      LWLockRelease(NodeTableLock);
+ +
+ +      if (nodes_alter)
+ +              *nodes_alter = altered;
+ +      if (nodes_delete)
+ +              *nodes_delete = deleted;
+ +      if (nodes_add)
+ +              *nodes_add = added;
+ +
+ +      if (catoids)
+ +              list_free(catoids);
+ +
+ +      return res;
+ +}
+ +
+ +/*
+ + * Refresh specific backend handles associated with
+ + * nodes in the "nodes_alter" list below
+ + *
+ + * The handles are refreshed using shared memory
+ + */
+ +void
+ +PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter)
+ +{
+ +      ListCell *lc;
+ +      Oid nodeoid;
+ +      int nid;
+ +      PGXCNodeHandle *handle = NULL;
+ +
+ +      foreach(lc, nodes_alter)
+ +      {
+ +              char ntype = PGXC_NODE_NONE;
+ +              NodeDefinition *nodedef;
+ +
+ +              nodeoid = lfirst_oid(lc);
+ +              nid = PGXCNodeGetNodeId(nodeoid, &ntype);
+ +
+ +              if (nid == -1)
+ +                      elog(ERROR, "Looks like node metadata changed again");
+ +              else
+ +              {
+ +                      if (ntype == PGXC_NODE_COORDINATOR)
+ +                              handle = &co_handles[nid];
+ +                      else if (ntype == PGXC_NODE_DATANODE)
+ +                              handle = &dn_handles[nid];
+ +                      else
+ +                              elog(ERROR, "Node with non-existent node type!");
+ +              }
+ +
+ +              /*
+ +               * Update the local backend handle data with data from catalog
+ +               * Free the handle first..
+ +               */
+ +              pgxc_node_free(handle);
+ +              elog(LOG, "Backend (%u), Node (%s) updated locally",
+ +                       MyBackendId, handle->nodename);
+ +              nodedef = PgxcNodeGetDefinition(nodeoid);
+ +              strncpy(handle->nodename, NameStr(nodedef->nodename), NAMEDATALEN);
+ +              strncpy(handle->nodehost, NameStr(nodedef->nodehost), NAMEDATALEN);
+ +              handle->nodeport = nodedef->nodeport;
+ +              pfree(nodedef);
+ +      }
+ +      return;
+ +}
+ +
+ +void
+ +HandlePoolerMessages(void)
+ +{
+ +      if (HandlesRefreshPending)
+ +      {
+ +              DoRefreshRemoteHandles();
+ +
+ +              elog(LOG, "Backend (%u), doing handles refresh",
+ +                       MyBackendId);
+ +      }
+ +      return;
+ +}
diff --cc src/backend/pgxc/pool/poolmgr.c

index 336101419ec9fd4a5ae1a885609482d9491574e9,0000000000000000000000000000000000000000..140907d8728407d66b649459264b99483f4acbc6

mode 100644,000000..100644
--- 1/src/backend/pgxc/pool/poolmgr.c
--- /dev/null
+++ b/src/backend/pgxc/pool/poolmgr.c
@@@ -1,3043 -1,0 +1,3046 @@@
+ +/*-------------------------------------------------------------------------
+ + *
+ + * poolmgr.c
+ + *
+ + *      Connection pool manager handles connections to Datanodes
+ + *
+ + * The pooler runs as a separate process and is forked off from a
+ + * Coordinator postmaster. If the Coordinator needs a connection from a
+ + * Datanode, it asks for one from the pooler, which maintains separate
+ + * pools for each Datanode. A group of connections can be requested in
+ + * a single request, and the pooler returns a list of file descriptors
+ + * to use for the connections.
+ + *
+ + * Note the current implementation does not yet shrink the pool over time
+ + * as connections are idle.  Also, it does not queue requests; if a
+ + * connection is unavailable, it will simply fail. This should be implemented
+ + * one day, although there is a chance for deadlocks. For now, limiting
+ + * connections should be done between the application and Coordinator.
+ + * Still, this is useful to avoid having to re-establish connections to the
+ + * Datanodes all the time for multiple Coordinator backend sessions.
+ + *
+ + * The term "agent" here refers to a session manager, one for each backend
+ + * Coordinator connection to the pooler. It will contain a list of connections
+ + * allocated to a session, at most one per Datanode.
+ + *
+ + *
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ + *
+ + * IDENTIFICATION
+ + *      $$
+ + *
+ + *-------------------------------------------------------------------------
+ + */
+ +#include <signal.h>
+ +#include <stdlib.h>
+ +#include <string.h>
+ +#include <sys/types.h>
+ +#include <sys/socket.h>
+ +#include <poll.h>
++#include <math.h>
+ +
+ +#include "postgres.h"
+ +
+ +#include "access/xact.h"
+ +#include "catalog/pgxc_node.h"
+ +#include "commands/dbcommands.h"
+ +#include "libpq/pqsignal.h"
+ +#include "miscadmin.h"
+ +#include "nodes/nodes.h"
+ +#include "utils/builtins.h"
+ +#include "utils/guc.h"
+ +#include "utils/memutils.h"
+ +#include "utils/lsyscache.h"
+ +#include "utils/resowner.h"
+ +#include "lib/stringinfo.h"
+ +#include "libpq/pqformat.h"
+ +#include "pgxc/locator.h"
+ +#include "pgxc/nodemgr.h"
+ +#include "pgxc/pause.h"
+ +#include "pgxc/pgxc.h"
+ +#include "pgxc/poolmgr.h"
+ +#include "pgxc/poolutils.h"
+ +#include "postmaster/postmaster.h"            /* For UnixSocketDir */
+ +#include "storage/procarray.h"
++#include "utils/varlena.h"
+ +
+ +#include "../interfaces/libpq/libpq-fe.h"
+ +#include "../interfaces/libpq/libpq-int.h"
+ +
++
+ +/* Configuration options */
+ +int                   PoolConnKeepAlive = 600;
+ +int                   PoolMaintenanceTimeout = 30;
+ +int                   MaxPoolSize = 100;
+ +int                   PoolerPort = 6667;
+ +
+ +bool                  PersistentConnections = false;
+ +
+ +/* Flag to tell if we are Postgres-XC pooler process */
+ +static bool am_pgxc_pooler = false;
+ +
+ +/* Connection information cached */
+ +typedef struct
+ +{
+ +      Oid     nodeoid;
+ +      char    *host;
+ +      int     port;
+ +} PGXCNodeConnectionInfo;
+ +
+ +/* Handle to the pool manager (Session's side) */
+ +typedef struct
+ +{
+ +      /* communication channel */
+ +      PoolPort        port;
+ +} PoolHandle;
+ +
+ +/* The root memory context */
+ +static MemoryContext PoolerMemoryContext = NULL;
+ +/*
+ + * Allocations of core objects: Datanode connections, upper level structures,
+ + * connection strings, etc.
+ + */
+ +static MemoryContext PoolerCoreContext = NULL;
+ +/*
+ + * Memory to store Agents
+ + */
+ +static MemoryContext PoolerAgentContext = NULL;
+ +
+ +/* Pool to all the databases (linked list) */
+ +static DatabasePool *databasePools = NULL;
+ +
+ +/* PoolAgents and the poll array*/
+ +static int    agentCount = 0;
+ +static PoolAgent **poolAgents;
+ +
+ +static PoolHandle *poolHandle = NULL;
+ +
+ +static int    is_pool_locked = false;
+ +static int    server_fd = -1;
+ +
+ +static int    node_info_check(PoolAgent *agent);
+ +static void agent_init(PoolAgent *agent, const char *database, const char *user_name,
+ +                         const char *pgoptions);
+ +static void agent_destroy(PoolAgent *agent);
+ +static void agent_create(void);
+ +static void agent_handle_input(PoolAgent *agent, StringInfo s);
+ +static DatabasePool *create_database_pool(const char *database, const char *user_name, const char *pgoptions);
+ +static void insert_database_pool(DatabasePool *pool);
+ +static int    destroy_database_pool(const char *database, const char *user_name);
+ +static void reload_database_pools(PoolAgent *agent);
+ +static int refresh_database_pools(PoolAgent *agent);
+ +static bool remove_all_agent_references(Oid nodeoid);
+ +static DatabasePool *find_database_pool(const char *database, const char *user_name, const char *pgoptions);
+ +static DatabasePool *remove_database_pool(const char *database, const char *user_name);
+ +static int *agent_acquire_connections(PoolAgent *agent, List *datanodelist,
+ +              List *coordlist, int **connectionpids);
+ +static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist);
+ +static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, Oid node);
+ +static void agent_release_connections(PoolAgent *agent, bool force_destroy);
+ +static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
+ +                                                         Oid node, bool force_destroy);
+ +static void destroy_slot(PGXCNodePoolSlot *slot);
+ +static PGXCNodePool *grow_pool(DatabasePool *dbPool, Oid node);
+ +static void destroy_node_pool(PGXCNodePool *node_pool);
+ +static void PoolerLoop(void);
+ +static int clean_connection(List *node_discard,
+ +                                                      const char *database,
+ +                                                      const char *user_name);
+ +static int *abort_pids(int *count,
+ +                                         int pid,
+ +                                         const char *database,
+ +                                         const char *user_name);
+ +static char *build_node_conn_str(Oid node, DatabasePool *dbPool);
+ +/* Signal handlers */
+ +static void pooler_die(SIGNAL_ARGS);
+ +static void pooler_quickdie(SIGNAL_ARGS);
+ +static void PoolManagerConnect(const char *database, const char *user_name,
+ +              const char *pgoptions);
+ +static void pooler_sighup(SIGNAL_ARGS);
+ +static bool shrink_pool(DatabasePool *pool);
+ +static void pools_maintenance(void);
+ +static void TryPingUnhealthyNode(Oid nodeoid);
+ +
+ +/*
+ + * Flags set by interrupt handlers for later service in the main loop.
+ + */
+ +static volatile sig_atomic_t got_SIGHUP = false;
+ +static volatile sig_atomic_t shutdown_requested = false;
+ +
+ +void
+ +PGXCPoolerProcessIam(void)
+ +{
+ +      am_pgxc_pooler = true;
+ +}
+ +
+ +bool
+ +IsPGXCPoolerProcess(void)
+ +{
+ +    return am_pgxc_pooler;
+ +}
+ +
+ +/*
+ + * Initialize internal structures
+ + */
+ +int
+ +PoolManagerInit()
+ +{
+ +      elog(DEBUG1, "Pooler process is started: %d", getpid());
+ +
+ +      /*
+ +       * Set up memory contexts for the pooler objects
+ +       */
+ +      PoolerMemoryContext = AllocSetContextCreate(TopMemoryContext,
+ +                                                                                              "PoolerMemoryContext",
+ +                                                                                              ALLOCSET_DEFAULT_MINSIZE,
+ +                                                                                              ALLOCSET_DEFAULT_INITSIZE,
+ +                                                                                              ALLOCSET_DEFAULT_MAXSIZE);
+ +      PoolerCoreContext = AllocSetContextCreate(PoolerMemoryContext,
+ +                                                                                        "PoolerCoreContext",
+ +                                                                                        ALLOCSET_DEFAULT_MINSIZE,
+ +                                                                                        ALLOCSET_DEFAULT_INITSIZE,
+ +                                                                                        ALLOCSET_DEFAULT_MAXSIZE);
+ +      PoolerAgentContext = AllocSetContextCreate(PoolerMemoryContext,
+ +                                                                                         "PoolerAgentContext",
+ +                                                                                         ALLOCSET_DEFAULT_MINSIZE,
+ +                                                                                         ALLOCSET_DEFAULT_INITSIZE,
+ +                                                                                         ALLOCSET_DEFAULT_MAXSIZE);
+ +
+ +      ForgetLockFiles();      
+ +
+ +      /*
+ +       * Properly accept or ignore signals the postmaster might send us
+ +       */
+ +      pqsignal(SIGINT, pooler_die);
+ +      pqsignal(SIGTERM, pooler_die);
+ +      pqsignal(SIGQUIT, pooler_quickdie);
+ +      pqsignal(SIGHUP, pooler_sighup);
+ +      /* TODO other signal handlers */
+ +
+ +      /* We allow SIGQUIT (quickdie) at all times */
+ +      sigdelset(&BlockSig, SIGQUIT);
+ +
+ +      /*
+ +       * Unblock signals (they were blocked when the postmaster forked us)
+ +       */
+ +      PG_SETMASK(&UnBlockSig);
+ +
+ +      /* Allocate pooler structures in the Pooler context */
+ +      MemoryContextSwitchTo(PoolerMemoryContext);
+ +
+ +      poolAgents = (PoolAgent **) palloc(MaxConnections * sizeof(PoolAgent *));
+ +      if (poolAgents == NULL)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory while initializing pool agents")));
+ +      }
+ +
+ +      PoolerLoop();
+ +      return 0;
+ +}
+ +
+ +
+ +/*
+ + * Check connection info consistency with system catalogs
+ + */
+ +static int
+ +node_info_check(PoolAgent *agent)
+ +{
+ +      DatabasePool   *dbPool = databasePools;
+ +      List               *checked = NIL;
+ +      int                     res = POOL_CHECK_SUCCESS;
+ +      Oid                        *coOids;
+ +      Oid                        *dnOids;
+ +      int                             numCo;
+ +      int                             numDn;
+ +
+ +      /*
+ +       * First check if agent's node information matches to current content of the
+ +       * shared memory table.
+ +       */
+ +      PgxcNodeGetOids(&coOids, &dnOids, &numCo, &numDn, false);
+ +
+ +      if (agent->num_coord_connections != numCo ||
+ +                      agent->num_dn_connections != numDn ||
+ +                      memcmp(agent->coord_conn_oids, coOids, numCo * sizeof(Oid)) ||
+ +                      memcmp(agent->dn_conn_oids, dnOids, numDn * sizeof(Oid)))
+ +              res = POOL_CHECK_FAILED;
+ +
+ +      /* Release palloc'ed memory */
+ +      pfree(coOids);
+ +      pfree(dnOids);
+ +
+ +      /*
+ +       * Iterate over all dbnode pools and check if connection strings
+ +       * are matching node definitions.
+ +       */
+ +      while (res == POOL_CHECK_SUCCESS && dbPool)
+ +      {
+ +              HASH_SEQ_STATUS hseq_status;
+ +              PGXCNodePool   *nodePool;
+ +
+ +              hash_seq_init(&hseq_status, dbPool->nodePools);
+ +              while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ +              {
+ +                      char               *connstr_chk;
+ +
+ +                      /* No need to check same Datanode twice */
+ +                      if (list_member_oid(checked, nodePool->nodeoid))
+ +                              continue;
+ +                      checked = lappend_oid(checked, nodePool->nodeoid);
+ +
+ +                      connstr_chk = build_node_conn_str(nodePool->nodeoid, dbPool);
+ +                      if (connstr_chk == NULL)
+ +                      {
+ +                              /* Problem of constructing connection string */
+ +                              hash_seq_term(&hseq_status);
+ +                              res = POOL_CHECK_FAILED;
+ +                              break;
+ +                      }
+ +                      /* return error if there is difference */
+ +                      if (strcmp(connstr_chk, nodePool->connstr))
+ +                      {
+ +                              pfree(connstr_chk);
+ +                              hash_seq_term(&hseq_status);
+ +                              res = POOL_CHECK_FAILED;
+ +                              break;
+ +                      }
+ +
+ +                      pfree(connstr_chk);
+ +              }
+ +              dbPool = dbPool->next;
+ +      }
+ +      list_free(checked);
+ +      return res;
+ +}
+ +
+ +/*
+ + * Destroy internal structures
+ + */
+ +int
+ +PoolManagerDestroy(void)
+ +{
+ +      int                     status = 0;
+ +
+ +      if (PoolerMemoryContext)
+ +      {
+ +              MemoryContextDelete(PoolerMemoryContext);
+ +              PoolerMemoryContext = NULL;
+ +      }
+ +
+ +      return status;
+ +}
+ +
+ +/*
+ + * Connect to the pooler process
+ + */
+ +static void
+ +GetPoolManagerHandle(void)
+ +{
+ +      PoolHandle *handle;
+ +      int                     fdsock = -1;
+ +
+ +      if (poolHandle)
+ +              /* already connected */
+ +              return;
+ +
+ +#ifdef HAVE_UNIX_SOCKETS
+ +      if (Unix_socket_directories)
+ +      {
+ +              char       *rawstring;
+ +              List       *elemlist;
+ +              ListCell   *l;
+ +              int                     success = 0;
+ +
+ +              /* Need a modifiable copy of Unix_socket_directories */
+ +              rawstring = pstrdup(Unix_socket_directories);
+ +
+ +              /* Parse string into list of directories */
+ +              if (!SplitDirectoriesString(rawstring, ',', &elemlist))
+ +              {
+ +                      /* syntax error in list */
+ +                      ereport(FATAL,
+ +                                      (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ +                                       errmsg("invalid list syntax in parameter \"%s\"",
+ +                                                      "unix_socket_directories")));
+ +              }
+ +
+ +              foreach(l, elemlist)
+ +              {
+ +                      char       *socketdir = (char *) lfirst(l);
+ +                      int                     saved_errno;
+ +
+ +                      /* Connect to the pooler */
+ +                      fdsock = pool_connect(PoolerPort, socketdir);
+ +                      if (fdsock < 0)
+ +                      {
+ +                              saved_errno = errno;
+ +                              ereport(WARNING,
+ +                                              (errmsg("could not create Unix-domain socket in directory \"%s\", errno: %d",
+ +                                                              socketdir, saved_errno)));
+ +                      }
+ +                      else
+ +                      {
+ +                              success++;
+ +                              break;
+ +                      }
+ +              }
+ +
+ +              if (!success && elemlist != NIL)
+ +                      ereport(ERROR,
+ +                                      (errmsg("failed to connect to pool manager: %m")));
+ +
+ +              list_free_deep(elemlist);
+ +              pfree(rawstring);
+ +      }
+ +#endif
+ +
+ +      /*
+ +       * Actual connection errors should be reported by the block above,
+ +       * but perhaps we haven't actually executed it - either because
+ +       * the Unix_socket_directories is not set, or because there's no
+ +       * support for UNIX_SOCKETS. Just bail out in that case.
+ +       */
+ +      if (fdsock < 0)
+ +              ereport(ERROR,
+ +                              (errmsg("failed to connect to pool manager: %m")));
+ +
+ +      /*
+ +       * Allocate handle
+ +       *
+ +       * XXX we may change malloc here to palloc but first ensure
+ +       * the CurrentMemoryContext is properly set.
+ +       * The handle allocated just before new session is forked off and
+ +       * inherited by the session process. It should remain valid for all
+ +       * the session lifetime.
+ +       */
+ +      handle = (PoolHandle *) malloc(sizeof(PoolHandle));
+ +      if (!handle)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +
+ +      handle->port.fdsock = fdsock;
+ +      handle->port.RecvLength = 0;
+ +      handle->port.RecvPointer = 0;
+ +      handle->port.SendPointer = 0;
+ +
+ +      poolHandle = handle;
+ +}
+ +
+ +/*
+ + * Create agent
+ + */
+ +static void
+ +agent_create(void)
+ +{
+ +      MemoryContext oldcontext;
+ +      int                     new_fd;
+ +      PoolAgent  *agent;
+ +
+ +      new_fd = accept(server_fd, NULL, NULL);
+ +      if (new_fd < 0)
+ +      {
+ +              int                     saved_errno = errno;
+ +
+ +              ereport(LOG,
+ +                              (errcode(ERRCODE_CONNECTION_FAILURE),
+ +                               errmsg("pool manager failed to accept connection: %m")));
+ +              errno = saved_errno;
+ +              return;
+ +      }
+ +
+ +      oldcontext = MemoryContextSwitchTo(PoolerAgentContext);
+ +
+ +      /* Allocate agent */
+ +      agent = (PoolAgent *) palloc(sizeof(PoolAgent));
+ +      if (!agent)
+ +      {
+ +              close(new_fd);
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +              return;
+ +      }
+ +
+ +      agent->port.fdsock = new_fd;
+ +      agent->port.RecvLength = 0;
+ +      agent->port.RecvPointer = 0;
+ +      agent->port.SendPointer = 0;
+ +      agent->pool = NULL;
+ +      agent->mcxt = AllocSetContextCreate(CurrentMemoryContext,
+ +                                                                              "Agent",
+ +                                                                              ALLOCSET_DEFAULT_MINSIZE,
+ +                                                                              ALLOCSET_DEFAULT_INITSIZE,
+ +                                                                              ALLOCSET_DEFAULT_MAXSIZE);
+ +      agent->num_dn_connections = 0;
+ +      agent->num_coord_connections = 0;
+ +      agent->dn_conn_oids = NULL;
+ +      agent->coord_conn_oids = NULL;
+ +      agent->dn_connections = NULL;
+ +      agent->coord_connections = NULL;
+ +      agent->pid = 0;
+ +
+ +      /* Append new agent to the list */
+ +      poolAgents[agentCount++] = agent;
+ +
+ +      MemoryContextSwitchTo(oldcontext);
+ +}
+ +
+ +
+ +/*
+ + * session_options
+ + * Returns the pgoptions string generated using a particular
+ + * list of parameters that are required to be propagated to Datanodes.
+ + * These parameters then become default values for the pooler sessions.
+ + * For e.g., a psql user sets PGDATESTYLE. This value should be set
+ + * as the default connection parameter in the pooler session that is
+ + * connected to the Datanodes. There are various parameters which need to
+ + * be analysed individually to determine whether these should be set on
+ + * Datanodes.
+ + *
+ + * Note: These parameters values are the default values of the particular
+ + * Coordinator backend session, and not the new values set by SET command.
+ + *
+ + */
+ +
+ +char *session_options(void)
+ +{
+ +      int                              i;
+ +      char                    *pgoptions[] = {"DateStyle", "timezone", "geqo", "intervalstyle", "lc_monetary"};
+ +      StringInfoData   options;
+ +      List                    *value_list;
+ +      ListCell                *l;
+ +
+ +      initStringInfo(&options);
+ +
+ +      for (i = 0; i < sizeof(pgoptions)/sizeof(char*); i++)
+ +      {
+ +              const char              *value;
+ +
+ +              appendStringInfo(&options, " -c %s=", pgoptions[i]);
+ +
+ +              value = GetConfigOptionResetString(pgoptions[i]);
+ +
+ +              /* lc_monetary does not accept lower case values */
+ +              if (strcmp(pgoptions[i], "lc_monetary") == 0)
+ +              {
+ +                      appendStringInfoString(&options, value);
+ +                      continue;
+ +              }
+ +
+ +              SplitIdentifierString(strdup(value), ',', &value_list);
+ +              foreach(l, value_list)
+ +              {
+ +                      char *value = (char *) lfirst(l);
+ +                      appendStringInfoString(&options, value);
+ +                      if (lnext(l))
+ +                              appendStringInfoChar(&options, ',');
+ +              }
+ +      }
+ +
+ +      return options.data;
+ +}
+ +
+ +
+ +/*
+ + * Associate session with specified database and respective connection pool
+ + * Invoked from Session process
+ + */
+ +static void
+ +PoolManagerConnect(const char *database, const char *user_name,
+ +              const char *pgoptions)
+ +{
+ +      int     n32;
+ +      char    msgtype = 'c';
+ +      int     unamelen = strlen(user_name);
+ +      int     dbnamelen = strlen(database);
+ +      int             pgoptionslen = strlen(pgoptions);
+ +      char    atchar = ' ';
+ +
+ +      /* Connect to the pooler process if not yet connected */
+ +      GetPoolManagerHandle();
+ +      if (poolHandle == NULL)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("failed to connect to the pooler process")));
+ +
+ +      elog(DEBUG1, "Connecting to PoolManager (user_name %s, database %s, "
+ +                      "pgoptions %s", user_name, database, pgoptions);
+ +
+ +      /*
+ +       * Special handling for db_user_namespace=on
+ +       * We need to handle per-db users and global users. The per-db users will
+ +       * arrive with @dbname and global users just as username. Handle both of
+ +       * them appropriately
+ +       */
+ +      if (strcmp(GetConfigOption("db_user_namespace", false, false), "on") == 0)
+ +      {
+ +              if (strchr(user_name, '@') != NULL)
+ +              {
+ +                      Assert(unamelen > dbnamelen + 1);
+ +                      unamelen -= (dbnamelen + 1);
+ +              }
+ +              else
+ +              {
+ +                      atchar = '@';
+ +                      unamelen++;
+ +              }
+ +      }
+ +
+ +      /* Message type */
+ +      pool_putbytes(&poolHandle->port, &msgtype, 1);
+ +
+ +      /* Message length */
+ +      n32 = htonl(dbnamelen + unamelen + pgoptionslen + 23);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* PID number */
+ +      n32 = htonl(MyProcPid);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Length of Database string */
+ +      n32 = htonl(dbnamelen + 1);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Send database name followed by \0 terminator */
+ +      pool_putbytes(&poolHandle->port, database, dbnamelen);
+ +      pool_putbytes(&poolHandle->port, "\0", 1);
+ +
+ +      /* Length of user name string */
+ +      n32 = htonl(unamelen + 1);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Send user name followed by \0 terminator */
+ +      /* Send the '@' char if needed. Already accounted for in len */
+ +      if (atchar == '@')
+ +      {
+ +              pool_putbytes(&poolHandle->port, user_name, unamelen - 1);
+ +              pool_putbytes(&poolHandle->port, "@", 1);
+ +      }
+ +      else
+ +              pool_putbytes(&poolHandle->port, user_name, unamelen);
+ +      pool_putbytes(&poolHandle->port, "\0", 1);
+ +
+ +      /* Length of pgoptions string */
+ +      n32 = htonl(pgoptionslen + 1);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Send pgoptions followed by \0 terminator */
+ +      pool_putbytes(&poolHandle->port, pgoptions, pgoptionslen);
+ +      pool_putbytes(&poolHandle->port, "\0", 1);
+ +      pool_flush(&poolHandle->port);
+ +}
+ +
+ +/*
+ + * Reconnect to pool manager
+ + * It simply does a disconnection and a reconnection.
+ + */
+ +void
+ +PoolManagerReconnect(void)
+ +{
+ +      elog(DEBUG1, "Reconnecting to PoolManager");
+ +
+ +      /* Connected, disconnect */
+ +      if (poolHandle)
+ +              PoolManagerDisconnect();
+ +
+ +      PoolManagerConnect(get_database_name(MyDatabaseId), GetClusterUserName(),
+ +                      session_options());
+ +}
+ +
+ +/*
+ + * Lock/unlock pool manager
+ + * During locking, the only operations not permitted are abort, connection and
+ + * connection obtention.
+ + */
+ +void
+ +PoolManagerLock(bool is_lock)
+ +{
+ +      char msgtype = 'o';
+ +      int n32;
+ +      int msglen = 8;
+ +      if (poolHandle == NULL)
+ +              PoolManagerConnect(get_database_name(MyDatabaseId),
+ +                                                 GetClusterUserName(), "");
+ +
+ +      elog(DEBUG1, "Locking PoolManager");
+ +
+ +      /* Message type */
+ +      pool_putbytes(&poolHandle->port, &msgtype, 1);
+ +
+ +      /* Message length */
+ +      n32 = htonl(msglen);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Lock information */
+ +      n32 = htonl((int) is_lock);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +      pool_flush(&poolHandle->port);
+ +}
+ +
+ +/*
+ + * Init PoolAgent
+ + */
+ +static void
+ +agent_init(PoolAgent *agent, const char *database, const char *user_name,
+ +           const char *pgoptions)
+ +{
+ +      MemoryContext oldcontext;
+ +
+ +      Assert(agent);
+ +      Assert(database);
+ +      Assert(user_name);
+ +
+ +      /* disconnect if we are still connected */
+ +      if (agent->pool)
+ +              agent_release_connections(agent, false);
+ +
+ +      oldcontext = MemoryContextSwitchTo(agent->mcxt);
+ +
+ +      /* Get needed info and allocate memory */
+ +      PgxcNodeGetOids(&agent->coord_conn_oids, &agent->dn_conn_oids,
+ +                                      &agent->num_coord_connections, &agent->num_dn_connections, false);
+ +
+ +      agent->coord_connections = (PGXCNodePoolSlot **)
+ +                      palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
+ +      agent->dn_connections = (PGXCNodePoolSlot **)
+ +                      palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
+ +      /* find database */
+ +      agent->pool = find_database_pool(database, user_name, pgoptions);
+ +
+ +      /* create if not found */
+ +      if (agent->pool == NULL)
+ +              agent->pool = create_database_pool(database, user_name, pgoptions);
+ +
+ +      MemoryContextSwitchTo(oldcontext);
+ +
+ +      return;
+ +}
+ +
+ +/*
+ + * Destroy PoolAgent
+ + */
+ +static void
+ +agent_destroy(PoolAgent *agent)
+ +{
+ +      int     i;
+ +
+ +      Assert(agent);
+ +
+ +      close(Socket(agent->port));
+ +
+ +      /* Discard connections if any remaining */
+ +      if (agent->pool)
+ +      {
+ +              /*
+ +               * If session is disconnecting while there are active connections
+ +               * we can not know if they clean or not, so force destroy them
+ +               */
+ +              agent_release_connections(agent, true);
+ +      }
+ +
+ +      /* find agent in the list */
+ +      for (i = 0; i < agentCount; i++)
+ +      {
+ +              if (poolAgents[i] == agent)
+ +              {
+ +                      /* Free memory. All connection slots are NULL at this point */
+ +                      MemoryContextDelete(agent->mcxt);
+ +
+ +                      pfree(agent);
+ +                      /* shrink the list and move last agent into the freed slot */
+ +                      if (i < --agentCount)
+ +                              poolAgents[i] = poolAgents[agentCount];
+ +                      /* only one match is expected so exit */
+ +                      break;
+ +              }
+ +      }
+ +}
+ +
+ +/*
+ + * Ping an UNHEALTHY node and if it succeeds, update SHARED node
+ + * information
+ + */
+ +static void
+ +TryPingUnhealthyNode(Oid nodeoid)
+ +{
+ +      int status;
+ +      NodeDefinition *nodeDef;
+ +      char connstr[MAXPGPATH * 2 + 256];
+ +
+ +      nodeDef = PgxcNodeGetDefinition(nodeoid);
+ +      if (nodeDef == NULL)
+ +      {
+ +              /* No such definition, node dropped? */
+ +              elog(DEBUG1, "Could not find node (%u) definition,"
+ +                       " skipping health check", nodeoid);
+ +              return;
+ +      }
+ +      if (nodeDef->nodeishealthy)
+ +      {
+ +              /* hmm, can this happen? */
+ +              elog(DEBUG1, "node (%u) healthy!"
+ +                       " skipping health check", nodeoid);
+ +              return;
+ +      }
+ +
+ +      elog(LOG, "node (%s:%u) down! Trying ping",
+ +               NameStr(nodeDef->nodename), nodeoid);
+ +      sprintf(connstr,
+ +                      "host=%s port=%d", NameStr(nodeDef->nodehost),
+ +                      nodeDef->nodeport);
+ +      status = PGXCNodePing(connstr);
+ +      if (status != 0)
+ +      {
+ +              pfree(nodeDef);
+ +              return;
+ +      }
+ +
+ +      elog(DEBUG1, "Node (%s) back online!", NameStr(nodeDef->nodename));
+ +      if (!PgxcNodeUpdateHealth(nodeoid, true))
+ +              elog(WARNING, "Could not update health status of node (%s)",
+ +                       NameStr(nodeDef->nodename));
+ +      else
+ +              elog(LOG, "Health map updated to reflect HEALTHY node (%s)",
+ +                       NameStr(nodeDef->nodename));
+ +      pfree(nodeDef);
+ +
+ +      return;
+ +}
+ +
+ +/*
+ + * Check if a node is indeed down and if it is update its UNHEALTHY
+ + * status
+ + */
+ +void
+ +PoolPingNodeRecheck(Oid nodeoid)
+ +{
+ +      int status;
+ +      NodeDefinition *nodeDef;
+ +      char connstr[MAXPGPATH * 2 + 256];
+ +      bool    healthy;
+ +
+ +      nodeDef = PgxcNodeGetDefinition(nodeoid);
+ +      if (nodeDef == NULL)
+ +      {
+ +              /* No such definition, node dropped? */
+ +              elog(DEBUG1, "Could not find node (%u) definition,"
+ +                       " skipping health check", nodeoid);
+ +              return;
+ +      }
+ +
+ +      sprintf(connstr,
+ +                      "host=%s port=%d", NameStr(nodeDef->nodehost),
+ +                      nodeDef->nodeport);
+ +      status = PGXCNodePing(connstr);
+ +      healthy = (status == 0);
+ +
+ +      /* if no change in health bit, return */
+ +      if (healthy == nodeDef->nodeishealthy)
+ +      {
+ +              pfree(nodeDef);
+ +              return;
+ +      }
+ +
+ +      if (!PgxcNodeUpdateHealth(nodeoid, healthy))
+ +              elog(WARNING, "Could not update health status of node (%s)",
+ +                       NameStr(nodeDef->nodename));
+ +      else
+ +              elog(LOG, "Health map updated to reflect (%s) node (%s)",
+ +                       healthy ? "HEALTHY" : "UNHEALTHY", NameStr(nodeDef->nodename));
+ +      pfree(nodeDef);
+ +
+ +      return;
+ +}
+ +
+ +/*
+ + * Ping UNHEALTHY nodes as part of the maintenance window
+ + */
+ +void
+ +PoolPingNodes()
+ +{
+ +      Oid                             coOids[MaxCoords];
+ +      Oid                             dnOids[MaxDataNodes];
+ +      bool                    coHealthMap[MaxCoords];
+ +      bool                    dnHealthMap[MaxDataNodes];
+ +      int                             numCo;
+ +      int                             numDn;
+ +      int                             i;
+ +
+ +      PgxcNodeGetHealthMap(coOids, dnOids, &numCo, &numDn,
+ +                                               coHealthMap, dnHealthMap);
+ +
+ +      /*
+ +       * Find unhealthy datanodes and try to re-ping them
+ +       */
+ +      for (i = 0; i < numDn; i++)
+ +      {
+ +              if (!dnHealthMap[i])
+ +              {
+ +                      Oid      nodeoid = dnOids[i];
+ +                      TryPingUnhealthyNode(nodeoid);
+ +              }
+ +      }
+ +      /*
+ +       * Find unhealthy coordinators and try to re-ping them
+ +       */
+ +      for (i = 0; i < numCo; i++)
+ +      {
+ +              if (!coHealthMap[i])
+ +              {
+ +                      Oid      nodeoid = coOids[i];
+ +                      TryPingUnhealthyNode(nodeoid);
+ +              }
+ +      }
+ +}
+ +
+ +/*
+ + * Release handle to pool manager
+ + */
+ +void
+ +PoolManagerDisconnect(void)
+ +{
+ +      if (!poolHandle)
+ +              return; /* not even connected */
+ +
+ +      pool_putmessage(&poolHandle->port, 'd', NULL, 0);
+ +      pool_flush(&poolHandle->port);
+ +
+ +      close(Socket(poolHandle->port));
+ +      free(poolHandle);
+ +      poolHandle = NULL;
+ +}
+ +
+ +
+ +/*
+ + * Get pooled connections
+ + */
+ +int *
+ +PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
+ +{
+ +      int                     i;
+ +      ListCell   *nodelist_item;
+ +      int                *fds;
+ +      int                     totlen = list_length(datanodelist) + list_length(coordlist);
+ +      int                     nodes[totlen + 2];
+ +
+ +      if (poolHandle == NULL)
+ +              PoolManagerConnect(get_database_name(MyDatabaseId),
+ +                                                 GetClusterUserName(), session_options());
+ +
+ +      /*
+ +       * Prepare end send message to pool manager.
+ +       * First with Datanode list.
+ +       * This list can be NULL for a query that does not need
+ +       * Datanode Connections (Sequence DDLs)
+ +       */
+ +      nodes[0] = htonl(list_length(datanodelist));
+ +      i = 1;
+ +      if (list_length(datanodelist) != 0)
+ +      {
+ +              foreach(nodelist_item, datanodelist)
+ +              {
+ +                      nodes[i++] = htonl(lfirst_int(nodelist_item));
+ +              }
+ +      }
+ +      /* Then with Coordinator list (can be nul) */
+ +      nodes[i++] = htonl(list_length(coordlist));
+ +      if (list_length(coordlist) != 0)
+ +      {
+ +              foreach(nodelist_item, coordlist)
+ +              {
+ +                      nodes[i++] = htonl(lfirst_int(nodelist_item));
+ +              }
+ +      }
+ +
+ +      pool_putmessage(&poolHandle->port, 'g', (char *) nodes, sizeof(int) * (totlen + 2));
+ +      pool_flush(&poolHandle->port);
+ +
+ +      /* Receive response */
+ +      fds = (int *) palloc(sizeof(int) * totlen);
+ +      if (fds == NULL)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +      }
+ +      if (pool_recvfds(&poolHandle->port, fds, totlen))
+ +      {
+ +              pfree(fds);
+ +              fds = NULL;
+ +      }
+ +
+ +      if (pool_recvpids(&poolHandle->port, pids) != totlen)
+ +      {
+ +              pfree(*pids);
+ +              *pids = NULL;
+ +              return NULL;
+ +      }
+ +
+ +      return fds;
+ +}
+ +
+ +/*
+ + * Abort active transactions using pooler.
+ + * Take a lock forbidding access to Pooler for new transactions.
+ + */
+ +int
+ +PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids)
+ +{
+ +      int             num_proc_ids = 0;
+ +      int             n32, msglen;
+ +      char            msgtype = 'a';
+ +      int             dblen = dbname ? strlen(dbname) + 1 : 0;
+ +      int             userlen = username ? strlen(username) + 1 : 0;
+ +
+ +      /*
+ +       * New connection may be established to clean connections to
+ +       * specified nodes and databases.
+ +       */
+ +      if (poolHandle == NULL)
+ +              PoolManagerConnect(get_database_name(MyDatabaseId),
+ +                                                 GetClusterUserName(), session_options());
+ +
+ +      /* Message type */
+ +      pool_putbytes(&poolHandle->port, &msgtype, 1);
+ +
+ +      /* Message length */
+ +      msglen = dblen + userlen + 12;
+ +      n32 = htonl(msglen);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Length of Database string */
+ +      n32 = htonl(dblen);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Send database name, followed by \0 terminator if necessary */
+ +      if (dbname)
+ +              pool_putbytes(&poolHandle->port, dbname, dblen);
+ +
+ +      /* Length of Username string */
+ +      n32 = htonl(userlen);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Send user name, followed by \0 terminator if necessary */
+ +      if (username)
+ +              pool_putbytes(&poolHandle->port, username, userlen);
+ +
+ +      pool_flush(&poolHandle->port);
+ +
+ +      /* Then Get back Pids from Pooler */
+ +      num_proc_ids = pool_recvpids(&poolHandle->port, proc_pids);
+ +
+ +      return num_proc_ids;
+ +}
+ +
+ +
+ +/*
+ + * Clean up Pooled connections
+ + */
+ +void
+ +PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username)
+ +{
+ +      int                     totlen = list_length(datanodelist) + list_length(coordlist);
+ +      int                     nodes[totlen + 2];
+ +      ListCell                *nodelist_item;
+ +      int                     i, n32, msglen;
+ +      char                    msgtype = 'f';
+ +      int                     userlen = username ? strlen(username) + 1 : 0;
+ +      int                     dblen = dbname ? strlen(dbname) + 1 : 0;
+ +
+ +      /*
+ +       * New connection may be established to clean connections to
+ +       * specified nodes and databases.
+ +       */
+ +      if (poolHandle == NULL)
+ +              PoolManagerConnect(get_database_name(MyDatabaseId),
+ +                                                 GetClusterUserName(), session_options());
+ +
+ +      nodes[0] = htonl(list_length(datanodelist));
+ +      i = 1;
+ +      if (list_length(datanodelist) != 0)
+ +      {
+ +              foreach(nodelist_item, datanodelist)
+ +              {
+ +                      nodes[i++] = htonl(lfirst_int(nodelist_item));
+ +              }
+ +      }
+ +      /* Then with Coordinator list (can be nul) */
+ +      nodes[i++] = htonl(list_length(coordlist));
+ +      if (list_length(coordlist) != 0)
+ +      {
+ +              foreach(nodelist_item, coordlist)
+ +              {
+ +                      nodes[i++] = htonl(lfirst_int(nodelist_item));
+ +              }
+ +      }
+ +
+ +      /* Message type */
+ +      pool_putbytes(&poolHandle->port, &msgtype, 1);
+ +
+ +      /* Message length */
+ +      msglen = sizeof(int) * (totlen + 2) + dblen + userlen + 12;
+ +      n32 = htonl(msglen);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Send list of nodes */
+ +      pool_putbytes(&poolHandle->port, (char *) nodes, sizeof(int) * (totlen + 2));
+ +
+ +      /* Length of Database string */
+ +      n32 = htonl(dblen);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Send database name, followed by \0 terminator if necessary */
+ +      if (dbname)
+ +              pool_putbytes(&poolHandle->port, dbname, dblen);
+ +
+ +      /* Length of Username string */
+ +      n32 = htonl(userlen);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Send user name, followed by \0 terminator if necessary */
+ +      if (username)
+ +              pool_putbytes(&poolHandle->port, username, userlen);
+ +
+ +      pool_flush(&poolHandle->port);
+ +
+ +      /* Receive result message */
+ +      if (pool_recvres(&poolHandle->port) != CLEAN_CONNECTION_COMPLETED)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("Clean connections not completed")));
+ +}
+ +
+ +
+ +/*
+ + * Check connection information consistency cached in pooler with catalog information
+ + */
+ +bool
+ +PoolManagerCheckConnectionInfo(void)
+ +{
+ +      int res;
+ +
+ +      /*
+ +       * New connection may be established to clean connections to
+ +       * specified nodes and databases.
+ +       */
+ +      if (poolHandle == NULL)
+ +              PoolManagerConnect(get_database_name(MyDatabaseId),
+ +                                                 GetClusterUserName(), session_options());
+ +      PgxcNodeListAndCount();
+ +      pool_putmessage(&poolHandle->port, 'q', NULL, 0);
+ +      pool_flush(&poolHandle->port);
+ +
+ +      res = pool_recvres(&poolHandle->port);
+ +
+ +      if (res == POOL_CHECK_SUCCESS)
+ +              return true;
+ +
+ +      return false;
+ +}
+ +
+ +
+ +/*
+ + * Reload connection data in pooler and drop all the existing connections of pooler
+ + */
+ +void
+ +PoolManagerReloadConnectionInfo(void)
+ +{
+ +      Assert(poolHandle);
+ +      PgxcNodeListAndCount();
+ +      pool_putmessage(&poolHandle->port, 'p', NULL, 0);
+ +      pool_flush(&poolHandle->port);
+ +}
+ +
+ +/*
+ + * Refresh connection data in pooler and drop connections for those nodes
+ + * that have changed. Thus, this operation is less destructive as compared
+ + * to PoolManagerReloadConnectionInfo and should typically be called when
+ + * NODE ALTER has been performed
+ + */
+ +int
+ +PoolManagerRefreshConnectionInfo(void)
+ +{
+ +      int res;
+ +
+ +      Assert(poolHandle);
+ +      PgxcNodeListAndCount();
+ +      pool_putmessage(&poolHandle->port, 'R', NULL, 0);
+ +      pool_flush(&poolHandle->port);
+ +
+ +      res = pool_recvres(&poolHandle->port);
+ +
+ +      if (res == POOL_CHECK_SUCCESS)
+ +              return true;
+ +
+ +      return false;
+ +}
+ +
+ +static void
+ +handle_abort(PoolAgent * agent, StringInfo s)
+ +{
+ +      int             len;
+ +      int        *pids;
+ +      const char *database = NULL;
+ +      const char *user_name = NULL;
+ +
+ +      pool_getmessage(&agent->port, s, 0);
+ +      len = pq_getmsgint(s, 4);
+ +      if (len > 0)
+ +              database = pq_getmsgbytes(s, len);
+ +
+ +      len = pq_getmsgint(s, 4);
+ +      if (len > 0)
+ +              user_name = pq_getmsgbytes(s, len);
+ +
+ +      pq_getmsgend(s);
+ +
+ +      pids = abort_pids(&len, agent->pid, database, user_name);
+ +
+ +      pool_sendpids(&agent->port, pids, len);
+ +      if (pids)
+ +              pfree(pids);
+ +}
+ +
+ +static void
+ +handle_connect(PoolAgent * agent, StringInfo s)
+ +{
+ +      int     len;
+ +      const char *database = NULL;
+ +      const char *user_name = NULL;
+ +      const char *pgoptions = NULL;
+ +
+ +      pool_getmessage(&agent->port, s, 0);
+ +      agent->pid = pq_getmsgint(s, 4);
+ +
+ +      len = pq_getmsgint(s, 4);
+ +      database = pq_getmsgbytes(s, len);
+ +
+ +      len = pq_getmsgint(s, 4);
+ +      user_name = pq_getmsgbytes(s, len);
+ +
+ +      len = pq_getmsgint(s, 4);
+ +      pgoptions = pq_getmsgbytes(s, len);
+ +
+ +      /*
+ +       * Coordinator pool is not initialized.
+ +       * With that it would be impossible to create a Database by default.
+ +       */
+ +      agent_init(agent, database, user_name, pgoptions);
+ +      pq_getmsgend(s);
+ +}
+ +
+ +static void
+ +handle_clean_connection(PoolAgent * agent, StringInfo s)
+ +{
+ +      int i, len, res;
+ +      int     datanodecount, coordcount;
+ +      const char *database = NULL;
+ +      const char *user_name = NULL;
+ +      List       *nodelist = NIL;
+ +
+ +      pool_getmessage(&agent->port, s, 0);
+ +
+ +      /* It is possible to clean up only datanode connections */
+ +      datanodecount = pq_getmsgint(s, 4);
+ +      for (i = 0; i < datanodecount; i++)
+ +      {
+ +              /* Translate index to Oid */
+ +              int index = pq_getmsgint(s, 4);
+ +              Oid node = agent->dn_conn_oids[index];
+ +              nodelist = lappend_oid(nodelist, node);
+ +      }
+ +
+ +      /* It is possible to clean up only coordinator connections */
+ +      coordcount = pq_getmsgint(s, 4);
+ +      for (i = 0; i < coordcount; i++)
+ +      {
+ +              /* Translate index to Oid */
+ +              int index = pq_getmsgint(s, 4);
+ +              Oid node = agent->coord_conn_oids[index];
+ +              nodelist = lappend_oid(nodelist, node);
+ +      }
+ +
+ +      len = pq_getmsgint(s, 4);
+ +      if (len > 0)
+ +              database = pq_getmsgbytes(s, len);
+ +
+ +      len = pq_getmsgint(s, 4);
+ +      if (len > 0)
+ +              user_name = pq_getmsgbytes(s, len);
+ +
+ +      pq_getmsgend(s);
+ +
+ +      /* Clean up connections here */
+ +      res = clean_connection(nodelist, database, user_name);
+ +
+ +      list_free(nodelist);
+ +
+ +      /* Send success result */
+ +      pool_sendres(&agent->port, res);
+ +}
+ +
+ +static void
+ +handle_get_connections(PoolAgent * agent, StringInfo s)
+ +{
+ +      int             i;
+ +      int        *fds, *pids = NULL;
+ +      int             datanodecount, coordcount;
+ +      List   *datanodelist = NIL;
+ +      List   *coordlist = NIL;
+ +
+ +      /*
+ +       * Length of message is caused by:
+ +       * - Message header = 4bytes
+ +       * - List of Datanodes = NumPoolDataNodes * 4bytes (max)
+ +       * - List of Coordinators = NumPoolCoords * 4bytes (max)
+ +       * - Number of Datanodes sent = 4bytes
+ +       * - Number of Coordinators sent = 4bytes
+ +       * It is better to send in a same message the list of Co and Dn at the same
+ +       * time, this permits to reduce interactions between postmaster and pooler
+ +       */
+ +      pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12);
+ +
+ +      datanodecount = pq_getmsgint(s, 4);
+ +      for (i = 0; i < datanodecount; i++)
+ +              datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4));
+ +
+ +      /* It is possible that no Coordinators are involved in the transaction */
+ +      coordcount = pq_getmsgint(s, 4);
+ +      for (i = 0; i < coordcount; i++)
+ +              coordlist = lappend_int(coordlist, pq_getmsgint(s, 4));
+ +
+ +      pq_getmsgend(s);
+ +
+ +      Assert(datanodecount >= 0 && coordcount >= 0);
+ +
+ +      /*
+ +       * In case of error agent_acquire_connections will log the error and
+ +       * return NULL.
+ +       */
+ +      fds = agent_acquire_connections(agent, datanodelist, coordlist, &pids);
+ +
+ +      list_free(datanodelist);
+ +      list_free(coordlist);
+ +
+ +      pool_sendfds(&agent->port, fds, fds ? datanodecount + coordcount : 0);
+ +      if (fds)
+ +              pfree(fds);
+ +
+ +      /*
+ +       * Also send the PIDs of the remote backend processes serving
+ +       * these connections
+ +       */
+ +      pool_sendpids(&agent->port, pids, pids ? datanodecount + coordcount : 0);
+ +      if (pids)
+ +              pfree(pids);
+ +}
+ +
+ +static void
+ +handle_query_cancel(PoolAgent * agent, StringInfo s)
+ +{
+ +      int             i;
+ +      int             datanodecount, coordcount;
+ +      List   *datanodelist = NIL;
+ +      List   *coordlist = NIL;
+ +
+ +      /*
+ +       * Length of message is caused by:
+ +       * - Message header = 4bytes
+ +       * - List of Datanodes = NumPoolDataNodes * 4bytes (max)
+ +       * - List of Coordinators = NumPoolCoords * 4bytes (max)
+ +       * - Number of Datanodes sent = 4bytes
+ +       * - Number of Coordinators sent = 4bytes
+ +       */
+ +      pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12);
+ +
+ +      datanodecount = pq_getmsgint(s, 4);
+ +      for (i = 0; i < datanodecount; i++)
+ +              datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4));
+ +
+ +      coordcount = pq_getmsgint(s, 4);
+ +      /* It is possible that no Coordinators are involved in the transaction */
+ +      for (i = 0; i < coordcount; i++)
+ +              coordlist = lappend_int(coordlist, pq_getmsgint(s, 4));
+ +
+ +      pq_getmsgend(s);
+ +
+ +      cancel_query_on_connections(agent, datanodelist, coordlist);
+ +      list_free(datanodelist);
+ +      list_free(coordlist);
+ +
+ +      /* Send success result */
+ +      pool_sendres(&agent->port, QUERY_CANCEL_COMPLETED);
+ +}
+ +
+ +/*
+ + * Handle messages to agent
+ + */
+ +static void
+ +agent_handle_input(PoolAgent * agent, StringInfo s)
+ +{
+ +      /* read byte from the buffer (and recv if empty) */
+ +      int     qtype = pool_getbyte(&agent->port);
+ +
+ +      /*
+ +       * We can have multiple messages, so handle them all
+ +       */
+ +      for (;;)
+ +      {
+ +              /*
+ +               * During a pool cleaning, Abort, Connect and Get Connections messages
+ +               * are not allowed on pooler side.
+ +               * It avoids to have new backends taking connections
+ +               * while remaining transactions are aborted during FORCE and then
+ +               * Pools are being shrinked.
+ +               */
+ +              if (is_pool_locked && (qtype == 'a' || qtype == 'c' || qtype == 'g'))
+ +                      elog(WARNING,"Pool operation cannot run during pool lock");
+ +
+ +              elog(DEBUG1, "Pooler is handling command %c from %d", (char) qtype, agent->pid);
+ +
+ +              switch (qtype)
+ +              {
+ +                      case 'a':                       /* ABORT */
+ +                              handle_abort(agent, s);
+ +                              break;
+ +                      case 'c':                       /* CONNECT */
+ +                              handle_connect(agent, s);
+ +                              break;
+ +                      case 'd':                       /* DISCONNECT */
+ +                              pool_getmessage(&agent->port, s, 4);
+ +                              agent_destroy(agent);
+ +                              pq_getmsgend(s);
+ +                              break;
+ +                      case 'f':                       /* CLEAN CONNECTION */
+ +                              handle_clean_connection(agent, s);
+ +                              break;
+ +                      case 'g':                       /* GET CONNECTIONS */
+ +                              handle_get_connections(agent, s);
+ +                              break;
+ +
+ +                      case 'h':                       /* Cancel SQL Command in progress on specified connections */
+ +                              handle_query_cancel(agent, s);
+ +                              break;
+ +                      case 'o':                       /* Lock/unlock pooler */
+ +                              pool_getmessage(&agent->port, s, 8);
+ +                              is_pool_locked = pq_getmsgint(s, 4);
+ +                              pq_getmsgend(s);
+ +                              break;
+ +                      case 'p':                       /* Reload connection info */
+ +                              pool_getmessage(&agent->port, s, 4);
+ +                              pq_getmsgend(s);
+ +
+ +                              /* First update all the pools */
+ +                              reload_database_pools(agent);
+ +                              break;
+ +                      case 'R':                       /* Refresh connection info */
+ +                              /*
+ +                               */
+ +                              pool_getmessage(&agent->port, s, 4);
+ +                              pq_getmsgend(s);
+ +
+ +                              pool_sendres(&agent->port, refresh_database_pools(agent));
+ +                              break;
+ +                      case 'P':                       /* Ping connection info */
+ +                              /*
+ +                               * Ping unhealthy nodes in the pools. If any of the
+ +                               * nodes come up, update SHARED memory to
+ +                               * indicate the same.
+ +                               */
+ +                              pool_getmessage(&agent->port, s, 4);
+ +                              pq_getmsgend(s);
+ +
+ +                              /* Ping all the pools */
+ +                              PoolPingNodes();
+ +
+ +                              break;
+ +                      case 'q':                       /* Check connection info consistency */
+ +                              pool_getmessage(&agent->port, s, 4);
+ +                              pq_getmsgend(s);
+ +
+ +                              /* Check cached info consistency */
+ +                              pool_sendres(&agent->port, node_info_check(agent));
+ +                              break;
+ +                      case 'r':                       /* RELEASE CONNECTIONS */
+ +                              {
+ +                                      bool destroy;
+ +
+ +                                      pool_getmessage(&agent->port, s, 8);
+ +                                      destroy = (bool) pq_getmsgint(s, 4);
+ +                                      pq_getmsgend(s);
+ +                                      agent_release_connections(agent, destroy);
+ +                              }
+ +                              break;
+ +                      case EOF:                       /* EOF */
+ +                              agent_destroy(agent);
+ +                              return;
+ +                      default:                        /* protocol violation */
+ +                              agent_destroy(agent);
+ +                              ereport(WARNING,
+ +                                      (errmsg("agent protocol violation, received byte %c", qtype)));
+ +                              return;
+ +              }
+ +
+ +              /*
+ +               * check if there are more data in the buffer (but don't recv
+ +               * additional data), to avoid reading from a closed connection
+ +               *
+ +               * XXX I wonder whether this is correct, because it means we
+ +               * won't call agent_destroy() in this case (unlike when handling
+ +               * the message in the switch above).
+ +               */
+ +              if ((qtype = pool_pollbyte(&agent->port)) == EOF)
+ +                      break;
+ +      }
+ +}
+ +
+ +/*
+ + * acquire connection
+ + */
+ +static int *
+ +agent_acquire_connections(PoolAgent *agent, List *datanodelist,
+ +              List *coordlist, int **pids)
+ +{
+ +      int                     i;
+ +      int                *result;
+ +      ListCell   *nodelist_item;
+ +      MemoryContext oldcontext;
+ +
+ +      Assert(agent);
+ +
+ +      /* Check if pooler can accept those requests */
+ +      if (list_length(datanodelist) > agent->num_dn_connections ||
+ +                      list_length(coordlist) > agent->num_coord_connections)
+ +      {
+ +              elog(LOG, "agent_acquire_connections called with invalid arguments -"
+ +                              "list_length(datanodelist) %d, num_dn_connections %d,"
+ +                              "list_length(coordlist) %d, num_coord_connections %d",
+ +                              list_length(datanodelist), agent->num_dn_connections,
+ +                              list_length(coordlist), agent->num_coord_connections);
+ +              return NULL;
+ +      }
+ +
+ +      /*
+ +       * Allocate memory
+ +       * File descriptors of Datanodes and Coordinators are saved in the same array,
+ +       * This array will be sent back to the postmaster.
+ +       * It has a length equal to the length of the Datanode list
+ +       * plus the length of the Coordinator list.
+ +       * Datanode fds are saved first, then Coordinator fds are saved.
+ +       */
+ +      result = (int *) palloc((list_length(datanodelist) + list_length(coordlist)) * sizeof(int));
+ +      if (result == NULL)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +      }
+ +
+ +      *pids = (int *) palloc((list_length(datanodelist) + list_length(coordlist)) * sizeof(int));
+ +      if (*pids == NULL)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +      }
+ +
+ +      /*
+ +       * There are possible memory allocations in the core pooler, we want
+ +       * these allocations in the contect of the database pool
+ +       */
+ +      oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
+ +
+ +
+ +      /* Initialize result */
+ +      i = 0;
+ +      /* Save in array fds of Datanodes first */
+ +      foreach(nodelist_item, datanodelist)
+ +      {
+ +              int                     node = lfirst_int(nodelist_item);
+ +
+ +              /* Acquire from the pool if none */
+ +              if (agent->dn_connections[node] == NULL)
+ +              {
+ +                      PGXCNodePoolSlot *slot = acquire_connection(agent->pool,
+ +                                                                                                              agent->dn_conn_oids[node]);
+ +
+ +                      /* Handle failure */
+ +                      if (slot == NULL)
+ +                      {
+ +                              pfree(result);
+ +                              MemoryContextSwitchTo(oldcontext);
+ +                              elog(LOG, "Pooler could not open a connection to node %d",
+ +                                              agent->dn_conn_oids[node]);
+ +                              return NULL;
+ +                      }
+ +
+ +                      /* Store in the descriptor */
+ +                      agent->dn_connections[node] = slot;
+ +
+ +                      /*
+ +                       * Update newly-acquired slot with session parameters.
+ +                       * Local parameters are fired only once BEGIN has been launched on
+ +                       * remote nodes.
+ +                       */
+ +              }
+ +
+ +              result[i] = PQsocket((PGconn *) agent->dn_connections[node]->conn);
+ +              (*pids)[i++] = ((PGconn *) agent->dn_connections[node]->conn)->be_pid;
+ +      }
+ +
+ +      /* Save then in the array fds for Coordinators */
+ +      foreach(nodelist_item, coordlist)
+ +      {
+ +              int                     node = lfirst_int(nodelist_item);
+ +
+ +              /* Acquire from the pool if none */
+ +              if (agent->coord_connections[node] == NULL)
+ +              {
+ +                      PGXCNodePoolSlot *slot = acquire_connection(agent->pool, agent->coord_conn_oids[node]);
+ +
+ +                      /* Handle failure */
+ +                      if (slot == NULL)
+ +                      {
+ +                              pfree(result);
+ +                              MemoryContextSwitchTo(oldcontext);
+ +                              elog(LOG, "Pooler could not open a connection to node %d",
+ +                                              agent->coord_conn_oids[node]);
+ +                              return NULL;
+ +                      }
+ +
+ +                      /* Store in the descriptor */
+ +                      agent->coord_connections[node] = slot;
+ +
+ +                      /*
+ +                       * Update newly-acquired slot with session parameters.
+ +                       * Local parameters are fired only once BEGIN has been launched on
+ +                       * remote nodes.
+ +                       */
+ +              }
+ +
+ +              result[i] = PQsocket((PGconn *) agent->coord_connections[node]->conn);
+ +              (*pids)[i++] = ((PGconn *) agent->coord_connections[node]->conn)->be_pid;
+ +      }
+ +
+ +      MemoryContextSwitchTo(oldcontext);
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * Cancel query
+ + */
+ +static int
+ +cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist)
+ +{
+ +      ListCell        *nodelist_item;
+ +      char            errbuf[256];
+ +      int             nCount;
+ +      bool            bRet;
+ +
+ +      nCount = 0;
+ +
+ +      if (agent == NULL)
+ +              return nCount;
+ +
+ +      /* Send cancel on Datanodes first */
+ +      foreach(nodelist_item, datanodelist)
+ +      {
+ +              int     node = lfirst_int(nodelist_item);
+ +
+ +              if(node < 0 || node >= agent->num_dn_connections)
+ +                      continue;
+ +
+ +              if (agent->dn_connections == NULL)
+ +                      break;
+ +
+ +              if (!agent->dn_connections[node])
+ +                      continue;
+ +
+ +              elog(DEBUG1, "Canceling query on connection to remote node %d, remote pid %d",
+ +                              agent->dn_conn_oids[node],
+ +                              ((PGconn *) agent->dn_connections[node]->conn)->be_pid);
+ +              bRet = PQcancel((PGcancel *) agent->dn_connections[node]->xc_cancelConn, errbuf, sizeof(errbuf));
+ +              if (bRet != false)
+ +              {
+ +                      elog(DEBUG1, "Cancelled query on connection to remote node %d, remote pid %d",
+ +                                      agent->dn_conn_oids[node],
+ +                                      ((PGconn *) agent->dn_connections[node]->conn)->be_pid);
+ +                      nCount++;
+ +              }
+ +      }
+ +
+ +      /* Send cancel to Coordinators too, e.g. if DDL was in progress */
+ +      foreach(nodelist_item, coordlist)
+ +      {
+ +              int     node = lfirst_int(nodelist_item);
+ +
+ +              if(node < 0 || node >= agent->num_coord_connections)
+ +                      continue;
+ +
+ +              if (agent->coord_connections == NULL)
+ +                      break;
+ +
+ +              if (!agent->coord_connections[node])
+ +                      continue;
+ +
+ +              elog(DEBUG1, "Canceling query on connection to remote node %d, remote pid %d",
+ +                              agent->coord_conn_oids[node],
+ +                              ((PGconn *) agent->coord_connections[node]->conn)->be_pid);
+ +              bRet = PQcancel((PGcancel *) agent->coord_connections[node]->xc_cancelConn, errbuf, sizeof(errbuf));
+ +              if (bRet != false)
+ +              {
+ +                      elog(DEBUG1, "Cancelled query on connection to remote node %d, remote pid %d",
+ +                                      agent->coord_conn_oids[node],
+ +                                      ((PGconn *) agent->coord_connections[node]->conn)->be_pid);
+ +                      nCount++;
+ +              }
+ +      }
+ +
+ +      return nCount;
+ +}
+ +
+ +/*
+ + * Return connections back to the pool
+ + */
+ +void
+ +PoolManagerReleaseConnections(bool force)
+ +{
+ +      char msgtype = 'r';
+ +      int n32;
+ +      int msglen = 8;
+ +
+ +      /* If disconnected from pooler all the connections already released */
+ +      if (!poolHandle)
+ +              return;
+ +
+ +      elog(DEBUG1, "Returning connections back to the pool");
+ +
+ +      /* Message type */
+ +      pool_putbytes(&poolHandle->port, &msgtype, 1);
+ +
+ +      /* Message length */
+ +      n32 = htonl(msglen);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +
+ +      /* Lock information */
+ +      n32 = htonl((int) force);
+ +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ +      pool_flush(&poolHandle->port);
+ +}
+ +
+ +/*
+ + * Cancel Query
+ + */
+ +void
+ +PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list)
+ +{
+ +      uint32          n32;
+ +      /*
+ +       * Buffer contains the list of both Coordinator and Datanodes, as well
+ +       * as the number of connections
+ +       */
+ +      uint32          buf[2 + dn_count + co_count];
+ +      int             i;
+ +
+ +      if (poolHandle == NULL)
+ +              return;
+ +
+ +      if (dn_count == 0 && co_count == 0)
+ +              return;
+ +
+ +      if (dn_count != 0 && dn_list == NULL)
+ +              return;
+ +
+ +      if (co_count != 0 && co_list == NULL)
+ +              return;
+ +
+ +      /* Insert the list of Datanodes in buffer */
+ +      n32 = htonl((uint32) dn_count);
+ +      buf[0] = n32;
+ +
+ +      for (i = 0; i < dn_count;)
+ +      {
+ +              n32 = htonl((uint32) dn_list[i++]);
+ +              buf[i] = n32;
+ +      }
+ +
+ +      /* Insert the list of Coordinators in buffer */
+ +      n32 = htonl((uint32) co_count);
+ +      buf[dn_count + 1] = n32;
+ +
+ +      /* Not necessary to send to pooler a request if there is no Coordinator */
+ +      if (co_count != 0)
+ +      {
+ +              for (i = dn_count + 1; i < (dn_count + co_count + 1);)
+ +              {
+ +                      n32 = htonl((uint32) co_list[i - (dn_count + 1)]);
+ +                      buf[++i] = n32;
+ +              }
+ +      }
+ +      pool_putmessage(&poolHandle->port, 'h', (char *) buf, (2 + dn_count + co_count) * sizeof(uint32));
+ +      pool_flush(&poolHandle->port);
+ +
+ +      /* Receive result message */
+ +      if (pool_recvres(&poolHandle->port) != QUERY_CANCEL_COMPLETED)
+ +              ereport(WARNING,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("Query cancel not completed")));
+ +}
+ +
+ +/*
+ + * Release connections for Datanodes and Coordinators
+ + */
+ +static void
+ +agent_release_connections(PoolAgent *agent, bool force_destroy)
+ +{
+ +      MemoryContext oldcontext;
+ +      int                     i;
+ +
+ +      if (!agent->dn_connections && !agent->coord_connections)
+ +              return;
+ +      if (!force_destroy && cluster_ex_lock_held)
+ +      {
+ +              elog(LOG, "Not releasing connection with cluster lock");
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * There are possible memory allocations in the core pooler, we want
+ +       * these allocations in the contect of the database pool
+ +       */
+ +      oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
+ +
+ +      /*
+ +       * Remaining connections are assumed to be clean.
+ +       * First clean up for Datanodes
+ +       */
+ +      for (i = 0; i < agent->num_dn_connections; i++)
+ +      {
+ +              PGXCNodePoolSlot *slot = agent->dn_connections[i];
+ +
+ +              /*
+ +               * Release connection.
+ +               * If connection has temporary objects on it, destroy connection slot.
+ +               */
+ +              if (slot)
+ +                      release_connection(agent->pool, slot, agent->dn_conn_oids[i], force_destroy);
+ +              agent->dn_connections[i] = NULL;
+ +              elog(DEBUG1, "Released connection to node %d", agent->dn_conn_oids[i]);
+ +      }
+ +      /* Then clean up for Coordinator connections */
+ +      for (i = 0; i < agent->num_coord_connections; i++)
+ +      {
+ +              PGXCNodePoolSlot *slot = agent->coord_connections[i];
+ +
+ +              /*
+ +               * Release connection.
+ +               * If connection has temporary objects on it, destroy connection slot.
+ +               */
+ +              if (slot)
+ +                      release_connection(agent->pool, slot, agent->coord_conn_oids[i], force_destroy);
+ +              agent->coord_connections[i] = NULL;
+ +              elog(DEBUG1, "Released connection to node %d", agent->coord_conn_oids[i]);
+ +      }
+ +
+ +      /*
+ +       * Released connections are now in the pool and we may want to close
+ +       * them eventually. Update the oldest_idle value to reflect the latest
+ +       * last access time if not already updated..
+ +       */
+ +      if (!force_destroy && agent->pool->oldest_idle == (time_t) 0)
+ +              agent->pool->oldest_idle = time(NULL);
+ +
+ +      MemoryContextSwitchTo(oldcontext);
+ +}
+ +
+ +/*
+ + * Create new empty pool for a database.
+ + * By default Database Pools have a size null so as to avoid interactions
+ + * between PGXC nodes in the cluster (Co/Co, Dn/Dn and Co/Dn).
+ + * Pool is increased at the first GET_CONNECTION message received.
+ + * Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory
+ + * error and POOL_WEXIST if poll for this database already exist.
+ + */
+ +static DatabasePool *
+ +create_database_pool(const char *database, const char *user_name, const char *pgoptions)
+ +{
+ +      MemoryContext   oldcontext;
+ +      MemoryContext   dbcontext;
+ +      DatabasePool   *databasePool;
+ +      HASHCTL                 hinfo;
+ +
+ +      elog(DEBUG1, "Creating a connection pool for database %s, user %s,"
+ +                      " with pgoptions %s", database, user_name, pgoptions);
+ +
+ +      dbcontext = AllocSetContextCreate(PoolerCoreContext,
+ +                                                                        "DB Context",
+ +                                                                        ALLOCSET_DEFAULT_MINSIZE,
+ +                                                                        ALLOCSET_DEFAULT_INITSIZE,
+ +                                                                        ALLOCSET_DEFAULT_MAXSIZE);
+ +      oldcontext = MemoryContextSwitchTo(dbcontext);
+ +      /* Allocate memory */
+ +      databasePool = (DatabasePool *) palloc(sizeof(DatabasePool));
+ +      if (!databasePool)
+ +      {
+ +              /* out of memory */
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +              return NULL;
+ +      }
+ +
+ +      databasePool->mcxt = dbcontext;
+ +       /* Copy the database name */
+ +      databasePool->database = pstrdup(database);
+ +       /* Copy the user name */
+ +      databasePool->user_name = pstrdup(user_name);
+ +      /* Reset the oldest_idle value */
+ +      databasePool->oldest_idle = (time_t) 0;
+ +       /* Copy the pgoptions */
+ +      databasePool->pgoptions = pstrdup(pgoptions);
+ +
+ +      if (!databasePool->database)
+ +      {
+ +              /* out of memory */
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                               errmsg("out of memory")));
+ +              pfree(databasePool);
+ +              return NULL;
+ +      }
+ +
+ +      /* Init next reference */
+ +      databasePool->next = NULL;
+ +
+ +      /* Init node hashtable */
+ +      MemSet(&hinfo, 0, sizeof(hinfo));
+ +
+ +      hinfo.keysize = sizeof(Oid);
+ +      hinfo.entrysize = sizeof(PGXCNodePool);
+ +      hinfo.hcxt = dbcontext;
+ +
+ +      databasePool->nodePools = hash_create("Node Pool", MaxDataNodes + MaxCoords,
+ +                                                                                &hinfo,
+ +                                                                                HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
+ +
+ +      MemoryContextSwitchTo(oldcontext);
+ +
+ +      /* Insert into the list */
+ +      insert_database_pool(databasePool);
+ +
+ +      return databasePool;
+ +}
+ +
+ +
+ +/*
+ + * Destroy the pool and free memory
+ + */
+ +static int
+ +destroy_database_pool(const char *database, const char *user_name)
+ +{
+ +      DatabasePool *databasePool;
+ +
+ +      elog(DEBUG1, "Destroy a connection pool to database %s, user %s",
+ +                      database, user_name);
+ +
+ +      /* Delete from the list */
+ +      databasePool = remove_database_pool(database, user_name);
+ +      if (databasePool)
+ +      {
+ +              HASH_SEQ_STATUS hseq_status;
+ +              PGXCNodePool   *nodePool;
+ +
+ +              hash_seq_init(&hseq_status, databasePool->nodePools);
+ +              while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ +              {
+ +                      destroy_node_pool(nodePool);
+ +              }
+ +              /* free allocated memory */
+ +              MemoryContextDelete(databasePool->mcxt);
+ +              return 1;
+ +      }
+ +      return 0;
+ +}
+ +
+ +
+ +/*
+ + * Insert new database pool to the list
+ + */
+ +static void
+ +insert_database_pool(DatabasePool *databasePool)
+ +{
+ +      Assert(databasePool);
+ +
+ +      /* Reference existing list or null the tail */
+ +      if (databasePools)
+ +              databasePool->next = databasePools;
+ +      else
+ +              databasePool->next = NULL;
+ +
+ +      /* Update head pointer */
+ +      databasePools = databasePool;
+ +}
+ +
+ +/*
+ + * reload_database_pools
+ + *    rebuild connection information for all database pools
+ + *
+ + * A database pool is reloaded as follows for each remote node:
+ + *
+ + * - node pool is deleted if the node has been deleted from catalog.
+ + *   Subsequently all its connections are dropped.
+ + *
+ + * - node pool is deleted if its port or host information is changed.
+ + *   Subsequently all its connections are dropped.
+ + *
+ + * - node pool is kept unchanged with existing connection information
+ + *   is not changed. However its index position in node pool is changed
+ + *   according to the alphabetical order of the node name in new
+ + *   cluster configuration.
+ + *
+ + * Backend sessions are responsible to reconnect to the pooler to update
+ + * their agent with newest connection information.
+ + *
+ + * The session invocating connection information reload is reconnected
+ + * and uploaded automatically after database pool reload. Other server
+ + * sessions are signaled to reconnect to pooler and update their
+ + * connection information separately.
+ + *
+ + * During reload process done internally on pooler, pooler is locked
+ + * to forbid new connection requests.
+ + */
+ +static void
+ +reload_database_pools(PoolAgent *agent)
+ +{
+ +      DatabasePool *databasePool;
+ +
+ +      elog(DEBUG1, "Reloading database pools");
+ +
+ +      /*
+ +       * Release node connections if any held. It is not guaranteed client session
+ +       * does the same so don't ever try to return them to pool and reuse
+ +       */
+ +      agent_release_connections(agent, true);
+ +
+ +      /* Forget previously allocated node info */
+ +      MemoryContextReset(agent->mcxt);
+ +
+ +      /* and allocate new */
+ +      PgxcNodeGetOids(&agent->coord_conn_oids, &agent->dn_conn_oids,
+ +                                      &agent->num_coord_connections, &agent->num_dn_connections, false);
+ +
+ +      agent->coord_connections = (PGXCNodePoolSlot **)
+ +                      palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
+ +      agent->dn_connections = (PGXCNodePoolSlot **)
+ +                      palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
+ +
+ +      /*
+ +       * Scan the list and destroy any altered pool. They will be recreated
+ +       * upon subsequent connection acquisition.
+ +       */
+ +      databasePool = databasePools;
+ +      while (databasePool)
+ +      {
+ +              /* Update each database pool slot with new connection information */
+ +              HASH_SEQ_STATUS hseq_status;
+ +              PGXCNodePool   *nodePool;
+ +
+ +              hash_seq_init(&hseq_status, databasePool->nodePools);
+ +              while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ +              {
+ +                      char *connstr_chk = build_node_conn_str(nodePool->nodeoid, databasePool);
+ +
+ +                      if (connstr_chk == NULL || strcmp(connstr_chk, nodePool->connstr))
+ +                      {
+ +                              /* Node has been removed or altered */
+ +                              destroy_node_pool(nodePool);
+ +                              hash_search(databasePool->nodePools, &nodePool->nodeoid,
+ +                                                      HASH_REMOVE, NULL);
+ +                      }
+ +
+ +                      if (connstr_chk)
+ +                              pfree(connstr_chk);
+ +              }
+ +
+ +              databasePool = databasePool->next;
+ +      }
+ +}
+ +
+ +/*
+ + * refresh_database_pools
+ + *            refresh information for all database pools
+ + *
+ + * Connection information refresh concerns all the database pools.
+ + * A database pool is refreshed as follows for each remote node:
+ + *
+ + * - node pool is deleted if its port or host information is changed.
+ + *   Subsequently all its connections are dropped.
+ + *
+ + * If any other type of activity is found, we error out.
+ + *
+ + * XXX I don't see any cases that would error out. Isn't the comment
+ + * simply obsolete?
+ + */
+ +static int
+ +refresh_database_pools(PoolAgent *agent)
+ +{
+ +      DatabasePool *databasePool;
+ +      Oid                        *coOids;
+ +      Oid                        *dnOids;
+ +      int                             numCo;
+ +      int                             numDn;
+ +      int                     res = POOL_REFRESH_SUCCESS;
+ +
+ +      elog(LOG, "Refreshing database pools");
+ +
+ +      /*
+ +       * re-check if agent's node information matches current contents of the
+ +       * shared memory table.
+ +       */
+ +      PgxcNodeGetOids(&coOids, &dnOids, &numCo, &numDn, false);
+ +
+ +      if (agent->num_coord_connections != numCo ||
+ +                      agent->num_dn_connections != numDn ||
+ +                      memcmp(agent->coord_conn_oids, coOids, numCo * sizeof(Oid)) ||
+ +                      memcmp(agent->dn_conn_oids, dnOids, numDn * sizeof(Oid)))
+ +              res = POOL_REFRESH_FAILED;
+ +
+ +      /* Release palloc'ed memory */
+ +      pfree(coOids);
+ +      pfree(dnOids);
+ +
+ +      /*
+ +       * Scan the list and destroy any altered pool. They will be recreated
+ +       * upon subsequent connection acquisition.
+ +       */
+ +      databasePool = databasePools;
+ +      while (res == POOL_REFRESH_SUCCESS && databasePool)
+ +      {
+ +              HASH_SEQ_STATUS hseq_status;
+ +              PGXCNodePool   *nodePool;
+ +
+ +              hash_seq_init(&hseq_status, databasePool->nodePools);
+ +              while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ +              {
+ +                      char *connstr_chk = build_node_conn_str(nodePool->nodeoid, databasePool);
+ +
+ +                      /*
+ +                       * Since we re-checked the numbers above, we should not get
+ +                       * the case of an ADDED or a DELETED node here..
+ +                       */
+ +                      if (connstr_chk == NULL)
+ +                      {
+ +                              elog(LOG, "Found a deleted node (%u)", nodePool->nodeoid);
+ +                              hash_seq_term(&hseq_status);
+ +                              res = POOL_REFRESH_FAILED;
+ +                              break;
+ +                      }
+ +
+ +                      if (strcmp(connstr_chk, nodePool->connstr))
+ +                      {
+ +                              elog(LOG, "Found an altered node (%u)", nodePool->nodeoid);
+ +                              /*
+ +                               * Node has been altered. First remove
+ +                               * all references to this node from ALL the
+ +                               * agents before destroying it..
+ +                               */
+ +                              if (!remove_all_agent_references(nodePool->nodeoid))
+ +                              {
+ +                                      res = POOL_REFRESH_FAILED;
+ +                                      break;
+ +                              }
+ +
+ +                              destroy_node_pool(nodePool);
+ +                              hash_search(databasePool->nodePools, &nodePool->nodeoid,
+ +                                                      HASH_REMOVE, NULL);
+ +                      }
+ +
+ +                      if (connstr_chk)
+ +                              pfree(connstr_chk);
+ +              }
+ +
+ +              databasePool = databasePool->next;
+ +      }
+ +      return res;
+ +}
+ +
+ +static bool
+ +remove_all_agent_references(Oid nodeoid)
+ +{
+ +      int i, j;
+ +      bool res = true;
+ +
+ +      /*
+ +       * Identify if it's a coordinator or datanode first
+ +       * and get its index
+ +       */
+ +      for (i = 1; i <= agentCount; i++)
+ +      {
+ +              bool found = false;
+ +
+ +              PoolAgent *agent = poolAgents[i - 1];
+ +              for (j = 0; j < agent->num_dn_connections; j++)
+ +              {
+ +                      if (agent->dn_conn_oids[j] == nodeoid)
+ +                      {
+ +                              found = true;
+ +                              break;
+ +                      }
+ +              }
+ +              if (found)
+ +              {
+ +                      PGXCNodePoolSlot *slot = agent->dn_connections[j];
+ +                      if (slot)
+ +                              release_connection(agent->pool, slot, agent->dn_conn_oids[j], false);
+ +                      agent->dn_connections[j] = NULL;
+ +              }
+ +              else
+ +              {
+ +                      for (j = 0; j < agent->num_coord_connections; j++)
+ +                      {
+ +                              if (agent->coord_conn_oids[j] == nodeoid)
+ +                              {
+ +                                      found = true;
+ +                                      break;
+ +                              }
+ +                      }
+ +                      if (found)
+ +                      {
+ +                              PGXCNodePoolSlot *slot = agent->coord_connections[j];
+ +                              if (slot)
+ +                                      release_connection(agent->pool, slot, agent->coord_conn_oids[j], true);
+ +                              agent->coord_connections[j] = NULL;
+ +                      }
+ +                      else
+ +                      {
+ +                              elog(LOG, "Node not found! (%u)", nodeoid);
+ +                              res = false;
+ +                      }
+ +              }
+ +      }
+ +      return res;
+ +}
+ +
+ +/*
+ + * Find pool for specified database and username in the list
+ + */
+ +static DatabasePool *
+ +find_database_pool(const char *database, const char *user_name, const char *pgoptions)
+ +{
+ +      DatabasePool *databasePool;
+ +
+ +      /* Scan the list */
+ +      databasePool = databasePools;
+ +      while (databasePool)
+ +      {
+ +              if (strcmp(database, databasePool->database) == 0 &&
+ +                      strcmp(user_name, databasePool->user_name) == 0 &&
+ +                      strcmp(pgoptions, databasePool->pgoptions) == 0)
+ +                      break;
+ +              databasePool = databasePool->next;
+ +      }
+ +      return databasePool;
+ +}
+ +
+ +
+ +/*
+ + * Remove pool for specified database from the list
+ + */
+ +static DatabasePool *
+ +remove_database_pool(const char *database, const char *user_name)
+ +{
+ +      DatabasePool *databasePool,
+ +                         *prev;
+ +
+ +      /* Scan the list */
+ +      databasePool = databasePools;
+ +      prev = NULL;
+ +      while (databasePool)
+ +      {
+ +
+ +              /* if match break the loop and return */
+ +              if (strcmp(database, databasePool->database) == 0 &&
+ +                      strcmp(user_name, databasePool->user_name) == 0)
+ +                      break;
+ +              prev = databasePool;
+ +              databasePool = databasePool->next;
+ +      }
+ +
+ +      /* if found */
+ +      if (databasePool)
+ +      {
+ +
+ +              /* Remove entry from chain or update head */
+ +              if (prev)
+ +                      prev->next = databasePool->next;
+ +              else
+ +                      databasePools = databasePool->next;
+ +
+ +
+ +              databasePool->next = NULL;
+ +      }
+ +      return databasePool;
+ +}
+ +
+ +/*
+ + * Acquire connection
+ + */
+ +static PGXCNodePoolSlot *
+ +acquire_connection(DatabasePool *dbPool, Oid node)
+ +{
+ +      PGXCNodePool       *nodePool;
+ +      PGXCNodePoolSlot   *slot;
+ +
+ +      Assert(dbPool);
+ +
+ +      nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
+ +                                                                                      NULL);
+ +
+ +      /*
+ +       * When a Coordinator pool is initialized by a Coordinator Postmaster,
+ +       * it has a NULL size and is below minimum size that is 1
+ +       * This is to avoid problems of connections between Coordinators
+ +       * when creating or dropping Databases.
+ +       */
+ +      if (nodePool == NULL || nodePool->freeSize == 0)
+ +              nodePool = grow_pool(dbPool, node);
+ +
+ +      slot = NULL;
+ +      /* Check available connections */
+ +      while (nodePool && nodePool->freeSize > 0)
+ +      {
+ +              int                     poll_result;
+ +
+ +              slot = nodePool->slot[--(nodePool->freeSize)];
+ +
+ +      retry:
+ +              if (PQsocket((PGconn *) slot->conn) > 0)
+ +              {
+ +                      /*
+ +                       * Make sure connection is ok, destroy connection slot if there is a
+ +                       * problem.
+ +                       */
+ +                      poll_result = pqReadReady((PGconn *) slot->conn);
+ +
+ +                      if (poll_result == 0)
+ +                              break;          /* ok, no data */
+ +                      else if (poll_result < 0)
+ +                      {
+ +                              if (errno == EAGAIN || errno == EINTR)
+ +                                      goto retry;
+ +
+ +                              elog(WARNING, "Error in checking connection, errno = %d", errno);
+ +                      }
+ +                      else
+ +                              elog(WARNING, "Unexpected data on connection, cleaning.");
+ +              }
+ +
+ +              destroy_slot(slot);
+ +              slot = NULL;
+ +
+ +              /* Decrement current max pool size */
+ +              (nodePool->size)--;
+ +              /* Ensure we are not below minimum size */
+ +              nodePool = grow_pool(dbPool, node);
+ +      }
+ +
+ +      if (slot == NULL)
+ +      {
+ +              elog(WARNING, "can not connect to node %u", node);
+ +
+ +              /*
+ +               * before returning, also update the shared health
+ +               * status field to indicate that this node is down
+ +               */
+ +              if (!PgxcNodeUpdateHealth(node, false))
+ +                      elog(WARNING, "Could not update health status of node %u", node);
+ +              else
+ +                      elog(WARNING, "Health map updated to reflect DOWN node (%u)", node);
+ +      }
+ +      else
+ +              PgxcNodeUpdateHealth(node, true);
+ +
+ +      return slot;
+ +}
+ +
+ +
+ +/*
+ + * release connection from specified pool and slot
+ + */
+ +static void
+ +release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
+ +                                 Oid node, bool force_destroy)
+ +{
+ +      PGXCNodePool *nodePool;
+ +
+ +      Assert(dbPool);
+ +      Assert(slot);
+ +
+ +      nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
+ +                                                                                      NULL);
+ +      if (nodePool == NULL)
+ +      {
+ +              /*
+ +               * The node may be altered or dropped.
+ +               * In any case the slot is no longer valid.
+ +               */
+ +              destroy_slot(slot);
+ +              return;
+ +      }
+ +
+ +      /* return or discard */
+ +      if (!force_destroy)
+ +      {
+ +              /* Insert the slot into the array and increase pool size */
+ +              nodePool->slot[(nodePool->freeSize)++] = slot;
+ +              slot->released = time(NULL);
+ +      }
+ +      else
+ +      {
+ +              elog(DEBUG1, "Cleaning up connection from pool %s, closing", nodePool->connstr);
+ +              destroy_slot(slot);
+ +              /* Decrement pool size */
+ +              (nodePool->size)--;
+ +              /* Ensure we are not below minimum size */
+ +              grow_pool(dbPool, node);
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Increase database pool size, create new if does not exist
+ + */
+ +static PGXCNodePool *
+ +grow_pool(DatabasePool *dbPool, Oid node)
+ +{
+ +      /* if error try to release idle connections and try again */
+ +      bool                    tryagain = true;
+ +      PGXCNodePool   *nodePool;
+ +      bool                    found;
+ +
+ +      Assert(dbPool);
+ +
+ +      nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
+ +                                                                                      HASH_ENTER, &found);
+ +      nodePool->connstr = build_node_conn_str(node, dbPool);
+ +      if (!nodePool->connstr)
+ +      {
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INTERNAL_ERROR),
+ +                               errmsg("could not build connection string for node %u", node)));
+ +      }
+ +
+ +      if (!found)
+ +      {
+ +              nodePool->slot = (PGXCNodePoolSlot **) palloc0(MaxPoolSize * sizeof(PGXCNodePoolSlot *));
+ +              if (!nodePool->slot)
+ +              {
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                       errmsg("out of memory")));
+ +              }
+ +              nodePool->freeSize = 0;
+ +              nodePool->size = 0;
+ +      }
+ +
+ +      while (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize)
+ +      {
+ +              PGXCNodePoolSlot *slot;
+ +
+ +              /* Allocate new slot */
+ +              slot = (PGXCNodePoolSlot *) palloc(sizeof(PGXCNodePoolSlot));
+ +              if (slot == NULL)
+ +              {
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_OUT_OF_MEMORY),
+ +                                       errmsg("out of memory")));
+ +              }
+ +
+ +              /* If connection fails, be sure that slot is destroyed cleanly */
+ +              slot->xc_cancelConn = NULL;
+ +
+ +              /* Establish connection */
+ +              slot->conn = PGXCNodeConnect(nodePool->connstr);
+ +              if (!PGXCNodeConnected(slot->conn))
+ +              {
+ +                      ereport(LOG,
+ +                                      (errcode(ERRCODE_CONNECTION_FAILURE),
+ +                                       errmsg("failed to connect to node, connection string (%s),"
+ +                                                " connection error (%s)",
+ +                                                nodePool->connstr,
+ +                                                PQerrorMessage((PGconn*) slot->conn))));
+ +                      destroy_slot(slot);
+ +                      /*
+ +                       * If we failed to connect probably number of connections on the
+ +                       * target node reached max_connections. Try and release idle
+ +                       * connections and try again.
+ +                       * We do not want to enter endless loop here and run maintenance
+ +                       * procedure only once.
+ +                       * It is not safe to run the maintenance procedure if no connections
+ +                       * from that pool currently in use - the node pool may be destroyed
+ +                       * in that case.
+ +                       */
+ +                      if (tryagain && nodePool->size > nodePool->freeSize)
+ +                      {
+ +                              pools_maintenance();
+ +                              tryagain = false;
+ +                              continue;
+ +                      }
+ +                      break;
+ +              }
+ +
+ +              slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
+ +              slot->released = time(NULL);
+ +              if (dbPool->oldest_idle == (time_t) 0)
+ +                      dbPool->oldest_idle = slot->released;
+ +
+ +              /* Insert at the end of the pool */
+ +              nodePool->slot[(nodePool->freeSize)++] = slot;
+ +
+ +              /* Increase count of pool size */
+ +              (nodePool->size)++;
+ +              elog(DEBUG1, "Pooler: increased pool size to %d for pool %s",
+ +                       nodePool->size,
+ +                       nodePool->connstr);
+ +      }
+ +      return nodePool;
+ +}
+ +
+ +
+ +/*
+ + * Destroy pool slot
+ + */
+ +static void
+ +destroy_slot(PGXCNodePoolSlot *slot)
+ +{
+ +      if (!slot)
+ +              return;
+ +
+ +      PQfreeCancel((PGcancel *)slot->xc_cancelConn);
+ +      PGXCNodeClose(slot->conn);
+ +      pfree(slot);
+ +}
+ +
+ +
+ +/*
+ + * Destroy node pool
+ + */
+ +static void
+ +destroy_node_pool(PGXCNodePool *node_pool)
+ +{
+ +      int                     i;
+ +
+ +      if (!node_pool)
+ +              return;
+ +
+ +      /*
+ +       * At this point all agents using connections from this pool should be already closed
+ +       * If this not the connections to the Datanodes assigned to them remain open, this will
+ +       * consume Datanode resources.
+ +       */
+ +      elog(DEBUG1, "About to destroy node pool %s, current size is %d, %d connections are in use",
+ +               node_pool->connstr, node_pool->freeSize, node_pool->size - node_pool->freeSize);
+ +      if (node_pool->connstr)
+ +              pfree(node_pool->connstr);
+ +
+ +      if (node_pool->slot)
+ +      {
+ +              for (i = 0; i < node_pool->freeSize; i++)
+ +                      destroy_slot(node_pool->slot[i]);
+ +              pfree(node_pool->slot);
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Main handling loop
+ + */
+ +static void
+ +PoolerLoop(void)
+ +{
+ +      StringInfoData  input_message;
+ +      time_t                  last_maintenance = (time_t) 0;
+ +      int                             maintenance_timeout;
+ +      struct pollfd   *pool_fd;
+ +
+ +#ifdef HAVE_UNIX_SOCKETS
+ +      if (Unix_socket_directories)
+ +      {
+ +              char       *rawstring;
+ +              List       *elemlist;
+ +              ListCell   *l;
+ +              int                     success = 0;
+ +
+ +              /* Need a modifiable copy of Unix_socket_directories */
+ +              rawstring = pstrdup(Unix_socket_directories);
+ +
+ +              /* Parse string into list of directories */
+ +              if (!SplitDirectoriesString(rawstring, ',', &elemlist))
+ +              {
+ +                      /* syntax error in list */
+ +                      ereport(FATAL,
+ +                                      (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ +                                       errmsg("invalid list syntax in parameter \"%s\"",
+ +                                                      "unix_socket_directories")));
+ +              }
+ +
+ +              foreach(l, elemlist)
+ +              {
+ +                      char       *socketdir = (char *) lfirst(l);
+ +                      int                     saved_errno;
+ +
+ +                      /* Connect to the pooler */
+ +                      server_fd = pool_listen(PoolerPort, socketdir);
+ +                      if (server_fd < 0)
+ +                      {
+ +                              saved_errno = errno;
+ +                              ereport(WARNING,
+ +                                              (errmsg("could not create Unix-domain socket in directory \"%s\", errno %d, server_fd %d",
+ +                                                              socketdir, saved_errno, server_fd)));
+ +                      }
+ +                      else
+ +                      {
+ +                              success++;
+ +                      }
+ +              }
+ +
+ +              if (!success && elemlist != NIL)
+ +                      ereport(ERROR,
+ +                                      (errmsg("failed to start listening on Unix-domain socket for pooler: %m")));
+ +
+ +              list_free_deep(elemlist);
+ +              pfree(rawstring);
+ +      }
+ +#endif
+ +
+ +      pool_fd = (struct pollfd *) palloc((MaxConnections + 1) * sizeof(struct pollfd));
+ +
+ +      if (server_fd == -1)
+ +      {
+ +              /* log error */
+ +              return;
+ +      }
+ +
+ +      initStringInfo(&input_message);
+ +
+ +      pool_fd[0].fd = server_fd;
+ +      pool_fd[0].events = POLLIN; 
+ +
+ +      for (;;)
+ +      {
+ +
+ +              int                     retval;
+ +              int                     i;
+ +
+ +              /*
+ +               * Emergency bailout if postmaster has died.  This is to avoid the
+ +               * necessity for manual cleanup of all postmaster children.
+ +               */
+ +              if (!PostmasterIsAlive())
+ +                      exit(1);
+ +
+ +              /* watch for incoming messages */
+ +              for (i = 1; i <= agentCount; i++)
+ +              {
+ +                      PoolAgent *agent = poolAgents[i - 1];
+ +                      int sockfd = Socket(agent->port);
+ +                      pool_fd[i].fd = sockfd;
+ +                      pool_fd[i].events = POLLIN;
+ +              }
+ +
+ +              if (PoolMaintenanceTimeout > 0)
+ +              {
+ +                      int                             timeout_val;
+ +                      double                  timediff;
+ +
+ +                      /*
+ +                       * Decide the timeout value based on when the last
+ +                       * maintenance activity was carried out. If the last
+ +                       * maintenance was done quite a while ago schedule the select
+ +                       * with no timeout. It will serve any incoming activity
+ +                       * and if there's none it will cause the maintenance
+ +                       * to be scheduled as soon as possible
+ +                       */
+ +                      timediff = difftime(time(NULL), last_maintenance);
+ +
+ +                      if (timediff > PoolMaintenanceTimeout)
+ +                              timeout_val = 0;
+ +                      else
+ +                              timeout_val = PoolMaintenanceTimeout - rint(timediff);
+ +
+ +                      maintenance_timeout = timeout_val * 1000;
+ +              }
+ +              else
+ +                      maintenance_timeout = -1;
+ +              /*
+ +               * Emergency bailout if postmaster has died.  This is to avoid the
+ +               * necessity for manual cleanup of all postmaster children.
+ +               */
+ +              if (!PostmasterIsAlive())
+ +                      exit(1);
+ +
+ +              /*
+ +               * Process any requests or signals received recently.
+ +               */
+ +              if (got_SIGHUP)
+ +              {
+ +                      got_SIGHUP = false;
+ +                      ProcessConfigFile(PGC_SIGHUP);
+ +              }
+ +
+ +              if (shutdown_requested)
+ +              {
+ +                      for (i = agentCount - 1; agentCount > 0 && i >= 0; i--)
+ +                      {
+ +                              PoolAgent  *agent = poolAgents[i];
+ +                              agent_destroy(agent);
+ +                      }
+ +
+ +                      while (databasePools)
+ +                              if (destroy_database_pool(databasePools->database,
+ +                                                                                databasePools->user_name) == 0)
+ +                                      break;
+ +                      
+ +                      close(server_fd);
+ +                      exit(0);
+ +              }
+ +
+ +              /* wait for event */
+ +              retval = poll(pool_fd, agentCount + 1, maintenance_timeout);
+ +              if (retval < 0)
+ +              {
+ +                      if (errno == EINTR || errno == EAGAIN)
+ +                              continue;
+ +                      elog(FATAL, "poll returned with error %d", retval);
+ +              }
+ +
+ +              if (retval > 0)
+ +              {
+ +                      /*
+ +                       * Agent may be removed from the array while processing
+ +                       * and trailing items are shifted, so scroll downward
+ +                       * to avoid problem
+ +                       */
+ +                      for (i = agentCount - 1; agentCount > 0 && i >= 0; i--)
+ +                      {
+ +                              PoolAgent *agent = poolAgents[i];
+ +                              int sockfd = Socket(agent->port);
+ +
+ +                              if ((sockfd == pool_fd[i + 1].fd) && 
+ +                                              (pool_fd[i + 1].revents & POLLIN))
+ +                                      agent_handle_input(agent, &input_message);
+ +                      }
+ +
+ +                      if (pool_fd[0].revents & POLLIN)
+ +                              agent_create();
+ +              }
+ +              else if (retval == 0)
+ +              {
+ +                      /* maintenance timeout */
+ +                      pools_maintenance();
+ +                      PoolPingNodes();
+ +                      last_maintenance = time(NULL);
+ +              }
+ +      }
+ +}
+ +
+ +/*
+ + * Clean Connection in all Database Pools for given Datanode and Coordinator list
+ + */
+ +int
+ +clean_connection(List *node_discard, const char *database, const char *user_name)
+ +{
+ +      DatabasePool *databasePool;
+ +      int                     res = CLEAN_CONNECTION_COMPLETED;
+ +
+ +      databasePool = databasePools;
+ +
+ +      while (databasePool)
+ +      {
+ +              ListCell *lc;
+ +
+ +              if ((database && strcmp(database, databasePool->database)) ||
+ +                              (user_name && strcmp(user_name, databasePool->user_name)))
+ +              {
+ +                      /* The pool does not match to request, skip */
+ +                      databasePool = databasePool->next;
+ +                      continue;
+ +              }
+ +
+ +              /*
+ +               * Clean each requested node pool
+ +               */
+ +              foreach(lc, node_discard)
+ +              {
+ +                      PGXCNodePool *nodePool;
+ +                      Oid node = lfirst_oid(lc);
+ +
+ +                      nodePool = hash_search(databasePool->nodePools, &node, HASH_FIND,
+ +                                                                 NULL);
+ +
+ +                      if (nodePool)
+ +                      {
+ +                              /* Check if connections are in use */
+ +                              if (nodePool->freeSize < nodePool->size)
+ +                              {
+ +                                      elog(WARNING, "Pool of Database %s is using Datanode %u connections",
+ +                                                              databasePool->database, node);
+ +                                      res = CLEAN_CONNECTION_NOT_COMPLETED;
+ +                              }
+ +
+ +                              /* Destroy connections currently in Node Pool */
+ +                              if (nodePool->slot)
+ +                              {
+ +                                      int i;
+ +                                      for (i = 0; i < nodePool->freeSize; i++)
+ +                                              destroy_slot(nodePool->slot[i]);
+ +                              }
+ +                              nodePool->size -= nodePool->freeSize;
+ +                              nodePool->freeSize = 0;
+ +                      }
+ +              }
+ +
+ +              databasePool = databasePool->next;
+ +      }
+ +
+ +      /* Release lock on Pooler, to allow transactions to connect again. */
+ +      is_pool_locked = false;
+ +      return res;
+ +}
+ +
+ +/*
+ + * Take a Lock on Pooler.
+ + * Abort PIDs registered with the agents for the given database.
+ + * Send back to client list of PIDs signaled to watch them.
+ + */
+ +int *
+ +abort_pids(int *len, int pid, const char *database, const char *user_name)
+ +{
+ +      int *pids = NULL;
+ +      int i = 0;
+ +      int count;
+ +
+ +      Assert(!is_pool_locked);
+ +      Assert(agentCount > 0);
+ +
+ +      is_pool_locked = true;
+ +
+ +      pids = (int *) palloc((agentCount - 1) * sizeof(int));
+ +
+ +      /* Send a SIGTERM signal to all processes of Pooler agents except this one */
+ +      for (count = 0; count < agentCount; count++)
+ +      {
+ +              if (poolAgents[count]->pid == pid)
+ +                      continue;
+ +
+ +              if (database && strcmp(poolAgents[count]->pool->database, database) != 0)
+ +                      continue;
+ +
+ +              if (user_name && strcmp(poolAgents[count]->pool->user_name, user_name) != 0)
+ +                      continue;
+ +
+ +              if (kill(poolAgents[count]->pid, SIGTERM) < 0)
+ +                      elog(ERROR, "kill(%ld,%d) failed: %m",
+ +                                              (long) poolAgents[count]->pid, SIGTERM);
+ +
+ +              pids[i++] = poolAgents[count]->pid;
+ +      }
+ +
+ +      *len = i;
+ +
+ +      return pids;
+ +}
+ +
+ +/*
+ + *
+ + */
+ +static void
+ +pooler_die(SIGNAL_ARGS)
+ +{
+ +      shutdown_requested = true;
+ +}
+ +
+ +
+ +/*
+ + *
+ + */
+ +static void
+ +pooler_quickdie(SIGNAL_ARGS)
+ +{
+ +      PG_SETMASK(&BlockSig);
+ +      exit(2);
+ +}
+ +
+ +
+ +static void
+ +pooler_sighup(SIGNAL_ARGS)
+ +{
+ +      got_SIGHUP = true;
+ +}
+ +
+ +/*
+ + * Given node identifier, dbname and user name build connection string.
+ + * Get node connection details from the shared memory node table
+ + */
+ +static char *
+ +build_node_conn_str(Oid node, DatabasePool *dbPool)
+ +{
+ +      NodeDefinition *nodeDef;
+ +      char               *connstr;
+ +
+ +      nodeDef = PgxcNodeGetDefinition(node);
+ +      if (nodeDef == NULL)
+ +      {
+ +              /* No such definition, node is dropped? */
+ +              return NULL;
+ +      }
+ +
+ +      connstr = PGXCNodeConnStr(NameStr(nodeDef->nodehost),
+ +                                                        nodeDef->nodeport,
+ +                                                        dbPool->database,
+ +                                                        dbPool->user_name,
+ +                                                        dbPool->pgoptions,
+ +                                                        IS_PGXC_COORDINATOR ? "coordinator" : "datanode",
+ +                                                        PGXCNodeName);
+ +      pfree(nodeDef);
+ +
+ +      return connstr;
+ +}
+ +
+ +/*
+ + * Check all pooled connections, and close which have been released more then
+ + * PooledConnKeepAlive seconds ago.
+ + * Return true if shrink operation closed all the connections and pool can be
+ + * ddestroyed, false if there are still connections or pool is in use.
+ + */
+ +static bool
+ +shrink_pool(DatabasePool *pool)
+ +{
+ +      time_t                  now = time(NULL);
+ +      HASH_SEQ_STATUS hseq_status;
+ +      PGXCNodePool   *nodePool;
+ +      int                     i;
+ +      bool                    empty = true;
+ +
+ +      /* Negative PooledConnKeepAlive disables automatic connection cleanup */
+ +      if (PoolConnKeepAlive < 0)
+ +              return false;
+ +
+ +      pool->oldest_idle = (time_t) 0;
+ +      hash_seq_init(&hseq_status, pool->nodePools);
+ +      while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ +      {
+ +              /* Go thru the free slots and destroy those that are free too long */
+ +              for (i = 0; i < nodePool->freeSize; )
+ +              {
+ +                      PGXCNodePoolSlot *slot = nodePool->slot[i];
+ +
+ +                      if (difftime(now, slot->released) > PoolConnKeepAlive)
+ +                      {
+ +                              /* connection is idle for long, close it */
+ +                              destroy_slot(slot);
+ +                              /* reduce pool size and total number of connections */
+ +                              (nodePool->freeSize)--;
+ +                              (nodePool->size)--;
+ +                              /* move last connection in place, if not at last already */
+ +                              if (i < nodePool->freeSize)
+ +                                      nodePool->slot[i] = nodePool->slot[nodePool->freeSize];
+ +                      }
+ +                      else
+ +                      {
+ +                              if (pool->oldest_idle == (time_t) 0 ||
+ +                                              difftime(pool->oldest_idle, slot->released) > 0)
+ +                                      pool->oldest_idle = slot->released;
+ +
+ +                              i++;
+ +                      }
+ +              }
+ +              if (nodePool->size > 0)
+ +                      empty = false;
+ +              else
+ +              {
+ +                      destroy_node_pool(nodePool);
+ +                      hash_search(pool->nodePools, &nodePool->nodeoid, HASH_REMOVE, NULL);
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Last check, if any active agent is referencing the pool do not allow to
+ +       * destroy it, because there will be a problem if session wakes up and try
+ +       * to get a connection from non existing pool.
+ +       * If all such sessions will eventually disconnect the pool will be
+ +       * destroyed during next maintenance procedure.
+ +       */
+ +      if (empty)
+ +      {
+ +              for (i = 0; i < agentCount; i++)
+ +              {
+ +                      if (poolAgents[i]->pool == pool)
+ +                              return false;
+ +              }
+ +      }
+ +
+ +      return empty;
+ +}
+ +
+ +
+ +/*
+ + * Scan connection pools and release connections which are idle for long.
+ + * If pool gets empty after releasing connections it is destroyed.
+ + */
+ +static void
+ +pools_maintenance(void)
+ +{
+ +      DatabasePool   *prev = NULL;
+ +      DatabasePool   *curr = databasePools;
+ +      time_t                  now = time(NULL);
+ +      int                             count = 0;
+ +
+ +      /* Iterate over the pools */
+ +      while (curr)
+ +      {
+ +              /*
+ +               * If current pool has connections to close and it is emptied after
+ +               * shrink remove the pool and free memory.
+ +               * Otherwithe move to next pool.
+ +               */
+ +              if (curr->oldest_idle != (time_t) 0 &&
+ +                              difftime(now, curr->oldest_idle) > PoolConnKeepAlive &&
+ +                              shrink_pool(curr))
+ +              {
+ +                      MemoryContext mem = curr->mcxt;
+ +                      curr = curr->next;
+ +                      if (prev)
+ +                              prev->next = curr;
+ +                      else
+ +                              databasePools = curr;
+ +                      MemoryContextDelete(mem);
+ +                      count++;
+ +              }
+ +              else
+ +              {
+ +                      prev = curr;
+ +                      curr = curr->next;
+ +              }
+ +      }
+ +      elog(DEBUG1, "Pool maintenance, done in %f seconds, removed %d pools",
+ +                      difftime(time(NULL), now), count);
+ +}
+ +
+ +bool
+ +check_persistent_connections(bool *newval, void **extra, GucSource source)
+ +{
+ +      if (*newval && IS_PGXC_DATANODE)
+ +      {
+ +              elog(WARNING, "persistent_datanode_connections = ON is currently not "
+ +                              "supported on datanodes - ignoring");
+ +              *newval = false;
+ +      }
+ +      return true;
+ +}
diff --cc src/backend/pgxc/squeue/squeue.c

index 4fbae5b31b5b3c5b0c3b735910991ec427c7a4a4,0000000000000000000000000000000000000000..a9741f33f6e06fb2d3ba8fbf5eb0749b4cc88231

mode 100644,000000..100644
--- 1/src/backend/pgxc/squeue/squeue.c
--- /dev/null
+++ b/src/backend/pgxc/squeue/squeue.c
@@@ -1,1791 -1,0 +1,1790 @@@
- static LWLockTranche SharedQueueLocksTranche;
+ +/*-------------------------------------------------------------------------
+ + *
+ + * squeue.c
+ + *
+ + *      Shared queue is for data exchange in shared memory between sessions,
+ + * one of which is a producer, providing data rows. Others are consumer agents -
+ + * sessions initiated from other datanodes, the main purpose of them is to read
+ + * rows from the shared queue and send then to the parent data node.
+ + *    The producer is usually a consumer at the same time, it sends back tuples
+ + * to the parent node without putting it to the queue.
+ + *
+ + * Copyright (c) 2012-2014, TransLattice, Inc.
+ + *
+ + * IDENTIFICATION
+ + *      $$
+ + *
+ + *
+ + *-------------------------------------------------------------------------
+ + */
+ +
+ +#include <sys/time.h>
+ +#include "postgres.h"
+ +
+ +#include "miscadmin.h"
+ +#include "access/gtm.h"
+ +#include "catalog/pgxc_node.h"
+ +#include "commands/prepare.h"
+ +#include "executor/executor.h"
+ +#include "nodes/pg_list.h"
+ +#include "pgxc/nodemgr.h"
+ +#include "pgxc/pgxc.h"
+ +#include "pgxc/pgxcnode.h"
+ +#include "pgxc/squeue.h"
+ +#include "storage/latch.h"
+ +#include "storage/lwlock.h"
+ +#include "storage/shmem.h"
+ +#include "utils/hsearch.h"
+ +#include "utils/resowner.h"
++#include "pgstat.h"
+ +
+ +
+ +int NSQueues = 64;
+ +int SQueueSize = 64;
+ +
+ +#define LONG_TUPLE -42
+ +
+ +typedef struct ConsumerSync
+ +{
+ +      LWLock     *cs_lwlock;          /* Synchronize access to the consumer queue */
+ +      Latch           cs_latch;       /* The latch consumer is waiting on */
+ +} ConsumerSync;
+ +
+ +
+ +/*
+ + * Shared memory structure to store synchronization info to access shared queues
+ + */
+ +typedef struct SQueueSync
+ +{
+ +      void       *queue;                      /* NULL if not assigned to any queue */
+ +      Latch           sqs_producer_latch; /* the latch producer is waiting on */
+ +      ConsumerSync sqs_consumer_sync[0]; /* actual length is MaxDataNodes-1 is
+ +                                                                              * not known on compile time */
+ +} SQueueSync;
+ +
+ +/* Both producer and consumer are working */
+ +#define CONSUMER_ACTIVE 0
+ +/* Producer have finished work successfully and waits for consumer */
+ +#define CONSUMER_EOF 1
+ +/* Producer encountered error and waits for consumer to disconnect */
+ +#define CONSUMER_ERROR 2
+ +/* Consumer is finished with the query, OK to unbind */
+ +#define CONSUMER_DONE 3
+ +
+ +
+ +/* State of a single consumer */
+ +typedef struct
+ +{
+ +      int                     cs_pid;                 /* Process id of the consumer session */
+ +      int                     cs_node;                /* Node id of the consumer parent */
+ +      /*
+ +       * Queue state. The queue is a cyclic queue where stored tuples in the
+ +       * DataRow format, first goes the lengths of the tuple in host format,
+ +       * because it never sent over network followed by tuple bytes.
+ +       */
+ +      int                     cs_ntuples;     /* Number of tuples in the queue */
+ +      int                     cs_status;              /* See CONSUMER_* defines above */
+ +      char       *cs_qstart;          /* Where consumer queue begins */
+ +      int                     cs_qlength;             /* The size of the consumer queue */
+ +      int                     cs_qreadpos;    /* The read position in the consumer queue */
+ +      int                     cs_qwritepos;   /* The write position in the consumer queue */
+ +#ifdef SQUEUE_STAT
+ +      long            stat_writes;
+ +      long            stat_reads;
+ +      long            stat_buff_writes;
+ +      long            stat_buff_reads;
+ +      long            stat_buff_returns;
+ +#endif
+ +} ConsState;
+ +
+ +/* Shared queue header */
+ +typedef struct SQueueHeader
+ +{
+ +      char            sq_key[SQUEUE_KEYSIZE]; /* Hash entry key should be at the
+ +                                                               * beginning of the hash entry */
+ +      int                     sq_pid;                 /* Process id of the producer session */
+ +      int                     sq_nodeid;              /* Node id of the producer parent */
+ +      SQueueSync *sq_sync;        /* Associated sinchronization objects */
+ +      int                     sq_refcnt;              /* Reference count to this entry */
+ +#ifdef SQUEUE_STAT
+ +      bool            stat_finish;
+ +      long            stat_paused;
+ +#endif
+ +      int                     sq_nconsumers;  /* Number of consumers */
+ +      ConsState       sq_consumers[0];/* variable length array */
+ +} SQueueHeader;
+ +
+ +
+ +/*
+ + * Hash table where all shared queues are stored. Key is the queue name, value
+ + * is SharedQueue
+ + */
+ +static HTAB *SharedQueues = NULL;
+ +static LWLockPadded *SQueueLocks = NULL;
-               SharedQueueLocksTranche.name = "Shared Queue Locks";
-               SharedQueueLocksTranche.array_base = SQueueLocks;
-               SharedQueueLocksTranche.array_stride = sizeof(LWLockPadded);
- 
+ +
+ +/*
+ + * Pool of synchronization items
+ + */
+ +static void *SQueueSyncs;
+ +
+ +#define SQUEUE_SYNC_SIZE \
+ +      (sizeof(SQueueSync) + (MaxDataNodes-1) * sizeof(ConsumerSync))
+ +
+ +#define GET_SQUEUE_SYNC(idx) \
+ +      ((SQueueSync *) (((char *) SQueueSyncs) + (idx) * SQUEUE_SYNC_SIZE))
+ +
+ +#define SQUEUE_HDR_SIZE(nconsumers) \
+ +      (sizeof(SQueueHeader) + (nconsumers) * sizeof(ConsState))
+ +
+ +#define QUEUE_FREE_SPACE(cstate) \
+ +      ((cstate)->cs_ntuples > 0 ? \
+ +              ((cstate)->cs_qreadpos >= (cstate)->cs_qwritepos ? \
+ +                      (cstate)->cs_qreadpos - (cstate)->cs_qwritepos : \
+ +                      (cstate)->cs_qlength + (cstate)->cs_qreadpos \
+ +                                                               - (cstate)->cs_qwritepos) \
+ +              : (cstate)->cs_qlength)
+ +
+ +#define QUEUE_WRITE(cstate, len, buf) \
+ +      do \
+ +      { \
+ +              if ((cstate)->cs_qwritepos + (len) <= (cstate)->cs_qlength) \
+ +              { \
+ +                      memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, len); \
+ +                      (cstate)->cs_qwritepos += (len); \
+ +                      if ((cstate)->cs_qwritepos == (cstate)->cs_qlength) \
+ +                              (cstate)->cs_qwritepos = 0; \
+ +              } \
+ +              else \
+ +              { \
+ +                      int part = (cstate)->cs_qlength - (cstate)->cs_qwritepos; \
+ +                      memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, part); \
+ +                      (cstate)->cs_qwritepos = (len) - part; \
+ +                      memcpy((cstate)->cs_qstart, (buf) + part, (cstate)->cs_qwritepos); \
+ +              } \
+ +      } while(0)
+ +
+ +
+ +#define QUEUE_READ(cstate, len, buf) \
+ +      do \
+ +      { \
+ +              if ((cstate)->cs_qreadpos + (len) <= (cstate)->cs_qlength) \
+ +              { \
+ +                      memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, len); \
+ +                      (cstate)->cs_qreadpos += (len); \
+ +                      if ((cstate)->cs_qreadpos == (cstate)->cs_qlength) \
+ +                              (cstate)->cs_qreadpos = 0; \
+ +              } \
+ +              else \
+ +              { \
+ +                      int part = (cstate)->cs_qlength - (cstate)->cs_qreadpos; \
+ +                      memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, part); \
+ +                      (cstate)->cs_qreadpos = (len) - part; \
+ +                      memcpy((buf) + part, (cstate)->cs_qstart, (cstate)->cs_qreadpos); \
+ +              } \
+ +      } while(0)
+ +
+ +
+ +static bool sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow);
+ +static void sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow,
+ +                                                         ConsumerSync *sync);
+ +
+ +/*
+ + * SharedQueuesInit
+ + *    Initialize the reference on the shared memory hash table where all shared
+ + * queues are stored. Invoked during postmaster initialization.
+ + */
+ +void
+ +SharedQueuesInit(void)
+ +{
+ +      HASHCTL info;
+ +      int             hash_flags;
+ +      bool    found;
+ +
+ +      info.keysize = SQUEUE_KEYSIZE;
+ +      info.entrysize = SQUEUE_SIZE;
+ +
+ +      /*
+ +       * Create hash table of fixed size to avoid running out of
+ +       * SQueueSyncs
+ +       */
+ +      hash_flags = HASH_ELEM | HASH_FIXED_SIZE;
+ +
+ +      SharedQueues = ShmemInitHash("Shared Queues", NUM_SQUEUES,
+ +                                                               NUM_SQUEUES, &info, hash_flags);
+ +
+ +      /*
+ +       * Synchronization stuff is in separate structure because we need to
+ +       * initialize all items now while in the postmaster.
+ +       * The structure is actually an array, each array entry is assigned to
+ +       * each instance of SharedQueue in use.
+ +       */
+ +      SQueueSyncs = ShmemInitStruct("Shared Queues Sync",
+ +                                                                SQUEUE_SYNC_SIZE * NUM_SQUEUES,
+ +                                                                &found);
+ +      if (!found)
+ +      {
+ +              int     i, l;
+ +              int     nlocks = (NUM_SQUEUES * (MaxDataNodes-1));
+ +              bool    foundLocks;
+ +
+ +              /* Initialize LWLocks for queues */
+ +              SQueueLocks = (LWLockPadded *) ShmemInitStruct("Shared Queue Locks",
+ +                                              sizeof(LWLockPadded) * nlocks, &foundLocks);
+ +
+ +              /* either both syncs and locks, or none of them */
+ +              Assert(! foundLocks);
+ +
-               LWLockRegisterTranche(LWTRANCHE_SHARED_QUEUES, &SharedQueueLocksTranche);
+ +              /* Register the trannche tranche in the main tranches array */
-                       WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1);
++              LWLockRegisterTranche(LWTRANCHE_SHARED_QUEUES, "Shared Queue Locks");
+ +
+ +              l = 0;
+ +              for (i = 0; i < NUM_SQUEUES; i++)
+ +              {
+ +                      SQueueSync *sqs = GET_SQUEUE_SYNC(i);
+ +                      int                     j;
+ +
+ +                      sqs->queue = NULL;
+ +                      InitSharedLatch(&sqs->sqs_producer_latch);
+ +                      for (j = 0; j < MaxDataNodes-1; j++)
+ +                      {
+ +                              InitSharedLatch(&sqs->sqs_consumer_sync[j].cs_latch);
+ +
+ +                              LWLockInitialize(&(SQueueLocks[l]).lock,
+ +                                                               LWTRANCHE_SHARED_QUEUES);
+ +
+ +                              sqs->sqs_consumer_sync[j].cs_lwlock = &(SQueueLocks[l++]).lock;
+ +                      }
+ +              }
+ +      }
+ +}
+ +
+ +
+ +Size
+ +SharedQueueShmemSize(void)
+ +{
+ +      Size sqs_size;
+ +
+ +      sqs_size = mul_size(NUM_SQUEUES, SQUEUE_SYNC_SIZE);
+ +      return add_size(sqs_size, hash_estimate_size(NUM_SQUEUES, SQUEUE_SIZE));
+ +}
+ +
+ +/*
+ + * SharedQueueAcquire
+ + *     Reserve a named shared queue for future data exchange between processes
+ + * supplying tuples to remote Datanodes. Invoked when a remote query plan is
+ + * registered on the Datanode. The number of consumers is known at this point,
+ + * so shared queue may be formatted during reservation. The first process that
+ + * is acquiring the shared queue on the Datanode does the formatting.
+ + */
+ +void
+ +SharedQueueAcquire(const char *sqname, int ncons)
+ +{
+ +      bool            found;
+ +      SharedQueue sq;
+ +      int trycount = 0;
+ +
+ +      Assert(IsConnFromDatanode());
+ +      Assert(ncons > 0);
+ +
+ +tryagain:
+ +      LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+ +
+ +      sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_ENTER, &found);
+ +      if (!sq)
+ +              ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+ +                              errmsg("out of shared queue, please increase shared_queues")));
+ +
+ +      /* First process acquiring queue should format it */
+ +      if (!found)
+ +      {
+ +              int             qsize;   /* Size of one queue */
+ +              int             i;
+ +              char   *heapPtr;
+ +
+ +              elog(DEBUG1, "Create a new SQueue %s and format it for %d consumers", sqname, ncons);
+ +
+ +              /* Initialize the shared queue */
+ +              sq->sq_pid = 0;
+ +              sq->sq_nodeid = -1;
+ +              sq->sq_refcnt = 1;
+ +#ifdef SQUEUE_STAT
+ +              sq->stat_finish = false;
+ +              sq->stat_paused = 0;
+ +#endif
+ +              /*
+ +               * Assign sync object (latches to wait on)
+ +               * XXX We may want to optimize this and do smart search instead of
+ +               * iterating the array.
+ +               */
+ +              for (i = 0; i < NUM_SQUEUES; i++)
+ +              {
+ +                      SQueueSync *sqs = GET_SQUEUE_SYNC(i);
+ +                      if (sqs->queue == NULL)
+ +                      {
+ +                              sqs->queue = (void *) sq;
+ +                              sq->sq_sync = sqs;
+ +                              break;
+ +                      }
+ +              }
+ +
+ +              Assert(sq->sq_sync != NULL);
+ +
+ +              sq->sq_nconsumers = ncons;
+ +              /* Determine queue size for a single consumer */
+ +              qsize = (SQUEUE_SIZE - SQUEUE_HDR_SIZE(sq->sq_nconsumers)) / sq->sq_nconsumers;
+ +
+ +              heapPtr = (char *) sq;
+ +              /* Skip header */
+ +              heapPtr += SQUEUE_HDR_SIZE(sq->sq_nconsumers);
+ +              /* Set up consumer queues */
+ +              for (i = 0; i < ncons; i++)
+ +              {
+ +                      ConsState *cstate = &(sq->sq_consumers[i]);
+ +
+ +                      cstate->cs_pid = 0;
+ +                      cstate->cs_node = -1;
+ +                      cstate->cs_ntuples = 0;
+ +                      cstate->cs_status = CONSUMER_ACTIVE;
+ +                      cstate->cs_qstart = heapPtr;
+ +                      cstate->cs_qlength = qsize;
+ +                      cstate->cs_qreadpos = 0;
+ +                      cstate->cs_qwritepos = 0;
+ +                      heapPtr += qsize;
+ +              }
+ +              Assert(heapPtr <= ((char *) sq) + SQUEUE_SIZE);
+ +      }
+ +      else
+ +      {
+ +              int i;
+ +
+ +              elog(DEBUG1, "Found an existing SQueue %s - (sq_pid:%d, sq_nodeid:%d,"
+ +                      " sq_nconsumers:%d",
+ +                      sqname, sq->sq_pid, sq->sq_nodeid, sq->sq_nconsumers);
+ +
+ +              for (i = 0; i < sq->sq_nconsumers; i++)
+ +              {
+ +                      elog(DEBUG1, "SQueue %s, consumer (%d) information (cs_pid:%d,"
+ +                                      " cs_node:%d, cs_ntuples:%d, cs_status: %d",
+ +                                      sqname, i,
+ +                                      sq->sq_consumers[i].cs_pid, 
+ +                                      sq->sq_consumers[i].cs_node, 
+ +                                      sq->sq_consumers[i].cs_ntuples, 
+ +                                      sq->sq_consumers[i].cs_status); 
+ +              }
+ +
+ +              /*
+ +               * A race condition is possible here. The previous operation might  use
+ +               * the same Shared Queue name if that was different execution of the
+ +               * same Portal. So here we should try to determine if that Shared Queue
+ +               * belongs to this execution or that is not-yet-released Shared Queue
+ +               * of previous operation.
+ +               * Though at the moment I am not sure, but I believe the BIND stage is
+ +               * only happening after completion of ACQUIRE stage, so it is enough
+ +               * to verify the producer (the very first node that binds) is not bound
+ +               * yet. If it is bound, sleep for a moment and try again. No reason to
+ +               * sleep longer, the producer needs just a quantum of CPU time to UNBIND
+ +               * itself.
+ +               */
+ +              if (sq->sq_pid != 0)
+ +              {
+ +                      int                     i;
+ +                      bool            old_squeue = true;
+ +
+ +                      PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE,
+ +                                      &PGXC_PARENT_NODE_TYPE);
+ +                      for (i = 0; i < sq->sq_nconsumers; i++)
+ +                      {
+ +                              ConsState *cstate = &(sq->sq_consumers[i]);
+ +                              if (cstate->cs_node == PGXC_PARENT_NODE_ID)
+ +                              {
+ +                                      SQueueSync *sqsync = sq->sq_sync;
+ +
+ +                                      LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ +                                                                LW_EXCLUSIVE);
+ +                                      /* verify status */
+ +                                      if (cstate->cs_status != CONSUMER_DONE)
+ +                                              old_squeue = false;
+ +
+ +                                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +                                      break;
+ +                              }
+ +                      }
+ +                      if (old_squeue)
+ +                      {
+ +                              LWLockRelease(SQueuesLock);
+ +                              pg_usleep(1000000L);
+ +                              elog(DEBUG1, "SQueue race condition, give the old producer to "
+ +                                              "finish the work and retry again");
+ +                              trycount++;
+ +                              if (trycount >= 10)
+ +                                      elog(ERROR, "Couldn't resolve SQueue race condition after"
+ +                                                      " %d tries", trycount);
+ +                              goto tryagain;
+ +                      }
+ +              }
+ +              sq->sq_refcnt++;
+ +      }
+ +      LWLockRelease(SQueuesLock);
+ +}
+ +
+ +
+ +/*
+ + * SharedQueueBind
+ + *    Bind to the shared queue specified by sqname either as a consumer or as a
+ + * producer. The first process that binds to the shared queue becomes a producer
+ + * and receives the consumer map, others become consumers and receive queue
+ + * indexes to read tuples from.
+ + * The consNodes int list identifies the nodes involved in the current step.
+ + * The distNodes int list describes result distribution of the current step.
+ + * The consNodes should be a subset of distNodes.
+ + * The myindex and consMap parameters are binding results. If caller process
+ + * is bound to the query as a producer myindex is set to -1 and index of the
+ + * each consumer (order number in the consNodes) is stored to the consMap array
+ + * at the position of the node in the distNodes. For the producer node
+ + * SQ_CONS_SELF is stored, nodes from distNodes list which are not members of
+ + * consNodes or if it was reported they won't read results, they are represented
+ + * as SQ_CONS_NONE.
+ + */
+ +SharedQueue
+ +SharedQueueBind(const char *sqname, List *consNodes,
+ +                                                                 List *distNodes, int *myindex, int *consMap)
+ +{
+ +      bool            found;
+ +      SharedQueue sq;
+ +
+ +      LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+ +
+ +      PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE,
+ +                      &PGXC_PARENT_NODE_TYPE);
+ +      sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found);
+ +      if (!found)
+ +              elog(PANIC, "Shared queue %s not found", sqname);
+ +      if (sq->sq_pid == 0)
+ +      {
+ +              /* Producer */
+ +              int             i;
+ +              ListCell *lc;
+ +
+ +              Assert(consMap);
+ +
+ +              elog(DEBUG1, "Bind node %s to squeue of step %s as a producer",
+ +                       PGXC_PARENT_NODE, sqname);
+ +
+ +              /* Initialize the shared queue */
+ +              sq->sq_pid = MyProcPid;
+ +              sq->sq_nodeid = PGXC_PARENT_NODE_ID;
+ +              OwnLatch(&sq->sq_sync->sqs_producer_latch);
+ +
+ +              i = 0;
+ +              foreach(lc, distNodes)
+ +              {
+ +                      int                     nodeid = lfirst_int(lc);
+ +
+ +                      /*
+ +                       * Producer won't go to shared queue to hand off tuple to itself,
+ +                       * so we do not need to create queue for that entry.
+ +                       */
+ +                      if (nodeid == PGXC_PARENT_NODE_ID)
+ +                      {
+ +                              /* Producer must be in the consNodes list */
+ +                              Assert(list_member_int(consNodes, nodeid));
+ +                              elog(DEBUG1, "SQueue %s consumer @%d is set to self",
+ +                                              sqname, i);
+ +                              consMap[i++] = SQ_CONS_SELF;
+ +                      }
+ +                      /*
+ +                       * This node may connect as a consumer, store consumer id to the map
+ +                       * and initialize consumer queue
+ +                       */
+ +                      else if (list_member_int(consNodes, nodeid))
+ +                      {
+ +                              ConsState  *cstate;
+ +                              int             j;
+ +
+ +                              for (j = 0; j < sq->sq_nconsumers; j++)
+ +                              {
+ +                                      cstate = &(sq->sq_consumers[j]);
+ +                                      if (cstate->cs_node == nodeid)
+ +                                      {
+ +                                              /* The process already reported that queue won't read */
+ +                                              elog(DEBUG1, "Node %d of SQueue %s is released already "
+ +                                                              "at consumer %d, cs_status %d",
+ +                                                       nodeid, sqname, j, cstate->cs_status);
+ +                                              consMap[i++] = SQ_CONS_NONE;
+ +                                              break;
+ +                                      }
+ +                                      else if (cstate->cs_node == -1)
+ +                                      {
+ +                                              /* found unused slot, assign the consumer to it */
+ +                                              elog(DEBUG1, "Node %d of SQueue %s is bound at consumer "
+ +                                                              "%d, cs_status %d",
+ +                                                              nodeid, sqname, j, cstate->cs_status);
+ +                                              consMap[i++] = j;
+ +                                              cstate->cs_node = nodeid;
+ +                                              break;
+ +                                      }
+ +                              }
+ +                      }
+ +                      /*
+ +                       * Consumer from this node won't ever connect as upper level step
+ +                       * is not executed on the node. Discard resuls that may go to that
+ +                       * node, if any.
+ +                       */
+ +                      else
+ +                      {
+ +                              elog(DEBUG1, "Node %d of SQueue %s is not in the "
+ +                                              "redistribution list and hence would never connect",
+ +                                              nodeid, sqname);
+ +                              consMap[i++] = SQ_CONS_NONE;
+ +                      }
+ +              }
+ +
+ +              if (myindex)
+ +                      *myindex = -1;
+ +
+ +              /*
+ +               * Increment the refcnt only when producer binds. This is a bit
+ +               * asymmetrical, but the way things are currently setup, a consumer
+ +               * though calls SharedQueueBind, never calls SharedQueueUnBind. The
+ +               * unbinding is done only by the producer after it waits for all
+ +               * consumers to finish.
+ +               *
+ +               * XXX This ought to be fixed someday to simplify things in Shared
+ +               * Queue handling
+ +               */ 
+ +              sq->sq_refcnt++;
+ +      }
+ +      else
+ +      {
+ +              int     nconsumers;
+ +              ListCell *lc;
+ +
+ +              /* Producer should be different process */
+ +              Assert(sq->sq_pid != MyProcPid);
+ +
+ +              elog(DEBUG1, "SQueue %s has a bound producer from node %d, pid %d",
+ +                              sqname, sq->sq_nodeid, sq->sq_pid);
+ +              elog(DEBUG1, "Bind node %s to SQueue %s as a consumer %d", PGXC_PARENT_NODE, sqname, sq->sq_pid);
+ +
+ +              /* Sanity checks */
+ +              Assert(myindex);
+ +              *myindex = -1;
+ +              /* Ensure the passed in consumer list matches the queue */
+ +              nconsumers = 0;
+ +              foreach (lc, consNodes)
+ +              {
+ +                      int             nodeid = lfirst_int(lc);
+ +                      int                     i;
+ +
+ +                      if (nodeid == sq->sq_nodeid)
+ +                      {
+ +                              /*
+ +                               * This node is a producer it should be in the consumer list,
+ +                               * but no consumer queue for it
+ +                               */
+ +                              continue;
+ +                      }
+ +
+ +                      /* find consumer queue for the node */
+ +                      for (i = 0; i < sq->sq_nconsumers; i++)
+ +                      {
+ +                              ConsState *cstate = &(sq->sq_consumers[i]);
+ +                              if (cstate->cs_node == nodeid)
+ +                              {
+ +                                      nconsumers++;
+ +                                      if (nodeid == PGXC_PARENT_NODE_ID)
+ +                                      {
+ +                                              /*
+ +                                               * Current consumer queue is that from which current
+ +                                               * session will be sending out data rows.
+ +                                               * Initialize the queue to let producer know we are
+ +                                               * here and runnng.
+ +                                               */
+ +                                              SQueueSync *sqsync = sq->sq_sync;
+ +
+ +                                              elog(DEBUG1, "SQueue %s, consumer node %d is same as "
+ +                                                              "the parent node", sqname, nodeid);
+ +                                              LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ +                                                                        LW_EXCLUSIVE);
+ +                                              /* Make sure no consumer bound to the queue already */
+ +                                              Assert(cstate->cs_pid == 0);
+ +                                              /* make sure the queue is ready to read */
+ +                                              Assert(cstate->cs_qlength > 0);
+ +                                              /* verify status */
+ +                                              if (cstate->cs_status == CONSUMER_ERROR ||
+ +                                                              cstate->cs_status == CONSUMER_DONE)
+ +                                              {
+ +                                                      int status = cstate->cs_status;
+ +                                                      /*
+ +                                                       * Producer failed by the time the consumer connect.
+ +                                                       * Change status to "Done" to allow producer unbind
+ +                                                       * and report problem to the parent.
+ +                                                       */
+ +                                                      cstate->cs_status = CONSUMER_DONE;
+ +                                                      /* Producer may be waiting for status change */
+ +                                                      SetLatch(&sqsync->sqs_producer_latch);
+ +                                                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +                                                      LWLockRelease(SQueuesLock);
+ +                                                      ereport(ERROR,
+ +                                                                      (errcode(ERRCODE_PRODUCER_ERROR),
+ +                                                                       errmsg("Producer failed while we were waiting - status was %d", status)));
+ +                                              }
+ +                                              /*
+ +                                               * Any other status is acceptable. Normally it would be
+ +                                               * ACTIVE. If producer have had only few rows to emit
+ +                                               * and it is already done the status would be EOF.
+ +                                               */
+ +
+ +                                              /* Set up the consumer */
+ +                                              cstate->cs_pid = MyProcPid;
+ +
+ +                                              elog(DEBUG1, "SQueue %s, consumer at %d, status %d - "
+ +                                                              "setting up consumer node %d, pid %d",
+ +                                                              sqname, i, cstate->cs_status, cstate->cs_node,
+ +                                                              cstate->cs_pid);
+ +                                              /* return found index */
+ +                                              *myindex = i;
+ +                                              OwnLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ +                                              LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +                                      }
+ +                                      else
+ +                                              elog(DEBUG1, "SQueue %s, consumer node %d is not same as "
+ +                                                              "the parent node %d", sqname, nodeid,
+ +                                                              PGXC_PARENT_NODE_ID);
+ +                                      break;
+ +                              }
+ +                      }
+ +                      /* Check if entry was found and therefore loop was broken */
+ +                      Assert(i < sq->sq_nconsumers);
+ +              }
+ +              /* Check the consumer is found */
+ +              Assert(*myindex != -1);
+ +              Assert(sq->sq_nconsumers == nconsumers);
+ +      }
+ +      LWLockRelease(SQueuesLock);
+ +      return sq;
+ +}
+ +
+ +
+ +/*
+ + * Push data from the local tuplestore to the queue for specified consumer.
+ + * Return true if succeeded and the tuplestore is now empty. Return false
+ + * if specified queue has not enough room for the next tuple.
+ + */
+ +static bool
+ +SharedQueueDump(SharedQueue squeue, int consumerIdx,
+ +                                                 TupleTableSlot *tmpslot, Tuplestorestate *tuplestore)
+ +{
+ +      ConsState  *cstate = &(squeue->sq_consumers[consumerIdx]);
+ +
+ +      elog(DEBUG3, "Dumping SQueue %s data for consumer at %d, "
+ +                      "producer - node %d, pid %d, "
+ +                      "consumer - node %d, pid %d, status %d",
+ +                      squeue->sq_key, consumerIdx,
+ +                      squeue->sq_nodeid, squeue->sq_pid,
+ +                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ +
+ +      /* discard stored data if consumer is not active */
+ +      if (cstate->cs_status != CONSUMER_ACTIVE)
+ +      {
+ +              elog(DEBUG3, "Discarding SQueue %s data for consumer at %d not active",
+ +                              squeue->sq_key, consumerIdx);
+ +              tuplestore_clear(tuplestore);
+ +              return true;
+ +      }
+ +
+ +      /*
+ +       * Tuplestore does not clear eof flag on the active read pointer, causing
+ +       * the store is always in EOF state once reached when there is a single
+ +       * read pointer. We do not want behavior like this and workaround by using
+ +       * secondary read pointer. Primary read pointer (0) is active when we are
+ +       * writing to the tuple store, also it is used to bookmark current position
+ +       * when reading to be able to roll back and return just read tuple back to
+ +       * the store if we failed to write it out to the queue.
+ +       * Secondary read pointer is for reading, and its eof flag is cleared if a
+ +       * tuple is written to the store.
+ +       */
+ +      tuplestore_select_read_pointer(tuplestore, 1);
+ +
+ +      /* If we have something in the tuplestore try to push this to the queue */
+ +      while (!tuplestore_ateof(tuplestore))
+ +      {
+ +              /* save position */
+ +              tuplestore_copy_read_pointer(tuplestore, 1, 0);
+ +
+ +              /* Try to get next tuple to the temporary slot */
+ +              if (!tuplestore_gettupleslot(tuplestore, true, false, tmpslot))
+ +              {
+ +                      /* false means the tuplestore in EOF state */
+ +                      elog(DEBUG3, "Tuplestore for SQueue %s returned EOF",
+ +                                      squeue->sq_key);
+ +                      break;
+ +              }
+ +#ifdef SQUEUE_STAT
+ +              cstate->stat_buff_reads++;
+ +#endif
+ +
+ +              /* The slot should contain a data row */
+ +              Assert(tmpslot->tts_datarow);
+ +
+ +              /* check if queue has enough room for the data */
+ +              if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + tmpslot->tts_datarow->msglen)
+ +              {
+ +                      /*
+ +                       * If stored tuple does not fit empty queue we are entering special
+ +                       * procedure of pushing it through.
+ +                       */
+ +                      if (cstate->cs_ntuples <= 0)
+ +                      {
+ +                              /*
+ +                               * If pushing throw is completed wake up and proceed to next
+ +                               * tuple, there could be enough space in the consumer queue to
+ +                               * fit more.
+ +                               */
+ +                              bool done = sq_push_long_tuple(cstate, tmpslot->tts_datarow);
+ +
+ +                              /*
+ +                               * sq_push_long_tuple writes some data anyway, so wake up
+ +                               * the consumer.
+ +                               */
+ +                              SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch);
+ +
+ +                              if (done)
+ +                                      continue;
+ +                      }
+ +
+ +                      /* Restore read position to get same tuple next time */
+ +                      tuplestore_copy_read_pointer(tuplestore, 0, 1);
+ +#ifdef SQUEUE_STAT
+ +                      cstate->stat_buff_returns++;
+ +#endif
+ +
+ +                      /* We might advance the mark, try to truncate */
+ +                      tuplestore_trim(tuplestore);
+ +
+ +                      /* Prepare for writing, set proper read pointer */
+ +                      tuplestore_select_read_pointer(tuplestore, 0);
+ +
+ +                      /* ... and exit */
+ +                      return false;
+ +              }
+ +              else
+ +              {
+ +                      /* Enqueue data */
+ +                      QUEUE_WRITE(cstate, sizeof(int), (char *) &tmpslot->tts_datarow->msglen);
+ +                      QUEUE_WRITE(cstate, tmpslot->tts_datarow->msglen, tmpslot->tts_datarow->msg);
+ +
+ +                      /* Increment tuple counter. If it was 0 consumer may be waiting for
+ +                       * data so try to wake it up */
+ +                      if ((cstate->cs_ntuples)++ == 0)
+ +                              SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch);
+ +              }
+ +      }
+ +
+ +      /* Remove rows we have just read */
+ +      tuplestore_trim(tuplestore);
+ +
+ +      /* prepare for writes, set read pointer 0 as active */
+ +      tuplestore_select_read_pointer(tuplestore, 0);
+ +
+ +      return true;
+ +}
+ +
+ +
+ +/*
+ + * SharedQueueWrite
+ + *    Write data from the specified slot to the specified queue. If the
+ + * tuplestore passed in has tuples try and write them first.
+ + * If specified queue is full the tuple is put into the tuplestore which is
+ + * created if necessary
+ + */
+ +void
+ +SharedQueueWrite(SharedQueue squeue, int consumerIdx,
+ +                                                      TupleTableSlot *slot, Tuplestorestate **tuplestore,
+ +                                                      MemoryContext tmpcxt)
+ +{
+ +      ConsState  *cstate = &(squeue->sq_consumers[consumerIdx]);
+ +      SQueueSync *sqsync = squeue->sq_sync;
+ +      LWLockId    clwlock = sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock;
+ +      RemoteDataRow datarow;
+ +      bool            free_datarow;
+ +
+ +      Assert(cstate->cs_qlength > 0);
+ +
+ +      LWLockAcquire(clwlock, LW_EXCLUSIVE);
+ +
+ +#ifdef SQUEUE_STAT
+ +      cstate->stat_writes++;
+ +#endif
+ +
+ +      /*
+ +       * If we have anything in the local storage try to dump this first,
+ +       * but do not try to dump often to avoid overhead of creating temporary
+ +       * tuple slot. It should be OK to dump if queue is half empty.
+ +       */
+ +      if (*tuplestore)
+ +      {
+ +              bool dumped = false;
+ +
+ +              if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2)
+ +              {
+ +                      TupleTableSlot *tmpslot;
+ +
+ +                      tmpslot = MakeSingleTupleTableSlot(slot->tts_tupleDescriptor);
+ +                      dumped = SharedQueueDump(squeue, consumerIdx, tmpslot, *tuplestore);
+ +                      ExecDropSingleTupleTableSlot(tmpslot);
+ +              }
+ +              if (!dumped)
+ +              {
+ +                      /* No room to even dump local store, append the tuple to the store
+ +                       * and exit */
+ +#ifdef SQUEUE_STAT
+ +                      cstate->stat_buff_writes++;
+ +#endif
+ +                      LWLockRelease(clwlock);
+ +                      tuplestore_puttupleslot(*tuplestore, slot);
+ +                      return;
+ +              }
+ +      }
+ +
+ +      /* Get datarow from the tuple slot */
+ +      if (slot->tts_datarow)
+ +      {
+ +              /*
+ +               * The function ExecCopySlotDatarow always make a copy, but here we
+ +               * can optimize and avoid copying the data, so we just get the reference
+ +               */
+ +              datarow = slot->tts_datarow;
+ +              free_datarow = false;
+ +      }
+ +      else
+ +      {
+ +              datarow = ExecCopySlotDatarow(slot, tmpcxt);
+ +              free_datarow = true;
+ +      }
+ +      if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + datarow->msglen)
+ +      {
+ +              /* Not enough room, store tuple locally */
+ +              LWLockRelease(clwlock);
+ +
+ +              /* clean up */
+ +              if (free_datarow)
+ +                      pfree(datarow);
+ +
+ +              /* Create tuplestore if does not exist */
+ +              if (*tuplestore == NULL)
+ +              {
+ +                      int                     ptrno;
+ +                      char            storename[64];
+ +
+ +#ifdef SQUEUE_STAT
+ +                      elog(DEBUG1, "Start buffering %s node %d, %d tuples in queue, %ld writes and %ld reads so far",
+ +                               squeue->sq_key, cstate->cs_node, cstate->cs_ntuples, cstate->stat_writes, cstate->stat_reads);
+ +#endif
+ +                      *tuplestore = tuplestore_begin_datarow(false, work_mem, tmpcxt);
+ +                      /* We need is to be able to remember/restore the read position */
+ +                      snprintf(storename, 64, "%s node %d", squeue->sq_key, cstate->cs_node);
+ +                      tuplestore_collect_stat(*tuplestore, storename);
+ +                      /*
+ +                       * Allocate a second read pointer to read from the store. We know
+ +                       * it must have index 1, so needn't store that.
+ +                       */
+ +                      ptrno = tuplestore_alloc_read_pointer(*tuplestore, 0);
+ +                      Assert(ptrno == 1);
+ +              }
+ +
+ +#ifdef SQUEUE_STAT
+ +              cstate->stat_buff_writes++;
+ +#endif
+ +              /* Append the slot to the store... */
+ +              tuplestore_puttupleslot(*tuplestore, slot);
+ +
+ +              /* ... and exit */
+ +              return;
+ +      }
+ +      else
+ +      {
+ +              /* do not supply data to closed consumer */
+ +              if (cstate->cs_status == CONSUMER_ACTIVE)
+ +              {
+ +                      elog(DEBUG3, "SQueue %s, consumer is active, writing data",
+ +                                      squeue->sq_key);
+ +                      /* write out the data */
+ +                      QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen);
+ +                      QUEUE_WRITE(cstate, datarow->msglen, datarow->msg);
+ +                      /* Increment tuple counter. If it was 0 consumer may be waiting for
+ +                       * data so try to wake it up */
+ +                      if ((cstate->cs_ntuples)++ == 0)
+ +                              SetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ +              }
+ +              else
+ +                      elog(DEBUG2, "SQueue %s, consumer is not active, no need to supply data",
+ +                                      squeue->sq_key);
+ +
+ +              /* clean up */
+ +              if (free_datarow)
+ +                      pfree(datarow);
+ +      }
+ +      LWLockRelease(clwlock);
+ +}
+ +
+ +
+ +/*
+ + * SharedQueueRead
+ + *    Read one data row from the specified queue into the provided tupleslot.
+ + * Returns true if EOF is reached on the specified consumer queue.
+ + * If the queue is empty, behavior is controlled by the canwait parameter.
+ + * If canwait is true it is waiting while row is available or EOF or error is
+ + * reported, if it is false, the slot is emptied and false is returned.
+ + */
+ +bool
+ +SharedQueueRead(SharedQueue squeue, int consumerIdx,
+ +                                                      TupleTableSlot *slot, bool canwait)
+ +{
+ +      ConsState  *cstate = &(squeue->sq_consumers[consumerIdx]);
+ +      SQueueSync *sqsync = squeue->sq_sync;
+ +      RemoteDataRow datarow;
+ +      int             datalen;
+ +
+ +      Assert(cstate->cs_qlength > 0);
+ +
+ +      LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE);
+ +
+ +      Assert(cstate->cs_status != CONSUMER_DONE);
+ +      while (cstate->cs_ntuples <= 0)
+ +      {
+ +              elog(DEBUG3, "SQueue %s, consumer node %d, pid %d, status %d - "
+ +                              "no tuples in the queue", squeue->sq_key,
+ +                              cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ +
+ +              if (cstate->cs_status == CONSUMER_EOF)
+ +              {
+ +                      elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d - "
+ +                                      "EOF marked. Informing produer by setting CONSUMER_DONE",
+ +                                      squeue->sq_key,
+ +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ +
+ +                      /* Inform producer the consumer have done the job */
+ +                      cstate->cs_status = CONSUMER_DONE;
+ +                      /* no need to receive notifications */
+ +                      DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ +                      /* producer done the job and no more rows expected, clean up */
+ +                      LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ +                      ExecClearTuple(slot);
+ +                      /*
+ +                       * notify the producer, it may be waiting while consumers
+ +                       * are finishing
+ +                       */
+ +                      SetLatch(&sqsync->sqs_producer_latch);
+ +                      return true;
+ +              }
+ +              else if (cstate->cs_status == CONSUMER_ERROR)
+ +              {
+ +                      elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d - "
+ +                                      "CONSUMER_ERROR set",
+ +                                      squeue->sq_key,
+ +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ +                      /*
+ +                       * There was a producer error while waiting.
+ +                       * Release all the locks and report problem to the caller.
+ +                       */
+ +                      LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ +                      /*
+ +                       * Reporting error will cause transaction rollback and clean up of
+ +                       * all portals. We can not mark the portal so it does not access
+ +                       * the queue so we should hold it for now. We should prevent queue
+ +                       * unbound in between.
+ +                       */
+ +                      ereport(ERROR,
+ +                                      (errcode(ERRCODE_PRODUCER_ERROR),
+ +                                       errmsg("Failed to read from SQueue %s, "
+ +                                               "consumer (node %d, pid %d, status %d) - "
+ +                                               "CONSUMER_ERROR set",
+ +                                               squeue->sq_key,
+ +                                               cstate->cs_node, cstate->cs_pid, cstate->cs_status)));
+ +              }
+ +              if (canwait)
+ +              {
+ +                      /* Prepare waiting on empty buffer */
+ +                      ResetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ +                      LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ +
+ +                      elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - "
+ +                                      "no queued tuples to read, waiting "
+ +                                      "for producer to produce more data",
+ +                                      squeue->sq_key,
+ +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ +
+ +                      /* Wait for notification about available info */
-                                                               10000L);
++                      WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch,
++                                      WL_LATCH_SET | WL_POSTMASTER_DEATH, -1,
++                                      WAIT_EVENT_MQ_INTERNAL);
+ +                      /* got the notification, restore lock and try again */
+ +                      LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE);
+ +              }
+ +              else
+ +              {
+ +                      LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ +
+ +                      elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - "
+ +                                      "no queued tuples to read, caller can't wait ",
+ +                                      squeue->sq_key,
+ +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ +                      ExecClearTuple(slot);
+ +                      return false;
+ +              }
+ +      }
+ +
+ +      elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - "
+ +                      "%d queued tuples to read",
+ +                      squeue->sq_key,
+ +                      cstate->cs_node, cstate->cs_pid, cstate->cs_status,
+ +                      cstate->cs_ntuples);
+ +
+ +      /* have at least one row, read it in and store to slot */
+ +      QUEUE_READ(cstate, sizeof(int), (char *) (&datalen));
+ +      datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + datalen);
+ +      datarow->msgnode = InvalidOid;
+ +      datarow->msglen = datalen;
+ +      if (datalen > cstate->cs_qlength - sizeof(int))
+ +              sq_pull_long_tuple(cstate, datarow,
+ +                                                 &sqsync->sqs_consumer_sync[consumerIdx]);
+ +      else
+ +              QUEUE_READ(cstate, datalen, datarow->msg);
+ +      ExecStoreDataRowTuple(datarow, slot, true);
+ +      (cstate->cs_ntuples)--;
+ +#ifdef SQUEUE_STAT
+ +      cstate->stat_reads++;
+ +#endif
+ +      /* sanity check */
+ +      Assert((cstate->cs_ntuples == 0) == (cstate->cs_qreadpos == cstate->cs_qwritepos));
+ +      LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ +      return false;
+ +}
+ +
+ +
+ +/*
+ + * Mark specified consumer as closed discarding all input which may already be
+ + * in the queue.
+ + * If consumerIdx is -1 the producer is cleaned up. Producer need to wait for
+ + * consumers before releasing the queue, so if there are yet active consumers,
+ + * they are notified about the problem and they should disconnect from the
+ + * queue as soon as possible.
+ + */
+ +void
+ +SharedQueueReset(SharedQueue squeue, int consumerIdx)
+ +{
+ +      SQueueSync *sqsync = squeue->sq_sync;
+ +
+ +      /* 
+ +       * We may have already cleaned up, but then an abort signalled us to clean up.
+ +       * Avoid segmentation fault on abort
+ +       */
+ +      if (!sqsync)
+ +              return;
+ +
+ +      if (consumerIdx == -1)
+ +      {
+ +              int i;
+ +
+ +              elog(DEBUG1, "SQueue %s, requested to reset producer node %d, pid %d - "
+ +                              "Now also resetting all consumers",
+ +                              squeue->sq_key, squeue->sq_nodeid, squeue->sq_pid);
+ +
+ +              /* check queue states */
+ +              for (i = 0; i < squeue->sq_nconsumers; i++)
+ +              {
+ +                      ConsState *cstate = &squeue->sq_consumers[i];
+ +                      LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+ +
+ +                      /*
+ +                       * If producer being reset before it is reached the end of the
+ +                       * result set, that means consumer probably would not get all
+ +                       * the rows and it should report error if the consumer's parent ever
+ +                       * try to read. No need to raise error if consumer is just closed.
+ +                       * If consumer is done already we do not need to change the status.
+ +                       */
+ +                      if (cstate->cs_status != CONSUMER_EOF &&
+ +                                      cstate->cs_status != CONSUMER_DONE)
+ +                      {
+ +                              elog(DEBUG1, "SQueue %s, reset consumer at %d, "
+ +                                              "consumer node %d, pid %d, status %d - marking CONSUMER_ERROR",
+ +                                              squeue->sq_key, i, cstate->cs_node, cstate->cs_pid,
+ +                                              cstate->cs_status);
+ +
+ +                              cstate->cs_status = CONSUMER_ERROR;
+ +                              /* discard tuples which may already be in the queue */
+ +                              cstate->cs_ntuples = 0;
+ +                              /* keep consistent with cs_ntuples*/
+ +                              cstate->cs_qreadpos = cstate->cs_qwritepos = 0;
+ +
+ +                              /* wake up consumer if it is sleeping */
+ +                              SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ +                      }
+ +                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +              }
+ +      }
+ +      else
+ +      {
+ +              ConsState  *cstate = &(squeue->sq_consumers[consumerIdx]);
+ +
+ +              elog(DEBUG1, "SQueue %s, requested to reset consumer at %d, "
+ +                              "consumer node %d, pid %d, status %d",
+ +                              squeue->sq_key, consumerIdx, cstate->cs_node, cstate->cs_pid,
+ +                              cstate->cs_status);
+ +
+ +              LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock,
+ +                                        LW_EXCLUSIVE);
+ +
+ +              if (cstate->cs_status != CONSUMER_DONE)
+ +              {
+ +                      elog(DEBUG1, "SQueue %s, consumer at %d, "
+ +                              "consumer node %d, pid %d, status %d - marking CONSUMER_DONE",
+ +                              squeue->sq_key, consumerIdx, cstate->cs_node, cstate->cs_pid,
+ +                              cstate->cs_status);
+ +
+ +                      /* Inform producer the consumer have done the job */
+ +                      cstate->cs_status = CONSUMER_DONE;
+ +                      /*
+ +                       * No longer need to receive notifications. If consumer has not
+ +                       * connected the latch is not owned
+ +                       */
+ +                      if (cstate->cs_pid > 0)
+ +                              DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ +                      /*
+ +                       * notify the producer, it may be waiting while consumers
+ +                       * are finishing
+ +                       */
+ +                      SetLatch(&sqsync->sqs_producer_latch);
+ +              }
+ +
+ +              LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Assume that not yet connected consumers won't connect and reset them.
+ + * That should allow to Finish/UnBind the queue gracefully and prevent
+ + * producer hanging.
+ + */
+ +void
+ +SharedQueueResetNotConnected(SharedQueue squeue)
+ +{
+ +      SQueueSync *sqsync = squeue->sq_sync;
+ +      int result = 0;
+ +      int i;
+ +
+ +      elog(DEBUG1, "SQueue %s, resetting all unconnected consumers",
+ +                      squeue->sq_key);
+ +
+ +      /* check queue states */
+ +      for (i = 0; i < squeue->sq_nconsumers; i++)
+ +      {
+ +              ConsState *cstate = &squeue->sq_consumers[i];
+ +              LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+ +
+ +              if (cstate->cs_pid == 0 &&
+ +                              cstate->cs_status != CONSUMER_EOF &&
+ +                              cstate->cs_status != CONSUMER_DONE)
+ +              {
+ +                      result++;
+ +                      elog(DEBUG1, "SQueue %s, consumer at %d, consumer node %d, pid %d, "
+ +                                      "status %d is cancelled - marking CONSUMER_ERROR", squeue->sq_key, i,
+ +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ +                      cstate->cs_status = CONSUMER_ERROR;
+ +                      /* discard tuples which may already be in the queue */
+ +                      cstate->cs_ntuples = 0;
+ +                      /* keep consistent with cs_ntuples*/
+ +                      cstate->cs_qreadpos = cstate->cs_qwritepos = 0;
+ +
+ +                      /* wake up consumer if it is sleeping */
+ +                      SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ +              }
+ +              LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Determine if producer can safely pause work.
+ + * The producer can pause if all consumers have enough data to read while
+ + * producer is sleeping.
+ + * Obvoius case when the producer can not pause if at least one queue is empty.
+ + */
+ +bool
+ +SharedQueueCanPause(SharedQueue squeue)
+ +{
+ +      SQueueSync *sqsync = squeue->sq_sync;
+ +      bool            result = true;
+ +      int             usedspace;
+ +      int                     ncons;
+ +      int             i;
+ +
+ +      usedspace = 0;
+ +      ncons = 0;
+ +      for (i = 0; result && (i < squeue->sq_nconsumers); i++)
+ +      {
+ +              ConsState *cstate = &(squeue->sq_consumers[i]);
+ +              LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_SHARED);
+ +              /*
+ +               * Count only consumers that may be blocked.
+ +               * If producer has finished scanning and pushing local buffers some
+ +               * consumers may be finished already.
+ +               */
+ +              if (cstate->cs_status == CONSUMER_ACTIVE)
+ +              {
+ +                      /* can not pause if some queue is empty */
+ +                      result = (cstate->cs_ntuples > 0);
+ +                      usedspace += (cstate->cs_qwritepos > cstate->cs_qreadpos ?
+ +                                                        cstate->cs_qwritepos - cstate->cs_qreadpos :
+ +                                                        cstate->cs_qlength + cstate->cs_qwritepos
+ +                                                                                               - cstate->cs_qreadpos);
+ +                      ncons++;
+ +              }
+ +              LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +      }
+ +      
+ +      if (!ncons)
+ +              return false;
+ +
+ +      /*
+ +       * Pause only if average consumer queue is full more then on half.
+ +       */
+ +      if (result)
+ +              result = (usedspace / ncons > squeue->sq_consumers[0].cs_qlength / 2);
+ +#ifdef SQUEUE_STAT
+ +      if (result)
+ +              squeue->stat_paused++;
+ +#endif
+ +      return result;
+ +}
+ +
+ +
+ +int
+ +SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc,
+ +                                                        Tuplestorestate **tuplestore)
+ +{
+ +      SQueueSync *sqsync = squeue->sq_sync;
+ +      TupleTableSlot *tmpslot = NULL;
+ +      int                     i;
+ +      int                     nstores = 0;
+ +
+ +      elog(DEBUG1, "SQueue %s, finishing the SQueue - producer node %d, "
+ +                      "pid %d, nconsumers %d", squeue->sq_key, squeue->sq_nodeid,
+ +                      squeue->sq_pid, squeue->sq_nconsumers);
+ +
+ +      for (i = 0; i < squeue->sq_nconsumers; i++)
+ +      {
+ +              ConsState *cstate = &squeue->sq_consumers[i];
+ +              LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+ +#ifdef SQUEUE_STAT
+ +              if (!squeue->stat_finish)
+ +                      elog(DEBUG1, "Finishing %s node %d, %ld writes and %ld reads so far, %ld buffer writes, %ld buffer reads, %ld tuples returned to buffer",
+ +                               squeue->sq_key, cstate->cs_node, cstate->stat_writes, cstate->stat_reads, cstate->stat_buff_writes, cstate->stat_buff_reads, cstate->stat_buff_returns);
+ +#endif
+ +              elog(DEBUG1, "SQueue %s finishing, consumer at %d, consumer node %d, pid %d, "
+ +                              "status %d", squeue->sq_key, i,
+ +                              cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ +              /*
+ +               * if the tuplestore has data and consumer queue has space for some
+ +               * try to push rows to the queue. We do not want to do that often
+ +               * to avoid overhead of temp tuple slot allocation.
+ +               */
+ +              if (tuplestore[i])
+ +              {
+ +                      /* If the consumer is not reading just destroy the tuplestore */
+ +                      if (cstate->cs_status != CONSUMER_ACTIVE)
+ +                      {
+ +                              tuplestore_end(tuplestore[i]);
+ +                              tuplestore[i] = NULL;
+ +                      }
+ +                      else
+ +                      {
+ +                              nstores++;
+ +                              /*
+ +                               * Attempt to dump tuples from the store require tuple slot
+ +                               * allocation, that is not a cheap operation, so proceed if
+ +                               * target queue has enough space.
+ +                               */
+ +                              if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2)
+ +                              {
+ +                                      if (tmpslot == NULL)
+ +                                              tmpslot = MakeSingleTupleTableSlot(tupDesc);
+ +                                      if (SharedQueueDump(squeue, i, tmpslot, tuplestore[i]))
+ +                                      {
+ +                                              tuplestore_end(tuplestore[i]);
+ +                                              tuplestore[i] = NULL;
+ +                                              cstate->cs_status = CONSUMER_EOF;
+ +                                              nstores--;
+ +                                      }
+ +                                      /* Consumer may be sleeping, wake it up */
+ +                                      SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ +                              }
+ +                      }
+ +              }
+ +              else
+ +              {
+ +                      /* it set eof if not yet set */
+ +                      if (cstate->cs_status == CONSUMER_ACTIVE)
+ +                      {
+ +                              cstate->cs_status = CONSUMER_EOF;
+ +                              SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ +                      }
+ +              }
+ +              LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +      }
+ +      if (tmpslot)
+ +              ExecDropSingleTupleTableSlot(tmpslot);
+ +
+ +#ifdef SQUEUE_STAT
+ +      squeue->stat_finish = true;
+ +#endif
+ +
+ +      return nstores;
+ +}
+ +
+ +
+ +/*
+ + * SharedQueueUnBind
+ + *    Cancel binding of current process to the shared queue. If the process
+ + * was a producer it should pass in the array of tuplestores where tuples were
+ + * queueed when it was unsafe to block. If any of the tuplestores holds data
+ + * rows they are written to the queue. The length of the array of the
+ + * tuplestores should be the same as the count of consumers. It is OK if some
+ + * entries are NULL. When a consumer unbinds from the shared queue it should
+ + * set the tuplestore parameter to NULL.
+ + */
+ +void
+ +SharedQueueUnBind(SharedQueue squeue, bool failed)
+ +{
+ +      SQueueSync *sqsync = squeue->sq_sync;
+ +      int                     wait_result = 0;
+ +      int         i                = 0;
+ +      int         consumer_running = 0;
+ +
+ +      elog(DEBUG1, "SQueue %s, unbinding the SQueue (failed: %c) - producer node %d, "
+ +                      "pid %d, nconsumers %d", squeue->sq_key, failed ? 'T' : 'F',
+ +                      squeue->sq_nodeid, squeue->sq_pid, squeue->sq_nconsumers);
+ +
+ +CHECK:
+ +
+ +      /* loop while there are active consumers */
+ +      for (;;)
+ +      {
+ +              int i;
+ +              int c_count = 0;
+ +              int unbound_count = 0;
+ +
+ +              /* check queue states */
+ +              for (i = 0; i < squeue->sq_nconsumers; i++)
+ +              {
+ +                      ConsState *cstate = &squeue->sq_consumers[i];
+ +                      LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+ +
+ +                      elog(DEBUG1, "SQueue %s unbinding, check consumer at %d, consumer node %d, pid %d, "
+ +                                      "status %d", squeue->sq_key, i,
+ +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ +
+ +                      /* is consumer working yet ? */
+ +                      if (cstate->cs_status == CONSUMER_ACTIVE && failed)
+ +                      {
+ +                              elog(DEBUG1, "SQueue %s, consumer status CONSUMER_ACTIVE, but "
+ +                                              "the operation has failed - marking CONSUMER_ERROR",
+ +                                              squeue->sq_key);
+ +
+ +                              cstate->cs_status = CONSUMER_ERROR;
+ +                      }
+ +
+ +                      if (cstate->cs_status != CONSUMER_DONE)
+ +                      {
+ +                              elog(DEBUG1, "SQueue %s, consumer not yet done, wake it up and "
+ +                                              "wait for it to finish reading", squeue->sq_key);
+ +                              c_count++;
+ +                              /* Wake up consumer if it is sleeping */
+ +                              SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ +                              /* producer will continue waiting */
+ +                              ResetLatch(&sqsync->sqs_producer_latch);
+ +
+ +                              if (cstate->cs_pid == 0)
+ +                                      unbound_count++;
+ +                      }
+ +
+ +                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +              }
+ +              if (c_count == 0)
+ +                      break;
+ +              elog(DEBUG1, "SQueue %s, wait while %d consumers finish, %d consumers"
+ +                              "not yet bound", squeue->sq_key, c_count, unbound_count);
+ +              /* wait for a notification */
+ +              wait_result = WaitLatch(&sqsync->sqs_producer_latch,
+ +                                                              WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT,
-                       WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1);
++                                                              10000L, WAIT_EVENT_MQ_INTERNAL);
+ +              if (wait_result & WL_TIMEOUT)
+ +              {
+ +                      elog(WARNING, "SQueue %s, timeout while waiting for Consumers "
+ +                                      "finishing", squeue->sq_key);
+ +                      break;
+ +              }
+ +              /* got notification, continue loop */
+ +      }
+ +#ifdef SQUEUE_STAT
+ +      elog(DEBUG1, "Producer %s is done, there were %ld pauses", squeue->sq_key, squeue->stat_paused);
+ +#endif
+ +      elog(DEBUG1, "SQueue %s, producer node %d, pid %d - unbound successfully",
+ +                      squeue->sq_key, squeue->sq_nodeid, squeue->sq_pid);
+ +
+ +      LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+ +
+ +      /*
+ +       * In rear situation, after consumers just bind to the shared queue, the producer timeout and remove the shared queue.
+ +       * This will cause a SEGV in the consumer. So here recheck if there are some consumers binded to the queue, if so, we need to wait them to 
+ +       * finish.
+ +       */
+ +      consumer_running = 0;
+ +      for (i = 0; i < squeue->sq_nconsumers; i++)
+ +      {
+ +              ConsState *cstate = &squeue->sq_consumers[i];
+ +
+ +              LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+ +
+ +              /* found a consumer running */
+ +              if (CONSUMER_ACTIVE == cstate->cs_status && cstate->cs_pid != 0)
+ +              {
+ +                      elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d, "
+ +                                      "started running after we finished unbind", squeue->sq_key,
+ +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ +                      consumer_running++;
+ +              }
+ +
+ +              LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +      }
+ +
+ +      if (consumer_running)
+ +      {
+ +              elog(DEBUG1, "SQueue %s have %d consumers started running after we "
+ +                              "unbound, recheck now", squeue->sq_key, consumer_running);
+ +              LWLockRelease(SQueuesLock);
+ +              goto CHECK;
+ +      }
+ +
+ +      /* All is done, clean up */
+ +      DisownLatch(&sqsync->sqs_producer_latch);
+ +
+ +      if (--squeue->sq_refcnt == 0)
+ +      {
+ +              /* Now it is OK to remove hash table entry */
+ +              squeue->sq_sync = NULL;
+ +              sqsync->queue = NULL;
+ +              if (hash_search(SharedQueues, squeue->sq_key, HASH_REMOVE, NULL) != squeue)
+ +                      elog(PANIC, "Shared queue data corruption");
+ +      }
+ +
+ +      LWLockRelease(SQueuesLock);
+ +}
+ +
+ +
+ +/*
+ + * If queue with specified name still exists set mark respective consumer as
+ + * "Done". Due to executor optimization consumer may never connect the queue,
+ + * and should allow producer to finish it up if it is known the consumer will
+ + * never connect.
+ + */
+ +void
+ +SharedQueueRelease(const char *sqname)
+ +{
+ +      bool                                    found;
+ +      volatile SharedQueue    sq;
+ +
+ +      LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+ +
+ +      sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found);
+ +      if (found)
+ +      {
+ +              volatile SQueueSync    *sqsync = sq->sq_sync;
+ +              int                                             i;
+ +
+ +              Assert(sqsync && sqsync->queue == sq);
+ +
+ +              elog(DEBUG1, "SQueue %s producer node %d, pid %d  - requested to release",
+ +                              sqname, sq->sq_nodeid, sq->sq_pid);
+ +
+ +              /*
+ +               * If the SharedQ is not bound, we can't just remove it because
+ +               * somebody might have just created a fresh entry and is going to bind
+ +               * to it soon. We assume that the future producer will eventually
+ +               * release the SharedQ
+ +               */
+ +              if (sq->sq_nodeid == -1)
+ +              {
+ +                      elog(DEBUG1, "SQueue %s, producer not bound ", sqname);
+ +                      goto done;
+ +              }
+ +
+ +              /*
+ +               * Do not bother releasing producer, all necessary work will be
+ +               * done upon UnBind.
+ +               */
+ +              if (sq->sq_nodeid != PGXC_PARENT_NODE_ID)
+ +              {
+ +                      elog(DEBUG1, "SQueue %s, we are consumer from node %d", sqname,
+ +                                      PGXC_PARENT_NODE_ID);
+ +                      /* find specified node in the consumer lists */
+ +                      for (i = 0; i < sq->sq_nconsumers; i++)
+ +                      {
+ +                              ConsState *cstate = &(sq->sq_consumers[i]);
+ +                              if (cstate->cs_node == PGXC_PARENT_NODE_ID)
+ +                              {
+ +                                      LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ +                                                                LW_EXCLUSIVE);
+ +                                      elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, "
+ +                                                      "status %d",  sq->sq_key, cstate->cs_node,
+ +                                                      cstate->cs_pid, cstate->cs_status);
+ +
+ +                                      /*
+ +                                       * If the consumer pid is not set, we are looking at a race
+ +                                       * condition where the old producer (which supplied the
+ +                                       * tuples to this remote datanode) may have finished and
+ +                                       * marked all consumers as CONSUMER_EOF, the consumers
+ +                                       * themeselves consumed all the tuples and marked
+ +                                       * themselves as CONSUMER_DONE. The old producer in that
+ +                                       * case may have actually removed the SharedQ from shared
+ +                                       * memory. But if a new execution for this same portal
+ +                                       * comes before the consumer sends a "Close Portal" message
+ +                                       * (which subsequently calls this function), we may end up
+ +                                       * corrupting state for the upcoming consumer for this new
+ +                                       * execution of the portal.
+ +                                       *
+ +                                       * It seems best to just ignore the release call in such
+ +                                       * cases.
+ +                                       */
+ +                                      if (cstate->cs_pid == 0)
+ +                                      {
+ +                                              elog(DEBUG1, "SQueue %s, consumer node %d, already released",
+ +                                                      sq->sq_key, cstate->cs_node);
+ +                                      }
+ +                                      else if (cstate->cs_status != CONSUMER_DONE)
+ +                                      {
+ +                                              /* Inform producer the consumer have done the job */
+ +                                              cstate->cs_status = CONSUMER_DONE;
+ +                                              /* no need to receive notifications */
+ +                                              if (cstate->cs_pid > 0)
+ +                                              {
+ +                                                      DisownLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ +                                                      cstate->cs_pid = 0;
+ +                                              }
+ +                                              /*
+ +                                               * notify the producer, it may be waiting while
+ +                                               * consumers are finishing
+ +                                               */
+ +                                              SetLatch(&sqsync->sqs_producer_latch);
+ +                                              elog(DEBUG1, "SQueue %s, release consumer at %d, node "
+ +                                                              "%d, pid %d, status %d ", sqname, i,
+ +                                                              cstate->cs_node, cstate->cs_pid,
+ +                                                              cstate->cs_status);
+ +                                      }
+ +                                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +                                      /* exit */
+ +                                      goto done;
+ +                              }
+ +                      }
+ +
+ +                      elog(DEBUG1, "SQueue %s, consumer from node %d never bound",
+ +                                      sqname, PGXC_PARENT_NODE_ID);
+ +                      /*
+ +                       * The consumer was never bound. Find empty consumer slot and
+ +                       * register node here to let producer know that the node will never
+ +                       * be consuming.
+ +                       */
+ +                      for (i = 0; i < sq->sq_nconsumers; i++)
+ +                      {
+ +                              ConsState *cstate = &(sq->sq_consumers[i]);
+ +                              if (cstate->cs_node == -1)
+ +                              {
+ +                                      LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ +                                                                LW_EXCLUSIVE);
+ +                                      /* Inform producer the consumer have done the job */
+ +                                      cstate->cs_status = CONSUMER_DONE;
+ +                                      SetLatch(&sqsync->sqs_producer_latch);
+ +                                      elog(DEBUG1, "SQueue %s, consumer at %d marking as "
+ +                                                      "CONSUMER_DONE", sqname, i);
+ +                                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +done:
+ +      /*
+ +       * If we are the last holder of the SQueue, remove it from the hash table
+ +       * to avoid any leak
+ +       */
+ +      if (sq && --sq->sq_refcnt == 0)
+ +      {
+ +              /* Now it is OK to remove hash table entry */
+ +              sq->sq_sync->queue = NULL;
+ +              sq->sq_sync = NULL;
+ +              if (hash_search(SharedQueues, sq->sq_key, HASH_REMOVE, NULL) != sq)
+ +                      elog(PANIC, "Shared queue data corruption");
+ +      }
+ +      LWLockRelease(SQueuesLock);
+ +}
+ +
+ +
+ +/*
+ + * Called when the backend is ending.
+ + */
+ +void
+ +SharedQueuesCleanup(int code, Datum arg)
+ +{
+ +      /* Need to be able to look into catalogs */
+ +      CurrentResourceOwner = ResourceOwnerCreate(NULL, "SharedQueuesCleanup");
+ +
+ +      /*
+ +       * Release all registered prepared statements.
+ +       * If a shared queue name is associated with the statement this queue will
+ +       * be released.
+ +       */
+ +      DropAllPreparedStatements();
+ +
+ +      /* Release everything */
+ +      ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, true, true);
+ +      ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_LOCKS, true, true);
+ +      ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_AFTER_LOCKS, true, true);
+ +      CurrentResourceOwner = NULL;
+ +}
+ +
+ +
+ +/*
+ + * sq_push_long_tuple
+ + *    Routine to push through the consumer state tuple longer the the consumer
+ + *    queue. Long tuple is written by a producer partially, and only when the
+ + *    consumer queue is empty.
+ + *    The consumer can determine that the tuple being read is long if the length
+ + *    of the tuple which is read before data is exceeding queue length.
+ + *      Consumers is switching to the long tuple mode and read in the portion of
+ + *      data which is already in the queue. After reading in each portion of data
+ + *    consumer sets cs_ntuples to LONG_TUPLE to indicate it is in long tuple
+ + *    mode, and writes out number of already read bytes to the beginning of the
+ + *    queue.
+ + *    While Consumer is reading in tuple data Producer may work on other task:
+ + *    execute query and send tuples to other Customers. If Producer sees the
+ + *    LONG_TUPLE indicator it may write out next portion. The tuple remains
+ + *    current in the tuplestore, and Producer just needs to read offset from
+ + *    the buffer to know what part of data to write next.
+ + *    After tuple is completely written the Producer is advancing to next tuple
+ + *    and continue operation in normal mode.
+ + */
+ +static bool
+ +sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow)
+ +{
+ +      if (cstate->cs_ntuples == 0)
+ +      {
+ +              /* the tuple is too big to fit the queue, start pushing it through */
+ +              int len;
+ +              /*
+ +               * Output actual message size, to prepare consumer:
+ +               * allocate memory and set up transmission.
+ +               */
+ +              QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen);
+ +              /* Output as much as possible */
+ +              len = cstate->cs_qlength - sizeof(int);
+ +              Assert(datarow->msglen > len);
+ +              QUEUE_WRITE(cstate, len, datarow->msg);
+ +              cstate->cs_ntuples = 1;
+ +              return false;
+ +      }
+ +      else
+ +      {
+ +              int offset;
+ +              int     len;
+ +
+ +              /* Continue pushing through long tuple */
+ +              Assert(cstate->cs_ntuples == LONG_TUPLE);
+ +              /*
+ +               * Consumer outputs number of bytes already read at the beginning of
+ +               * the queue.
+ +               */
+ +              memcpy(&offset, cstate->cs_qstart, sizeof(int));
+ +
+ +              Assert(offset > 0 && offset < datarow->msglen);
+ +
+ +              /* remaining data */
+ +              len = datarow->msglen - offset;
+ +              /*
+ +               * We are sending remaining lengs just for sanity check at the consumer
+ +               * side
+ +               */
+ +              QUEUE_WRITE(cstate, sizeof(int), (char *) &len);
+ +              if (len > cstate->cs_qlength - sizeof(int))
+ +              {
+ +                      /* does not fit yet */
+ +                      len = cstate->cs_qlength - sizeof(int);
+ +                      QUEUE_WRITE(cstate, len, datarow->msg + offset);
+ +                      cstate->cs_ntuples = 1;
+ +                      return false;
+ +              }
+ +              else
+ +              {
+ +                      /* now we are done */
+ +                      QUEUE_WRITE(cstate, len, datarow->msg + offset);
+ +                      cstate->cs_ntuples = 1;
+ +                      return true;
+ +              }
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * sq_pull_long_tuple
+ + *    Read in from the queue data of a long tuple which does not the queue.
+ + *    See sq_push_long_tuple for more details
+ + */
+ +static void
+ +sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow,
+ +                                                         ConsumerSync *sync)
+ +{
+ +      int offset = 0;
+ +      int len = datarow->msglen;
+ +
+ +      for (;;)
+ +      {
+ +              /* determine how many bytes to read */
+ +              if (len > cstate->cs_qlength - sizeof(int))
+ +                      len = cstate->cs_qlength - sizeof(int);
+ +
+ +              /* read data */
+ +              QUEUE_READ(cstate, len, datarow->msg + offset);
+ +
+ +              /* remember how many we read already */
+ +              offset += len;
+ +
+ +              /* check if we are done */
+ +              if (offset == datarow->msglen)
+ +                      return;
+ +
+ +              /* need more, set up queue to accept data from the producer */
+ +              Assert(cstate->cs_ntuples == 1); /* allow exactly one incomplete tuple */
+ +              cstate->cs_ntuples = LONG_TUPLE; /* long tuple mode marker */
+ +              /* Inform producer how many bytes we have already */
+ +              memcpy(cstate->cs_qstart, &offset, sizeof(int));
+ +              /* Release locks and wait until producer supply more data */
+ +              while (cstate->cs_ntuples == LONG_TUPLE)
+ +              {
+ +                      /* prepare wait */
+ +                      ResetLatch(&sync->cs_latch);
+ +                      LWLockRelease(sync->cs_lwlock);
+ +                      /* Wait for notification about available info */
++                      WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1,
++                                      WAIT_EVENT_MQ_INTERNAL);
+ +                      /* got the notification, restore lock and try again */
+ +                      LWLockAcquire(sync->cs_lwlock, LW_EXCLUSIVE);
+ +              }
+ +              /* Read length of remaining data */
+ +              QUEUE_READ(cstate, sizeof(int), (char *) &len);
+ +
+ +              /* Make sure we are doing the same tuple */
+ +              Assert(offset + len == datarow->msglen);
+ +
+ +              /* next iteration */
+ +      }
+ +}
diff --cc src/backend/postmaster/autovacuum.c

index 1b5328e479771d7b9da390b23c1e30061fb03876,89dd3b321bc91bbc176e2825e6af49556c94637e..45ae93b4ef27fb4dfc93dcd9280bd54006177e9a
--- 1/src/backend/postmaster/autovacuum.c
--- 2/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@@ -50,8 -50,7 +50,8 @@@
    * there is a window (caused by pgstat delay) on which a worker may choose a
    * table that was already vacuumed; this is a bug in the current design.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -2175,16 -2211,96 +2216,106 @@@ do_autovacuum(void
         heap_endscan(relScan);
         heap_close(classRel, AccessShareLock);
   
+ +#ifdef XCP
+ +      /*
+ +       * Coordinator needs to access Datanodes to process distributed table.
+ +       */
+ +      if (IS_PGXC_COORDINATOR)
+ +      {
+ +              InitMultinodeExecutor(false);
+ +      }
+ +#endif
+ +
+       /*
+        * Recheck orphan temporary tables, and if they still seem orphaned, drop
+        * them.  We'll eat a transaction per dropped table, which might seem
+        * excessive, but we should only need to do anything as a result of a
+        * previous backend crash, so this should not happen often enough to
+        * justify "optimizing".  Using separate transactions ensures that we
+        * don't bloat the lock table if there are many temp tables to be dropped,
+        * and it ensures that we don't lose work if a deletion attempt fails.
+        */
+       foreach(cell, orphan_oids)
+       {
+               Oid                     relid = lfirst_oid(cell);
+               Form_pg_class classForm;
+               int                     backendID;
+               ObjectAddress object;
+ 
+               /*
+                * Check for user-requested abort.
+                */
+               CHECK_FOR_INTERRUPTS();
+ 
+               /*
+                * Try to lock the table.  If we can't get the lock immediately,
+                * somebody else is using (or dropping) the table, so it's not our
+                * concern anymore.  Having the lock prevents race conditions below.
+                */
+               if (!ConditionalLockRelationOid(relid, AccessExclusiveLock))
+                       continue;
+ 
+               /*
+                * Re-fetch the pg_class tuple and re-check whether it still seems to
+                * be an orphaned temp table.  If it's not there or no longer the same
+                * relation, ignore it.
+                */
+               tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
+               if (!HeapTupleIsValid(tuple))
+               {
+                       /* be sure to drop useless lock so we don't bloat lock table */
+                       UnlockRelationOid(relid, AccessExclusiveLock);
+                       continue;
+               }
+               classForm = (Form_pg_class) GETSTRUCT(tuple);
+ 
+               /*
+                * Make all the same tests made in the loop above.  In event of OID
+                * counter wraparound, the pg_class entry we have now might be
+                * completely unrelated to the one we saw before.
+                */
+               if (!((classForm->relkind == RELKIND_RELATION ||
+                          classForm->relkind == RELKIND_MATVIEW) &&
+                         classForm->relpersistence == RELPERSISTENCE_TEMP))
+               {
+                       UnlockRelationOid(relid, AccessExclusiveLock);
+                       continue;
+               }
+               backendID = GetTempNamespaceBackendId(classForm->relnamespace);
+               if (!(backendID != InvalidBackendId &&
+                         (backendID == MyBackendId ||
+                          BackendIdGetProc(backendID) == NULL)))
+               {
+                       UnlockRelationOid(relid, AccessExclusiveLock);
+                       continue;
+               }
+ 
+               /* OK, let's delete it */
+               ereport(LOG,
+                               (errmsg("autovacuum: dropping orphan temp table \"%s.%s.%s\"",
+                                               get_database_name(MyDatabaseId),
+                                               get_namespace_name(classForm->relnamespace),
+                                               NameStr(classForm->relname))));
+ 
+               object.classId = RelationRelationId;
+               object.objectId = relid;
+               object.objectSubId = 0;
+               performDeletion(&object, DROP_CASCADE,
+                                               PERFORM_DELETION_INTERNAL |
+                                               PERFORM_DELETION_QUIETLY |
+                                               PERFORM_DELETION_SKIP_EXTENSIONS);
+ 
+               /*
+                * To commit the deletion, end current transaction and start a new
+                * one.  Note this also releases the lock we took.
+                */
+               CommitTransactionCommand();
+               StartTransactionCommand();
+ 
+               /* StartTransactionCommand changed current memory context */
+               MemoryContextSwitchTo(AutovacMemCxt);
+       }
+ 
         /*
          * Create a buffer access strategy object for VACUUM to use.  We want to
          * use the same one across all the vacuum operations we perform, since the
diff --cc src/backend/postmaster/clustermon.c

index 0eb039286358863899dae26df670b879ae05d61a,0000000000000000000000000000000000000000..6c6e8ebd9fc176b9a41626710d3ffa034f945059

mode 100644,000000..100644
--- 1/src/backend/postmaster/clustermon.c
--- /dev/null
+++ b/src/backend/postmaster/clustermon.c
@@@ -1,437 -1,0 +1,439 @@@
-                                          (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L));
+ +/*-------------------------------------------------------------------------
+ + *
+ + * clustermon.c
+ + *
+ + * Postgres-XL Cluster Monitor
+ + *
+ + * Portions Copyright (c) 2015, 2ndQuadrant Ltd
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 1994, Regents of the University of California
+ + *
+ + *
+ + * IDENTIFICATION
+ + *      src/backend/postmaster/clustermon.c
+ + *
+ + *-------------------------------------------------------------------------
+ + */
+ +#include "postgres.h"
+ +
+ +#include <signal.h>
+ +#include <sys/types.h>
+ +#include <sys/time.h>
+ +#include <unistd.h>
+ +
+ +#include "access/gtm.h"
+ +#include "access/transam.h"
+ +#include "access/xact.h"
+ +#include "gtm/gtm_c.h"
+ +#include "gtm/gtm_gxid.h"
+ +#include "libpq/pqsignal.h"
+ +#include "miscadmin.h"
+ +#include "pgxc/pgxc.h"
+ +#include "postmaster/clustermon.h"
+ +#include "postmaster/fork_process.h"
+ +#include "postmaster/postmaster.h"
+ +#include "storage/ipc.h"
+ +#include "storage/proc.h"
+ +#include "storage/procarray.h"
+ +#include "storage/spin.h"
+ +#include "tcop/tcopprot.h"
+ +#include "utils/memutils.h"
+ +#include "utils/ps_status.h"
+ +#include "utils/timeout.h"
+ +#include "utils/timestamp.h"
++#include "pgstat.h"
+ +
+ +/* Flags to tell if we are in a clustermon process */
+ +static bool am_clustermon = false;
+ +
+ +/* Flags set by signal handlers */
+ +static volatile sig_atomic_t got_SIGHUP = false;
+ +static volatile sig_atomic_t got_SIGTERM = false;
+ +
+ +/* Memory context for long-lived data */
+ +static MemoryContext ClusterMonitorMemCxt;
+ +static ClusterMonitorCtlData *ClusterMonitorCtl = NULL; 
+ +
+ +static void cm_sighup_handler(SIGNAL_ARGS);
+ +static void cm_sigterm_handler(SIGNAL_ARGS);
+ +static void ClusterMonitorSetReportedGlobalXmin(GlobalTransactionId xmin);
+ +static void ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin);
+ +
+ +/* PID of clustser monitoring process */
+ +int                   ClusterMonitorPid = 0;
+ +
+ +#define CLUSTER_MONITOR_NAPTIME       5
+ +
+ +/*
+ + * Main loop for the cluster monitor process.
+ + */
+ +int
+ +ClusterMonitorInit(void)
+ +{
+ +      sigjmp_buf      local_sigjmp_buf;
+ +      GTM_PGXCNodeType nodetype = IS_PGXC_DATANODE ?
+ +                                                                      GTM_NODE_DATANODE :
+ +                                                                      GTM_NODE_COORDINATOR;
+ +      GlobalTransactionId oldestXmin;
+ +      GlobalTransactionId newOldestXmin;
+ +      GlobalTransactionId lastGlobalXmin;
+ +      GlobalTransactionId latestCompletedXid;
+ +      int status;
+ +
+ +      am_clustermon = true;
+ +
+ +      /* Identify myself via ps */
+ +      init_ps_display("cluster monitor process", "", "", "");
+ +
+ +      ereport(LOG,
+ +                      (errmsg("cluster monitor started")));
+ +
+ +      if (PostAuthDelay)
+ +              pg_usleep(PostAuthDelay * 1000000L);
+ +
+ +      /*
+ +       * Set up signal handlers.  We operate on databases much like a regular
+ +       * backend, so we use the same signal handling.  See equivalent code in
+ +       * tcop/postgres.c.
+ +       */
+ +      pqsignal(SIGHUP, cm_sighup_handler);
+ +      pqsignal(SIGINT, StatementCancelHandler);
+ +      pqsignal(SIGTERM, cm_sigterm_handler);
+ +
+ +      pqsignal(SIGQUIT, quickdie);
+ +      InitializeTimeouts();           /* establishes SIGALRM handler */
+ +
+ +      pqsignal(SIGPIPE, SIG_IGN);
+ +      pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+ +      pqsignal(SIGFPE, FloatExceptionHandler);
+ +      pqsignal(SIGCHLD, SIG_DFL);
+ +
+ +      /*
+ +       * Create a memory context that we will do all our work in.  We do this so
+ +       * that we can reset the context during error recovery and thereby avoid
+ +       * possible memory leaks.
+ +       */
+ +      ClusterMonitorMemCxt = AllocSetContextCreate(TopMemoryContext,
+ +                                                                                "Cluster Monitor",
+ +                                                                                ALLOCSET_DEFAULT_MINSIZE,
+ +                                                                                ALLOCSET_DEFAULT_INITSIZE,
+ +                                                                                ALLOCSET_DEFAULT_MAXSIZE);
+ +      MemoryContextSwitchTo(ClusterMonitorMemCxt);
+ +
+ +    SetProcessingMode(NormalProcessing);
+ +
+ +      if (RegisterGTM(nodetype) < 0)
+ +      {
+ +              UnregisterGTM(nodetype);
+ +              if (RegisterGTM(nodetype) < 0)
+ +              {
+ +                      ereport(LOG,
+ +                                      (errcode(ERRCODE_IO_ERROR),
+ +                                       errmsg("Can not register node on GTM")));
+ +              }
+ +      }
+ +
+ +      /*
+ +       * If an exception is encountered, processing resumes here.
+ +       *
+ +       * This code is a stripped down version of PostgresMain error recovery.
+ +       */
+ +      if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+ +      {
+ +              /* since not using PG_TRY, must reset error stack by hand */
+ +              error_context_stack = NULL;
+ +
+ +              /* Prevents interrupts while cleaning up */
+ +              HOLD_INTERRUPTS();
+ +
+ +              /* Forget any pending QueryCancel or timeout request */
+ +              disable_all_timeouts(false);
+ +              QueryCancelPending = false;             /* second to avoid race condition */
+ +
+ +              /* Report the error to the server log */
+ +              EmitErrorReport();
+ +
+ +              /*
+ +               * Now return to normal top-level context and clear ErrorContext for
+ +               * next time.
+ +               */
+ +              MemoryContextSwitchTo(ClusterMonitorMemCxt);
+ +              FlushErrorState();
+ +
+ +              /* Flush any leaked data in the top-level context */
+ +              MemoryContextResetAndDeleteChildren(ClusterMonitorMemCxt);
+ +
+ +              /* Now we can allow interrupts again */
+ +              RESUME_INTERRUPTS();
+ +
+ +              /* if in shutdown mode, no need for anything further; just go away */
+ +              if (got_SIGTERM)
+ +                      goto shutdown;
+ +
+ +              /*
+ +               * Sleep at least 1 second after any error.  We don't want to be
+ +               * filling the error logs as fast as we can.
+ +               */
+ +              pg_usleep(1000000L);
+ +      }
+ +
+ +      /* We can now handle ereport(ERROR) */
+ +      PG_exception_stack = &local_sigjmp_buf;
+ +
+ +      /* must unblock signals before calling rebuild_database_list */
+ +      PG_SETMASK(&UnBlockSig);
+ +
+ +      /*
+ +       * Force statement_timeout and lock_timeout to zero to avoid letting these
+ +       * settings prevent regular maintenance from being executed.
+ +       */
+ +      SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
+ +      SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
+ +
+ +      /* loop until shutdown request */
+ +      while (!got_SIGTERM)
+ +      {
+ +              struct timeval nap;
+ +              int                     rc;
+ +
+ +              /*
+ +               * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval
+ +               */
+ +              nap.tv_sec = CLUSTER_MONITOR_NAPTIME;
+ +              nap.tv_usec = 0;
+ +
+ +              /*
+ +               * Wait until naptime expires or we get some type of signal (all the
+ +               * signal handlers will wake us by calling SetLatch).
+ +               */
+ +              rc = WaitLatch(MyLatch,
+ +                                         WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-               oldestXmin = GetOldestXminInternal(NULL, false, true, lastGlobalXmin);
++                                         (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
++                                         WAIT_EVENT_CLUSTER_MONITOR_MAIN);
+ +
+ +              ResetLatch(MyLatch);
+ +
+ +              /* Process sinval catchup interrupts that happened while sleeping */
+ +              ProcessCatchupInterrupt();
+ +
+ +              /*
+ +               * Emergency bailout if postmaster has died.  This is to avoid the
+ +               * necessity for manual cleanup of all postmaster children.
+ +               */
+ +              if (rc & WL_POSTMASTER_DEATH)
+ +                      proc_exit(1);
+ +
+ +              /* the normal shutdown case */
+ +              if (got_SIGTERM)
+ +                      break;
+ +
+ +              if (got_SIGHUP)
+ +              {
+ +                      got_SIGHUP = false;
+ +                      ProcessConfigFile(PGC_SIGHUP);
+ +              }
+ +
+ +              /*
+ +               * Compute RecentGlobalXmin, report it to the GTM and sleep for the set
+ +               * interval. Keep doing this forever
+ +               */
+ +              lastGlobalXmin = ClusterMonitorGetGlobalXmin();
+ +              LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE);
++              oldestXmin = GetOldestXminInternal(NULL, 0, true, lastGlobalXmin);
+ +              ClusterMonitorSetReportingGlobalXmin(oldestXmin);
+ +              LWLockRelease(ClusterMonitorLock);
+ +
+ +              if ((status = ReportGlobalXmin(oldestXmin, &newOldestXmin,
+ +                                              &latestCompletedXid)))
+ +              {
+ +                      elog(DEBUG1, "Failed (status %d) to report RecentGlobalXmin "
+ +                                      "- reported RecentGlobalXmin %d, received "
+ +                                      "RecentGlobalXmin %d, " "received latestCompletedXid %d",
+ +                                      status, oldestXmin, newOldestXmin,
+ +                                      latestCompletedXid);
+ +                      if (status == GTM_ERRCODE_TOO_OLD_XMIN ||
+ +                              status == GTM_ERRCODE_NODE_EXCLUDED)
+ +                      {
+ +                              /*
+ +                               * If we haven't seen a new transaction for a very long time or
+ +                               * were disconncted for a while or excluded from the xmin
+ +                               * computation for any reason, our xmin calculation could be
+ +                               * well in the past, especially because its capped by the
+ +                               * latestCompletedXid which may not advance on an idle server.
+ +                               * In such cases, use the value of latestCompletedXid as
+ +                               * returned by GTM and then recompute local xmin.
+ +                               *
+ +                               * If the GTM's global xmin advances even further while we are
+ +                               * ready with a new xmin, just repeat the entire exercise as
+ +                               * long as GTM keeps returning us a more current value of
+ +                               * latestCompletedXid and thus pushing forward our local xmin
+ +                               * calculation
+ +                               */
+ +                              if (GlobalTransactionIdIsValid(latestCompletedXid) &&
+ +                                              TransactionIdPrecedes(oldestXmin, latestCompletedXid))
+ +                              {
+ +                                      SetLatestCompletedXid(latestCompletedXid);
+ +                                      continue;
+ +                              }
+ +                      }
+ +              }
+ +              else
+ +              {
+ +                      elog(DEBUG1, "Successfully reported xmin to GTM - reported_xmin %d,"
+ +                                      "received RecentGlobalXmin %d, "
+ +                                      "received latestCompletedXid %d", oldestXmin,
+ +                                      newOldestXmin, latestCompletedXid);
+ +
+ +                      SetLatestCompletedXid(latestCompletedXid);
+ +                      ClusterMonitorSetReportedGlobalXmin(oldestXmin);
+ +                      if (GlobalTransactionIdIsValid(newOldestXmin))
+ +                              ClusterMonitorSetGlobalXmin(newOldestXmin);
+ +              }
+ +
+ +              ClusterMonitorSetReportingGlobalXmin(InvalidGlobalTransactionId);
+ +
+ +      }
+ +
+ +      /* Normal exit from the cluster monitor is here */
+ +shutdown:
+ +      UnregisterGTM(nodetype);
+ +      ereport(LOG,
+ +                      (errmsg("cluster monitor shutting down")));
+ +
+ +      proc_exit(0);                           /* done */
+ +}
+ +
+ +/* SIGHUP: set flag to re-read config file at next convenient time */
+ +static void
+ +cm_sighup_handler(SIGNAL_ARGS)
+ +{
+ +      int                     save_errno = errno;
+ +
+ +      got_SIGHUP = true;
+ +      SetLatch(MyLatch);
+ +
+ +      errno = save_errno;
+ +}
+ +
+ +/* SIGTERM: time to die */
+ +static void
+ +cm_sigterm_handler(SIGNAL_ARGS)
+ +{
+ +      int                     save_errno = errno;
+ +
+ +      got_SIGTERM = true;
+ +      SetLatch(MyLatch);
+ +
+ +      errno = save_errno;
+ +}
+ +
+ +
+ +/*
+ + * IsClusterMonitor functions
+ + *            Return whether this is either a cluster monitor process or a worker
+ + *            process.
+ + */
+ +bool
+ +IsClusterMonitorProcess(void)
+ +{
+ +      return am_clustermon;
+ +}
+ +
+ +/* Report shared-memory space needed by ClusterMonitor */
+ +Size
+ +ClusterMonitorShmemSize(void)
+ +{
+ +      return sizeof (ClusterMonitorCtlData);
+ +}
+ +
+ +void
+ +ClusterMonitorShmemInit(void)
+ +{
+ +      bool            found;
+ +
+ +      ClusterMonitorCtl = (ClusterMonitorCtlData *)
+ +              ShmemInitStruct("Cluster Monitor Ctl", ClusterMonitorShmemSize(), &found);
+ +
+ +      if (!found)
+ +      {
+ +              /* First time through, so initialize */
+ +              MemSet(ClusterMonitorCtl, 0, ClusterMonitorShmemSize());
+ +              SpinLockInit(&ClusterMonitorCtl->mutex);
+ +      }
+ +}
+ +
+ +GlobalTransactionId
+ +ClusterMonitorGetGlobalXmin(void)
+ +{
+ +      GlobalTransactionId xmin;
+ +
+ +      SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ +      xmin = ClusterMonitorCtl->gtm_recent_global_xmin;
+ +      SpinLockRelease(&ClusterMonitorCtl->mutex);
+ +
+ +      return xmin;
+ +}
+ +
+ +void
+ +ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin)
+ +{
+ +      /*
+ +       * First extend the commit logs. Even though we may not have actually
+ +       * started any transactions in the new range, we must still extend the logs
+ +       * so that later operations which rely on the RecentGlobalXmin to truncate
+ +       * the logs work correctly.
+ +       */
+ +      ExtendLogs(xmin);
+ +
+ +      LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ +
+ +      /*
+ +       * Do a consistency check to ensure that we NEVER have running transactions
+ +       * with xmin less than what the GTM has already computed. While during
+ +       * normal execution, this should never happen, if we ever been excluded
+ +       * from the xmin calculation by the GTM while we are still running old
+ +       * transactions, PANIC is our best bet to avoid corruption
+ +       */ 
+ +      ProcArrayCheckXminConsistency(xmin);
+ +
+ +      SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ +      ClusterMonitorCtl->gtm_recent_global_xmin = xmin;
+ +      SpinLockRelease(&ClusterMonitorCtl->mutex);
+ +
+ +      LWLockRelease(ProcArrayLock);
+ +}
+ +
+ +static void
+ +ClusterMonitorSetReportedGlobalXmin(GlobalTransactionId xmin)
+ +{
+ +      elog(DEBUG2, "ClusterMonitorSetReportedGlobalXmin - old %d, new %d",
+ +                      ClusterMonitorCtl->reported_recent_global_xmin,
+ +                      xmin);
+ +      SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ +      ClusterMonitorCtl->reported_recent_global_xmin = xmin;
+ +      SpinLockRelease(&ClusterMonitorCtl->mutex);
+ +}
+ +
+ +static void
+ +ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin)
+ +{
+ +      elog(DEBUG2, "ClusterMonitorSetReportingGlobalXmin - old %d, new %d",
+ +                      ClusterMonitorCtl->reporting_recent_global_xmin,
+ +                      xmin);
+ +      SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ +      ClusterMonitorCtl->reporting_recent_global_xmin = xmin;
+ +      SpinLockRelease(&ClusterMonitorCtl->mutex);
+ +}
+ +
+ +GlobalTransactionId
+ +ClusterMonitorGetReportingGlobalXmin(void)
+ +{
+ +      GlobalTransactionId reporting_xmin;
+ +
+ +      SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ +      reporting_xmin = ClusterMonitorCtl->reporting_recent_global_xmin;
+ +      SpinLockRelease(&ClusterMonitorCtl->mutex);
+ +
+ +      return reporting_xmin;
+ +}
diff --cc src/backend/postmaster/pgstat.c

index 181f14ee7444d2d153b595163faf8c99f5403bdb,f453dade6c63c77ff7a5f69709a440971fad169b..008502e48c4eef352fc022940e16682e68d238c9
--- 1/src/backend/postmaster/pgstat.c
--- 2/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@@ -11,8 -11,7 +11,8 @@@
    *                    - Add a pgstat config column to pg_database, so this
    *                      entire thing can be enabled/disabled on a per db basis.
    *
-  *    Copyright (c) 2001-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  *    Copyright (c) 2001-2017, PostgreSQL Global Development Group
    *
    *    src/backend/postmaster/pgstat.c
    * ----------
@@@ -3262,6 -3453,421 +3514,424 @@@ pgstat_get_wait_event(uint32 wait_event
         return event_name;
   }
   
+ /* ----------
+  * pgstat_get_wait_activity() -
+  *
+  * Convert WaitEventActivity to string.
+  * ----------
+  */
+ static const char *
+ pgstat_get_wait_activity(WaitEventActivity w)
+ {
+       const char *event_name = "unknown wait event";
+ 
+       switch (w)
+       {
+               case WAIT_EVENT_ARCHIVER_MAIN:
+                       event_name = "ArchiverMain";
+                       break;
+               case WAIT_EVENT_AUTOVACUUM_MAIN:
+                       event_name = "AutoVacuumMain";
+                       break;
+               case WAIT_EVENT_BGWRITER_HIBERNATE:
+                       event_name = "BgWriterHibernate";
+                       break;
+               case WAIT_EVENT_BGWRITER_MAIN:
+                       event_name = "BgWriterMain";
+                       break;
+               case WAIT_EVENT_CHECKPOINTER_MAIN:
+                       event_name = "CheckpointerMain";
+                       break;
+               case WAIT_EVENT_PGSTAT_MAIN:
+                       event_name = "PgStatMain";
+                       break;
+               case WAIT_EVENT_RECOVERY_WAL_ALL:
+                       event_name = "RecoveryWalAll";
+                       break;
+               case WAIT_EVENT_RECOVERY_WAL_STREAM:
+                       event_name = "RecoveryWalStream";
+                       break;
+               case WAIT_EVENT_SYSLOGGER_MAIN:
+                       event_name = "SysLoggerMain";
+                       break;
+               case WAIT_EVENT_WAL_RECEIVER_MAIN:
+                       event_name = "WalReceiverMain";
+                       break;
+               case WAIT_EVENT_WAL_SENDER_MAIN:
+                       event_name = "WalSenderMain";
+                       break;
+               case WAIT_EVENT_WAL_WRITER_MAIN:
+                       event_name = "WalWriterMain";
+                       break;
+               case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN:
+                       event_name = "LogicalLauncherMain";
+                       break;
+               case WAIT_EVENT_LOGICAL_APPLY_MAIN:
+                       event_name = "LogicalApplyMain";
+                       break;
++              case WAIT_EVENT_CLUSTER_MONITOR_MAIN:
++                      event_name = "ClusterMonitorMain";
++                      break;
+                       /* no default case, so that compiler will warn */
+       }
+ 
+       return event_name;
+ }
+ 
+ /* ----------
+  * pgstat_get_wait_client() -
+  *
+  * Convert WaitEventClient to string.
+  * ----------
+  */
+ static const char *
+ pgstat_get_wait_client(WaitEventClient w)
+ {
+       const char *event_name = "unknown wait event";
+ 
+       switch (w)
+       {
+               case WAIT_EVENT_CLIENT_READ:
+                       event_name = "ClientRead";
+                       break;
+               case WAIT_EVENT_CLIENT_WRITE:
+                       event_name = "ClientWrite";
+                       break;
+               case WAIT_EVENT_SSL_OPEN_SERVER:
+                       event_name = "SSLOpenServer";
+                       break;
+               case WAIT_EVENT_WAL_RECEIVER_WAIT_START:
+                       event_name = "WalReceiverWaitStart";
+                       break;
+               case WAIT_EVENT_LIBPQWALRECEIVER:
+                       event_name = "LibPQWalReceiver";
+                       break;
+               case WAIT_EVENT_WAL_SENDER_WAIT_WAL:
+                       event_name = "WalSenderWaitForWAL";
+                       break;
+               case WAIT_EVENT_WAL_SENDER_WRITE_DATA:
+                       event_name = "WalSenderWriteData";
+                       break;
+                       /* no default case, so that compiler will warn */
+       }
+ 
+       return event_name;
+ }
+ 
+ /* ----------
+  * pgstat_get_wait_ipc() -
+  *
+  * Convert WaitEventIPC to string.
+  * ----------
+  */
+ static const char *
+ pgstat_get_wait_ipc(WaitEventIPC w)
+ {
+       const char *event_name = "unknown wait event";
+ 
+       switch (w)
+       {
+               case WAIT_EVENT_BGWORKER_SHUTDOWN:
+                       event_name = "BgWorkerShutdown";
+                       break;
+               case WAIT_EVENT_BGWORKER_STARTUP:
+                       event_name = "BgWorkerStartup";
+                       break;
+               case WAIT_EVENT_BTREE_PAGE:
+                       event_name = "BtreePage";
+                       break;
+               case WAIT_EVENT_EXECUTE_GATHER:
+                       event_name = "ExecuteGather";
+                       break;
+               case WAIT_EVENT_MQ_INTERNAL:
+                       event_name = "MessageQueueInternal";
+                       break;
+               case WAIT_EVENT_MQ_PUT_MESSAGE:
+                       event_name = "MessageQueuePutMessage";
+                       break;
+               case WAIT_EVENT_MQ_RECEIVE:
+                       event_name = "MessageQueueReceive";
+                       break;
+               case WAIT_EVENT_MQ_SEND:
+                       event_name = "MessageQueueSend";
+                       break;
+               case WAIT_EVENT_PARALLEL_FINISH:
+                       event_name = "ParallelFinish";
+                       break;
+               case WAIT_EVENT_PARALLEL_BITMAP_SCAN:
+                       event_name = "ParallelBitmapScan";
+                       break;
+               case WAIT_EVENT_PROCARRAY_GROUP_UPDATE:
+                       event_name = "ProcArrayGroupUpdate";
+                       break;
+               case WAIT_EVENT_SAFE_SNAPSHOT:
+                       event_name = "SafeSnapshot";
+                       break;
+               case WAIT_EVENT_SYNC_REP:
+                       event_name = "SyncRep";
+                       break;
+               case WAIT_EVENT_LOGICAL_SYNC_DATA:
+                       event_name = "LogicalSyncData";
+                       break;
+               case WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE:
+                       event_name = "LogicalSyncStateChange";
+                       break;
+                       /* no default case, so that compiler will warn */
+       }
+ 
+       return event_name;
+ }
+ 
+ /* ----------
+  * pgstat_get_wait_timeout() -
+  *
+  * Convert WaitEventTimeout to string.
+  * ----------
+  */
+ static const char *
+ pgstat_get_wait_timeout(WaitEventTimeout w)
+ {
+       const char *event_name = "unknown wait event";
+ 
+       switch (w)
+       {
+               case WAIT_EVENT_BASE_BACKUP_THROTTLE:
+                       event_name = "BaseBackupThrottle";
+                       break;
+               case WAIT_EVENT_PG_SLEEP:
+                       event_name = "PgSleep";
+                       break;
+               case WAIT_EVENT_RECOVERY_APPLY_DELAY:
+                       event_name = "RecoveryApplyDelay";
+                       break;
+                       /* no default case, so that compiler will warn */
+       }
+ 
+       return event_name;
+ }
+ 
+ /* ----------
+  * pgstat_get_wait_io() -
+  *
+  * Convert WaitEventIO to string.
+  * ----------
+  */
+ static const char *
+ pgstat_get_wait_io(WaitEventIO w)
+ {
+       const char *event_name = "unknown wait event";
+ 
+       switch (w)
+       {
+               case WAIT_EVENT_BUFFILE_READ:
+                       event_name = "BufFileRead";
+                       break;
+               case WAIT_EVENT_BUFFILE_WRITE:
+                       event_name = "BufFileWrite";
+                       break;
+               case WAIT_EVENT_CONTROL_FILE_READ:
+                       event_name = "ControlFileRead";
+                       break;
+               case WAIT_EVENT_CONTROL_FILE_SYNC:
+                       event_name = "ControlFileSync";
+                       break;
+               case WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE:
+                       event_name = "ControlFileSyncUpdate";
+                       break;
+               case WAIT_EVENT_CONTROL_FILE_WRITE:
+                       event_name = "ControlFileWrite";
+                       break;
+               case WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE:
+                       event_name = "ControlFileWriteUpdate";
+                       break;
+               case WAIT_EVENT_COPY_FILE_READ:
+                       event_name = "CopyFileRead";
+                       break;
+               case WAIT_EVENT_COPY_FILE_WRITE:
+                       event_name = "CopyFileWrite";
+                       break;
+               case WAIT_EVENT_DATA_FILE_EXTEND:
+                       event_name = "DataFileExtend";
+                       break;
+               case WAIT_EVENT_DATA_FILE_FLUSH:
+                       event_name = "DataFileFlush";
+                       break;
+               case WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC:
+                       event_name = "DataFileImmediateSync";
+                       break;
+               case WAIT_EVENT_DATA_FILE_PREFETCH:
+                       event_name = "DataFilePrefetch";
+                       break;
+               case WAIT_EVENT_DATA_FILE_READ:
+                       event_name = "DataFileRead";
+                       break;
+               case WAIT_EVENT_DATA_FILE_SYNC:
+                       event_name = "DataFileSync";
+                       break;
+               case WAIT_EVENT_DATA_FILE_TRUNCATE:
+                       event_name = "DataFileTruncate";
+                       break;
+               case WAIT_EVENT_DATA_FILE_WRITE:
+                       event_name = "DataFileWrite";
+                       break;
+               case WAIT_EVENT_DSM_FILL_ZERO_WRITE:
+                       event_name = "DSMFillZeroWrite";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ:
+                       event_name = "LockFileAddToDataDirRead";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC:
+                       event_name = "LockFileAddToDataDirSync";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE:
+                       event_name = "LockFileAddToDataDirWrite";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_CREATE_READ:
+                       event_name = "LockFileCreateRead";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_CREATE_SYNC:
+                       event_name = "LockFileCreateSync";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_CREATE_WRITE:
+                       event_name = "LockFileCreateWRITE";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ:
+                       event_name = "LockFileReCheckDataDirRead";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC:
+                       event_name = "LogicalRewriteCheckpointSync";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC:
+                       event_name = "LogicalRewriteMappingSync";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE:
+                       event_name = "LogicalRewriteMappingWrite";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_SYNC:
+                       event_name = "LogicalRewriteSync";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE:
+                       event_name = "LogicalRewriteTruncate";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_WRITE:
+                       event_name = "LogicalRewriteWrite";
+                       break;
+               case WAIT_EVENT_RELATION_MAP_READ:
+                       event_name = "RelationMapRead";
+                       break;
+               case WAIT_EVENT_RELATION_MAP_SYNC:
+                       event_name = "RelationMapSync";
+                       break;
+               case WAIT_EVENT_RELATION_MAP_WRITE:
+                       event_name = "RelationMapWrite";
+                       break;
+               case WAIT_EVENT_REORDER_BUFFER_READ:
+                       event_name = "ReorderBufferRead";
+                       break;
+               case WAIT_EVENT_REORDER_BUFFER_WRITE:
+                       event_name = "ReorderBufferWrite";
+                       break;
+               case WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ:
+                       event_name = "ReorderLogicalMappingRead";
+                       break;
+               case WAIT_EVENT_REPLICATION_SLOT_READ:
+                       event_name = "ReplicationSlotRead";
+                       break;
+               case WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC:
+                       event_name = "ReplicationSlotRestoreSync";
+                       break;
+               case WAIT_EVENT_REPLICATION_SLOT_SYNC:
+                       event_name = "ReplicationSlotSync";
+                       break;
+               case WAIT_EVENT_REPLICATION_SLOT_WRITE:
+                       event_name = "ReplicationSlotWrite";
+                       break;
+               case WAIT_EVENT_SLRU_FLUSH_SYNC:
+                       event_name = "SLRUFlushSync";
+                       break;
+               case WAIT_EVENT_SLRU_READ:
+                       event_name = "SLRURead";
+                       break;
+               case WAIT_EVENT_SLRU_SYNC:
+                       event_name = "SLRUSync";
+                       break;
+               case WAIT_EVENT_SLRU_WRITE:
+                       event_name = "SLRUWrite";
+                       break;
+               case WAIT_EVENT_SNAPBUILD_READ:
+                       event_name = "SnapbuildRead";
+                       break;
+               case WAIT_EVENT_SNAPBUILD_SYNC:
+                       event_name = "SnapbuildSync";
+                       break;
+               case WAIT_EVENT_SNAPBUILD_WRITE:
+                       event_name = "SnapbuildWrite";
+                       break;
+               case WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC:
+                       event_name = "TimelineHistoryFileSync";
+                       break;
+               case WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE:
+                       event_name = "TimelineHistoryFileWrite";
+                       break;
+               case WAIT_EVENT_TIMELINE_HISTORY_READ:
+                       event_name = "TimelineHistoryRead";
+                       break;
+               case WAIT_EVENT_TIMELINE_HISTORY_SYNC:
+                       event_name = "TimelineHistorySync";
+                       break;
+               case WAIT_EVENT_TIMELINE_HISTORY_WRITE:
+                       event_name = "TimelineHistoryWrite";
+                       break;
+               case WAIT_EVENT_TWOPHASE_FILE_READ:
+                       event_name = "TwophaseFileRead";
+                       break;
+               case WAIT_EVENT_TWOPHASE_FILE_SYNC:
+                       event_name = "TwophaseFileSync";
+                       break;
+               case WAIT_EVENT_TWOPHASE_FILE_WRITE:
+                       event_name = "TwophaseFileWrite";
+                       break;
+               case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ:
+                       event_name = "WALSenderTimelineHistoryRead";
+                       break;
+               case WAIT_EVENT_WAL_BOOTSTRAP_SYNC:
+                       event_name = "WALBootstrapSync";
+                       break;
+               case WAIT_EVENT_WAL_BOOTSTRAP_WRITE:
+                       event_name = "WALBootstrapWrite";
+                       break;
+               case WAIT_EVENT_WAL_COPY_READ:
+                       event_name = "WALCopyRead";
+                       break;
+               case WAIT_EVENT_WAL_COPY_SYNC:
+                       event_name = "WALCopySync";
+                       break;
+               case WAIT_EVENT_WAL_COPY_WRITE:
+                       event_name = "WALCopyWrite";
+                       break;
+               case WAIT_EVENT_WAL_INIT_SYNC:
+                       event_name = "WALInitSync";
+                       break;
+               case WAIT_EVENT_WAL_INIT_WRITE:
+                       event_name = "WALInitWrite";
+                       break;
+               case WAIT_EVENT_WAL_READ:
+                       event_name = "WALRead";
+                       break;
+               case WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN:
+                       event_name = "WALSyncMethodAssign";
+                       break;
+               case WAIT_EVENT_WAL_WRITE:
+                       event_name = "WALWrite";
+                       break;
+ 
+                       /* no default case, so that compiler will warn */
+       }
+ 
+       return event_name;
+ }
+ 
+ 
   /* ----------
    * pgstat_get_backend_current_activity() -
    *
diff --cc src/backend/postmaster/postmaster.c

index 520616e4496e129033bf260975dba3c518417a97,35b4ec88d35786508781a62d9c06b1f2be712ba7..f6f920e49343a06698fadae83dcaa5e2c22fca00
--- 1/src/backend/postmaster/postmaster.c
--- 2/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@@ -32,10 -32,8 +32,10 @@@
    *      clients.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -136,10 -126,8 +137,11 @@@
   #include "utils/dynamic_loader.h"
   #include "utils/memutils.h"
   #include "utils/ps_status.h"
+ +#ifdef PGXC
+ +#include "utils/resowner.h"
+ +#endif
   #include "utils/timeout.h"
+ #include "utils/varlena.h"
   
   #ifdef EXEC_BACKEND
   #include "storage/spin.h"
@@@ -1397,19 -1345,8 +1447,19 @@@ PostmasterMain(int argc, char *argv[]
         StartupStatus = STARTUP_RUNNING;
         pmState = PM_STARTUP;
   
+ +#ifdef PGXC /* PGXC_COORD */
+ +      oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+ +
+ +      /*
+ +       * Initialize the Data Node connection pool
+ +       */
+ +      PgPoolerPID = StartPoolManager();
+ +
+ +      MemoryContextSwitchTo(oldcontext);
+ +#endif /* PGXC */
+ +
         /* Some workers may be scheduled to start now */
-       maybe_start_bgworker();
+       maybe_start_bgworkers();
   
         status = ServerLoop();
   
@@@ -1858,21 -1795,10 +1908,22 @@@ ServerLoop(void
                 }
   
                 /* If we have lost the stats collector, try to start a new one */
-               if (PgStatPID == 0 && pmState == PM_RUN)
+               if (PgStatPID == 0 &&
+                       (pmState == PM_RUN || pmState == PM_HOT_STANDBY))
                         PgStatPID = pgstat_start();
   
+ +#ifdef PGXC
+ +              /* If we have lost the pooler, try to start a new one */
+ +              if (PgPoolerPID == 0 && pmState == PM_RUN)
+ +                      PgPoolerPID = StartPoolManager();
+ +#endif /* PGXC */
+ +
+ +#ifdef XCP
+ +              /* If we have lost the cluster monitor, try to start a new one */
+ +              if (ClusterMonPID == 0 && pmState == PM_RUN)
+ +                      ClusterMonPID = StartClusterMonitor();
+ +#endif
+ +
                 /* If we have lost the archiver, try to start a new one. */
                 if (PgArchPID == 0 && PgArchStartupAllowed())
                         PgArchPID = pgarch_start();
@@@ -2951,18 -2857,9 +3010,18 @@@ reaper(SIGNAL_ARGS
                                 PgArchPID = pgarch_start();
                         if (PgStatPID == 0)
                                 PgStatPID = pgstat_start();
+ +#ifdef PGXC
+ +                      if (PgPoolerPID == 0)
+ +                              PgPoolerPID = StartPoolManager();
+ +#endif /* PGXC */
+ +
+ +#ifdef XCP
+ +                      if (ClusterMonPID == 0)
+ +                              ClusterMonPID = StartClusterMonitor();
+ +#endif
   
                         /* workers may be scheduled to start now */
-                       maybe_start_bgworker();
+                       maybe_start_bgworkers();
   
                         /* at this point we are really open for business */
                         ereport(LOG,
diff --cc src/backend/replication/logical/decode.c
Simple merge
diff --cc src/backend/replication/logical/logicalfuncs.c
Simple merge
diff --cc src/backend/replication/syncrep.c
Simple merge
diff --cc src/backend/rewrite/rewriteHandler.c

index 4fd96d6a8c265ca0a3ffdb9a3c1ea592089b5c0f,35ff8bb3b7cb4f7c500acefd2ae05d99a401253b..510f49fcc0654194fff3de460cf85f06d8e5cc52
--- 1/src/backend/rewrite/rewriteHandler.c
--- 2/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@@ -21,7 -21,7 +21,8 @@@
   #include "postgres.h"
   
   #include "access/sysattr.h"
+ #include "catalog/dependency.h"
+ +#include "catalog/namespace.h"
   #include "catalog/pg_type.h"
   #include "commands/trigger.h"
   #include "foreign/fdwapi.h"
@@@ -1305,76 -1277,9 +1361,77 @@@ rewriteTargetListUD(Query *parsetree, R
         const char *attrname;
         TargetEntry *tle;
   
+ +#ifdef PGXC
+ +      List *var_list = NIL;
+ +      ListCell *elt;
+ +
+ +      /*
+ +       * In Postgres-XC, we need to evaluate quals of the parse tree and determine
+ +       * if they are Coordinator quals. If they are, their attribute need to be
+ +       * added to target list for evaluation. In case some are found, add them as
+ +       * junks in the target list. The junk status will be used by remote UPDATE
+ +       * planning to associate correct element to a clause.
+ +       * For DELETE, having such columns in target list helps to evaluate Quals
+ +       * correctly on Coordinator.
+ +       * PGXCTODO: This list could be reduced to keep only in target list the
+ +       * vars using Coordinator Quals.
+ +       */
+ +      if (IS_PGXC_COORDINATOR && parsetree->jointree)
+ +              var_list = pull_qual_vars((Node *) parsetree->jointree, parsetree->resultRelation);
+ +
+ +      foreach(elt, var_list)
+ +      {
+ +              Form_pg_attribute att_tup;
+ +              int numattrs = RelationGetNumberOfAttributes(target_relation);
+ +
+ +              var = (Var *) lfirst(elt);
+ +              /* Bypass in case of extra target items like ctid */
+ +              if (var->varattno < 1 || var->varattno > numattrs)
+ +                      continue;
+ +
+ +
+ +              att_tup = target_relation->rd_att->attrs[var->varattno - 1];
+ +              tle = makeTargetEntry((Expr *) var,
+ +                                                        list_length(parsetree->targetList) + 1,
+ +                                                        pstrdup(NameStr(att_tup->attname)),
+ +                                                        true);
+ +
+ +              parsetree->targetList = lappend(parsetree->targetList, tle);
+ +      }
+ +#endif
+ +
+ +#ifdef PGXC
+ +      /*
+ +       * If relation is non-replicated, we need also to identify the Datanode
+ +       * from where tuple is fetched.
+ +       */
+ +      if (IS_PGXC_COORDINATOR &&
+ +              !IsConnFromCoord() &&
+ +              !IsLocatorReplicated(GetRelationLocType(RelationGetRelid(target_relation))) &&
+ +              (target_relation->rd_rel->relkind == RELKIND_RELATION ||
+ +               target_relation->rd_rel->relkind == RELKIND_MATVIEW))
+ +      {
+ +              var = makeVar(parsetree->resultRelation,
+ +                                        XC_NodeIdAttributeNumber,
+ +                                        INT4OID,
+ +                                        -1,
+ +                                        InvalidOid,
+ +                                        0);
+ +
+ +              attrname = "xc_node_id";
+ +
+ +              tle = makeTargetEntry((Expr *) var,
+ +                                                        list_length(parsetree->targetList) + 1,
+ +                                                        pstrdup(attrname),
+ +                                                        true);
+ +
+ +              parsetree->targetList = lappend(parsetree->targetList, tle);
+ +      }
+ +#endif
+ +
         if (target_relation->rd_rel->relkind == RELKIND_RELATION ||
-               target_relation->rd_rel->relkind == RELKIND_MATVIEW)
+               target_relation->rd_rel->relkind == RELKIND_MATVIEW ||
+               target_relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
         {
                 /*
                  * Emit CTID so that executor can find the row to update or delete.
@@@ -3886,184 -3629,3 +3952,194 @@@ QueryRewrite(Query *parsetree
   
         return results;
   }
-       ProcessUtility(cparsetree->utilityStmt, cquery.data, PROCESS_UTILITY_QUERY,
-                       NULL, NULL, false, NULL);
+ +
+ +#ifdef PGXC
+ +/*
+ + * Rewrite the CREATE TABLE AS and SELECT INTO queries as a
+ + * INSERT INTO .. SELECT query. The target table must be created first using
+ + * utility command processing. This takes care of creating the target table on
+ + * all the Coordinators and the Datanodes.
+ + */
+ +List *
+ +QueryRewriteCTAS(Query *parsetree)
+ +{
+ +      RangeVar *relation;
+ +      CreateStmt *create_stmt;
++      PlannedStmt *wrapper;
+ +      List *tableElts = NIL;
+ +      StringInfoData cquery;
+ +      ListCell *col;
+ +      Query *cparsetree;
+ +      List *raw_parsetree_list, *tlist;
+ +      char *selectstr;
+ +      CreateTableAsStmt *stmt;
+ +      IntoClause *into;
+ +      ListCell *lc;
+ +
+ +      if (parsetree->commandType != CMD_UTILITY ||
+ +              !IsA(parsetree->utilityStmt, CreateTableAsStmt))
+ +              elog(ERROR, "Unexpected commandType or intoClause is not set properly");
+ +
+ +      /* Get the target table */
+ +      stmt = (CreateTableAsStmt *) parsetree->utilityStmt;
+ +
+ +      if (stmt->relkind == OBJECT_MATVIEW)
+ +              return list_make1(parsetree);
+ +
+ +      relation = stmt->into->rel;
+ +
+ +      if (stmt->if_not_exists)
+ +      {
+ +              Oid                     nspid;
+ +
+ +              nspid = RangeVarGetCreationNamespace(stmt->into->rel);
+ +
+ +              if (get_relname_relid(stmt->into->rel->relname, nspid))
+ +              {
+ +                      ereport(NOTICE,
+ +                                      (errcode(ERRCODE_DUPLICATE_TABLE),
+ +                                       errmsg("relation \"%s\" already exists, skipping",
+ +                                                      stmt->into->rel->relname)));
+ +                      return NIL;
+ +              }
+ +      }
+ +
+ +      /* Start building a CreateStmt for creating the target table */
+ +      create_stmt = makeNode(CreateStmt);
+ +      create_stmt->relation = relation;
+ +      create_stmt->islocal = stmt->islocal;
+ +      create_stmt->if_not_exists = stmt->if_not_exists;
+ +      into = stmt->into;
+ +
+ +      /* Obtain the target list of new table */
+ +      Assert(IsA(stmt->query, Query));
+ +      cparsetree = (Query *) stmt->query;
+ +      tlist = cparsetree->targetList;
+ +
+ +      /*
+ +       * Based on the targetList, populate the column information for the target
+ +       * table. If a column name list was specified in CREATE TABLE AS, override
+ +       * the column names derived from the query. (Too few column names are OK, too
+ +       * many are not.).
+ +       */
+ +      lc = list_head(into->colNames);
+ +      foreach(col, tlist)
+ +      {
+ +              TargetEntry *tle = (TargetEntry *)lfirst(col);
+ +              ColumnDef   *coldef;
+ +              TypeName    *typename;
+ +
+ +              /* Ignore junk columns from the targetlist */
+ +              if (tle->resjunk)
+ +                      continue;
+ +
+ +              coldef = makeNode(ColumnDef);
+ +              typename = makeNode(TypeName);
+ +
+ +              /* Take the column name specified if any */
+ +              if (lc)
+ +              {
+ +                      coldef->colname = strVal(lfirst(lc));
+ +                      lc = lnext(lc);
+ +              }
+ +              else
+ +                      coldef->colname = pstrdup(tle->resname);
+ +
+ +              coldef->inhcount = 0;
+ +              coldef->is_local = true;
+ +              coldef->is_not_null = false;
+ +              coldef->raw_default = NULL;
+ +              coldef->cooked_default = NULL;
+ +              coldef->constraints = NIL;
+ +
+ +              /*
+ +               * Set typeOid and typemod. The name of the type is derived while
+ +               * generating query
+ +               */
+ +              typename->typeOid = exprType((Node *)tle->expr);
+ +              typename->typemod = exprTypmod((Node *)tle->expr);
+ +
+ +              coldef->typeName = typename;
+ +
+ +              tableElts = lappend(tableElts, coldef);
+ +      }
+ +
+ +      if (lc != NULL)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_SYNTAX_ERROR),
+ +                               errmsg("CREATE TABLE AS specifies too many column names")));
+ +
+ +      /*
+ +       * Set column information and the distribution mechanism (which will be
+ +       * NULL for SELECT INTO and the default mechanism will be picked)
+ +       */
+ +      create_stmt->tableElts = tableElts;
+ +      create_stmt->distributeby = stmt->into->distributeby;
+ +      create_stmt->subcluster = stmt->into->subcluster;
+ +
+ +      create_stmt->tablespacename = stmt->into->tableSpaceName;
+ +      create_stmt->oncommit = stmt->into->onCommit;
+ +      create_stmt->options = stmt->into->options;
+ +
+ +      /*
+ +       * Check consistency of arguments
+ +       */
+ +      if (create_stmt->oncommit != ONCOMMIT_NOOP
+ +                      && create_stmt->relation->relpersistence != RELPERSISTENCE_TEMP)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+ +                               errmsg("ON COMMIT can only be used on temporary tables")));
+ +
+ +      /* Get a copy of the parsetree which we can freely modify  */
+ +      cparsetree = copyObject(parsetree);
+ +
+ +      /*
+ +       * Now build a utility statement in order to run the CREATE TABLE DDL on
+ +       * the local and remote nodes. We keep others fields as it is since they
+ +       * are ignored anyways by deparse_query.
+ +       */
+ +      cparsetree->commandType = CMD_UTILITY;
+ +      cparsetree->utilityStmt = (Node *) create_stmt;
+ +
+ +      initStringInfo(&cquery);
+ +      deparse_query(cparsetree, &cquery, NIL, false, false);
+ +
++
++      /* finally, wrap it in a dummy PlannedStmt */
++      wrapper = makeNode(PlannedStmt);
++      wrapper->commandType = CMD_UTILITY;
++      wrapper->canSetTag = false;
++      wrapper->utilityStmt = (Node *) create_stmt;
++      wrapper->stmt_location = -1;
++      wrapper->stmt_len = -1;
++
+ +      /* Finally, fire off the query to run the DDL */
-                       NULL, 0);
++      ProcessUtility(wrapper, cquery.data, PROCESS_UTILITY_QUERY,
++                      NULL, NULL, NULL, false, NULL);
+ +
+ +      /*
+ +       * Now fold the CTAS statement into an INSERT INTO statement. The
+ +       * utility is no more required.
+ +       */
+ +      parsetree->utilityStmt = NULL;
+ +
+ +      /* Get the SELECT query string */
+ +      initStringInfo(&cquery);
+ +      deparse_query((Query *)stmt->query, &cquery, NIL, false, false);
+ +      selectstr = pstrdup(cquery.data);
+ +
+ +      /* Now, finally build the INSERT INTO statement */
+ +      initStringInfo(&cquery);
+ +
+ +      appendStringInfo(&cquery, "INSERT INTO %s.%s",
+ +                              quote_identifier(get_namespace_name(RangeVarGetCreationNamespace(relation))),
+ +                              quote_identifier(relation->relname));
+ +
+ +      appendStringInfo(&cquery, " %s %s", selectstr,
+ +                      into->skipData ? "LIMIT 0" : "");
+ +
+ +      raw_parsetree_list = pg_parse_query(cquery.data);
+ +      return pg_analyze_and_rewrite(linitial(raw_parsetree_list), cquery.data,
++                      NULL, 0, NULL);
+ +}
+ +#endif
diff --cc src/backend/rewrite/rowsecurity.c
Simple merge
diff --cc src/backend/storage/buffer/bufmgr.c

index 90239e6abf75ebec458bb418af3c6d56a0020d44,2109cbf8587fe68256b7f76bd23a02b0a6e5280d..b22edf00ecd130104bb47be466726aacfd873b3c
--- 1/src/backend/storage/buffer/bufmgr.c
--- 2/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@@ -3,8 -3,7 +3,8 @@@
    * bufmgr.c
    *      buffer manager interface routines
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/storage/file/fd.c
Simple merge
diff --cc src/backend/storage/file/reinit.c
Simple merge
diff --cc src/backend/storage/ipc/ipci.c

index 7887d82a6e269f34338e34f48afb267f854af934,2d1ed143e0b67da2344013d48af235fccd898255..f4a192efd40ce757418ea9c8315a780f7384dfad
--- 1/src/backend/storage/ipc/ipci.c
--- 2/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@@ -3,8 -3,7 +3,8 @@@
    * ipci.c
    *      POSTGRES inter-process communication initialization code.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -49,13 -44,10 +50,14 @@@
   #include "storage/procsignal.h"
   #include "storage/sinvaladt.h"
   #include "storage/spin.h"
+ +#ifdef XCP
+ +#include "pgxc/pgxc.h"
+ +#include "pgxc/squeue.h"
+ +#include "pgxc/pause.h"
+ +#endif
+ #include "utils/backend_random.h"
   #include "utils/snapmgr.h"
   
- -
   shmem_startup_hook_type shmem_startup_hook = NULL;
   
   static Size total_addin_request = 0;
@@@ -147,21 -144,12 +154,23 @@@ CreateSharedMemoryAndSemaphores(bool ma
                 size = add_size(size, ReplicationOriginShmemSize());
                 size = add_size(size, WalSndShmemSize());
                 size = add_size(size, WalRcvShmemSize());
+ +#ifdef XCP
+ +              if (IS_PGXC_DATANODE)
+ +                      size = add_size(size, SharedQueueShmemSize());
+ +              if (IS_PGXC_COORDINATOR)
+ +                      size = add_size(size, ClusterLockShmemSize());
+ +              size = add_size(size, ClusterMonitorShmemSize());
+ +#endif
+               size = add_size(size, ApplyLauncherShmemSize());
                 size = add_size(size, SnapMgrShmemSize());
                 size = add_size(size, BTreeShmemSize());
                 size = add_size(size, SyncScanShmemSize());
                 size = add_size(size, AsyncShmemSize());
+ +#ifdef PGXC
+ +              size = add_size(size, NodeTablesShmemSize());
+ +#endif
+ +
+               size = add_size(size, BackendRandomShmemSize());
   #ifdef EXEC_BACKEND
                 size = add_size(size, ShmemBackendArraySize());
   #endif
@@@ -270,18 -260,8 +285,19 @@@
         ReplicationOriginShmemInit();
         WalSndShmemInit();
         WalRcvShmemInit();
+       ApplyLauncherShmemInit();
   
+ +#ifdef XCP
+ +      /*
+ +       * Set up distributed executor's shared queues
+ +       */
+ +      if (IS_PGXC_DATANODE)
+ +              SharedQueuesInit();
+ +      if (IS_PGXC_COORDINATOR)
+ +              ClusterLockShmemInit();
+ +      ClusterMonitorShmemInit();
+ +#endif
+ +
         /*
          * Set up other modules that need some shared memory space
          */
@@@ -289,12 -269,8 +305,13 @@@
         BTreeShmemInit();
         SyncScanShmemInit();
         AsyncShmemInit();
+       BackendRandomShmemInit();
   
+ +#ifdef PGXC
+ +      NodeTablesShmemInit();
+ +#endif
+ +
+ +
   #ifdef EXEC_BACKEND
   
         /*
diff --cc src/backend/storage/ipc/procarray.c

index a66cb2468d4682c7c23e060804c4b2154aa6f4ea,8a715367918cfa5e9c89165ca8dfbc2a0ec29a31..1c01dd973f9b45bd2af900d279f1d08f9d9e3c2f
--- 1/src/backend/storage/ipc/procarray.c
--- 2/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@@ -41,10 -32,8 +41,10 @@@
    * happen, it would tie up KnownAssignedXids indefinitely, so we protect
    * ourselves by pruning the array when a valid list of running XIDs arrives.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -64,7 -53,7 +64,8 @@@
   #include "access/xlog.h"
   #include "catalog/catalog.h"
   #include "miscadmin.h"
+ +#include "postmaster/clustermon.h"
+ #include "pgstat.h"
   #include "storage/proc.h"
   #include "storage/procarray.h"
   #include "storage/spin.h"
@@@ -1377,34 -1311,7 +1380,34 @@@ TransactionIdIsActive(TransactionId xid
    * GetOldestXmin() move backwards, with no consequences for data integrity.
    */
   TransactionId
- GetOldestXmin(Relation rel, bool ignoreVacuum)
+ GetOldestXmin(Relation rel, int flags)
+ +{
-       return GetOldestXminInternal(rel, ignoreVacuum, false,
++      return GetOldestXminInternal(rel, flags, false,
+ +                      InvalidTransactionId);
+ +}
+ +
+ +/*
+ + * This implements most of the logic that GetOldestXmin needs. In XL, we don't
+ + * actually compute OldestXmin unless specifically told to do by computeLocal
+ + * argument set to true which GetOldestXmin never done. So we just return the
+ + * value from the shared memory. The OldestXmin itself is always computed by
+ + * the Cluster Monitor process by sending local state information to the GTM,
+ + * which then aggregates information from all the nodes and gives out final
+ + * OldestXmin or GlobalXmin which is consistent across the entire cluster.
+ + *
+ + * In addition, Cluster Monitor also passes the last reported xmin (or the one
+ + * sent back by GTM in case we were idle) and the last received GlobalXmin. We
+ + * must ensure that we don't see an XID or xmin which is beyond these horizons.
+ + * Otherwise it signals problems with the GlobalXmin calculation. This can
+ + * happen because of network disconnects or extreme load on the machine
+ + * (unlikely). In any case, we must restart ourselves to avoid any data
+ + * consistency problem. A more careful approach could involve killing only
+ + * those backends which are running with old xid or xmin. We can consider
+ + * implementing it that way in future
+ + */
+ +TransactionId
- GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal,
++GetOldestXminInternal(Relation rel, int flags, bool computeLocal,
+ +              TransactionId lastGlobalXmin)
   {
         ProcArrayStruct *arrayP = procArray;
         TransactionId result;
diff --cc src/backend/storage/ipc/procsignal.c

index 0d2d1b08435ac73acc5c9e6a17ae1a532a478581,4a21d5512d2370ac3d879bbb8dd7e197aae6d6ee..f4d4f25e68ccd38ac8c960f1c023d48b04e6b4cf
--- 1/src/backend/storage/ipc/procsignal.c
--- 2/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@@ -4,8 -4,7 +4,8 @@@
    *      Routines for interprocess signalling
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
diff --cc src/backend/storage/lmgr/lmgr.c
Simple merge
diff --cc src/backend/storage/lmgr/lock.c

index 37eec5b00a491afb65ea243854bb81d70ccdfc16,4315be4077359b8e584804525250fb816847cbc5..34a4e913d71220aaebee034fb71a6412d134521e
--- 1/src/backend/storage/lmgr/lock.c
--- 2/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@@ -3,8 -3,7 +3,8 @@@
    * lock.c
    *      POSTGRES primary lock mechanism
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/storage/lmgr/lwlock.c

index 950ea746498bf29cf790b61ad8fc1c533d734a00,35536e47894bd7f00fb57c14392f581bf87e4f60..655c05c7a7661b630a12523656a4757da9680ec8
--- 1/src/backend/storage/lmgr/lwlock.c
--- 2/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@@ -20,8 -20,7 +20,8 @@@
    * appropriate value for a free lock.  The meaning of the variable is up to
    * the caller, the lightweight lock code just assigns and compares it.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
@@@ -85,11 -84,8 +85,12 @@@
   #include "storage/ipc.h"
   #include "storage/predicate.h"
   #include "storage/proc.h"
+ #include "storage/proclist.h"
   #include "storage/spin.h"
+ +#ifdef XCP
+ +#include "pgxc/nodemgr.h"
+ +#include "pgxc/squeue.h"
+ +#endif
   #include "utils/memutils.h"
   
   #ifdef LWLOCK_STATS
@@@ -532,10 -494,10 +498,12 @@@ RegisterLWLockTranches(void
   
         if (LWLockTrancheArray == NULL)
         {
-               LWLockTranchesAllocated = 32;
-               LWLockTrancheArray = (LWLockTranche **)
- -              LWLockTranchesAllocated = 64;
++              LWLockTranchesAllocated = 128; /* XXX PG10MERGE: Not sure why 64 is
++                                                                                hardcoded in the PG10 branch. That
++                                                                                causes assertion failure */
+               LWLockTrancheArray = (char **)
                         MemoryContextAllocZero(TopMemoryContext,
-                                                 LWLockTranchesAllocated * sizeof(LWLockTranche *));
+                                                                  LWLockTranchesAllocated * sizeof(char *));
                 Assert(LWLockTranchesAllocated >= LWTRANCHE_FIRST_USER_DEFINED);
         }
   
diff --cc src/backend/storage/lmgr/lwlocknames.txt

index b7c7c7d49c5133762dc4650263cb2cd424c845cb,e6025ecedb3ba34e7579a4115510ed294d4f8f6d..420a76217c891a3c489e93023f8994d729286aa0
--- 1/src/backend/storage/lmgr/lwlocknames.txt
--- 2/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@@ -47,7 -47,6 +47,10 @@@ CommitTsLock                                         3
   ReplicationOriginLock                         40
   MultiXactTruncationLock                               41
   OldSnapshotTimeMapLock                                42
- -BackendRandomLock                                     43
- -LogicalRepWorkerLock                          44
- -CLogTruncationLock                                    45
+ +BarrierLock                                                   43
+ +NodeTableLock                                         44
+ +SQueuesLock                                                   45
+ +ClusterMonitorLock                                    46
++BackendRandomLock                                     47
++LogicalRepWorkerLock                          48
++CLogTruncationLock                                    49
diff --cc src/backend/storage/lmgr/predicate.c
Simple merge
diff --cc src/backend/storage/lmgr/proc.c

index d876625166dd60ed3c4997a98402ffffb4dc45b4,3e716b1c6c7280038b457a5b355e96e61c7982b6..410c31fe9911f9f17e31e18886bebd068f977ef3
--- 1/src/backend/storage/lmgr/proc.c
--- 2/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@@ -3,8 -3,7 +3,8 @@@
    * proc.c
    *      routines to manage per-process shared memory data structure
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -40,13 -39,11 +40,15 @@@
   #include "access/twophase.h"
   #include "access/xact.h"
   #include "miscadmin.h"
+ #include "pgstat.h"
   #include "postmaster/autovacuum.h"
+ +#ifdef PGXC
+ +#include "pgxc/pgxc.h"
+ +#include "pgxc/poolmgr.h"
+ +#endif
   #include "replication/slot.h"
   #include "replication/syncrep.h"
+ #include "storage/condition_variable.h"
   #include "storage/standby.h"
   #include "storage/ipc.h"
   #include "storage/lmgr.h"
@@@ -377,15 -370,9 +375,16 @@@ InitProcess(void
         MyProc->backendId = InvalidBackendId;
         MyProc->databaseId = InvalidOid;
         MyProc->roleId = InvalidOid;
+ +#ifdef XCP
+ +      MyProc->coordId = InvalidOid;
+ +      MyProc->coordPid = 0;
+ +#endif
+       MyProc->isBackgroundWorker = IsBackgroundWorker;
         MyPgXact->delayChkpt = false;
         MyPgXact->vacuumFlags = 0;
+ +#ifdef PGXC
+ +      MyProc->isPooler = false;
+ +#endif
         /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
         if (IsAutoVacuumWorkerProcess())
                 MyPgXact->vacuumFlags |= PROC_IS_AUTOVACUUM;
@@@ -556,15 -543,7 +555,16 @@@ InitAuxiliaryProcess(void
         MyProc->backendId = InvalidBackendId;
         MyProc->databaseId = InvalidOid;
         MyProc->roleId = InvalidOid;
+ +#ifdef XCP
+ +      MyProc->coordId = InvalidOid;
+ +      MyProc->coordPid = 0;
+ +#endif
+ +#ifdef PGXC
+ +      MyProc->isPooler = false;
+ +      if (IsPGXCPoolerProcess())
+ +              MyProc->isPooler = true;
+ +#endif
+       MyProc->isBackgroundWorker = IsBackgroundWorker;
         MyPgXact->delayChkpt = false;
         MyPgXact->vacuumFlags = 0;
         MyProc->lwWaiting = false;
diff --cc src/backend/tcop/dest.c

index f1905d2f80a1b9845a95a81b5396029afd9ac357,28081c37654a8787e9423e75c6f768997d6872ca..6ad2d78b3efa3a25325fe704d73b4e8332b66fbf
--- 1/src/backend/tcop/dest.c
--- 2/src/backend/tcop/dest.c
+++ b/src/backend/tcop/dest.c
@@@ -4,8 -4,7 +4,8 @@@
    *      support for communication destinations
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
diff --cc src/backend/tcop/postgres.c

index 95cf9847721c0c453b39efc5948c8166591e3f20,75c2d9a61d0dc067e9844f986cf23e23f724e565..a4f4884372813825856cd83394824e92bf8875b8
--- 1/src/backend/tcop/postgres.c
--- 2/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@@ -3,10 -3,8 +3,10 @@@
    * postgres.c
    *      POSTGRES C Backend Interface
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -1162,10 -964,9 +1173,10 @@@ exec_simple_query(const char *query_str
         /*
          * Run through the raw parsetree(s) and process each one.
          */
- -      foreach(parsetree_item, parsetree_list)
+ +      forboth(parsetree_item, parsetree_list, querysource_item, querysource_list)
         {
-               Node       *parsetree = (Node *) lfirst(parsetree_item);
+               RawStmt    *parsetree = lfirst_node(RawStmt, parsetree_item);
+ +              char       *querysource = (char *) lfirst(querysource_item);
                 bool            snapshot_set = false;
                 const char *commandTag;
                 char            completionTag[COMPLETION_TAG_BUFSIZE];
@@@ -5220,15 -4416,15 +5209,15 @@@ ShowUsageCommon(const char *title, stru
   
         appendStringInfoString(&str, "! system usage stats:\n");
         appendStringInfo(&str,
-                               "!\t%ld.%06ld elapsed %ld.%06ld user %ld.%06ld system sec\n",
-                                        (long) (elapse_t.tv_sec - save_t->tv_sec),
-                                        (long) (elapse_t.tv_usec - save_t->tv_usec),
+                       "!\t%ld.%06ld s user, %ld.%06ld s system, %ld.%06ld s elapsed\n",
- -                                       (long) (r.ru_utime.tv_sec - Save_r.ru_utime.tv_sec),
- -                                       (long) (r.ru_utime.tv_usec - Save_r.ru_utime.tv_usec),
- -                                       (long) (r.ru_stime.tv_sec - Save_r.ru_stime.tv_sec),
- -                                       (long) (r.ru_stime.tv_usec - Save_r.ru_stime.tv_usec),
- -                                       (long) (elapse_t.tv_sec - Save_t.tv_sec),
- -                                       (long) (elapse_t.tv_usec - Save_t.tv_usec));
+ +                                       (long) (r.ru_utime.tv_sec - save_r->ru_utime.tv_sec),
+ +                                       (long) (r.ru_utime.tv_usec - save_r->ru_utime.tv_usec),
+ +                                       (long) (r.ru_stime.tv_sec - save_r->ru_stime.tv_sec),
-                                        (long) (r.ru_stime.tv_usec - save_r->ru_stime.tv_usec));
++                                       (long) (r.ru_stime.tv_usec - save_r->ru_stime.tv_usec),
++                                       (long) (elapse_t.tv_sec - save_t->tv_sec),
++                                       (long) (elapse_t.tv_usec - save_t->tv_usec));
         appendStringInfo(&str,
-                                        "!\t[%ld.%06ld user %ld.%06ld sys total]\n",
+                                        "!\t[%ld.%06ld s user, %ld.%06ld s system total]\n",
                                          (long) user.tv_sec,
                                          (long) user.tv_usec,
                                          (long) sys.tv_sec,
diff --cc src/backend/tcop/pquery.c

index f3e175e475d0835c58fa4918d35f8db3bc19beca,e30aeb1c7faff6eb02533dceaa99d46e35f1fa1f..134dc6dd240490182405b4f5d3b9cd638cf2db39
--- 1/src/backend/tcop/pquery.c
--- 2/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@@ -3,8 -3,7 +3,8 @@@
    * pquery.c
    *      POSTGRES process query command code
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -102,41 -93,8 +106,13 @@@ CreateQueryDesc(PlannedStmt *plannedstm
         qd->planstate = NULL;
         qd->totaltime = NULL;
   
-       return qd;
- }
- 
- /*
-  * CreateUtilityQueryDesc
-  */
- QueryDesc *
- CreateUtilityQueryDesc(Node *utilitystmt,
-                                          const char *sourceText,
-                                          Snapshot snapshot,
-                                          DestReceiver *dest,
-                                          ParamListInfo params)
- {
-       QueryDesc  *qd = (QueryDesc *) palloc(sizeof(QueryDesc));
- 
-       qd->operation = CMD_UTILITY;    /* operation */
-       qd->plannedstmt = NULL;
-       qd->utilitystmt = utilitystmt;          /* utility command */
-       qd->sourceText = sourceText;    /* query text */
-       qd->snapshot = RegisterSnapshot(snapshot);      /* snapshot */
-       qd->crosscheck_snapshot = InvalidSnapshot;      /* RI check snapshot */
-       qd->dest = dest;                        /* output dest */
-       qd->params = params;            /* parameter values passed into query */
-       qd->instrument_options = false;         /* uninteresting for utilities */
- 
-       /* null these fields until set by ExecutorStart */
-       qd->tupDesc = NULL;
-       qd->estate = NULL;
-       qd->planstate = NULL;
-       qd->totaltime = NULL;
+ +#ifdef XCP
+ +      qd->squeue = NULL;
+ +      qd->myindex = -1;
+ +#endif
+ +
+       /* not yet executed */
+       qd->already_executed = false;
   
         return qd;
   }
@@@ -421,15 -260,9 +394,14 @@@ ChoosePortalStrategy(List *stmts
                 {
                         PlannedStmt *pstmt = (PlannedStmt *) stmt;
   
+ +#ifdef XCP
+ +                      if (list_length(pstmt->distributionRestrict) > 1)
+ +                              return PORTAL_DISTRIBUTED;
+ +#endif
+ +
                         if (pstmt->canSetTag)
                         {
-                               if (pstmt->commandType == CMD_SELECT &&
-                                       pstmt->utilityStmt == NULL)
+                               if (pstmt->commandType == CMD_SELECT)
                                 {
                                         if (pstmt->hasModifyingCTE)
                                                 return PORTAL_ONE_MOD_WITH;
@@@ -646,204 -484,6 +627,205 @@@ PortalStart(Portal portal, ParamListInf
                  */
                 switch (portal->strategy)
                 {
+ +#ifdef XCP
+ +                      case PORTAL_DISTRIBUTED:
+ +                              /* No special ability is needed */
+ +                              eflags = 0;
+ +                              /* Must set snapshot before starting executor. */
+ +                              if (snapshot)
+ +                                      PushActiveSnapshot(GetActiveSnapshot());
+ +                              else
+ +                                      PushActiveSnapshot(GetTransactionSnapshot());
+ +
+ +                              /*
+ +                               * Create QueryDesc in portal's context; for the moment, set
+ +                               * the destination to DestNone.
+ +                               */
+ +                              queryDesc = CreateQueryDesc((PlannedStmt *) linitial(portal->stmts),
+ +                                                                                      portal->sourceText,
+ +                                                                                      GetActiveSnapshot(),
+ +                                                                                      InvalidSnapshot,
+ +                                                                                      None_Receiver,
+ +                                                                                      params,
++                                                                                      NULL,
+ +                                                                                      0);
+ +                              /*
+ +                               * If parent node have sent down parameters, and at least one
+ +                               * of them is PARAM_EXEC we should avoid "single execution"
+ +                               * model. All parent nodes deliver the same values for
+ +                               * PARAM_EXTERN since these values are provided by client and
+ +                               * they are not changed during the query execution.
+ +                               * On the conrary, values of PARAM_EXEC are results of execution
+ +                               * on the parent node and in general diferent parents send to
+ +                               * this node different values and executions are not equivalent.
+ +                               * Since PARAM_EXECs are always at the end of the list we just
+ +                               * need to check last item to figure out if there are any
+ +                               * PARAM_EXECs.
+ +                               * NB: Check queryDesc->plannedstmt->nParamExec > 0 is incorrect
+ +                               * here since queryDesc->plannedstmt->nParamExec may be used
+ +                               * just to allocate space for them and no actual values passed.
+ +                               */
+ +                              if (queryDesc->plannedstmt->nParamRemote > 0 &&
+ +                                              queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)
+ +                              {
+ +                                      int        *consMap;
+ +                                      int             len;
+ +                                      ListCell   *lc;
+ +                                      int             i;
+ +                                      Locator    *locator;
+ +                                      Oid                     keytype;
+ +                                      DestReceiver *dest;
+ +
+ +                                      len = list_length(queryDesc->plannedstmt->distributionNodes);
+ +                                      consMap = (int *) palloc0(len * sizeof(int));
+ +                                      queryDesc->squeue = NULL;
+ +                                      queryDesc->myindex = -1;
+ +                                      PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE,
+ +                                                                                                         &PGXC_PARENT_NODE_TYPE);
+ +                                      i = 0;
+ +                                      foreach(lc, queryDesc->plannedstmt->distributionNodes)
+ +                                      {
+ +                                              if (PGXC_PARENT_NODE_ID == lfirst_int(lc))
+ +                                                      consMap[i] = SQ_CONS_SELF;
+ +                                              else
+ +                                                      consMap[i] = SQ_CONS_NONE;
+ +                                              i++;
+ +                                      }
+ +                                      /*
+ +                                       * Multiple executions of the RemoteSubplan may lead to name
+ +                                       * conflict of SharedQueue, if the subplan has more
+ +                                       * RemoteSubplan nodes in the execution plan tree.
+ +                                       * We need to make them unique.
+ +                                       */
+ +                                      RemoteSubplanMakeUnique(
+ +                                                      (Node *) queryDesc->plannedstmt->planTree,
+ +                                                      PGXC_PARENT_NODE_ID);
+ +                                      /*
+ +                                       * Call ExecutorStart to prepare the plan for execution
+ +                                       */
+ +                                      ExecutorStart(queryDesc, eflags);
+ +
+ +                                      /*
+ +                                       * Set up locator if result distribution is requested
+ +                                       */
+ +                                      keytype = queryDesc->plannedstmt->distributionKey == InvalidAttrNumber ?
+ +                                                      InvalidOid :
+ +                                                      queryDesc->tupDesc->attrs[queryDesc->plannedstmt->distributionKey-1]->atttypid;
+ +                                      locator = createLocator(
+ +                                                      queryDesc->plannedstmt->distributionType,
+ +                                                      RELATION_ACCESS_INSERT,
+ +                                                      keytype,
+ +                                                      LOCATOR_LIST_INT,
+ +                                                      len,
+ +                                                      consMap,
+ +                                                      NULL,
+ +                                                      false);
+ +                                      dest = CreateDestReceiver(DestProducer);
+ +                                      SetProducerDestReceiverParams(dest,
+ +                                                      queryDesc->plannedstmt->distributionKey,
+ +                                                      locator, queryDesc->squeue);
+ +                                      queryDesc->dest = dest;
+ +                              }
+ +                              else
+ +                              {
+ +                                      int        *consMap;
+ +                                      int             len;
+ +
+ +                                      /* Distributed data requested, bind shared queue for data exchange */
+ +                                      len = list_length(queryDesc->plannedstmt->distributionNodes);
+ +                                      consMap = (int *) palloc(len * sizeof(int));
+ +                                      queryDesc->squeue = SharedQueueBind(portal->name,
+ +                                                              queryDesc->plannedstmt->distributionRestrict,
+ +                                                              queryDesc->plannedstmt->distributionNodes,
+ +                                                              &queryDesc->myindex, consMap);
+ +                                      if (queryDesc->myindex == -1)
+ +                                      {
+ +                                              /* producer */
+ +                                              Locator    *locator;
+ +                                              Oid                     keytype;
+ +                                              DestReceiver *dest;
+ +
+ +                                              PG_TRY();
+ +                                              {
+ +                                                      /*
+ +                                                       * Call ExecutorStart to prepare the plan for execution
+ +                                                       */
+ +                                                      ExecutorStart(queryDesc, eflags);
+ +                                              }
+ +                                              PG_CATCH();
+ +                                              {
+ +                                                      /* Ensure SharedQueue is released */
+ +                                                      SharedQueueUnBind(queryDesc->squeue, true);
+ +                                                      queryDesc->squeue = NULL;
+ +                                                      PG_RE_THROW();
+ +                                              }
+ +                                              PG_END_TRY();
+ +
+ +                                              /*
+ +                                               * This tells PortalCleanup to shut down the executor
+ +                                               */
+ +                                              portal->queryDesc = queryDesc;
+ +
+ +                                              /*
+ +                                               * Some basic sanity checking against invalid remote plans.
+ +                                               */
+ +                                              Assert((queryDesc->plannedstmt->distributionKey == InvalidAttrNumber) ||
+ +                                                         (queryDesc->plannedstmt->distributionKey <= queryDesc->tupDesc->natts));
+ +
+ +                                              /*
+ +                                               * Set up locator if result distribution is requested
+ +                                               */
+ +                                              keytype = queryDesc->plannedstmt->distributionKey == InvalidAttrNumber ?
+ +                                                              InvalidOid :
+ +                                                              queryDesc->tupDesc->attrs[queryDesc->plannedstmt->distributionKey-1]->atttypid;
+ +                                              locator = createLocator(
+ +                                                              queryDesc->plannedstmt->distributionType,
+ +                                                              RELATION_ACCESS_INSERT,
+ +                                                              keytype,
+ +                                                              LOCATOR_LIST_INT,
+ +                                                              len,
+ +                                                              consMap,
+ +                                                              NULL,
+ +                                                              false);
+ +                                              dest = CreateDestReceiver(DestProducer);
+ +                                              SetProducerDestReceiverParams(dest,
+ +                                                              queryDesc->plannedstmt->distributionKey,
+ +                                                              locator, queryDesc->squeue);
+ +                                              queryDesc->dest = dest;
+ +
+ +                                              addProducingPortal(portal);
+ +                                      }
+ +                                      else
+ +                                      {
+ +                                              /*
+ +                                               * We do not need to initialize executor, but need
+ +                                               * a tuple descriptor
+ +                                               */
+ +                                              queryDesc->tupDesc = ExecCleanTypeFromTL(
+ +                                                              queryDesc->plannedstmt->planTree->targetlist,
+ +                                                              false);
+ +                                      }
+ +                                      pfree(consMap);
+ +                              }
+ +
+ +                              portal->queryDesc = queryDesc;
+ +
+ +                              /*
+ +                               * Remember tuple descriptor (computed by ExecutorStart)
+ +                               */
+ +                              portal->tupDesc = queryDesc->tupDesc;
+ +
+ +                              /*
+ +                               * Reset cursor position data to "start of query"
+ +                               */
+ +                              portal->atStart = true;
+ +                              portal->atEnd = false;  /* allow fetches */
+ +                              portal->portalPos = 0;
+ +
+ +                              PopActiveSnapshot();
+ +                              break;
+ +#endif
+ +
                         case PORTAL_ONE_SELECT:
   
                                 /* Must set snapshot before starting executor. */
@@@ -1716,10 -1180,8 +1703,11 @@@ PortalRunUtility(Portal portal, Planned
                                    portal->sourceText,
                            isTopLevel ? PROCESS_UTILITY_TOPLEVEL : PROCESS_UTILITY_QUERY,
                                    portal->portalParams,
+                                  portal->queryEnv,
                                    dest,
+ +#ifdef PGXC
+ +                                 false,
+ +#endif /* PGXC */
                                    completionTag);
   
         /* Some utility statements may change context on us */
@@@ -1834,14 -1287,8 +1819,15 @@@ PortalRunMulti(Portal portal
                                 ProcessQuery(pstmt,
                                                          portal->sourceText,
                                                          portal->portalParams,
+                                                        portal->queryEnv,
                                                          dest, completionTag);
+ +#ifdef PGXC
+ +                              /* it's special for INSERT */
+ +                              if (IS_PGXC_COORDINATOR &&
+ +                                      pstmt->commandType == CMD_INSERT)
+ +                                      HandleCmdComplete(pstmt->commandType, &combine,
+ +                                                      completionTag, strlen(completionTag));
+ +#endif
                         }
                         else
                         {
@@@ -2265,355 -1707,3 +2252,355 @@@ DoPortalRewind(Portal portal
         portal->atEnd = false;
         portal->portalPos = 0;
   }
-                               ExecutorRun(queryDesc, ForwardScanDirection, PRODUCE_TUPLES);
+ +
+ +#ifdef XCP
+ +/*
+ + * Execute the specified portal's query and distribute tuples to consumers.
+ + * Returs 1 if portal should keep producing, 0 if all consumers have enough
+ + * rows in the buffers to pause producing temporarily, -1 if the query is
+ + * completed.
+ + */
+ +int
+ +AdvanceProducingPortal(Portal portal, bool can_wait)
+ +{
+ +      Portal          saveActivePortal;
+ +      ResourceOwner saveResourceOwner;
+ +      MemoryContext savePortalContext;
+ +      MemoryContext oldContext;
+ +      QueryDesc  *queryDesc;
+ +      SharedQueue squeue;
+ +      DestReceiver *treceiver;
+ +      int                     result;
+ +
+ +      queryDesc = PortalGetQueryDesc(portal);
+ +      squeue = queryDesc->squeue;
+ +
+ +      Assert(queryDesc);
+ +      /* Make sure the portal is producing */
+ +      Assert(squeue && queryDesc->myindex == -1);
+ +      /* Make sure there is proper receiver */
+ +      Assert(queryDesc->dest && queryDesc->dest->mydest == DestProducer);
+ +
+ +      /*
+ +       * Set up global portal context pointers.
+ +       */
+ +      saveActivePortal = ActivePortal;
+ +      saveResourceOwner = CurrentResourceOwner;
+ +      savePortalContext = PortalContext;
+ +      PG_TRY();
+ +      {
+ +              ActivePortal = portal;
+ +              CurrentResourceOwner = portal->resowner;
+ +              PortalContext = PortalGetHeapMemory(portal);
+ +
+ +              oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(portal));
+ +
+ +              /*
+ +               * That is the first pass thru if the hold store is not initialized yet,
+ +               * Need to initialize stuff.
+ +               */
+ +              if (portal->holdStore == NULL && portal->status != PORTAL_FAILED)
+ +              {
+ +                      int idx;
+ +                      char storename[64];
+ +
+ +                      PortalCreateProducerStore(portal);
+ +                      treceiver = CreateDestReceiver(DestTuplestore);
+ +                      SetTuplestoreDestReceiverParams(treceiver,
+ +                                                                                      portal->holdStore,
+ +                                                                                      portal->holdContext,
+ +                                                                                      false);
+ +                      SetSelfConsumerDestReceiver(queryDesc->dest, treceiver);
+ +                      SetProducerTempMemory(queryDesc->dest, portal->tmpContext);
+ +                      snprintf(storename, 64, "%s producer store", portal->name);
+ +                      tuplestore_collect_stat(portal->holdStore, storename);
+ +                      /*
+ +                       * Tuplestore does not clear eof flag on the active read pointer,
+ +                       * causing the store is always in EOF state once reached when
+ +                       * there is a single read pointer. We do not want behavior like this
+ +                       * and workaround by using secondary read pointer.
+ +                       * Primary read pointer (0) is active when we are writing to
+ +                       * the tuple store, secondary read pointer is for reading, and its
+ +                       * eof flag is cleared if a tuple is written to the store.
+ +                       * We know the extra read pointer has index 1, so do not store it.
+ +                       */
+ +                      idx = tuplestore_alloc_read_pointer(portal->holdStore, 0);
+ +                      Assert(idx == 1);
+ +              }
+ +
+ +              if (queryDesc->estate && !queryDesc->estate->es_finished &&
+ +                              portal->status != PORTAL_FAILED)
+ +              {
+ +                      /*
+ +                       * If the portal's hold store has tuples available for read and
+ +                       * all consumer queues are not empty we skip advancing the portal
+ +                       * (pause it) to prevent buffering too many rows at the producer.
+ +                       * NB just created portal store would not be in EOF state, but in
+ +                       * this case consumer queues will be empty and do not allow
+ +                       * erroneous pause. After the first call to AdvanceProducingPortal
+ +                       * portal will try to read the hold store and EOF flag will be set
+ +                       * correctly.
+ +                       */
+ +                      tuplestore_select_read_pointer(portal->holdStore, 1);
+ +                      if (!tuplestore_ateof(portal->holdStore) &&
+ +                                      SharedQueueCanPause(squeue))
+ +                              result = 0;
+ +                      else
+ +                              result = 1;
+ +                      tuplestore_select_read_pointer(portal->holdStore, 0);
+ +
+ +                      if (result)
+ +                      {
+ +                              /* Execute query and dispatch tuples via dest receiver */
+ +#define PRODUCE_TUPLES 100
+ +                              PushActiveSnapshot(queryDesc->snapshot);
++                              ExecutorRun(queryDesc, ForwardScanDirection, PRODUCE_TUPLES, true);
+ +                              PopActiveSnapshot();
+ +
+ +                              if (queryDesc->estate->es_processed < PRODUCE_TUPLES)
+ +                              {
+ +                                      /*
+ +                                       * Finish the executor, but we may still have some tuples
+ +                                       * in the local storages.
+ +                                       * We should keep trying pushing them into the squeue, so do not
+ +                                       * remove the portal from the list of producers.
+ +                                       */
+ +                                      ExecutorFinish(queryDesc);
+ +                              }
+ +                      }
+ +              }
+ +
+ +              /* Try to dump local tuplestores */
+ +              if ((queryDesc->estate == NULL || queryDesc->estate->es_finished) &&
+ +                              ProducerReceiverPushBuffers(queryDesc->dest))
+ +              {
+ +                      if (can_wait && queryDesc->estate == NULL)
+ +                      {
+ +                              (*queryDesc->dest->rDestroy) (queryDesc->dest);
+ +                              queryDesc->dest = NULL;
+ +                              portal->queryDesc = NULL;
+ +                              squeue = NULL;
+ +
+ +                              removeProducingPortal(portal);
+ +                              FreeQueryDesc(queryDesc);
+ +
+ +                              /*
+ +                               * Current context is the portal context, which is going
+ +                               * to be deleted
+ +                               */
+ +                              MemoryContextSwitchTo(TopTransactionContext);
+ +
+ +                              ActivePortal = saveActivePortal;
+ +                              CurrentResourceOwner = saveResourceOwner;
+ +                              PortalContext = savePortalContext;
+ +
+ +                              if (portal->resowner)
+ +                              {
+ +                                      bool            isCommit = (portal->status != PORTAL_FAILED);
+ +
+ +                                      ResourceOwnerRelease(portal->resowner,
+ +                                                                               RESOURCE_RELEASE_BEFORE_LOCKS,
+ +                                                                               isCommit, false);
+ +                                      ResourceOwnerRelease(portal->resowner,
+ +                                                                               RESOURCE_RELEASE_LOCKS,
+ +                                                                               isCommit, false);
+ +                                      ResourceOwnerRelease(portal->resowner,
+ +                                                                               RESOURCE_RELEASE_AFTER_LOCKS,
+ +                                                                               isCommit, false);
+ +                                      ResourceOwnerDelete(portal->resowner);
+ +                              }
+ +                              portal->resowner = NULL;
+ +
+ +                              /*
+ +                               * Delete tuplestore if present.  We should do this even under error
+ +                               * conditions; since the tuplestore would have been using cross-
+ +                               * transaction storage, its temp files need to be explicitly deleted.
+ +                               */
+ +                              if (portal->holdStore)
+ +                              {
+ +                                      MemoryContext oldcontext;
+ +
+ +                                      oldcontext = MemoryContextSwitchTo(portal->holdContext);
+ +                                      tuplestore_end(portal->holdStore);
+ +                                      MemoryContextSwitchTo(oldcontext);
+ +                                      portal->holdStore = NULL;
+ +                              }
+ +
+ +                              /* delete tuplestore storage, if any */
+ +                              if (portal->holdContext)
+ +                                      MemoryContextDelete(portal->holdContext);
+ +
+ +                              /* release subsidiary storage */
+ +                              MemoryContextDelete(PortalGetHeapMemory(portal));
+ +
+ +                              /* release portal struct (it's in PortalMemory) */
+ +                              pfree(portal);
+ +                      }
+ +                      /* report portal is not producing */
+ +                      result = -1;
+ +              }
+ +              else
+ +              {
+ +                      result = SharedQueueCanPause(queryDesc->squeue) ? 0 : 1;
+ +              }
+ +      }
+ +      PG_CATCH();
+ +      {
+ +              /* Uncaught error while executing portal: mark it dead */
+ +              portal->status = PORTAL_FAILED;
+ +              /*
+ +               * Reset producer to allow consumers to finish, so receiving node will
+ +               * handle the error.
+ +               */
+ +              if (squeue)
+ +                      SharedQueueReset(squeue, -1);
+ +
+ +              /* Restore global vars and propagate error */
+ +              ActivePortal = saveActivePortal;
+ +              CurrentResourceOwner = saveResourceOwner;
+ +              PortalContext = savePortalContext;
+ +
+ +              PG_RE_THROW();
+ +      }
+ +      PG_END_TRY();
+ +
+ +      MemoryContextSwitchTo(oldContext);
+ +
+ +      ActivePortal = saveActivePortal;
+ +      CurrentResourceOwner = saveResourceOwner;
+ +      PortalContext = savePortalContext;
+ +
+ +      return result;
+ +}
+ +
+ +
+ +/*
+ + * Iterate over producing portal, determine already closed, and clean them up,
+ + * waiting while consumers finish their work. Closed producers should be
+ + * cleaned up and resources are released before proceeding with handling of
+ + * next request.
+ + */
+ +void
+ +cleanupClosedProducers(void)
+ +{
+ +      ListCell   *lc = list_head(getProducingPortals());
+ +      while (lc)
+ +      {
+ +              Portal p = (Portal) lfirst(lc);
+ +              QueryDesc  *queryDesc = PortalGetQueryDesc(p);
+ +              SharedQueue squeue = queryDesc->squeue;
+ +
+ +              /*
+ +               * Get next already, because next call may remove cell from
+ +               * the list and invalidate next reference
+ +               */
+ +              lc = lnext(lc);
+ +
+ +              /* When portal is closed executor state is not set */
+ +              if (queryDesc->estate == NULL)
+ +              {
+ +                      /*
+ +                       * Set up global portal context pointers.
+ +                       */
+ +                      Portal          saveActivePortal = ActivePortal;
+ +                      ResourceOwner saveResourceOwner = CurrentResourceOwner;
+ +                      MemoryContext savePortalContext = PortalContext;
+ +
+ +                      PG_TRY();
+ +                      {
+ +                              MemoryContext oldContext;
+ +                              ActivePortal = p;
+ +                              CurrentResourceOwner = p->resowner;
+ +                              PortalContext = PortalGetHeapMemory(p);
+ +
+ +                              oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(p));
+ +
+ +                              (*queryDesc->dest->rDestroy) (queryDesc->dest);
+ +                              queryDesc->dest = NULL;
+ +                              p->queryDesc = NULL;
+ +                              squeue = NULL;
+ +
+ +                              removeProducingPortal(p);
+ +                              FreeQueryDesc(queryDesc);
+ +
+ +                              /*
+ +                               * Current context is the portal context, which is going
+ +                               * to be deleted
+ +                               */
+ +                              MemoryContextSwitchTo(TopTransactionContext);
+ +
+ +                              ActivePortal = saveActivePortal;
+ +                              CurrentResourceOwner = saveResourceOwner;
+ +                              PortalContext = savePortalContext;
+ +
+ +                              if (p->resowner)
+ +                              {
+ +                                      bool            isCommit = (p->status != PORTAL_FAILED);
+ +
+ +                                      ResourceOwnerRelease(p->resowner,
+ +                                                                               RESOURCE_RELEASE_BEFORE_LOCKS,
+ +                                                                               isCommit, false);
+ +                                      ResourceOwnerRelease(p->resowner,
+ +                                                                               RESOURCE_RELEASE_LOCKS,
+ +                                                                               isCommit, false);
+ +                                      ResourceOwnerRelease(p->resowner,
+ +                                                                               RESOURCE_RELEASE_AFTER_LOCKS,
+ +                                                                               isCommit, false);
+ +                                      ResourceOwnerDelete(p->resowner);
+ +                              }
+ +                              p->resowner = NULL;
+ +
+ +                              /*
+ +                               * Delete tuplestore if present.  We should do this even under error
+ +                               * conditions; since the tuplestore would have been using cross-
+ +                               * transaction storage, its temp files need to be explicitly deleted.
+ +                               */
+ +                              if (p->holdStore)
+ +                              {
+ +                                      MemoryContext oldcontext;
+ +
+ +                                      oldcontext = MemoryContextSwitchTo(p->holdContext);
+ +                                      tuplestore_end(p->holdStore);
+ +                                      MemoryContextSwitchTo(oldcontext);
+ +                                      p->holdStore = NULL;
+ +                              }
+ +
+ +                              /* delete tuplestore storage, if any */
+ +                              if (p->holdContext)
+ +                                      MemoryContextDelete(p->holdContext);
+ +
+ +                              /* release subsidiary storage */
+ +                              MemoryContextDelete(PortalGetHeapMemory(p));
+ +
+ +                              /* release portal struct (it's in PortalMemory) */
+ +                              pfree(p);
+ +
+ +                              MemoryContextSwitchTo(oldContext);
+ +                      }
+ +                      PG_CATCH();
+ +                      {
+ +                              /* Uncaught error while executing portal: mark it dead */
+ +                              p->status = PORTAL_FAILED;
+ +                              /*
+ +                               * Reset producer to allow consumers to finish, so receiving node will
+ +                               * handle the error.
+ +                               */
+ +                              if (squeue)
+ +                                      SharedQueueReset(squeue, -1);
+ +
+ +                              /* Restore global vars and propagate error */
+ +                              ActivePortal = saveActivePortal;
+ +                              CurrentResourceOwner = saveResourceOwner;
+ +                              PortalContext = savePortalContext;
+ +
+ +                              PG_RE_THROW();
+ +                      }
+ +                      PG_END_TRY();
+ +
+ +                      ActivePortal = saveActivePortal;
+ +                      CurrentResourceOwner = saveResourceOwner;
+ +                      PortalContext = savePortalContext;
+ +              }
+ +      }
+ +}
+ +#endif
diff --cc src/backend/tcop/utility.c

index 7680a6451a28665369b2d8f8a8abea89565fc956,1e941fbd600276b9c99b81540b369f9ef058e642..632d51f3acae177edd6c08103bac5100560a4ee4
--- 1/src/backend/tcop/utility.c
--- 2/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@@ -5,10 -5,8 +5,10 @@@
    *      commands.  At one time acted as an interface between the Lisp and C
    *      systems.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -111,18 -78,10 +114,19 @@@ static void ProcessUtilitySlow(ParseSta
                                    const char *queryString,
                                    ProcessUtilityContext context,
                                    ParamListInfo params,
+                                  QueryEnvironment *queryEnv,
                                    DestReceiver *dest,
+ +                                 bool sentToRemote,
                                    char *completionTag);
+ +
+ +#ifdef PGXC
+ +static void ExecDropStmt(DropStmt *stmt,
+ +                                      const char *queryString,
+ +                                      bool sentToRemote,
+ +                                      bool isTopLevel);
+ +#else
   static void ExecDropStmt(DropStmt *stmt, bool isTopLevel);
+ +#endif
   
   
   /*
@@@ -363,12 -336,12 +381,15 @@@ ProcessUtility(PlannedStmt *pstmt
                            const char *queryString,
                            ProcessUtilityContext context,
                            ParamListInfo params,
+                          QueryEnvironment *queryEnv,
                            DestReceiver *dest,
+ +#ifdef PGXC
+ +                         bool sentToRemote,
+ +#endif
                            char *completionTag)
   {
+       Assert(IsA(pstmt, PlannedStmt));
+       Assert(pstmt->commandType == CMD_UTILITY);
         Assert(queryString != NULL);    /* required as of 8.4 */
   
         /*
@@@ -377,21 -350,13 +398,17 @@@
          * call standard_ProcessUtility().
          */
         if (ProcessUtility_hook)
-               (*ProcessUtility_hook) (parsetree, queryString,
-                                                               context, params,
+               (*ProcessUtility_hook) (pstmt, queryString,
+                                                               context, params, queryEnv,
- -                                                              dest, completionTag);
+ +                                                              dest,
- #ifdef PGXC
+ +                                                              sentToRemote,
- #endif
+ +                                                              completionTag);
         else
-               standard_ProcessUtility(parsetree, queryString,
-                                                               context, params,
+               standard_ProcessUtility(pstmt, queryString,
+                                                               context, params, queryEnv,
- -                                                              dest, completionTag);
+ +                                                              dest,
- #ifdef PGXC
+ +                                                              sentToRemote,
- #endif
+ +                                                              completionTag);
   }
   
   /*
@@@ -410,45 -375,13 +427,48 @@@ standard_ProcessUtility(PlannedStmt *ps
                                                 const char *queryString,
                                                 ProcessUtilityContext context,
                                                 ParamListInfo params,
+                                               QueryEnvironment *queryEnv,
                                                 DestReceiver *dest,
+ +#ifdef PGXC
+ +                                              bool sentToRemote,
+ +#endif
                                                 char *completionTag)
   {
+       Node       *parsetree = pstmt->utilityStmt;
         bool            isTopLevel = (context == PROCESS_UTILITY_TOPLEVEL);
+       ParseState *pstate;
+ +#ifdef PGXC
+ +      /*
+ +       * For more detail see comments in function pgxc_lock_for_backup.
+ +       *
+ +       * Cosider the following scenario:
+ +       * Imagine a two cordinator cluster CO1, CO2
+ +       * Suppose a client connected to CO1 issues select pgxc_lock_for_backup()
+ +       * Now assume that a client connected to CO2 issues a create table
+ +       * select pgxc_lock_for_backup() would try to acquire the advisory lock
+ +       * in exclusive mode, whereas create table would try to acquire the same
+ +       * lock in shared mode. Both these requests will always try acquire the
+ +       * lock in the same order i.e. they would both direct the request first to
+ +       * CO1 and then to CO2. One of the two requests would therefore pass
+ +       * and the other would fail.
+ +       *
+ +       * Consider another scenario:
+ +       * Suppose we have a two cooridnator cluster CO1 and CO2
+ +       * Assume one client connected to each coordinator
+ +       * Further assume one client starts a transaction
+ +       * and issues a DDL. This is an unfinished transaction.
+ +       * Now assume the second client issues
+ +       * select pgxc_lock_for_backup()
+ +       * This request would fail because the unfinished transaction
+ +       * would already hold the advisory lock.
+ +       */
+ +      if (IS_PGXC_LOCAL_COORDINATOR && IsNormalProcessingMode())
+ +      {
+ +              /* Is the statement a prohibited one? */
+ +              if (!IsStmtAllowedInLockedMode(parsetree, queryString))
+ +                      pgxc_lock_for_utility_stmt(parsetree);
+ +      }
+ +#endif
   
         check_xact_readonly(parsetree);
   
@@@ -756,33 -590,13 +772,33 @@@
   
                 case T_CreatedbStmt:
                         /* no event triggers for global objects */
+ +#ifdef PGXC
+ +                      if (IS_PGXC_LOCAL_COORDINATOR)
+ +#endif
                         PreventTransactionChain(isTopLevel, "CREATE DATABASE");
-                       createdb((CreatedbStmt *) parsetree);
+                       createdb(pstate, (CreatedbStmt *) parsetree);
+ +#ifdef PGXC
+ +                      if (IS_PGXC_LOCAL_COORDINATOR)
+ +                              ExecUtilityWithMessage(queryString, sentToRemote, false);
+ +#endif
                         break;
   
                 case T_AlterDatabaseStmt:
                         /* no event triggers for global objects */
-                       AlterDatabase((AlterDatabaseStmt *) parsetree, isTopLevel);
+                       AlterDatabase(pstate, (AlterDatabaseStmt *) parsetree, isTopLevel);
+ +#ifdef PGXC
+ +                      if (IS_PGXC_LOCAL_COORDINATOR)
+ +                      {
+ +                              /*
+ +                               * If this is not a SET TABLESPACE statement, just propogate the
+ +                               * cmd as usual.
+ +                               */
+ +                              if (!IsSetTableSpace((AlterDatabaseStmt*) parsetree))
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ +                              else
+ +                                      ExecUtilityWithMessage(queryString, sentToRemote, false);
+ +                      }
+ +#endif
                         break;
   
                 case T_AlterDatabaseSetStmt:
@@@ -963,11 -719,7 +980,11 @@@
                          */
                 case T_CreateRoleStmt:
                         /* no event triggers for global objects */
-                       CreateRole((CreateRoleStmt *) parsetree);
+                       CreateRole(pstate, (CreateRoleStmt *) parsetree);
+ +#ifdef PGXC
+ +                      if (IS_PGXC_LOCAL_COORDINATOR)
+ +                              ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ +#endif
                         break;
   
                 case T_AlterRoleStmt:
@@@ -1171,63 -824,11 +1188,63 @@@
                                 GrantStmt  *stmt = (GrantStmt *) parsetree;
   
                                 if (EventTriggerSupportsGrantObjectType(stmt->objtype))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
- -                                                                         dest, completionTag);
+ +                                                                         dest,
+ +                                                                         sentToRemote,
+ +                                                                         completionTag);
                                 else
-                                       ExecuteGrantStmt((GrantStmt *) parsetree);
+                                       ExecuteGrantStmt(stmt);
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                              {
+ +                                      RemoteQueryExecType remoteExecType = EXEC_ON_ALL_NODES;
+ +                                      GrantStmt *stmt = (GrantStmt *) parsetree;
+ +                                      bool is_temp = false;
+ +
+ +                                      /* Launch GRANT on Coordinator if object is a sequence */
+ +                                      if ((stmt->objtype == ACL_OBJECT_RELATION &&
+ +                                                              stmt->targtype == ACL_TARGET_OBJECT))
+ +                                      {
+ +                                              /*
+ +                                               * In case object is a relation, differenciate the case
+ +                                               * of a sequence, a view and a table
+ +                                               */
+ +                                              ListCell   *cell;
+ +                                              /* Check the list of objects */
+ +                                              bool            first = true;
+ +                                              RemoteQueryExecType type_local = remoteExecType;
+ +
+ +                                              foreach (cell, stmt->objects)
+ +                                              {
+ +                                                      RangeVar   *relvar = (RangeVar *) lfirst(cell);
+ +                                                      Oid                     relid = RangeVarGetRelid(relvar, NoLock, true);
+ +
+ +                                                      /* Skip if object does not exist */
+ +                                                      if (!OidIsValid(relid))
+ +                                                              continue;
+ +
+ +                                                      remoteExecType = ExecUtilityFindNodesRelkind(relid, &is_temp);
+ +
+ +                                                      /* Check if object node type corresponds to the first one */
+ +                                                      if (first)
+ +                                                      {
+ +                                                              type_local = remoteExecType;
+ +                                                              first = false;
+ +                                                      }
+ +                                                      else
+ +                                                      {
+ +                                                              if (type_local != remoteExecType)
+ +                                                                      ereport(ERROR,
+ +                                                                                      (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ +                                                                                       errmsg("PGXC does not support GRANT on multiple object types"),
+ +                                                                                       errdetail("Grant VIEW/TABLE with separate queries")));
+ +                                                      }
+ +                                              }
+ +                                      }
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, remoteExecType, is_temp);
+ +                              }
+ +#endif
                         }
                         break;
   
@@@ -1236,13 -837,11 +1253,13 @@@
                                 DropStmt   *stmt = (DropStmt *) parsetree;
   
                                 if (EventTriggerSupportsObjectType(stmt->removeType))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
- -                                                                         dest, completionTag);
+ +                                                                         dest,
+ +                                                                         sentToRemote,
+ +                                                                         completionTag);
                                 else
- -                                      ExecDropStmt(stmt, isTopLevel);
+ +                                      ExecDropStmt(stmt, queryString, sentToRemote, isTopLevel);
                         }
                         break;
   
@@@ -1251,11 -850,9 +1268,11 @@@
                                 RenameStmt *stmt = (RenameStmt *) parsetree;
   
                                 if (EventTriggerSupportsObjectType(stmt->renameType))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
- -                                                                         dest, completionTag);
+ +                                                                         dest,
+ +                                                                         sentToRemote,
+ +                                                                         completionTag);
                                 else
                                         ExecRenameStmt(stmt);
                         }
@@@ -1306,11 -863,9 +1323,11 @@@
                                 AlterObjectDependsStmt *stmt = (AlterObjectDependsStmt *) parsetree;
   
                                 if (EventTriggerSupportsObjectType(stmt->objectType))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
- -                                                                         dest, completionTag);
+ +                                                                         dest,
+ +                                                                         sentToRemote,
+ +                                                                         completionTag);
                                 else
                                         ExecAlterObjectDependsStmt(stmt, NULL);
                         }
@@@ -1321,11 -876,9 +1338,11 @@@
                                 AlterObjectSchemaStmt *stmt = (AlterObjectSchemaStmt *) parsetree;
   
                                 if (EventTriggerSupportsObjectType(stmt->objectType))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
- -                                                                         dest, completionTag);
+ +                                                                         dest,
+ +                                                                         sentToRemote,
+ +                                                                         completionTag);
                                 else
                                         ExecAlterObjectSchemaStmt(stmt, NULL);
                         }
@@@ -1376,18 -889,11 +1393,18 @@@
                                 AlterOwnerStmt *stmt = (AlterOwnerStmt *) parsetree;
   
                                 if (EventTriggerSupportsObjectType(stmt->objectType))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
- -                                                                         dest, completionTag);
+ +                                                                         dest,
+ +                                                                         sentToRemote,
+ +                                                                         completionTag);
                                 else
                                         ExecAlterOwnerStmt(stmt);
+ +
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ +#endif
                         }
                         break;
   
@@@ -1418,38 -902,22 +1435,39 @@@
                                 CommentStmt *stmt = (CommentStmt *) parsetree;
   
                                 if (EventTriggerSupportsObjectType(stmt->objtype))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
- -                                                                         dest, completionTag);
+ +                                                                         dest,
+ +                                                                         sentToRemote,
+ +                                                                         completionTag);
                                 else
-                                       CommentObject((CommentStmt *) parsetree);
+                                       CommentObject(stmt);
+                               break;
                         }
+ +#ifdef PGXC
+ +                      {
+ +                              /* Comment objects depending on their object and temporary types */
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                              {
+ +                                      bool is_temp = false;
+ +                                      CommentStmt *stmt = (CommentStmt *) parsetree;
+ +                                      RemoteQueryExecType exec_type = GetNodesForCommentUtility(stmt, &is_temp);
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, exec_type, is_temp);
+ +                              }
+ +                      }
+ +#endif
+ +                      break;
   
                 case T_SecLabelStmt:
                         {
                                 SecLabelStmt *stmt = (SecLabelStmt *) parsetree;
   
                                 if (EventTriggerSupportsObjectType(stmt->objtype))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
- -                                                                         dest, completionTag);
+ +                                                                         dest,
+ +                                                                         sentToRemote,
+ +                                                                         completionTag);
                                 else
                                         ExecSecLabelStmt(stmt);
                                 break;
@@@ -1457,13 -925,13 +1475,15 @@@
   
                 default:
                         /* All other statement types have event trigger support */
-                       ProcessUtilitySlow(parsetree, queryString,
-                                                          context, params,
+                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                          context, params, queryEnv,
- -                                                         dest, completionTag);
+ +                                                         dest,
+ +                                                         sentToRemote,
+ +                                                         completionTag);
                         break;
         }
+ 
+       free_parsestate(pstate);
   }
   
   /*
@@@ -1476,10 -945,11 +1497,12 @@@ ProcessUtilitySlow(ParseState *pstate
                                    const char *queryString,
                                    ProcessUtilityContext context,
                                    ParamListInfo params,
+                                  QueryEnvironment *queryEnv,
                                    DestReceiver *dest,
+ +                                 bool sentToRemote,
                                    char *completionTag)
   {
+       Node       *parsetree = pstmt->utilityStmt;
         bool            isTopLevel = (context == PROCESS_UTILITY_TOPLEVEL);
         bool            isCompleteQuery = (context <= PROCESS_UTILITY_QUERY);
         bool            needCleanup;
@@@ -1502,13 -972,10 +1525,10 @@@
                                  * relation and attribute manipulation
                                  */
                         case T_CreateSchemaStmt:
- #ifdef PGXC
-                               CreateSchemaCommand((CreateSchemaStmt *) parsetree,
-                                                                       queryString, sentToRemote);
- #else                         
                                 CreateSchemaCommand((CreateSchemaStmt *) parsetree,
-                                                                       queryString);
- #endif                                
- -                                                                      queryString,
++                                                                      queryString, sentToRemote,
+                                                                       pstmt->stmt_location,
+                                                                       pstmt->stmt_len);
   
                                 /*
                                  * EventTriggerCollectSimpleCommand called by
@@@ -1670,10 -1069,8 +1701,11 @@@
                                                                                    queryString,
                                                                                    PROCESS_UTILITY_SUBCOMMAND,
                                                                                    params,
+                                                                                  NULL,
                                                                                    None_Receiver,
+ +#ifdef PGXC
+ +                                                                                 true,
+ +#endif                                                                                
                                                                                    NULL);
                                                 }
   
@@@ -1766,10 -1148,8 +1806,11 @@@
                                                                                            queryString,
                                                                                            PROCESS_UTILITY_SUBCOMMAND,
                                                                                            params,
+                                                                                          NULL,
                                                                                            None_Receiver,
+ +#ifdef PGXC
+ +                                                                                         true,
+ +#endif /* PGXC */
                                                                                            NULL);
                                                                 EventTriggerAlterTableStart(parsetree);
                                                                 EventTriggerAlterTableRelid(relid);
@@@ -1997,19 -1345,11 +2041,19 @@@
                                 break;
   
                         case T_CreateExtensionStmt:
-                               address = CreateExtension((CreateExtensionStmt *) parsetree);
+                               address = CreateExtension(pstate, (CreateExtensionStmt *) parsetree);
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ +#endif
                                 break;
   
                         case T_AlterExtensionStmt:
-                               address = ExecAlterExtensionStmt((AlterExtensionStmt *) parsetree);
+                               address = ExecAlterExtensionStmt(pstate, (AlterExtensionStmt *) parsetree);
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ +#endif
                                 break;
   
                         case T_AlterExtensionContentsStmt:
@@@ -2105,30 -1411,13 +2149,31 @@@
                                 break;
   
                         case T_AlterEnumStmt:           /* ALTER TYPE (enum) */
-                               address = AlterEnum((AlterEnumStmt *) parsetree, isTopLevel);
+                               address = AlterEnum((AlterEnumStmt *) parsetree);
+ +#ifdef PGXC
+ +                              /*
+ +                               * In this case force autocommit, this transaction cannot be launched
+ +                               * inside a transaction block.
+ +                               */
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote,
+ +                                                      true, EXEC_ON_ALL_NODES, false);
+ +#endif
                                 break;
   
                         case T_ViewStmt:        /* CREATE VIEW */
                                 EventTriggerAlterTableStart(parsetree);
-                               address = DefineView((ViewStmt *) parsetree, queryString);
+                               address = DefineView((ViewStmt *) parsetree, queryString,
+                                                                        pstmt->stmt_location, pstmt->stmt_len);
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                              {
+ +                                      ViewStmt *stmt = (ViewStmt *) parsetree;
+ +
+ +                                      if (stmt->view->relpersistence != RELPERSISTENCE_TEMP)
+ +                                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false);
+ +                              }
+ +#endif
                                 EventTriggerCollectSimpleCommand(address, secondaryObject,
                                                                                                  parsetree);
                                 /* stashed internally */
@@@ -2137,19 -1426,11 +2182,19 @@@
                                 break;
   
                         case T_CreateFunctionStmt:      /* CREATE FUNCTION */
-                               address = CreateFunction((CreateFunctionStmt *) parsetree, queryString);
+                               address = CreateFunction(pstate, (CreateFunctionStmt *) parsetree);
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ +#endif
                                 break;
   
                         case T_AlterFunctionStmt:       /* ALTER FUNCTION */
-                               address = AlterFunction((AlterFunctionStmt *) parsetree);
+                               address = AlterFunction(pstate, (AlterFunctionStmt *) parsetree);
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ +#endif
                                 break;
   
                         case T_RuleStmt:        /* CREATE RULE */
@@@ -2167,67 -1438,17 +2212,68 @@@
                                 break;
   
                         case T_CreateSeqStmt:
-                               address = DefineSequence((CreateSeqStmt *) parsetree);
+                               address = DefineSequence(pstate, (CreateSeqStmt *) parsetree);
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                              {
+ +                                      CreateSeqStmt *stmt = (CreateSeqStmt *) parsetree;
+ +
+ +                                      /* In case this query is related to a SERIAL execution, just bypass */
+ +                                      if (!stmt->is_serial)
+ +                                      {
+ +                                              bool is_temp = stmt->sequence->relpersistence == RELPERSISTENCE_TEMP;
+ +                                              ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, is_temp);
+ +                                      }
+ +                              }
+ +#endif
                                 break;
   
                         case T_AlterSeqStmt:
-                               address = AlterSequence((AlterSeqStmt *) parsetree);
+                               address = AlterSequence(pstate, (AlterSeqStmt *) parsetree);
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                              {
+ +                                      AlterSeqStmt *stmt = (AlterSeqStmt *) parsetree;
+ +
+ +                                      /* In case this query is related to a SERIAL execution, just bypass */
+ +                                      if (!stmt->is_serial)
+ +                                      {
+ +                                              bool              is_temp;
+ +                                              RemoteQueryExecType exec_type;
+ +                                              Oid                                     relid = RangeVarGetRelid(stmt->sequence, NoLock, true);
+ +
+ +                                              if (!OidIsValid(relid))
+ +                                                      break;
+ +
+ +                                              exec_type = ExecUtilityFindNodes(OBJECT_SEQUENCE,
+ +                                                              relid,
+ +                                                              &is_temp);
+ +
+ +                                              ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, exec_type, is_temp);
+ +                                      }
+ +                              }
+ +#endif
                                 break;
   
                         case T_CreateTableAsStmt:
                                 address = ExecCreateTableAs((CreateTableAsStmt *) parsetree,
-                                                                                queryString, params, completionTag);
+                                                                                       queryString, params, queryEnv,
+                                                                                       completionTag);
+ +#ifdef PGXC
+ +                              if ((IS_PGXC_COORDINATOR) && !IsConnFromCoord())
+ +                              {
+ +                                      CreateTableAsStmt *stmt = (CreateTableAsStmt *) parsetree;
+ +
+ +                                      /*
+ +                                       * CTAS for normal tables should have been rewritten as a
+ +                                       * CREATE TABLE + SELECT INTO
+ +                                       */
+ +                                      Assert(stmt->relkind == OBJECT_MATVIEW);
+ +                                      if (stmt->into->rel->relpersistence != RELPERSISTENCE_TEMP)
+ +                                                      ExecUtilityStmtOnNodes(queryString, NULL,
+ +                                                                      sentToRemote, false, EXEC_ON_COORDS, false);
+ +                              }
+ +#endif
                                 break;
   
                         case T_RefreshMatViewStmt:
@@@ -2370,11 -1521,14 +2416,18 @@@
                                 break;
   
                         case T_AlterTSConfigurationStmt:
-                               address = AlterTSConfiguration((AlterTSConfigurationStmt *) parsetree);
+                               AlterTSConfiguration((AlterTSConfigurationStmt *) parsetree);
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ +#endif
+ 
+                               /*
+                                * Commands are stashed in MakeConfigurationMapping and
+                                * DropConfigurationMapping, which are called from
+                                * AlterTSConfiguration
+                                */
+                               commandCollected = true;
                                 break;
   
                         case T_AlterTableMoveAllStmt:
@@@ -2442,12 -1584,7 +2495,12 @@@
                                 break;
   
                         case T_AlterDefaultPrivilegesStmt:
-                               ExecAlterDefaultPrivilegesStmt((AlterDefaultPrivilegesStmt *) parsetree);
+                               ExecAlterDefaultPrivilegesStmt(pstate, (AlterDefaultPrivilegesStmt *) parsetree);
+ +
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ +#endif
                                 EventTriggerCollectAlterDefPrivs((AlterDefaultPrivilegesStmt *) parsetree);
                                 commandCollected = true;
                                 break;
@@@ -2474,12 -1603,45 +2527,49 @@@
   
                         case T_CreateAmStmt:
                                 address = CreateAccessMethod((CreateAmStmt *) parsetree);
+ +#ifdef PGXC
+ +                              if (IS_PGXC_LOCAL_COORDINATOR)
+ +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ +#endif
                                 break;
   
+                       case T_CreatePublicationStmt:
+                               address = CreatePublication((CreatePublicationStmt *) parsetree);
+                               break;
+ 
+                       case T_AlterPublicationStmt:
+                               AlterPublication((AlterPublicationStmt *) parsetree);
+ 
+                               /*
+                                * AlterPublication calls EventTriggerCollectSimpleCommand
+                                * directly
+                                */
+                               commandCollected = true;
+                               break;
+ 
+                       case T_CreateSubscriptionStmt:
+                               address = CreateSubscription((CreateSubscriptionStmt *) parsetree,
+                                                                                        isTopLevel);
+                               break;
+ 
+                       case T_AlterSubscriptionStmt:
+                               address = AlterSubscription((AlterSubscriptionStmt *) parsetree);
+                               break;
+ 
+                       case T_DropSubscriptionStmt:
+                               DropSubscription((DropSubscriptionStmt *) parsetree, isTopLevel);
+                               /* no commands stashed for DROP */
+                               commandCollected = true;
+                               break;
+ 
+                       case T_CreateStatsStmt:
+                               address = CreateStatistics((CreateStatsStmt *) parsetree);
+                               break;
+ 
+                       case T_AlterCollationStmt:
+                               address = AlterCollation((AlterCollationStmt *) parsetree);
+                               break;
+ 
                         default:
                                 elog(ERROR, "unrecognized node type: %d",
                                          (int) nodeTag(parsetree));
@@@ -4219,548 -3360,3 +4383,548 @@@ GetCommandLogLevel(Node *parsetree
   
         return lev;
   }
-       if (stmt->objtype == OBJECT_DATABASE && list_length(stmt->objname) == 1)
+ +
+ +#ifdef PGXC
+ +
+ +/*
+ + * ExecUtilityWithMessage:
+ + * Execute the query on remote nodes in a transaction block.
+ + * If this fails on one of the nodes :
+ + *            Add a context message containing the failed node names.
+ + *            Rethrow the error with the message about the failed nodes.
+ + * If all are successful, just return.
+ + */
+ +      static void
+ +ExecUtilityWithMessage(const char *queryString, bool sentToRemote, bool is_temp)
+ +{
+ +      PG_TRY();
+ +      {
+ +              ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, is_temp);
+ +      }
+ +      PG_CATCH();
+ +      {
+ +
+ +              /*
+ +               * Some nodes failed. Add context about what all nodes the query
+ +               * failed
+ +               */
+ +              ExecNodes *coord_success_nodes = NULL;
+ +              ExecNodes *data_success_nodes = NULL;
+ +              char *msg_failed_nodes;
+ +
+ +              pgxc_all_success_nodes(&data_success_nodes, &coord_success_nodes, &msg_failed_nodes);
+ +              if (msg_failed_nodes)
+ +                      errcontext("%s", msg_failed_nodes);
+ +              PG_RE_THROW();
+ +      }
+ +      PG_END_TRY();
+ +
+ +
+ +}
+ +
+ +/*
+ + * Execute a Utility statement on nodes, including Coordinators
+ + * If the DDL is received from a remote Coordinator,
+ + * it is not possible to push down DDL to Datanodes
+ + * as it is taken in charge by the remote Coordinator.
+ + */
+ +      static void
+ +ExecUtilityStmtOnNodes(const char *queryString, ExecNodes *nodes, bool sentToRemote,
+ +              bool force_autocommit, RemoteQueryExecType exec_type, bool is_temp)
+ +{
+ +      /* Return if query is launched on no nodes */
+ +      if (exec_type == EXEC_ON_NONE)
+ +              return;
+ +
+ +      /* Nothing to be done if this statement has been sent to the nodes */
+ +      if (sentToRemote)
+ +              return;
+ +
+ +      /* If no Datanodes defined, the query cannot be launched */
+ +      if (NumDataNodes == 0)
+ +              ereport(ERROR,
+ +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                               errmsg("No Datanode defined in cluster"),
+ +                               errhint("You need to define at least 1 Datanode with "
+ +                                       "CREATE NODE.")));
+ +
+ +      if (!IsConnFromCoord())
+ +      {
+ +              RemoteQuery *step = makeNode(RemoteQuery);
+ +              step->combine_type = COMBINE_TYPE_SAME;
+ +              step->exec_nodes = nodes;
+ +              step->sql_statement = pstrdup(queryString);
+ +              step->force_autocommit = force_autocommit;
+ +              step->exec_type = exec_type;
+ +              ExecRemoteUtility(step);
+ +              pfree(step->sql_statement);
+ +              pfree(step);
+ +      }
+ +}
+ +
+ +/*
+ + * ExecUtilityFindNodes
+ + *
+ + * Determine the list of nodes to launch query on.
+ + * This depends on temporary nature of object and object type.
+ + * Return also a flag indicating if relation is temporary.
+ + *
+ + * If object is a RULE, the object id sent is that of the object to which the
+ + * rule is applicable.
+ + */
+ +      static RemoteQueryExecType
+ +ExecUtilityFindNodes(ObjectType object_type,
+ +              Oid object_id,
+ +              bool *is_temp)
+ +{
+ +      RemoteQueryExecType exec_type;
+ +
+ +      switch (object_type)
+ +      {
+ +              case OBJECT_SEQUENCE:
+ +                      *is_temp = IsTempTable(object_id);
+ +                      exec_type = EXEC_ON_ALL_NODES;
+ +                      break;
+ +
+ +              case OBJECT_TABLE:
+ +                      /* Do the check on relation kind */
+ +                      exec_type = ExecUtilityFindNodesRelkind(object_id, is_temp);
+ +                      break;
+ +
+ +                      /*
+ +                       * Views and rules, both permanent or temporary are created
+ +                       * on Coordinators only.
+ +                       */
+ +              case OBJECT_RULE:
+ +              case OBJECT_VIEW:
+ +              case OBJECT_MATVIEW:
+ +                      /* Check if object is a temporary view */
+ +                      if ((*is_temp = IsTempTable(object_id)))
+ +                              exec_type = EXEC_ON_NONE;
+ +                      else
+ +                              exec_type = EXEC_ON_COORDS;
+ +                      break;
+ +
+ +              case OBJECT_INDEX:
+ +                      /* Check if given index uses temporary tables */
+ +                      {
+ +                              Relation        rel;
+ +                              bool            is_matview;
+ +
+ +                              rel = relation_open(object_id, NoLock);
+ +                              
+ +                              *is_temp = (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP);
+ +                              is_matview = (rel->rd_rel->relkind == RELKIND_MATVIEW);
+ +                              
+ +                              relation_close(rel, NoLock);
+ +
+ +                              exec_type = EXEC_ON_NONE;
+ +                              if (*is_temp)
+ +                              {
+ +                                      if (!is_matview)
+ +                                              exec_type = EXEC_ON_DATANODES;
+ +                              }
+ +                              else
+ +                              {
+ +                                      if (!is_matview)
+ +                                              exec_type = EXEC_ON_ALL_NODES;
+ +                                      else
+ +                                              exec_type = EXEC_ON_COORDS;
+ +                              }
+ +                      }
+ +                      break;
+ +
+ +              default:
+ +                      *is_temp = false;
+ +                      exec_type = EXEC_ON_ALL_NODES;
+ +                      break;
+ +      }
+ +
+ +      return exec_type;
+ +}
+ +
+ +/*
+ + * ExecUtilityFindNodesRelkind
+ + *
+ + * Get node execution and temporary type
+ + * for given relation depending on its relkind
+ + */
+ +static RemoteQueryExecType
+ +ExecUtilityFindNodesRelkind(Oid relid, bool *is_temp)
+ +{
+ +      char relkind_str = get_rel_relkind(relid);
+ +      RemoteQueryExecType exec_type;
+ +
+ +      switch (relkind_str)
+ +      {
+ +              case RELKIND_SEQUENCE:
+ +              case RELKIND_RELATION:
+ +                      if ((*is_temp = IsTempTable(relid)))
+ +                      {
+ +                              if (IsLocalTempTable(relid))
+ +                                      exec_type = EXEC_ON_NONE;
+ +                              else
+ +                                      exec_type = EXEC_ON_DATANODES;
+ +                      }
+ +                      else
+ +                              exec_type = EXEC_ON_ALL_NODES;
+ +                      break;
+ +
+ +              case RELKIND_INDEX:
+ +                      {
+ +                              HeapTuple   tuple;
+ +                              Oid table_relid = InvalidOid;
+ +
+ +                              tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(relid));
+ +                              if (HeapTupleIsValid(tuple))
+ +                              {
+ +                                      Form_pg_index index = (Form_pg_index) GETSTRUCT(tuple);
+ +                                      table_relid = index->indrelid;
+ +
+ +                                      /* Release system cache BEFORE looking at the parent table */
+ +                                      ReleaseSysCache(tuple);
+ +                                      return ExecUtilityFindNodesRelkind(table_relid, is_temp);
+ +                              }
+ +                              else
+ +                              {
+ +                                      exec_type = EXEC_ON_NONE;
+ +                                      *is_temp = false;
+ +                              }
+ +                      }
+ +                      break;
+ +
+ +              case RELKIND_VIEW:
+ +                      if ((*is_temp = IsTempTable(relid)))
+ +                              exec_type = EXEC_ON_NONE;
+ +                      else
+ +                              exec_type = EXEC_ON_COORDS;
+ +                      break;
+ +
+ +              case RELKIND_MATVIEW:
+ +                      /* Check if object is a temporary view */
+ +                      if ((*is_temp = IsTempTable(relid)))
+ +                              exec_type = EXEC_ON_NONE;
+ +                      else
+ +                              exec_type = EXEC_ON_COORDS;
+ +                      break;
+ +
+ +              default:
+ +                      *is_temp = false;
+ +                      exec_type = EXEC_ON_ALL_NODES;
+ +                      break;
+ +      }
+ +
+ +      return exec_type;
+ +}
+ +#endif
+ +
+ +#ifdef PGXC
+ +/*
+ + * IsStmtAllowedInLockedMode
+ + *
+ + * Allow/Disallow a utility command while cluster is locked
+ + * A statement will be disallowed if it makes such changes
+ + * in catalog that are backed up by pg_dump except
+ + * CREATE NODE that has to be allowed because
+ + * a new node has to be created while the cluster is still
+ + * locked for backup
+ + */
+ +static bool
+ +IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString)
+ +{
+ +#define ALLOW         1
+ +#define DISALLOW      0
+ +
+ +      switch (nodeTag(parsetree))
+ +      {
+ +              /* To allow creation of temp tables */
+ +              case T_CreateStmt:                                      /* CREATE TABLE */
+ +                      {
+ +                              CreateStmt *stmt = (CreateStmt *) parsetree;
+ +                              if (stmt->relation->relpersistence == RELPERSISTENCE_TEMP)
+ +                                      return ALLOW;
+ +                              return DISALLOW;
+ +                      }
+ +                      break;
+ +
+ +              case T_ExecuteStmt:                                     /*
+ +                                                                                                                               * Prepared statememts can only have
+ +                                                                                                                               * SELECT, INSERT, UPDATE, DELETE,
+ +                                                                                                                               * or VALUES statement, there is no
+ +                                                                                                                               * point stopping EXECUTE.
+ +                                                                                                                               */
+ +              case T_CreateNodeStmt:                          /*
+ +                                                                                                               * This has to be allowed so that the new node
+ +                                                                                                               * can be created, while the cluster is still
+ +                                                                                                               * locked for backup
+ +                                                                                                               */
+ +              case T_DropNodeStmt:                            /*
+ +                                                                                                               * This has to be allowed so that DROP NODE
+ +                                                                                                               * can be issued to drop a node that has crashed.
+ +                                                                                                               * Otherwise system would try to acquire a shared
+ +                                                                                                               * advisory lock on the crashed node.
+ +                                                                                                               */
+ +
+ +              case T_AlterNodeStmt:                                                   /*
+ +                                                                                                               * This has to be
+ +                                                                                                               * allowed so that
+ +                                                                                                               * ALTER NODE can be
+ +                                                                                                               * issued in case a
+ +                                                                                                               * datanode or
+ +                                                                                                               * coordinator failover
+ +                                                                                                               */  
+ +              case T_TransactionStmt:
+ +              case T_PlannedStmt:
+ +              case T_ClosePortalStmt:
+ +              case T_FetchStmt:
+ +              case T_TruncateStmt:
+ +              case T_CopyStmt:
+ +              case T_PrepareStmt:                                     /*
+ +                                                                                                                               * Prepared statememts can only have
+ +                                                                                                                               * SELECT, INSERT, UPDATE, DELETE,
+ +                                                                                                                               * or VALUES statement, there is no
+ +                                                                                                                               * point stopping PREPARE.
+ +                                                                                                                               */
+ +              case T_DeallocateStmt:                          /*
+ +                                                                                                               * If prepare is allowed the deallocate should
+ +                                                                                                               * be allowed also
+ +                                                                                                               */
+ +              case T_DoStmt:
+ +              case T_NotifyStmt:
+ +              case T_ListenStmt:
+ +              case T_UnlistenStmt:
+ +              case T_LoadStmt:
+ +              case T_ClusterStmt:
+ +              case T_VacuumStmt:
+ +              case T_ExplainStmt:
+ +              case T_VariableSetStmt:
+ +              case T_VariableShowStmt:
+ +              case T_DiscardStmt:
+ +              case T_LockStmt:
+ +              case T_ConstraintsSetStmt:
+ +              case T_CheckPointStmt:
+ +              case T_BarrierStmt:
+ +              case T_ReindexStmt:
+ +              case T_RemoteQuery:
+ +              case T_CleanConnStmt:
+ +#ifdef XCP
+ +              case T_PauseClusterStmt:
+ +#endif
+ +                      return ALLOW;
+ +
+ +              default:
+ +                      return DISALLOW;
+ +      }
+ +      return DISALLOW;
+ +}
+ +
+ +/*
+ + * GetCommentObjectId
+ + * TODO Change to return the nodes to execute the utility on
+ + *
+ + * Return Object ID of object commented
+ + * Note: This function uses portions of the code of CommentObject,
+ + * even if this code is duplicated this is done like this to facilitate
+ + * merges with PostgreSQL head.
+ + */
+ +static RemoteQueryExecType
+ +GetNodesForCommentUtility(CommentStmt *stmt, bool *is_temp)
+ +{
+ +      ObjectAddress           address;
+ +      Relation                        relation;
+ +      RemoteQueryExecType     exec_type = EXEC_ON_ALL_NODES;  /* By default execute on all nodes */
+ +      Oid                                     object_id;
+ +
-               char       *database = strVal(linitial(stmt->objname));
++      if (stmt->objtype == OBJECT_DATABASE)
+ +      {
-       address = get_object_address(stmt->objtype, stmt->objname, stmt->objargs,
++              char       *database = strVal((Value *) stmt->object);
+ +              if (!OidIsValid(get_database_oid(database, true)))
+ +                      ereport(WARNING,
+ +                                      (errcode(ERRCODE_UNDEFINED_DATABASE),
+ +                                       errmsg("database \"%s\" does not exist", database)));
+ +              /* No clue, return the default one */
+ +              return exec_type;
+ +      }
+ +
-                       char *rulename = strVal(llast(stmt->objname));
++      address = get_object_address(stmt->objtype, stmt->object,
+ +                      &relation, ShareUpdateExclusiveLock, false);
+ +      object_id = address.objectId;
+ +
+ +      /*
+ +       * If the object being commented is a rule, the nodes are decided by the
+ +       * object to which rule is applicable, so get the that object's oid
+ +       */
+ +      if (stmt->objtype == OBJECT_RULE)
+ +      {
+ +              if (!relation && !OidIsValid(relation->rd_id))
+ +              {
+ +                      /* This should not happen, but prepare for the worst */
-                                               objname, NIL,
++                      char *rulename = strVal(llast(castNode(List, stmt->object)));
+ +                      ereport(WARNING,
+ +                                      (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                                       errmsg("can not find relation for rule \"%s\" does not exist", rulename)));
+ +                      object_id = InvalidOid;
+ +              }
+ +              else
+ +                      object_id = RelationGetRelid(relation);
+ +      }
+ +
+ +      if (relation != NULL)
+ +              relation_close(relation, NoLock);
+ +
+ +      /* Commented object may not have a valid object ID, so move to default */
+ +      if (OidIsValid(object_id))
+ +              exec_type = ExecUtilityFindNodes(stmt->objtype,
+ +                              object_id,
+ +                              is_temp);
+ +      return exec_type;
+ +}
+ +
+ +/*
+ + * GetNodesForRulesUtility
+ + * Get the nodes to execute this RULE related utility statement.
+ + * A rule is expanded on Coordinator itself, and does not need any
+ + * existence on Datanode. In fact, if it were to exist on Datanode,
+ + * there is a possibility that it would expand again
+ + */
+ +static RemoteQueryExecType
+ +GetNodesForRulesUtility(RangeVar *relation, bool *is_temp)
+ +{
+ +      Oid relid = RangeVarGetRelid(relation, NoLock, true);
+ +      RemoteQueryExecType exec_type;
+ +
+ +      /* Skip if this Oid does not exist */
+ +      if (!OidIsValid(relid))
+ +              return EXEC_ON_NONE;
+ +
+ +      /*
+ +       * PGXCTODO: See if it's a temporary object, do we really need
+ +       * to care about temporary objects here? What about the
+ +       * temporary objects defined inside the rule?
+ +       */
+ +      exec_type = ExecUtilityFindNodes(OBJECT_RULE, relid, is_temp);
+ +      return exec_type;
+ +}
+ +
+ +/*
+ + * TreatDropStmtOnCoord
+ + * Do a pre-treatment of Drop statement on a remote Coordinator
+ + */
+ +static void
+ +DropStmtPreTreatment(DropStmt *stmt, const char *queryString, bool sentToRemote,
+ +              bool *is_temp, RemoteQueryExecType *exec_type)
+ +{
+ +      bool            res_is_temp = false;
+ +      RemoteQueryExecType res_exec_type = EXEC_ON_ALL_NODES;
+ +
+ +      /* Nothing to do if not local Coordinator */
+ +      if (IS_PGXC_DATANODE || IsConnFromCoord())
+ +              return;
+ +
+ +      switch (stmt->removeType)
+ +      {
+ +              case OBJECT_TABLE:
+ +              case OBJECT_SEQUENCE:
+ +              case OBJECT_VIEW:
+ +              case OBJECT_INDEX:
+ +              case OBJECT_MATVIEW:
+ +                      {
+ +                              /*
+ +                               * Check the list of objects going to be dropped.
+ +                               * XC does not allow yet to mix drop of temporary and
+ +                               * non-temporary objects because this involves to rewrite
+ +                               * query to process for tables.
+ +                               */
+ +                              ListCell   *cell;
+ +                              bool            is_first = true;
+ +
+ +                              foreach(cell, stmt->objects)
+ +                              {
+ +                                      RangeVar   *rel = makeRangeVarFromNameList((List *) lfirst(cell));
+ +                                      Oid         relid;
+ +
+ +                                      /*
+ +                                       * Do not print result at all, error is thrown
+ +                                       * after if necessary
+ +                                       */
+ +                                      relid = RangeVarGetRelid(rel, NoLock, true);
+ +
+ +                                      /*
+ +                                       * In case this relation ID is incorrect throw
+ +                                       * a correct DROP error.
+ +                                       */
+ +                                      if (!OidIsValid(relid) && !stmt->missing_ok)
+ +                                              DropTableThrowErrorExternal(rel,
+ +                                                              stmt->removeType,
+ +                                                              stmt->missing_ok);
+ +
+ +                                      /* In case of DROP ... IF EXISTS bypass */
+ +                                      if (!OidIsValid(relid) && stmt->missing_ok)
+ +                                              continue;
+ +
+ +                                      if (is_first)
+ +                                      {
+ +                                              res_exec_type = ExecUtilityFindNodes(stmt->removeType,
+ +                                                              relid,
+ +                                                              &res_is_temp);
+ +                                              is_first = false;
+ +                                      }
+ +                                      else
+ +                                      {
+ +                                              RemoteQueryExecType exec_type_loc;
+ +                                              bool is_temp_loc;
+ +                                              exec_type_loc = ExecUtilityFindNodes(stmt->removeType,
+ +                                                              relid,
+ +                                                              &is_temp_loc);
+ +                                              if (exec_type_loc != res_exec_type ||
+ +                                                              is_temp_loc != res_is_temp)
+ +                                                      ereport(ERROR,
+ +                                                                      (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ +                                                                       errmsg("DROP not supported for TEMP and non-TEMP objects"),
+ +                                                                       errdetail("You should separate TEMP and non-TEMP objects")));
+ +                                      }
+ +                              }
+ +                      }
+ +                      break;
+ +
+ +              case OBJECT_RULE:
+ +                      {
+ +                              /*
+ +                               * In the case of a rule we need to find the object on
+ +                               * which the rule is dependent and define if this rule
+ +                               * has a dependency with a temporary object or not.
+ +                               */
+ +                              List *objname = linitial(stmt->objects);
+ +                              Relation    relation = NULL;
+ +
+ +                              get_object_address(OBJECT_RULE,
++                                              objname, /* XXX PG10MERGE: check if this is ok */
+ +                                              &relation,
+ +                                              AccessExclusiveLock,
+ +                                              stmt->missing_ok);
+ +
+ +                              /* Do nothing if no relation */
+ +                              if (relation && OidIsValid(relation->rd_id))
+ +                                      res_exec_type = ExecUtilityFindNodes(OBJECT_RULE,
+ +                                                      relation->rd_id,
+ +                                                      &res_is_temp);
+ +                              else
+ +                                      res_exec_type = EXEC_ON_NONE;
+ +
+ +                              /* Close relation if necessary */
+ +                              if (relation)
+ +                                      relation_close(relation, NoLock);
+ +                      }
+ +                      break;
+ +
+ +              default:
+ +                      res_is_temp = false;
+ +                      res_exec_type = EXEC_ON_ALL_NODES;
+ +                      break;
+ +      }
+ +
+ +      /* Save results */
+ +      *is_temp = res_is_temp;
+ +      *exec_type = res_exec_type;
+ +}
+ +#endif
diff --cc src/backend/utils/adt/array_userfuncs.c
Simple merge
diff --cc src/backend/utils/adt/arrayfuncs.c

index 8ee878e1128ec35afd0c51f66e9fc49da443f6a5,d9c8aa569c9198c09a769fc7bf30c79d3b5b68a0..2c21a43d73f784ff02365eb1997893e65c54b751
--- 1/src/backend/utils/adt/arrayfuncs.c
--- 2/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@@ -3,8 -3,7 +3,8 @@@
    * arrayfuncs.c
    *      Support functions for arrays.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/utils/adt/date.c

index ff01b5f702efb1ba9c0549f6fa98c7e8532f2c12,76ab9496e2ebe3700563679856a1f06e93062853..baf2957c1a0ccba7ac2c84ceb56e8ae6bc5f669f
--- 1/src/backend/utils/adt/date.c
--- 2/src/backend/utils/adt/date.c
+++ b/src/backend/utils/adt/date.c
@@@ -3,8 -3,7 +3,8 @@@
    * date.c
    *      implements DATE and TIME data types specified in SQL standard
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994-5, Regents of the University of California
    *
    *
diff --cc src/backend/utils/adt/dbsize.c

index 770198fdb460d3aa501f22210a681ebe7e1700c5,f0725860b4bfada10d898d20f8621efd39914265..c1446bb2a31c97b9e90906224804cc62fc7be494
--- 1/src/backend/utils/adt/dbsize.c
--- 2/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@@ -2,8 -2,7 +2,8 @@@
    * dbsize.c
    *            Database object size functions, and related inquiries
    *
-  * Copyright (c) 2002-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Copyright (c) 2002-2017, PostgreSQL Global Development Group
    *
    * IDENTIFICATION
    *      src/backend/utils/adt/dbsize.c
@@@ -19,7 -17,7 +18,8 @@@
   #include "access/htup_details.h"
   #include "catalog/catalog.h"
   #include "catalog/namespace.h"
+ #include "catalog/pg_authid.h"
+ +#include "catalog/pg_namespace.h"
   #include "catalog/pg_tablespace.h"
   #include "commands/dbcommands.h"
   #include "commands/tablespace.h"
@@@ -150,14 -124,8 +156,14 @@@ calculate_database_size(Oid dbOid
                         strcmp(direntry->d_name, "..") == 0)
                         continue;
   
-               snprintf(pathname, MAXPGPATH, "pg_tblspc/%s/%s_%s/%u",
+ +#ifdef PGXC
+ +              /* Postgres-XC tablespaces include node name in path */
-               snprintf(pathname, MAXPGPATH, "pg_tblspc/%s/%s/%u",
++              snprintf(pathname, sizeof(pathname), "pg_tblspc/%s/%s_%s/%u",
+ +                               direntry->d_name, TABLESPACE_VERSION_DIRECTORY, PGXCNodeName, dbOid);
+ +#else
+               snprintf(pathname, sizeof(pathname), "pg_tblspc/%s/%s/%u",
                                  direntry->d_name, TABLESPACE_VERSION_DIRECTORY, dbOid);
+ +#endif
                 totalsize += db_dir_size(pathname);
         }
   
diff --cc src/backend/utils/adt/jsonb.c
Simple merge
diff --cc src/backend/utils/adt/lockfuncs.c
Simple merge
diff --cc src/backend/utils/adt/misc.c

index 72d0e7ee4f4db8110e305ffd3e6fc98750d90873,9cc0b08e969b9950154243fdd22bdf9efa722cf0..af2fa19521f1b6da0bcc20a7b7d2c4fb5c292c63
--- 1/src/backend/utils/adt/misc.c
--- 2/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@@ -42,12 -43,7 +43,10 @@@
   #include "utils/acl.h"
   #include "utils/builtins.h"
   #include "utils/timestamp.h"
+ +#ifdef PGXC
+ +#include "pgxc/pgxc.h"
+ +#endif
   
- #define atooid(x)  ((Oid) strtoul((x), NULL, 10))
- 
   
   /*
    * Common subroutine for num_nulls() and num_nonnulls().
diff --cc src/backend/utils/adt/pseudotypes.c

index 94ee7e2d037b48217c756ce4e295191620f99dc1,be793539a3c84e60ff60e21bf07280403fa94f80..85fdfc9fb3471d6ed35c9b961a72dcf6c5e40a0c
--- 1/src/backend/utils/adt/pseudotypes.c
--- 2/src/backend/utils/adt/pseudotypes.c
+++ b/src/backend/utils/adt/pseudotypes.c
@@@ -11,8 -11,7 +11,8 @@@
    * we do better?)
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -124,18 -89,11 +96,18 @@@ cstring_send(PG_FUNCTION_ARGS
   Datum
   anyarray_in(PG_FUNCTION_ARGS)
   {
+ +#ifdef XCP
+ +      /*
+ +       * XCP version of array_in() understands prefix describing element type
+ +       */
+ +      return array_in(fcinfo);
+ +#else
         ereport(ERROR,
                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                        errmsg("cannot accept a value of type anyarray")));
+                        errmsg("cannot accept a value of type %s", "anyarray")));
   
         PG_RETURN_VOID();                       /* keep compiler quiet */
+ +#endif
   }
   
   /*
diff --cc src/backend/utils/adt/ri_triggers.c

index eeec525fdaa03c24e8876d58216fde091d186f9c,37139f9647b465fa412bb734ecbe800a3c2a87c5..ee2c56bb2e0443389189406b0340b91f7452f629
--- 1/src/backend/utils/adt/ri_triggers.c
--- 2/src/backend/utils/adt/ri_triggers.c
+++ b/src/backend/utils/adt/ri_triggers.c
@@@ -13,8 -13,7 +13,8 @@@
    *    plan --- consider improving this someday.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    *
    * src/backend/utils/adt/ri_triggers.c
    *
diff --cc src/backend/utils/adt/rowtypes.c
Simple merge
diff --cc src/backend/utils/adt/ruleutils.c

index 66e7553e5146f7a8a96dd72e5afc100b2589cb9e,824d7572faf43139b12e68fd2c8981290226d75b..2820dbe46513dbd77a44fecd3c12412cab0f55c6
--- 1/src/backend/utils/adt/ruleutils.c
--- 2/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@@ -4,8 -4,7 +4,8 @@@
    *      Functions to convert stored expressions/querytrees back to
    *      source text
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -6312,181 -6353,6 +6689,180 @@@ get_utility_query_def(Query *query, dep
                         simple_quote_literal(buf, stmt->payload);
                 }
         }
-                               Type type;
+ +#ifdef PGXC
+ +      else if (query->utilityStmt && IsA(query->utilityStmt, CreateStmt))
+ +      {
+ +              CreateStmt *stmt = (CreateStmt *) query->utilityStmt;
+ +              ListCell   *column;
+ +              const char *delimiter = "";
+ +              RangeVar   *relation = stmt->relation;
+ +              bool            istemp = (relation->relpersistence == RELPERSISTENCE_TEMP);
+ +              bool            isunlogged = (relation->relpersistence == RELPERSISTENCE_UNLOGGED);
+ +
+ +              appendStringInfo(buf, "CREATE %s %s %s TABLE %s ",
+ +                              stmt->islocal ? "LOCAL" : "",
+ +                              istemp ? "TEMP" : "",
+ +                              isunlogged ? "UNLOGGED" : "",
+ +                              stmt->if_not_exists ? "IF NOT EXISTS " : "");
+ +
+ +              if (!istemp && relation->schemaname && relation->schemaname[0])
+ +                      appendStringInfo(buf, "%s.", quote_identifier(relation->schemaname));
+ +              appendStringInfo(buf, "%s", quote_identifier(relation->relname));
+ +
+ +              appendStringInfo(buf, "(");
+ +              foreach(column, stmt->tableElts)
+ +              {
+ +                      Node *node = (Node *) lfirst(column);
+ +
+ +                      appendStringInfo(buf, "%s", delimiter);
+ +                      delimiter = ", ";
+ +
+ +                      if (IsA(node, ColumnDef))
+ +                      {
+ +                              ColumnDef *coldef = (ColumnDef *) node;
+ +                              TypeName *typename = coldef->typeName;
+ +#ifdef XCP
+ +                              appendStringInfo(buf, "%s %s",
+ +                                                               quote_identifier(coldef->colname),
+ +                                                               format_type_with_typemod(typename->typeOid,
+ +                                                                                                                typename->typemod));
+ +#else
+ +
+ +                              /* error out if we have no recourse at all */
+ +                              if (!OidIsValid(typename->typeOid))
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                       errmsg("improper type oid: \"%u\"", typename->typeOid)));
+ +
+ +                              /* get typename from the oid */
+ +                              type = typeidType(typename->typeOid);
+ +
+ +                              if (!HeapTupleIsValid(type))
+ +                                      ereport(ERROR,
+ +                                                      (errcode(ERRCODE_UNDEFINED_OBJECT),
+ +                                                       errmsg("type \"%u\" does not exist",
+ +                                                               typename->typeOid)));
+ +                              appendStringInfo(buf, "%s %s", quote_identifier(coldef->colname),
+ +                                              typeTypeName(type));
+ +                              ReleaseSysCache(type);
+ +#endif
+ +                      }
+ +                      else
+ +                              elog(ERROR, "Invalid table column definition.");
+ +              }
+ +              appendStringInfo(buf, ")");
+ +
+ +              /* Append storage parameters, like for instance WITH (OIDS) */
+ +              if (list_length(stmt->options) > 0)
+ +              {
+ +                      Datum        reloptions;
+ +                      static char *validnsps[] = HEAP_RELOPT_NAMESPACES;
+ +
+ +                      reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps,
+ +                                                                               false, false);
+ +
+ +                      if (reloptions)
+ +                      {
+ +                              Datum   sep, txt;
+ +                              /* Below is inspired from flatten_reloptions() */
+ +                              sep = CStringGetTextDatum(", ");
+ +                              txt = OidFunctionCall2(F_ARRAY_TO_TEXT, reloptions, sep);
+ +                              appendStringInfo(buf, " WITH (%s)", TextDatumGetCString(txt));
+ +                      }
+ +              }
+ +
+ +              /* add the on commit clauses for temporary tables */
+ +              switch (stmt->oncommit)
+ +              {
+ +                      case ONCOMMIT_NOOP:
+ +                              /* do nothing */
+ +                              break;
+ +
+ +                      case ONCOMMIT_PRESERVE_ROWS:
+ +                              appendStringInfo(buf, " ON COMMIT PRESERVE ROWS");
+ +                              break;
+ +
+ +                      case ONCOMMIT_DELETE_ROWS:
+ +                              appendStringInfo(buf, " ON COMMIT DELETE ROWS");
+ +                              break;
+ +
+ +                      case ONCOMMIT_DROP:
+ +                              appendStringInfo(buf, " ON COMMIT DROP");
+ +                              break;
+ +              }
+ +
+ +              if (stmt->distributeby)
+ +              {
+ +                      /* add the on commit clauses for temporary tables */
+ +                      switch (stmt->distributeby->disttype)
+ +                      {
+ +                              case DISTTYPE_REPLICATION:
+ +                                      appendStringInfo(buf, " DISTRIBUTE BY REPLICATION");
+ +                                      break;
+ +
+ +                              case DISTTYPE_HASH:
+ +                                      appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", stmt->distributeby->colname);
+ +                                      break;
+ +
+ +                              case DISTTYPE_ROUNDROBIN:
+ +                                      appendStringInfo(buf, " DISTRIBUTE BY ROUNDROBIN");
+ +                                      break;
+ +
+ +                              case DISTTYPE_MODULO:
+ +                                      appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)",
+ +                                                      quote_identifier(stmt->distributeby->colname));
+ +                                      break;
+ +
+ +                              default:
+ +                                      ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
+ +                                                              errmsg("Invalid distribution type")));
+ +
+ +                      }
+ +              }
+ +
+ +              if (stmt->subcluster)
+ +              {
+ +                      ListCell   *cell;
+ +
+ +                      switch (stmt->subcluster->clustertype)
+ +                      {
+ +                              case SUBCLUSTER_NODE:
+ +                                      appendStringInfo(buf, " TO NODE (");
+ +
+ +                                      /* Add node members */
+ +                                      Assert(stmt->subcluster->members);
+ +                                      foreach(cell, stmt->subcluster->members)
+ +                                      {
+ +                                              appendStringInfo(buf, " %s",
+ +                                                              quote_identifier(strVal(lfirst(cell))));
+ +                                              if (cell->next)
+ +                                                      appendStringInfo(buf, ",");
+ +                                      }
+ +                                      appendStringInfo(buf, ")");
+ +                                      break;
+ +
+ +                              case SUBCLUSTER_GROUP:
+ +                                      appendStringInfo(buf, " TO GROUP");
+ +
+ +                                      /* Add group members */
+ +                                      Assert(stmt->subcluster->members);
+ +                                      foreach(cell, stmt->subcluster->members)
+ +                                      {
+ +                                              appendStringInfo(buf, " %s",
+ +                                                              quote_identifier(strVal(lfirst(cell))));
+ +                                              if (cell->next)
+ +                                                      appendStringInfo(buf, ",");
+ +                                      }
+ +                                      break;
+ +
+ +                              case SUBCLUSTER_NONE:
+ +                              default:
+ +                                      /* Nothing to do */
+ +                                      break;
+ +                      }
+ +              }
+ +      }
+ +#endif
         else
         {
                 /* Currently only NOTIFY utility commands can appear in rules */
diff --cc src/backend/utils/adt/selfuncs.c
Simple merge
diff --cc src/backend/utils/adt/timestamp.c
Simple merge
diff --cc src/backend/utils/adt/varlena.c
Simple merge
diff --cc src/backend/utils/adt/version.c

index 2095a9dfe2c14ade02c3d857c5108c8970980071,5bdc8fad43307b9a1aa1f16cdf5381787ee089ba..180032877ad28671c84d0045cedc5865ee608698
--- 1/src/backend/utils/adt/version.c
--- 2/src/backend/utils/adt/version.c
+++ b/src/backend/utils/adt/version.c
@@@ -3,8 -3,7 +3,8 @@@
    * version.c
    *     Returns the PostgreSQL version string
    *
-  * Copyright (c) 1998-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Copyright (c) 1998-2017, PostgreSQL Global Development Group
    *
    * IDENTIFICATION
    *
diff --cc src/backend/utils/cache/inval.c

index 99b6deb9cf8013336e4daec22d5a5f9f709ad6fb,819121638ea3e28e3a91e73e828c935ab48fe615..9daaf75d888420fc8b0df7831551d958441aed7b
--- 1/src/backend/utils/cache/inval.c
--- 2/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@@ -85,8 -85,7 +85,8 @@@
    *    problems can be overcome cheaply.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
diff --cc src/backend/utils/cache/lsyscache.c

index d675081ed5bde06095f6d2c8a8b7d57553c12bb8,4def73ddfbe7ee0c7f407361896cb2a53c7bde48..7384b1971a4c74ea75b6055bcf92f5fe045241ae
--- 1/src/backend/utils/cache/lsyscache.c
--- 2/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@@ -3,8 -3,7 +3,8 @@@
    * lsyscache.c
    *      Convenience routines for common queries in the system catalog cache.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
@@@ -2362,275 -2316,6 +2394,293 @@@ getBaseTypeAndTypmod(Oid typid, int32 *
         return typid;
   }
   
+ +#ifdef PGXC
+ +/*
+ + * get_typename
+ + *            Get type name for given type ID
+ + */
+ +char *
+ +get_typename(Oid typid)
+ +{
+ +      HeapTuple               tuple;
+ +      Form_pg_type    typeForm;
+ +      char               *result;
+ +
+ +      tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid));
+ +
+ +      if (!HeapTupleIsValid(tuple))
+ +                      elog(ERROR, "cache lookup failed for type %u", typid);
+ +
+ +      typeForm = (Form_pg_type) GETSTRUCT(tuple);
+ +      result = pstrdup(NameStr(typeForm->typname));
+ +      ReleaseSysCache(tuple);
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * get_pgxc_nodeoid
+ + *            Obtain PGXC Node Oid for given node name
+ + *            Return Invalid Oid if object does not exist
+ + */
+ +Oid
+ +get_pgxc_nodeoid(const char *nodename)
+ +{
+ +      return GetSysCacheOid1(PGXCNODENAME,
+ +                                                 PointerGetDatum(nodename));
+ +}
+ +
+ +/*
+ + * get_pgxc_nodename
+ + *            Get node name for given Oid
+ + */
+ +char *
+ +get_pgxc_nodename(Oid nodeid)
+ +{
+ +      HeapTuple               tuple;
+ +      Form_pgxc_node  nodeForm;
+ +      char               *result;
+ +
+ +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+ +
+ +      if (!HeapTupleIsValid(tuple))
+ +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
+ +
+ +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ +      result = pstrdup(NameStr(nodeForm->node_name));
+ +      ReleaseSysCache(tuple);
+ +
+ +      return result;
+ +}
+ +
+ + /*
+ + * get_pgxc_node_id
+ + *            Get node identifier for a given Oid
+ + */
+ +uint32
+ +get_pgxc_node_id(Oid nodeid)
+ +{
+ +      HeapTuple       tuple;
+ +      Form_pgxc_node  nodeForm;
+ +      uint32          result;
+ +
+ +      if (nodeid == InvalidOid)
+ +              return 0;
+ +
+ +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+ +
+ +      if (!HeapTupleIsValid(tuple))
+ +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
+ +
+ +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ +      result = nodeForm->node_id;
+ +      ReleaseSysCache(tuple);
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * get_pgxc_nodetype
+ + *            Get node type for given Oid
+ + */
+ +char
+ +get_pgxc_nodetype(Oid nodeid)
+ +{
+ +      HeapTuple               tuple;
+ +      Form_pgxc_node  nodeForm;
+ +      char                    result;
+ +
+ +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+ +
+ +      if (!HeapTupleIsValid(tuple))
+ +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
+ +
+ +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ +      result = nodeForm->node_type;
+ +      ReleaseSysCache(tuple);
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * get_pgxc_nodeport
+ + *            Get node port for given Oid
+ + */
+ +int
+ +get_pgxc_nodeport(Oid nodeid)
+ +{
+ +      HeapTuple               tuple;
+ +      Form_pgxc_node  nodeForm;
+ +      int                             result;
+ +
+ +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+ +
+ +      if (!HeapTupleIsValid(tuple))
+ +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
+ +
+ +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ +      result = nodeForm->node_port;
+ +      ReleaseSysCache(tuple);
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * get_pgxc_nodehost
+ + *            Get node host for given Oid
+ + */
+ +char *
+ +get_pgxc_nodehost(Oid nodeid)
+ +{
+ +      HeapTuple               tuple;
+ +      Form_pgxc_node  nodeForm;
+ +      char               *result;
+ +
+ +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+ +
+ +      if (!HeapTupleIsValid(tuple))
+ +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
+ +
+ +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ +      result = pstrdup(NameStr(nodeForm->node_host));
+ +      ReleaseSysCache(tuple);
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * is_pgxc_nodepreferred
+ + *            Determine if node is a preferred one
+ + */
+ +bool
+ +is_pgxc_nodepreferred(Oid nodeid)
+ +{
+ +      HeapTuple               tuple;
+ +      Form_pgxc_node  nodeForm;
+ +      bool                    result;
+ +
+ +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+ +
+ +      if (!HeapTupleIsValid(tuple))
+ +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
+ +
+ +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ +      result = nodeForm->nodeis_preferred;
+ +      ReleaseSysCache(tuple);
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * is_pgxc_nodeprimary
+ + *            Determine if node is a primary one
+ + */
+ +bool
+ +is_pgxc_nodeprimary(Oid nodeid)
+ +{
+ +      HeapTuple               tuple;
+ +      Form_pgxc_node  nodeForm;
+ +      bool                    result;
+ +
+ +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+ +
+ +      if (!HeapTupleIsValid(tuple))
+ +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
+ +
+ +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ +      result = nodeForm->nodeis_primary;
+ +      ReleaseSysCache(tuple);
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * get_pgxc_groupoid
+ + *            Obtain PGXC Group Oid for given group name
+ + *            Return Invalid Oid if group does not exist
+ + */
+ +Oid
+ +get_pgxc_groupoid(const char *groupname)
+ +{
+ +      return GetSysCacheOid1(PGXCGROUPNAME,
+ +                                                 PointerGetDatum(groupname));
+ +}
+ +
+ +/*
+ + * get_pgxc_groupmembers
+ + *            Obtain PGXC Group members for given group Oid
+ + *            Return number of members and their list
+ + *
+ + * Member list is returned as a palloc'd array
+ + */
+ +int
+ +get_pgxc_groupmembers(Oid groupid, Oid **members)
+ +{
+ +      HeapTuple               tuple;
+ +      Form_pgxc_group         groupForm;
+ +      int                     nmembers;
+ +
+ +      tuple = SearchSysCache1(PGXCGROUPOID, ObjectIdGetDatum(groupid));
+ +
+ +      if (!HeapTupleIsValid(tuple))
+ +                      elog(ERROR, "cache lookup failed for group %u", groupid);
+ +
+ +      groupForm = (Form_pgxc_group) GETSTRUCT(tuple);
+ +      nmembers = (int) groupForm->group_members.dim1;
+ +      *members = (Oid *) palloc(nmembers * sizeof(Oid));
+ +      memcpy(*members, groupForm->group_members.values, nmembers * sizeof(Oid));
+ +
+ +      ReleaseSysCache(tuple);
+ +      return nmembers;
+ +}
+ +
++char *
++get_pgxc_groupname(Oid groupid)
++{
++      HeapTuple       tuple;
++      Form_pgxc_group     groupForm;
++      char            *result;
++
++      tuple = SearchSysCache1(PGXCGROUPOID,
++                      ObjectIdGetDatum(groupid));
++
++      if (!HeapTupleIsValid(tuple))
++              elog(ERROR, "cache lookup failed for group %u", groupid);
++
++      groupForm = (Form_pgxc_group) GETSTRUCT(tuple);
++      result = pstrdup(NameStr(groupForm->group_name));
++      ReleaseSysCache(tuple);
++      return result;
++}
+ +/*
+ + * get_pgxc_classnodes
+ + *            Obtain PGXC class Datanode list for given relation Oid
+ + *            Return number of Datanodes and their list
+ + *
+ + * Node list is returned as a palloc'd array
+ + */
+ +int
+ +get_pgxc_classnodes(Oid tableid, Oid **nodes)
+ +{
+ +      HeapTuple               tuple;
+ +      Form_pgxc_class         classForm;
+ +      int                     numnodes;
+ +
+ +      tuple = SearchSysCache1(PGXCCLASSRELID, ObjectIdGetDatum(tableid));
+ +
+ +      if (!HeapTupleIsValid(tuple))
+ +                      elog(ERROR, "cache lookup failed for relation %u", tableid);
+ +
+ +      classForm = (Form_pgxc_class) GETSTRUCT(tuple);
+ +      numnodes = (int) classForm->nodeoids.dim1;
+ +      *nodes = (Oid *) palloc(numnodes * sizeof(Oid));
+ +      memcpy(*nodes, classForm->nodeoids.values, numnodes * sizeof(Oid));
+ +
+ +      ReleaseSysCache(tuple);
+ +      return numnodes;
+ +}
+ +#endif
+ +
   /*
    * get_typavgwidth
    *
diff --cc src/backend/utils/cache/plancache.c

index 652cdf188b0f36c6e36cd528be3cab50a11a8c30,4b5f8107ef096fc5bb0c7bdc82426df57aaa9306..61e3da9306031c67b445be750b961ba79cd190b8
--- 1/src/backend/utils/cache/plancache.c
--- 2/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@@ -38,8 -38,7 +38,8 @@@
    * be infrequent enough that more-detailed tracking is not worth the effort.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
@@@ -155,11 -149,8 +158,11 @@@ InitPlanCache(void
    * commandTag: compile-time-constant tag for query, or NULL if empty query
    */
   CachedPlanSource *
- CreateCachedPlan(Node *raw_parse_tree,
+ CreateCachedPlan(RawStmt *raw_parse_tree,
                                  const char *query_string,
+ +#ifdef PGXC
+ +                               const char *stmt_name,
+ +#endif
                                  const char *commandTag)
   {
         CachedPlanSource *plansource;
@@@ -1749,13 -1674,9 +1754,12 @@@ PlanCacheComputeResultDesc(List *stmt_l
   
         switch (ChoosePortalStrategy(stmt_list))
         {
+ +#ifdef XCP
+ +              case PORTAL_DISTRIBUTED:
+ +#endif
                 case PORTAL_ONE_SELECT:
                 case PORTAL_ONE_MOD_WITH:
-                       query = (Query *) linitial(stmt_list);
-                       Assert(IsA(query, Query));
+                       query = linitial_node(Query, stmt_list);
                         return ExecCleanTypeFromTL(query->targetList, false);
   
                 case PORTAL_ONE_RETURNING:
diff --cc src/backend/utils/cache/relcache.c

index 9d3e19617603adfea6a87cbce43369f8a245f659,c2e8361f2f4413a20345d7a3584084fb1a8e65a9..02df47433965334e14664558a8f64e56be7123da
--- 1/src/backend/utils/cache/relcache.c
--- 2/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@@ -3,10 -3,8 +3,10 @@@
    * relcache.c
    *      POSTGRES relation descriptor cache code
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
diff --cc src/backend/utils/cache/syscache.c

index 1f32c421078f83efd85363034414f0c96929683c,922718c9d17378d4fb230c775a5c7bf1c86e27c4..f18dbb31b0e35d2cbf29bc03020946b21ea40981
--- 1/src/backend/utils/cache/syscache.c
--- 2/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@@ -3,9 -3,8 +3,9 @@@
    * syscache.c
    *      System cache management routines
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    *
    * IDENTIFICATION
@@@ -574,74 -573,17 +579,85 @@@ static const struct cachedesc cacheinfo
                 },
                 8
         },
+ +#ifdef PGXC
+ +      {PgxcClassRelationId,   /* PGXCCLASSRELID */
+ +              PgxcClassPgxcRelIdIndexId,
+ +              1,
+ +              {
+ +                      Anum_pgxc_class_pcrelid,
+ +                      0,
+ +                      0,
+ +                      0
+ +              },
+ +              1024
+ +      },
+ +      {PgxcGroupRelationId,   /* PGXCGROUPNAME */
+ +              PgxcGroupGroupNameIndexId,
+ +              1,
+ +              {
+ +                      Anum_pgxc_group_name,
+ +                      0,
+ +                      0,
+ +                      0
+ +              },
+ +              256
+ +      },
+ +      {PgxcGroupRelationId,   /* PGXCGROUPOID */
+ +              PgxcGroupOidIndexId,
+ +              1,
+ +              {
+ +                      ObjectIdAttributeNumber,
+ +                      0,
+ +                      0,
+ +                      0
+ +              },
+ +              256
+ +      },
+ +      {PgxcNodeRelationId,    /* PGXCNODENAME */
+ +              PgxcNodeNodeNameIndexId,
+ +              1,
+ +              {
+ +                      Anum_pgxc_node_name,
+ +                      0,
+ +                      0,
+ +                      0
+ +              },
+ +              256
+ +      },
+ +      {PgxcNodeRelationId,    /* PGXCNODEOID */
+ +              PgxcNodeOidIndexId,
+ +              1,
+ +              {
+ +                      ObjectIdAttributeNumber,
+ +                      0,
+ +                      0,
+ +                      0
+ +              },
+ +              256
+ +      },
+ +      {PgxcNodeRelationId,    /* PGXCNODEIDENTIFIER */
+ +              PgxcNodeNodeIdIndexId,
+ +              1,
+ +              {
+ +                      Anum_pgxc_node_id,
+ +                      0,
+ +                      0,
+ +                      0
+ +              },
+ +              256
+ +      },
+ +#endif
+       {PartitionedRelationId,         /* PARTRELID */
+               PartitionedRelidIndexId,
+               1,
+               {
+                       Anum_pg_partitioned_table_partrelid,
+                       0,
+                       0,
+                       0
+               },
+               32
+       },
         {ProcedureRelationId,           /* PROCNAMEARGSNSP */
                 ProcedureNameArgsNspIndexId,
                 3,
diff --cc src/backend/utils/errcodes.txt
Simple merge
diff --cc src/backend/utils/error/elog.c

index 2da98efa33080725ebd28042beb52d3ba5e02133,22004cb81920ada33db7ab47e9dbea6776f94f80..35ed690931da39bfbe61afc8632ad9438556aac5
--- 1/src/backend/utils/error/elog.c
--- 2/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@@ -79,21 -77,12 +79,20 @@@
   #include "utils/guc.h"
   #include "utils/memutils.h"
   #include "utils/ps_status.h"
+ +#ifdef PGXC
+ +#include "pgxc/pgxc.h"
+ +#include "pgxc/execRemote.h"
+ +#endif
   
   
+ /* In this module, access gettext() via err_gettext() */
   #undef _
   #define _(x) err_gettext(x)
   
- static const char *err_gettext(const char *str) pg_attribute_format_arg(1);
- static void set_errdata_field(MemoryContextData *cxt, char **ptr, const char *str);
+ +#ifdef USE_MODULE_MSGIDS
+ +static void AtProcExit_MsgModule(int code, Datum arg);
+ +static bool pg_msgmodule_enable_disable(int32 pid, bool enable);
+ +#endif
   
   /* Global variables */
   ErrorContextCallback *error_context_stack = NULL;
@@@ -188,41 -182,8 +192,37 @@@ static const char *useful_strerror(int 
   static const char *get_errno_symbol(int errnum);
   static const char *error_severity(int elevel);
   static void append_with_tabs(StringInfo buf, const char *str);
- -static bool is_log_level_output(int elevel, int log_min_level);
+ +static bool is_log_level_output(int elevel,
+ +#ifdef USE_MODULE_MSGIDS
+ +              int moduleid,
+ +              int fileid,
+ +              int msgid,
+ +#endif
+ +              int log_min_level);
- static void write_pipe_chunks(char *data, int len, int dest);
- static void write_csvlog(ErrorData *edata);
- static void setup_formatted_log_time(void);
- static void setup_formatted_start_time(void);
   
+ +#ifdef USE_MODULE_MSGIDS
+ +typedef struct MsgModuleCtlStruct
+ +{
+ +      bool    mm_enabled;
+ +      bool    mm_persistent;
+ +      char    mm_flags[FLEXIBLE_ARRAY_MEMBER];
+ +} MsgModuleCtlStruct;
+ +
+ +#define StartOfBackendFlags   \
+ +      ( \
+ +        PGXL_MSG_MAX_MODULES * \
+ +        PGXL_MSG_MAX_FILEIDS_PER_MODULE * \
+ +        PGXL_MSG_MAX_MSGIDS_PER_FILE \
+ +      )
+ +
+ +#define SizeOfMsgModuleCtlStruct      \
+ +      ( \
+ +        offsetof(MsgModuleCtlStruct, mm_flags) + \
+ +        StartOfBackendFlags + \
+ +        MaxBackends \
+ +      )
+ +static MsgModuleCtlStruct *MsgModuleCtl;
+ +#endif
   
   /*
    * in_error_recursion_trouble --- are we at risk of infinite error recursion?
@@@ -1674,18 -1615,15 +1677,19 @@@ ThrowErrorData(ErrorData *edata
         MemoryContext oldcontext;
   
         if (!errstart(edata->elevel, edata->filename, edata->lineno,
+ +#ifdef USE_MODULE_MSGIDS
+ +                              edata->moduleid,
+ +                              edata->fileid, edata->msgid,
+ +#endif
                                   edata->funcname, NULL))
-               return;
+               return;                                 /* error is not to be reported at all */
   
         newedata = &errordata[errordata_stack_depth];
-       oldcontext = MemoryContextSwitchTo(edata->assoc_context);
+       recursion_depth++;
+       oldcontext = MemoryContextSwitchTo(newedata->assoc_context);
   
-       /* Copy the supplied fields to the error stack. */
-       if (edata->sqlerrcode > 0)
+       /* Copy the supplied fields to the error stack entry. */
+       if (edata->sqlerrcode != 0)
                 newedata->sqlerrcode = edata->sqlerrcode;
         if (edata->message)
                 newedata->message = pstrdup(edata->message);
diff --cc src/backend/utils/init/globals.c

index 5cb9a138a544f5b7fcc74eef1d202c34f5d0fdb0,08b6030a649621d35b7b581a46233e94115002d8..b0ec4a2d279e33d4bb4bf66aea51405ad39a9208
--- 1/src/backend/utils/init/globals.c
--- 2/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@@ -3,8 -3,7 +3,8 @@@
    * globals.c
    *      global variable declarations
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
diff --cc src/backend/utils/init/miscinit.c

index 1f59d7acf8bedb294c820a8e8778d895afb1d1b9,8d149bf2728cb5c9ee35d90e9481242facbbddda..d987dac8f508df248143ce467dcb974557d21d26
--- 1/src/backend/utils/init/miscinit.c
--- 2/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@@ -3,8 -3,7 +3,8 @@@
    * miscinit.c
    *      miscellaneous initialization support stuff
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -40,9 -35,7 +39,10 @@@
   #include "libpq/libpq.h"
   #include "mb/pg_wchar.h"
   #include "miscadmin.h"
+ +#ifdef XCP
+ +#include "pgxc/execRemote.h"
+ +#endif
+ #include "pgstat.h"
   #include "postmaster/autovacuum.h"
   #include "postmaster/postmaster.h"
   #include "storage/fd.h"
@@@ -54,11 -47,8 +54,12 @@@
   #include "utils/builtins.h"
   #include "utils/guc.h"
   #include "utils/memutils.h"
+ +#ifdef XCP
+ +#include "utils/snapmgr.h"
+ +#endif
   #include "utils/syscache.h"
+ +#include "utils/lsyscache.h"
+ #include "utils/varlena.h"
   
   
   #define DIRECTORY_LOCK_FILE           "postmaster.pid"
diff --cc src/backend/utils/init/postinit.c

index 2355321549e8b96b37ee7dae9c372cc40a07cb8e,b8b4a06350c50858c66a41f6d70b46e74303c0a4..778e7fc47274e6f91a68968d57b79088b5aef277
--- 1/src/backend/utils/init/postinit.c
--- 2/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@@ -3,8 -3,7 +3,8 @@@
    * postinit.c
    *      postgres initialization utilities
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *
@@@ -356,11 -349,8 +354,11 @@@ CheckMyDatabase(const char *name, bool 
                  * just document that the connection limit is approximate.
                  */
                 if (dbform->datconnlimit >= 0 &&
+ +#ifdef XCP
+ +                      IS_PGXC_COORDINATOR &&
+ +#endif
                         !am_superuser &&
-                       CountDBBackends(MyDatabaseId) > dbform->datconnlimit)
+                       CountDBConnections(MyDatabaseId) > dbform->datconnlimit)
                         ereport(FATAL,
                                         (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
                                          errmsg("too many connections for database \"%s\"",
@@@ -674,8 -664,13 +672,13 @@@ InitPostgres(const char *in_dbname, Oi
         before_shmem_exit(ShutdownPostgres, 0);
   
         /* The autovacuum launcher is done here */
- -      if (IsAutoVacuumLauncherProcess())
+ +      if (IsAutoVacuumLauncherProcess() || IsClusterMonitorProcess())
+       {
+               /* report this backend in the PgBackendStatus array */
+               pgstat_bestart();
+ 
                 return;
+       }
   
         /*
          * Start a new transaction here before first access to db, and get a
diff --cc src/backend/utils/misc/guc.c

index cf1a41fa71e201b260ca2a4f59f17b7e2e30fdbe,92e1d63b2f5ec710b03639b3bbecd2f51d7bcc35..f7391cc6b8f4cb3c846164444eac57bc2ad120d4
--- 1/src/backend/utils/misc/guc.c
--- 2/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@@ -6,8 -6,7 +6,8 @@@
    * See src/backend/utils/misc/README for more information.
    *
    *
-  * Copyright (c) 2000-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Copyright (c) 2000-2017, PostgreSQL Global Development Group
    * Written by Peter Eisentraut <[email protected]>.
    *
    * IDENTIFICATION
@@@ -29,10 -28,7 +29,11 @@@
   
   #include "access/commit_ts.h"
   #include "access/gin.h"
+ +#ifdef PGXC
+ +#include "access/gtm.h"
+ +#include "pgxc/pgxc.h"
+ +#endif
+ #include "access/rmgr.h"
   #include "access/transam.h"
   #include "access/twophase.h"
   #include "access/xact.h"
@@@ -57,27 -56,8 +61,27 @@@
   #include "parser/parser.h"
   #include "parser/scansup.h"
   #include "pgstat.h"
+ +#ifdef PGXC
+ +#include "commands/tablecmds.h"
+ +#include "commands/trigger.h"
+ +#include "nodes/nodes.h"
+ +#include "pgxc/execRemote.h"
+ +#include "pgxc/locator.h"
+ +#include "pgxc/planner.h"
+ +#include "pgxc/poolmgr.h"
+ +#include "pgxc/nodemgr.h"
+ +#include "pgxc/xc_maintenance_mode.h"
+ +#include "storage/procarray.h"
+ +#endif
+ +#ifdef XCP
+ +#include "commands/sequence.h"
+ +#include "parser/parse_utilcmd.h"
+ +#include "pgxc/nodemgr.h"
+ +#include "pgxc/squeue.h"
+ +#include "utils/snapmgr.h"
+ +#endif
   #include "postmaster/autovacuum.h"
- #include "postmaster/bgworker.h"
+ #include "postmaster/bgworker_internals.h"
   #include "postmaster/bgwriter.h"
   #include "postmaster/postmaster.h"
   #include "postmaster/syslogger.h"
@@@ -956,47 -901,16 +980,57 @@@ static struct config_bool ConfigureName
                 true,
                 NULL, NULL, NULL
         },
+ +#ifdef PGXC
+ +      {
+ +              {"enable_fast_query_shipping", PGC_USERSET, QUERY_TUNING_METHOD,
+ +                      gettext_noop("Enables the planner's use of fast query shipping to ship query directly to datanode."),
+ +                      NULL
+ +              },
+ +              &enable_fast_query_shipping,
+ +              true,
+ +              NULL, NULL, NULL
+ +      },
+ +      {
+ +              {"loose_constraints", PGC_USERSET, COORDINATORS,
+ +                      gettext_noop("Relax enforcing of constraints"),
+ +                      gettext_noop("If enabled then constraints like foreign keys "
+ +                                               "are not enforced. It's the users responsibility "
+ +                                               "to maintain referential integrity at the application "
+ +                                               "level")
+ +              },
+ +              &loose_constraints,
+ +              false,
+ +              NULL, NULL, NULL
+ +      },
+ +      {
+ +              {"gtm_backup_barrier", PGC_SUSET, QUERY_TUNING_METHOD,
+ +                      gettext_noop("Enables coordinator to report barrier id to GTM for backup."),
+ +                      NULL
+ +              },
+ +              &gtm_backup_barrier,
+ +              false,
+ +              NULL, NULL, NULL
+ +      },
+ +      {
+ +              {"enable_datanode_row_triggers", PGC_POSTMASTER, DEVELOPER_OPTIONS,
+ +                      gettext_noop("Enables datanode-only ROW triggers"),
+ +                      NULL
+ +              },
+ +              &enable_datanode_row_triggers,
+ +              false,
+ +              NULL, NULL, NULL
+ +      },
+ +#endif
+       {
+               {"enable_gathermerge", PGC_USERSET, QUERY_TUNING_METHOD,
+                       gettext_noop("Enables the planner's use of gather merge plans."),
+                       NULL
+               },
+               &enable_gathermerge,
+               true,
+               NULL, NULL, NULL
+       },
+ 
         {
                 {"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
                         gettext_noop("Enables genetic query optimization."),
@@@ -3866,41 -3576,8 +3945,41 @@@ static struct config_string ConfigureNa
                 check_TSCurrentConfig, assign_TSCurrentConfig, NULL
         },
   
+ +#ifdef PGXC
+ +      {
+ +              {"gtm_host", PGC_POSTMASTER, GTM,
+ +                      gettext_noop("Host name or address of GTM"),
+ +                      NULL
+ +              },
+ +              &GtmHost,
+ +              "localhost",
+ +              NULL, NULL, NULL
+ +      },
+ +
+ +      {
+ +              {"pgxc_node_name", PGC_POSTMASTER, GTM,
+ +                      gettext_noop("The Coordinator or Datanode name."),
+ +                      NULL,
+ +                      GUC_NO_RESET_ALL | GUC_IS_NAME
+ +              },
+ +              &PGXCNodeName,
+ +              "",
+ +              NULL, NULL, NULL
+ +      },
+ +#endif
+ +#ifdef XCP
+ +      {
+ +              {"parentnode", PGC_BACKEND, CONN_AUTH,
+ +                      gettext_noop("Sets the name of the parent data node"),
+ +                      NULL
+ +              },
+ +              &parentPGXCNode,
+ +              NULL,
+ +              NULL, NULL, NULL
+ +      },
+ +#endif /* XCP */
         {
-               {"ssl_ciphers", PGC_POSTMASTER, CONN_AUTH_SECURITY,
+               {"ssl_ciphers", PGC_SIGHUP, CONN_AUTH_SECURITY,
                         gettext_noop("Sets the list of allowed SSL ciphers."),
                         NULL,
                         GUC_SUPERUSER_ONLY
diff --cc src/backend/utils/misc/postgresql.conf.sample

index 2163979637cdaffd5fd60341de4fdd53cec32f63,fceef14c78ae565c658f95688b1d3e2a6b1ae910..cd07343fd4490de02f0da52f3bf380e403ea78ff
--- 1/src/backend/utils/misc/postgresql.conf.sample
--- 2/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@@ -163,15 -161,13 +161,17 @@@
   
   #effective_io_concurrency = 1         # 1-1000; 0 disables prefetching
   #max_worker_processes = 8             # (change requires restart)
- #max_parallel_workers_per_gather = 2  # taken from max_worker_processes
+ #max_parallel_workers_per_gather = 2  # taken from max_parallel_workers
+ #max_parallel_workers = 8         # maximum number of max_worker_processes that
+                                       # can be used in parallel queries
   #old_snapshot_threshold = -1          # 1min-60d; -1 disables; 0 is immediate
-                                                                       # (change requires restart)
- #backend_flush_after = 0              # 0 disables, default is 0
+                                       # (change requires restart)
+ #backend_flush_after = 0              # measured in pages, 0 disables
   
+ +# - Shared queues -
+ +
+ +#shared_queues = 64                   # min 16   
+ +#shared_queue_size = 64KB             # min 16KB
   
   #------------------------------------------------------------------------------
   # WRITE AHEAD LOG
@@@ -302,11 -305,10 +309,12 @@@
   #cpu_tuple_cost = 0.01                        # same scale as above
   #cpu_index_tuple_cost = 0.005         # same scale as above
   #cpu_operator_cost = 0.0025           # same scale as above
+ +#network_byte_cost = 0.001            # same scale as above
+ +#remote_query_cost = 100.0            # same scale as above
   #parallel_tuple_cost = 0.1            # same scale as above
   #parallel_setup_cost = 1000.0 # same scale as above
- #min_parallel_relation_size = 8MB
+ #min_parallel_table_scan_size = 8MB
+ #min_parallel_index_scan_size = 512kB
   #effective_cache_size = 4GB
   
   # - Genetic Query Optimizer -
diff --cc src/backend/utils/mmgr/mcxt.c

index 6b3f3dc7d9147ccf9ae659658a2cef86c92eefd3,6668bf135e9a81435c9a74bdec9898c0d5d6f67a..73c9fba2b1dbb0ecf1bd1da7117f2fd83cbc8a2e
--- 1/src/backend/utils/mmgr/mcxt.c
--- 2/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@@ -1189,26 -1094,16 +1098,40 @@@ pnstrdup(const char *in, Size len
         return out;
   }
   
+ +#ifdef PGXC
+ +#include "gen_alloc.h"
+ +
+ +void *current_memcontext(void);
+ +
+ +void *current_memcontext()
+ +{
+ +      return((void *)CurrentMemoryContext);
+ +}
+ +
+ +void *allocTopCxt(size_t s)
+ +{
+ +      return MemoryContextAlloc(TopMemoryContext, (Size)s);
+ +}
+ +
+ +Gen_Alloc genAlloc_class = {(void *)MemoryContextAlloc,
+ +                                                      (void *)MemoryContextAllocZero,
+ +                                                      (void *)repalloc,
+ +                                                      (void *)pfree,
+ +                                                      (void *)current_memcontext,
+ +                                                      (void *)allocTopCxt};
+ +
+ +#endif
++
+ /*
+  * Make copy of string with all trailing newline characters removed.
+  */
+ char *
+ pchomp(const char *in)
+ {
+       size_t          n;
+ 
+       n = strlen(in);
+       while (n > 0 && in[n - 1] == '\n')
+               n--;
+       return pnstrdup(in, n);
+ }
diff --cc src/backend/utils/mmgr/portalmem.c

index 776d2ae893b129664b965d165ff8707684d17b22,5983aedb121278dd13825b925c206643c2192ae4..62d96a01978e316f5823dc7a86da0e8b6dd1cb91
--- 1/src/backend/utils/mmgr/portalmem.c
--- 2/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@@ -8,8 -8,7 +8,8 @@@
    * doesn't actually run the executor for them.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
diff --cc src/backend/utils/resowner/resowner.c

index 73801c180b591adf36925e66744733b8ca141934,af46d781253e2834e0c330e7fc83776710e53847..4c3654f8096ed64e62c0a84f8be0b1b513bb296b
--- 1/src/backend/utils/resowner/resowner.c
--- 2/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@@ -679,19 -668,6 +679,17 @@@ ResourceOwnerReleaseInternal(ResourceOw
                                 PrintFileLeakWarning(res);
                         FileClose(res);
                 }
-               /* Clean up index scans too */
-               ReleaseResources_hash();
+ +
+ +              /* Ditto for prepared statements */
+ +              while (ResourceArrayGetAny(&(owner->prepstmts), &foundres))
+ +              {
+ +                      char *stmt = (char *) DatumGetPointer(foundres);
+ +
+ +                      if (isCommit)
+ +                              PrintPreparedStmtLeakWarning(stmt);
+ +                      DropPreparedStatement(stmt, false);
+ +              }
+ +
         }
   
         /* Let add-on modules get a chance too */
diff --cc src/backend/utils/sort/tuplesort.c

index d2ba7d968c1eda99b813ea4880d1543ea8691e94,8a8db0fd337b874069054cfc841fe11048752eeb..e5bc08fff924a7b550789824b02767025d6ba2e3
--- 1/src/backend/utils/sort/tuplesort.c
--- 2/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@@ -108,11 -108,11 +108,12 @@@
    * code we determine the number of tapes M on the basis of workMem: we want
    * workMem/M to be large enough that we read a fair amount of data each time
    * we preread from a tape, so as to maintain the locality of access described
-  * above.  Nonetheless, with large workMem we can have many tapes.
+  * above.  Nonetheless, with large workMem we can have many tapes (but not
+  * too many -- see the comments in tuplesort_merge_order).
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
@@@ -317,22 -324,6 +335,14 @@@ struct Tuplesortstat
         void            (*readtup) (Tuplesortstate *state, SortTuple *stup,
                                                                                 int tapenum, unsigned int len);
   
-       /*
-        * Function to move a caller tuple.  This is usually implemented as a
-        * memmove() shim, but function may also perform additional fix-up of
-        * caller tuple where needed.  Batch memory support requires the movement
-        * of caller tuples from one location in memory to another.
-        */
-       void            (*movetup) (void *dest, void *src, unsigned int len);
- 
+ +#ifdef PGXC
+ +      /*
+ +       * Function to read length of next stored tuple.
+ +       * Used as 'len' parameter for readtup function.
+ +       */
+ +      unsigned int (*getlen) (Tuplesortstate *state, int tapenum, bool eofOK);
+ +#endif
+ +
         /*
          * This array holds the tuples now in sort memory.  If we are in state
          * INITIAL, the tuples are in no particular order; if we are in state
@@@ -504,11 -521,7 +540,10 @@@
   #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
   #define WRITETUP(state,tape,stup)     ((*(state)->writetup) (state, tape, stup))
   #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
- #define MOVETUP(dest,src,len) ((*(state)->movetup) (dest, src, len))
- #define LACKMEM(state)                ((state)->availMem < 0 && !(state)->batchUsed)
+ +#ifdef PGXC
+ +#define GETLEN(state,tape,eofOK) ((*(state)->getlen) (state, tape, eofOK))
+ +#endif
+ #define LACKMEM(state)                ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
   #define USEMEM(state,amt)     ((state)->availMem -= (amt))
   #define FREEMEM(state,amt)    ((state)->availMem += (amt))
   
@@@ -604,13 -612,6 +634,12 @@@ static void writetup_heap(Tuplesortstat
                           SortTuple *stup);
   static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
                          int tapenum, unsigned int len);
- static void movetup_heap(void *dest, void *src, unsigned int len);
+ +#ifdef PGXC
+ +static unsigned int getlen_datanode(Tuplesortstate *state, int tapenum,
+ +                              bool eofOK);
+ +static void readtup_datanode(Tuplesortstate *state, SortTuple *stup,
+ +                               int tapenum, unsigned int len);
+ +#endif
   static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
                                    Tuplesortstate *state);
   static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
@@@ -792,10 -786,6 +814,9 @@@ tuplesort_begin_heap(TupleDesc tupDesc
         state->copytup = copytup_heap;
         state->writetup = writetup_heap;
         state->readtup = readtup_heap;
-       state->movetup = movetup_heap;
+ +#ifdef PGXC
+ +      state->getlen = getlen;
+ +#endif
   
         state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
         state->abbrevNext = 10;
@@@ -868,10 -858,6 +889,9 @@@ tuplesort_begin_cluster(TupleDesc tupDe
         state->copytup = copytup_cluster;
         state->writetup = writetup_cluster;
         state->readtup = readtup_cluster;
-       state->movetup = movetup_cluster;
+ +#ifdef PGXC
+ +      state->getlen = getlen;
+ +#endif
         state->abbrevNext = 10;
   
         state->indexInfo = BuildIndexInfo(indexRel);
@@@ -963,10 -949,6 +983,9 @@@ tuplesort_begin_index_btree(Relation he
         state->copytup = copytup_index;
         state->writetup = writetup_index;
         state->readtup = readtup_index;
-       state->movetup = movetup_index;
+ +#ifdef PGXC
+ +      state->getlen = getlen;
+ +#endif
         state->abbrevNext = 10;
   
         state->heapRel = heapRel;
@@@ -1034,10 -1021,6 +1058,9 @@@ tuplesort_begin_index_hash(Relation hea
         state->copytup = copytup_index;
         state->writetup = writetup_index;
         state->readtup = readtup_index;
-       state->movetup = movetup_index;
+ +#ifdef PGXC
+ +      state->getlen = getlen;
+ +#endif
   
         state->heapRel = heapRel;
         state->indexRel = indexRel;
@@@ -1080,10 -1065,6 +1105,9 @@@ tuplesort_begin_datum(Oid datumType, Oi
         state->copytup = copytup_datum;
         state->writetup = writetup_datum;
         state->readtup = readtup_datum;
-       state->movetup = movetup_datum;
+ +#ifdef PGXC
+ +      state->getlen = getlen;
+ +#endif
         state->abbrevNext = 10;
   
         state->datumType = datumType;
@@@ -1126,113 -1107,6 +1150,95 @@@
         return state;
   }
   
-       state->batchUsed = false;
+ +#ifdef PGXC
+ +/*
+ + * Tuples are coming from source where they are already sorted.
+ + * It is pretty much like sorting heap tuples but no need to load sorter.
+ + * Sorter initial status is final merge, and correct readtup and getlen
+ + * callbacks should be passed in.
+ + * Usage pattern of the merge sorter
+ + * tuplesort_begin_merge
+ + * while (tuple = tuplesort_gettuple())
+ + * {
+ + *     // process
+ + * }
+ + * tuplesort_end_merge
+ + */
+ +Tuplesortstate *
+ +tuplesort_begin_merge(TupleDesc tupDesc,
+ +                                       int nkeys, AttrNumber *attNums,
+ +                                       Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags,
+ +                                       ResponseCombiner *combiner,
+ +                                       int workMem)
+ +{
+ +      Tuplesortstate *state = tuplesort_begin_common(workMem, false);
+ +      MemoryContext oldcontext;
+ +      int                     i;
+ +
+ +      oldcontext = MemoryContextSwitchTo(state->sortcontext);
+ +
+ +      AssertArg(nkeys > 0);
+ +      AssertArg(combiner);
+ +
+ +#ifdef TRACE_SORT
+ +      if (trace_sort)
+ +              elog(LOG,
+ +                       "begin merge sort: nkeys = %d, workMem = %d", nkeys, workMem);
+ +#endif
+ +
+ +      state->nKeys = nkeys;
+ +
+ +      TRACE_POSTGRESQL_SORT_START(MERGE_SORT,
+ +                                                              false,  /* no unique check */
+ +                                                              nkeys,
+ +                                                              workMem,
+ +                                                              false);
+ +
+ +      state->combiner = combiner;
+ +      state->comparetup = comparetup_heap;
+ +      state->copytup = NULL;
+ +      state->writetup = NULL;
+ +      state->readtup = readtup_datanode;
+ +      state->getlen = getlen_datanode;
+ +
+ +      state->tuples = false;
-       /*
-        * logical tape in this case is a sorted stream
-        */
-       state->maxTapes = combiner->conn_count;
-       state->tapeRange = combiner->conn_count;
- 
-       state->mergeactive = (bool *) palloc0(combiner->conn_count * sizeof(bool));
-       state->mergenext = (int *) palloc0(combiner->conn_count * sizeof(int));
-       state->mergelast = (int *) palloc0(combiner->conn_count * sizeof(int));
-       state->mergeavailslots = (int *) palloc0(combiner->conn_count * sizeof(int));
-       state->mergeavailmem = (int64 *) palloc0(combiner->conn_count * sizeof(int64));
- 
-       state->mergetuples = (char **) palloc0(combiner->conn_count * sizeof(char *));
-       state->mergecurrent = (char **) palloc0(combiner->conn_count * sizeof(char *));
-       state->mergetail = (char **) palloc0(combiner->conn_count * sizeof(char *));
-       state->mergeoverflow = (char **) palloc0(combiner->conn_count * sizeof(char *));
- 
+ +
+ +      state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
+ +      state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
+ +
+ +      for (i = 0; i < nkeys; i++)
+ +      {
+ +              SortSupport sortKey = state->sortKeys + i;
+ +
+ +              AssertArg(attNums[i] != 0);
+ +              AssertArg(sortOperators[i] != 0);
+ +
+ +              sortKey->ssup_cxt = CurrentMemoryContext;
+ +              sortKey->ssup_collation = sortCollations[i];
+ +              sortKey->ssup_nulls_first = nullsFirstFlags[i];
+ +              sortKey->ssup_attno = attNums[i];
+ +
+ +              PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
+ +      }
+ +
-       beginmerge(state, state->tuples);
+ +      state->tp_runs = (int *) palloc0(combiner->conn_count * sizeof(int));
+ +      state->tp_dummy = (int *) palloc0(combiner->conn_count * sizeof(int));
+ +      state->tp_tapenum = (int *) palloc0(combiner->conn_count * sizeof(int));
+ +      /* mark each stream (tape) has one run */
+ +      for (i = 0; i < combiner->conn_count; i++)
+ +      {
+ +              state->tp_runs[i] = 1;
+ +              state->tp_tapenum[i] = i;
+ +      }
++      beginmerge(state);
+ +      state->status = TSS_FINALMERGE;
+ +
+ +      MemoryContextSwitchTo(oldcontext);
+ +
+ +      return state;
+ +}
+ +#endif
+ +
   /*
    * tuplesort_set_bound
    *
@@@ -3243,169 -2898,27 +3030,31 @@@ beginmerge(Tuplesortstate *state
   }
   
   /*
-  * mergeprereadone - load tuples from one merge input tape
+  * mergereadnext - read next tuple from one merge input tape
    *
-  * Read tuples from the specified tape until it has used up its free memory
-  * or array slots; but ensure that we have at least one tuple, if any are
-  * to be had.
+  * Returns false on EOF.
    */
- static void
- mergeprereadone(Tuplesortstate *state, int srcTape)
+ static bool
+ mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup)
   {
         unsigned int tuplen;
-       SortTuple       stup;
-       int                     tupIndex;
-       int64           priorAvail,
-                               spaceUsed;
   
         if (!state->mergeactive[srcTape])
-               return;                                 /* tape's run is already exhausted */
- 
-       /*
-        * Manage per-tape availMem.  Only actually matters when batch memory not
-        * in use.
-        */
-       priorAvail = state->availMem;
-       state->availMem = state->mergeavailmem[srcTape];
+               return false;                   /* tape's run is already exhausted */
   
-       /*
-        * When batch memory is used if final on-the-fly merge, only mergeoverflow
-        * test is relevant; otherwise, only LACKMEM() test is relevant.
-        */
-       while ((state->mergeavailslots[srcTape] > 0 &&
-                       state->mergeoverflow[srcTape] == NULL && !LACKMEM(state)) ||
-                  state->mergenext[srcTape] == 0)
-       {
-               /* read next tuple, if any */
+       /* read next tuple, if any */
+ +#ifdef PGXC
+ +              if ((tuplen = GETLEN(state, srcTape, true)) == 0)
+ +#else
-               if ((tuplen = getlen(state, srcTape, true)) == 0)
+       if ((tuplen = getlen(state, srcTape, true)) == 0)
+ +#endif
-               {
-                       state->mergeactive[srcTape] = false;
-                       break;
-               }
-               READTUP(state, &stup, srcTape, tuplen);
-               /* find a free slot in memtuples[] for it */
-               tupIndex = state->mergefreelist;
-               if (tupIndex)
-                       state->mergefreelist = state->memtuples[tupIndex].tupindex;
-               else
-               {
-                       tupIndex = state->mergefirstfree++;
-                       Assert(tupIndex < state->memtupsize);
-               }
-               state->mergeavailslots[srcTape]--;
-               /* store tuple, append to list for its tape */
-               stup.tupindex = 0;
-               state->memtuples[tupIndex] = stup;
-               if (state->mergelast[srcTape])
-                       state->memtuples[state->mergelast[srcTape]].tupindex = tupIndex;
-               else
-                       state->mergenext[srcTape] = tupIndex;
-               state->mergelast[srcTape] = tupIndex;
+       {
+               state->mergeactive[srcTape] = false;
+               return false;
         }
-       /* update per-tape and global availmem counts */
-       spaceUsed = state->mergeavailmem[srcTape] - state->availMem;
-       state->mergeavailmem[srcTape] = state->availMem;
-       state->availMem = priorAvail - spaceUsed;
+       READTUP(state, stup, srcTape, tuplen);
+ 
+       return true;
   }
   
   /*
@@@ -4263,60 -3789,6 +3925,54 @@@ readtup_heap(Tuplesortstate *state, Sor
                                                                 &stup->isnull1);
   }
   
- static void
- movetup_heap(void *dest, void *src, unsigned int len)
- {
-       memmove(dest, src, len);
- }
- 
+ +#ifdef PGXC
+ +static unsigned int
+ +getlen_datanode(Tuplesortstate *state, int tapenum, bool eofOK)
+ +{
+ +      ResponseCombiner *combiner = state->combiner;
+ +      TupleTableSlot   *dstslot = combiner->ss.ps.ps_ResultTupleSlot;
+ +      TupleTableSlot   *slot;
+ +
+ +      combiner->current_conn = tapenum;
+ +      slot = FetchTuple(combiner);
+ +      if (TupIsNull(slot))
+ +      {
+ +              if (eofOK)
+ +                      return 0;
+ +              else
+ +                      elog(ERROR, "unexpected end of data");
+ +      }
+ +
+ +      if (slot != dstslot)
+ +              ExecCopySlot(dstslot, slot);
+ +
+ +      return 1;
+ +}
+ +
+ +static void
+ +readtup_datanode(Tuplesortstate *state, SortTuple *stup,
+ +                               int tapenum, unsigned int len)
+ +{
+ +      TupleTableSlot *slot = state->combiner->ss.ps.ps_ResultTupleSlot;
+ +      MinimalTuple tuple;
+ +      HeapTupleData htup;
+ +
+ +      Assert(!TupIsNull(slot));
+ +
+ +      /* copy the tuple into sort storage */
+ +      tuple = ExecCopySlotMinimalTuple(slot);
+ +      stup->tuple = (void *) tuple;
+ +      USEMEM(state, GetMemoryChunkSpace(tuple));
+ +      /* set up first-column key value */
+ +      htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
+ +      htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
+ +      stup->datum1 = heap_getattr(&htup,
+ +                                                              state->sortKeys[0].ssup_attno,
+ +                                                              state->tupDesc,
+ +                                                              &stup->isnull1);
+ +}
+ +#endif /* PGXC */
+ +
   /*
    * Routines specialized for the CLUSTER case (HeapTuple data, with
    * comparisons per a btree index definition)
diff --cc src/backend/utils/sort/tuplestore.c

index 24b51bf28b346e94551d14455e0e0d4f30790acf,b3f6be74573807308bf6022c762238266c66e454..9cbce9e59894e35a39cab610e3d42168211afdc9
--- 1/src/backend/utils/sort/tuplestore.c
--- 2/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@@@ -43,8 -43,7 +43,8 @@@
    * before switching to the other state or activating a different read pointer.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
@@@ -134,11 -109,9 +134,12 @@@ struct Tuplestorestat
         bool            truncated;              /* tuplestore_trim has removed tuples? */
         int64           availMem;               /* remaining memory available, in bytes */
         int64           allowedMem;             /* total memory allowed, in bytes */
+       int64           tuples;                 /* number of tuples added */
         BufFile    *myfile;                     /* underlying file, or NULL if none */
         MemoryContext context;          /* memory context for holding tuples */
+ +#ifdef XCP
+ +      MemoryContext tmpcxt;           /* memory context for holding temporary data */
+ +#endif
         ResourceOwner resowner;         /* resowner for holding temp files */
   
         /*
@@@ -842,8 -768,7 +857,9 @@@ tuplestore_puttuple_common(Tuplestorest
         int                     i;
         ResourceOwner oldowner;
   
+ +      if (state->stat_name)
+ +              state->stat_write_count++;
+       state->tuples++;
   
         switch (state->status)
         {
diff --cc src/backend/utils/time/combocid.c

index 6923149fab380bb1f906029e9ff62b9d65049691,baff998641a64b274636b54c8d41b50a93c6ac67..e72547e8794c1804d89e9f165575449b36eaaf60
--- 1/src/backend/utils/time/combocid.c
--- 2/src/backend/utils/time/combocid.c
+++ b/src/backend/utils/time/combocid.c
@@@ -30,8 -30,7 +30,8 @@@
    * destroyed at the end of each transaction.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
diff --cc src/backend/utils/time/snapmgr.c

index ff7362cb49cc921ad45fc01abea146ecd7848e2a,b3d4fe3ae2a9f682883fabace1821e8537b45a9e..f89d635162254025b77196ed68799cd90d1769ed
--- 1/src/backend/utils/time/snapmgr.c
--- 2/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@@ -24,15 -28,14 +28,15 @@@
    * transaction).
    *
    * These arrangements let us reset MyPgXact->xmin when there are no snapshots
-  * referenced by this transaction.  (One possible improvement would be to be
-  * able to advance Xmin when the snapshot with the earliest Xmin is no longer
-  * referenced.  That's a bit harder though, it requires more locking, and
-  * anyway it should be rather uncommon to keep temporary snapshots referenced
-  * for too long.)
+  * referenced by this transaction, and advance it when the one with oldest
+  * Xmin is no longer referenced.  For simplicity however, only registered
+  * snapshots not active snapshots participate in tracking which one is oldest;
+  * we don't try to change MyPgXact->xmin except when the active-snapshot
+  * stack is empty.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * IDENTIFICATION
@@@ -348,48 -349,19 +352,45 @@@ GetTransactionSnapshot(void
                         pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node);
                 }
                 else
- -                      CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
+ +                      CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData, false);
   
-               /* Don't allow catalog snapshot to be older than xact snapshot. */
-               CatalogSnapshotStale = true;
- 
                 FirstSnapshotSet = true;
                 return CurrentSnapshot;
         }
   
         if (IsolationUsesXactSnapshot())
+ +      {
+ +#ifdef PGXC
+ +              /*
+ +               * Consider this test case taken from portals.sql
+ +               *
+ +               * CREATE TABLE cursor (a int, b int) distribute by replication;
+ +               * INSERT INTO cursor VALUES (10);
+ +               * BEGIN;
+ +               * SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
+ +               * DECLARE c1 NO SCROLL CURSOR FOR SELECT * FROM cursor FOR UPDATE;
+ +               * INSERT INTO cursor VALUES (2);
+ +               * FETCH ALL FROM c1;
+ +               * would result in
+ +               * ERROR:  attempted to lock invisible tuple
+ +               * because FETCH would be sent as a select to the remote nodes
+ +               * with command id 0, whereas the command id would be 2
+ +               * in the current snapshot.
+ +               * (1 sent by Coordinator due to declare cursor &
+ +               *  2 because of the insert inside the transaction)
+ +               * The command id should therefore be updated in the
+ +               * current snapshot.
+ +               */
+ +              if (IsConnFromCoord() || IsConnFromDatanode())
+ +                      SnapshotSetCommandId(GetCurrentCommandId(false));
+ +#endif
                 return CurrentSnapshot;
+ +      }
   
         /* Don't allow catalog snapshot to be older than xact snapshot. */
-       CatalogSnapshotStale = true;
+       InvalidateCatalogSnapshot();
   
- -      CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
+ +      CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData, false);
   
         return CurrentSnapshot;
   }
@@@ -492,20 -464,29 +493,29 @@@ GetNonHistoricCatalogSnapshot(Oid relid
          * scan a relation for which neither catcache nor snapshot invalidations
          * are sent, we must refresh the snapshot every time.
          */
-       if (!CatalogSnapshotStale && !RelationInvalidatesSnapshotsOnly(relid) &&
+       if (CatalogSnapshot &&
+               !RelationInvalidatesSnapshotsOnly(relid) &&
                 !RelationHasSysCache(relid))
-               CatalogSnapshotStale = true;
+               InvalidateCatalogSnapshot();
   
-       if (CatalogSnapshotStale)
+       if (CatalogSnapshot == NULL)
         {
                 /* Get new snapshot. */
- -              CatalogSnapshot = GetSnapshotData(&CatalogSnapshotData);
+ +              CatalogSnapshot = GetSnapshotData(&CatalogSnapshotData, true);
   
                 /*
-                * Mark new snapshost as valid.  We must do this last, in case an
-                * ERROR occurs inside GetSnapshotData().
+                * Make sure the catalog snapshot will be accounted for in decisions
+                * about advancing PGXACT->xmin.  We could apply RegisterSnapshot, but
+                * that would result in making a physical copy, which is overkill; and
+                * it would also create a dependency on some resource owner, which we
+                * do not want for reasons explained at the head of this file. Instead
+                * just shove the CatalogSnapshot into the pairing heap manually. This
+                * has to be reversed in InvalidateCatalogSnapshot, of course.
+                *
+                * NB: it had better be impossible for this to throw error, since the
+                * CatalogSnapshot pointer is already valid.
                  */
-               CatalogSnapshotStale = false;
+               pairingheap_add(&RegisteredSnapshots, &CatalogSnapshot->ph_node);
         }
   
         return CatalogSnapshot;
diff --cc src/backend/utils/time/tqual.c
Simple merge
diff --cc src/bin/Makefile
Simple merge
diff --cc src/bin/initdb/initdb.c

index df3561cc66af40ba2178a598c1c4ea6fa2ba33e4,cd2f4b66d00c0c8d76358b3822afb346f32af7e2..6399d92f64b9c778014123c8d2f2717ed21b3c15
--- 1/src/bin/initdb/initdb.c
--- 2/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@@ -38,8 -38,7 +38,8 @@@
    *
    * This code is released under the terms of the PostgreSQL License.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/bin/initdb/initdb.c
@@@ -265,12 -245,9 +256,12 @@@ static void test_config_settings(void)
   static void setup_config(void);
   static void bootstrap_template1(void);
   static void setup_auth(FILE *cmdfd);
- static void get_set_pwd(FILE *cmdfd);
+ static void get_su_pwd(void);
   static void setup_depend(FILE *cmdfd);
   static void setup_sysviews(FILE *cmdfd);
+ +#ifdef PGXC
+ +static void setup_nodeself(FILE *cmdfd);
+ +#endif
   static void setup_description(FILE *cmdfd);
   static void setup_collation(FILE *cmdfd);
   static void setup_conversion(FILE *cmdfd);
@@@ -2057,14 -1701,15 +1742,18 @@@ setup_privileges(FILE *cmdfd
                 "  SET relacl = (SELECT array_agg(a.acl) FROM "
                 " (SELECT E'=r/\"$POSTGRES_SUPERUSERNAME\"' as acl "
                 "  UNION SELECT unnest(pg_catalog.acldefault("
-               "    CASE WHEN relkind = 'S' THEN 's' ELSE 'r' END::\"char\",10::oid))"
+               "    CASE WHEN relkind = " CppAsString2(RELKIND_SEQUENCE) " THEN 's' "
+               "         ELSE 'r' END::\"char\"," CppAsString2(BOOTSTRAP_SUPERUSERID) "::oid))"
                 " ) as a) "
-               "  WHERE relkind IN ('r', 'v', 'm', 'S') AND relacl IS NULL;\n\n",
+               "  WHERE relkind IN (" CppAsString2(RELKIND_RELATION) ", "
+               CppAsString2(RELKIND_VIEW) ", " CppAsString2(RELKIND_MATVIEW) ", "
+               CppAsString2(RELKIND_SEQUENCE) ")"
+               "  AND relacl IS NULL;\n\n",
                 "GRANT USAGE ON SCHEMA pg_catalog TO PUBLIC;\n\n",
                 "GRANT CREATE, USAGE ON SCHEMA public TO PUBLIC;\n\n",
+ +#ifdef XCP
+ +        "GRANT USAGE ON SCHEMA storm_catalog TO PUBLIC;\n",
+ +#endif
                 "REVOKE ALL ON pg_largeobject FROM PUBLIC;\n\n",
                 "INSERT INTO pg_init_privs "
                 "  (objoid, classoid, objsubid, initprivs, privtype)"
@@@ -3437,14 -2959,13 +3058,16 @@@ main(int argc, char *argv[]
                 {"version", no_argument, NULL, 'V'},
                 {"debug", no_argument, NULL, 'd'},
                 {"show", no_argument, NULL, 's'},
-               {"noclean", no_argument, NULL, 'n'},
-               {"nosync", no_argument, NULL, 'N'},
+               {"noclean", no_argument, NULL, 'n'},    /* for backwards compatibility */
+               {"no-clean", no_argument, NULL, 'n'},
+               {"nosync", no_argument, NULL, 'N'},             /* for backwards compatibility */
+               {"no-sync", no_argument, NULL, 'N'},
                 {"sync-only", no_argument, NULL, 'S'},
-               {"xlogdir", required_argument, NULL, 'X'},
+               {"waldir", required_argument, NULL, 'X'},
                 {"data-checksums", no_argument, NULL, 'k'},
+ +#ifdef PGXC
+ +              {"nodename", required_argument, NULL, 12},
+ +#endif
                 {NULL, 0, NULL, 0}
         };
   
@@@ -3704,41 -3216,34 +3339,52 @@@
         if (authwarning != NULL)
                 fprintf(stderr, "%s", authwarning);
   
-       /* Get directory specification used to start this executable */
-       strlcpy(bin_dir, argv[0], sizeof(bin_dir));
-       get_parent_directory(bin_dir);
+       /*
+        * Build up a shell command to tell the user how to start the server
+        */
+       start_db_cmd = createPQExpBuffer();
+ 
+       /* Get directory specification used to start initdb ... */
+       strlcpy(pg_ctl_path, argv[0], sizeof(pg_ctl_path));
+       canonicalize_path(pg_ctl_path);
+       get_parent_directory(pg_ctl_path);
+       /* ... and tag on pg_ctl instead */
+       join_path_components(pg_ctl_path, pg_ctl_path, "pg_ctl");
+ 
+       /* path to pg_ctl, properly quoted */
+       appendShellString(start_db_cmd, pg_ctl_path);
+ 
+       /* add -D switch, with properly quoted data directory */
+       appendPQExpBufferStr(start_db_cmd, " -D ");
+       appendShellString(start_db_cmd, pgdata_native);
+ 
+       /* add suggested -l switch and "start" command */
+       /* translator: This is a placeholder in a shell command. */
+       appendPQExpBuffer(start_db_cmd, " -l %s start", _("logfile"));
   
-                       printf(_("You can now start the database server of the Postgres-XL coordinator using:\n\n"
-                                               "    %s%s%spostgres%s --coordinator -D %s%s%s\n"
+ +
+ +#ifdef PGXC
+ +      printf(_("\nSuccess.\n"));
+ +      {
+ +              char *pgxc_ctl_silent = getenv("PGXC_CTL_SILENT");
+ +              if (!pgxc_ctl_silent || !strlen(pgxc_ctl_silent))
+ +              {
-                                               "    %s%s%spg_ctl%s start -D %s%s%s -Z coordinator -l logfile\n\n"
++                      printf(_("\nSuccess. You can now start the database server of the Postgres-XL coordinator using:\n\n"
++                                              "    %s -Z coordinator\n\n"
+ +                                              "or\n"
-                                               "    %s%s%spostgres%s --datanode -D %s%s%s\n"
-                                               "or \n"
-                                               "    %s%s%spg_ctl%s start -D %s%s%s -Z datanode -l logfile\n\n"),
-                                       QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
-                                       QUOTE_PATH, pgdata_native, QUOTE_PATH,
-                                       QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
-                                       QUOTE_PATH, pgdata_native, QUOTE_PATH,
-                                       QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
-                                       QUOTE_PATH, pgdata_native, QUOTE_PATH,
-                                       QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
-                                       QUOTE_PATH, pgdata_native, QUOTE_PATH);
+ +                                              " You can now start the database server of the Postgres-XL datanode using:\n\n"
++                                              "    %s -Z datanode\n\n"),
++                                      start_db_cmd->data,
++                                      start_db_cmd->data);
+ +              }
+ +      }
+ +#else
         printf(_("\nSuccess. You can now start the database server using:\n\n"
-                        "    %s%s%spg_ctl%s -D %s%s%s -l logfile start\n\n"),
-          QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
-                  QUOTE_PATH, pgdata_native, QUOTE_PATH);
+                        "    %s\n\n"),
+                  start_db_cmd->data);
+ +#endif
   
+       destroyPQExpBuffer(start_db_cmd);
+ 
         return 0;
   }
diff --cc src/bin/pg_ctl/pg_ctl.c

index b15994b246c1fd9fbe9d6dd01f2afd1d30b839ec,8387a0b08056872a619dd3229c36a10e66c4699a..8043d326b39324aae3214e3f4368b647d0d961f1
--- 1/src/bin/pg_ctl/pg_ctl.c
--- 2/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@@ -1922,24 -1932,19 +1946,19 @@@ do_help(void
   {
         printf(_("%s is a utility to initialize, start, stop, or control a PostgreSQL server.\n\n"), progname);
         printf(_("Usage:\n"));
-       printf(_("  %s init[db]               [-D DATADIR] [-s] [-o \"OPTIONS\"]\n"), progname);
- #ifdef PGXC
-       printf(_("  %s start   [-w] [-t SECS] [-Z NODE-TYPE] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
-       printf(_("  %s restart [-w] [-t SECS] [-Z NODE-TYPE] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
-                        "                 [-o \"OPTIONS\"]\n"), progname);
- #else
-       printf(_("  %s start   [-w] [-t SECS] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
-       printf(_("  %s restart [-w] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
-                        "                 [-o \"OPTIONS\"]\n"), progname);
- #endif
-       printf(_("  %s stop    [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname);
-       printf(_("  %s reload  [-D DATADIR] [-s]\n"), progname);
-       printf(_("  %s status  [-D DATADIR]\n"), progname);
-       printf(_("  %s promote [-D DATADIR] [-s]\n"), progname);
-       printf(_("  %s kill    SIGNALNAME PID\n"), progname);
+       printf(_("  %s init[db] [-D DATADIR] [-s] [-o OPTIONS]\n"), progname);
- -      printf(_("  %s start    [-D DATADIR] [-l FILENAME] [-W] [-t SECS] [-s]\n"
++      printf(_("  %s start    [-D DATADIR] [-Z NODE-TYPE] [-l FILENAME] [-W] [-t SECS] [-s]\n"
+                        "                  [-o OPTIONS] [-p PATH] [-c]\n"), progname);
+       printf(_("  %s stop     [-D DATADIR] [-m SHUTDOWN-MODE] [-W] [-t SECS] [-s]\n"), progname);
- -      printf(_("  %s restart  [-D DATADIR] [-m SHUTDOWN-MODE] [-W] [-t SECS] [-s]\n"
++      printf(_("  %s restart  [-D DATADIR] [-Z NODE-TYPE] [-m SHUTDOWN-MODE] [-W] [-t SECS] [-s]\n"
+                        "                  [-o OPTIONS] [-c]\n"), progname);
+       printf(_("  %s reload   [-D DATADIR] [-s]\n"), progname);
+       printf(_("  %s status   [-D DATADIR]\n"), progname);
+       printf(_("  %s promote  [-D DATADIR] [-W] [-t SECS] [-s]\n"), progname);
+       printf(_("  %s kill     SIGNALNAME PID\n"), progname);
   #ifdef WIN32
-       printf(_("  %s register   [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
-                        "                    [-S START-TYPE] [-w] [-t SECS] [-o \"OPTIONS\"]\n"), progname);
+       printf(_("  %s register [-D DATADIR] [-N SERVICENAME] [-U USERNAME] [-P PASSWORD]\n"
+                        "                  [-S START-TYPE] [-e SOURCE] [-W] [-t SECS] [-s] [-o OPTIONS]\n"), progname);
         printf(_("  %s unregister [-N SERVICENAME]\n"), progname);
   #endif
   
@@@ -1951,13 -1956,9 +1970,10 @@@
         printf(_("  -s, --silent           only print errors, no informational messages\n"));
         printf(_("  -t, --timeout=SECS     seconds to wait when using -w option\n"));
         printf(_("  -V, --version          output version information, then exit\n"));
-       printf(_("  -w                     wait until operation completes\n"));
-       printf(_("  -W                     do not wait until operation completes\n"));
- #ifdef PGXC
+ +      printf(_("  -Z NODE-TYPE           can be \"coordinator\" or \"datanode\" (Postgres-XL)\n"));
- #endif
+       printf(_("  -w, --wait             wait until operation completes (default)\n"));
+       printf(_("  -W, --no-wait          do not wait until operation completes\n"));
         printf(_("  -?, --help             show this help, then exit\n"));
-       printf(_("(The default is to wait for shutdown, but not for start or restart.)\n\n"));
         printf(_("If the -D option is omitted, the environment variable PGDATA is used.\n"));
   
         printf(_("\nOptions for start or restart:\n"));
@@@ -2223,11 -2242,8 +2261,8 @@@ main(int argc, char **argv
         /* process command-line options */
         while (optind < argc)
         {
- #ifdef PGXC
-               while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wWZ:", long_options, &option_index)) != -1)
- #else
-               while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wW", long_options, &option_index)) != -1)
- #endif
- -              while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wW",
++              while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wWZ:",
+                                                               long_options, &option_index)) != -1)
                 {
                         switch (c)
                         {
diff --cc src/bin/pg_dump/pg_backup.h
Simple merge
diff --cc src/bin/pg_dump/pg_backup_db.c
Simple merge
diff --cc src/bin/pg_dump/pg_dump.c

index a527bfca5d94cb5c2a668e4caa5aae30e1f52307,9941111cda06444b7f6207d0357b417d8be893b8..a95a2f5fb35f13af7d875f79cc4d49305a2811e2
--- 1/src/bin/pg_dump/pg_dump.c
--- 2/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@@ -4,8 -4,7 +4,8 @@@
    *      pg_dump is a utility for dumping out a postgres database
    *      into a script file.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    *    pg_dump will read the system catalogs in a database and dump out a
@@@ -354,10 -355,9 +360,9 @@@ main(int argc, char **argv
                 {"no-security-labels", no_argument, &dopt.no_security_labels, 1},
                 {"no-synchronized-snapshots", no_argument, &dopt.no_synchronized_snapshots, 1},
                 {"no-unlogged-table-data", no_argument, &dopt.no_unlogged_table_data, 1},
- #ifdef PGXC
+               {"no-subscriptions", no_argument, &dopt.no_subscriptions, 1},
+               {"no-sync", no_argument, NULL, 7},
- -
+ +              {"include-nodes", no_argument, &include_nodes, 1},
- #endif
- 
                 {NULL, 0, NULL, 0}
         };
   
@@@ -1078,22 -1101,14 +1113,18 @@@ setup_connection(Archive *AH, const cha
                 else
                         ExecuteSqlStatement(AH,
                                                                 "SET TRANSACTION ISOLATION LEVEL "
- -                                                              "REPEATABLE READ, READ ONLY");
+ +                                                              "REPEATABLE READ"
+ +#ifndef XCP
+ +                                                              ", READ ONLY"
+ +#endif
+ +                                                              );
         }
-       else if (AH->remoteVersion >= 70400)
+       else
         {
-               /* note: comma was not accepted in SET TRANSACTION before 8.0 */
                 ExecuteSqlStatement(AH,
                                                         "SET TRANSACTION ISOLATION LEVEL "
-                                                       "SERIALIZABLE READ ONLY");
+                                                       "SERIALIZABLE, READ ONLY");
         }
-       else
-               ExecuteSqlStatement(AH,
-                                                       "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE");
   
         /*
          * If user specified a snapshot to use, select that.  In a parallel dump
@@@ -1383,12 -1391,13 +1407,16 @@@ selectDumpableNamespace(NamespaceInfo *
                  * initdb time, see pg_init_privs).
                  */
                 nsinfo->dobj.dump_contains = nsinfo->dobj.dump = DUMP_COMPONENT_ACL;
+       }
         else if (strncmp(nsinfo->dobj.name, "pg_", 3) == 0 ||
+ +#ifdef XCP
+ +                       strncmp(nsinfo->dobj.name, "storm_", 6) == 0 ||
+ +#endif
                          strcmp(nsinfo->dobj.name, "information_schema") == 0)
+       {
+               /* Other system schemas don't get dumped */
                 nsinfo->dobj.dump_contains = nsinfo->dobj.dump = DUMP_COMPONENT_NONE;
+       }
         else
                 nsinfo->dobj.dump_contains = nsinfo->dobj.dump = DUMP_COMPONENT_ALL;
   
@@@ -5329,11 -5658,7 +5685,12 @@@ getTables(Archive *fout, int *numTables
                                                   initacl_subquery->data,
                                                   initracl_subquery->data,
                                                   username_subquery,
+ +                                                fout->isPostgresXL
+ +                                                        ?  "(SELECT pclocatortype from pgxc_class v where v.pcrelid = c.oid) AS pgxclocatortype,"
+ +                                                        "(SELECT pcattnum from pgxc_class v where v.pcrelid = c.oid) AS pgxcattnum,"
+ +                                                        "(SELECT string_agg(node_name,',') AS pgxc_node_names from pgxc_node n where n.oid in (select unnest(nodeoids) from pgxc_class v where v.pcrelid=c.oid) ) , "
+ +                                                        : "",
+                                                 RELKIND_SEQUENCE,
                                                   attacl_subquery->data,
                                                   attracl_subquery->data,
                                                   attinitacl_subquery->data,
@@@ -5540,12 -5854,12 +5910,15 @@@
                                                   "d.refobjid AS owning_tab, "
                                                   "d.refobjsubid AS owning_col, "
                                                   "(SELECT spcname FROM pg_tablespace t WHERE t.oid = c.reltablespace) AS reltablespace, "
+ +#ifdef PGXC
+ +                                                "%s"
+ +#endif
                                                   "c.reloptions AS reloptions, "
                                                   "tc.reloptions AS toast_reloptions, "
-                                                 "NULL AS changed_acl "
+                                                 "NULL AS changed_acl, "
+                                                 "NULL AS partkeydef, "
+                                                 "false AS ispartition, "
+                                                 "NULL AS partbound "
                                                   "FROM pg_class c "
                                                   "LEFT JOIN pg_depend d ON "
                                                   "(c.relkind = '%c' AND "
diff --cc src/bin/pg_dump/pg_dump.h

index a6bc86ff311e9f2de0a4e8e4414b0c065d578e91,4afffc0690daf22b09d5a10fcdc75fb979922070..75aa065e5d295398c8845b829d04a3a9b765338b
--- 1/src/bin/pg_dump/pg_dump.h
--- 2/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@@ -286,14 -292,10 +292,16 @@@ typedef struct _tableInf
         int                     relpages;               /* table's size in pages (from pg_class) */
   
         bool            interesting;    /* true if need to collect more data */
+       bool            dummy_view;             /* view's real definition must be postponed */
         bool            postponed_def;  /* matview must be postponed into post-data */
+       bool            ispartition;    /* is table a partition? */
   
+ +#ifdef PGXC
+ +      /* PGXC table locator Data */
+ +      char            pgxclocatortype;        /* Type of PGXC table locator */
+ +      int                     pgxcattnum;             /* Number of the attribute the table is partitioned with */
+ +      char            *pgxc_node_names;       /* List of node names where this table is distributed */
+ +#endif
         /*
          * These fields are computed only if we decide the table is interesting
          * (it's either a table to dump, or a direct parent of a dumpable table).
diff --cc src/bin/pg_dump/pg_dumpall.c

index 60c7ba5e8ed4468486752772d3f88af68baabf67,68003c35331b40a655d71b6f2ef5e59400e76c6e..9534134e61856b58360fb1178d554b4c2776be4d
--- 1/src/bin/pg_dump/pg_dumpall.c
--- 2/src/bin/pg_dump/pg_dumpall.c
+++ b/src/bin/pg_dump/pg_dumpall.c
@@@ -134,12 -130,13 +139,16 @@@ main(int argc, char *argv[]
                 {"quote-all-identifiers", no_argument, &quote_all_identifiers, 1},
                 {"role", required_argument, NULL, 3},
                 {"use-set-session-authorization", no_argument, &use_setsessauth, 1},
+               {"no-publications", no_argument, &no_publications, 1},
                 {"no-security-labels", no_argument, &no_security_labels, 1},
+               {"no-subscriptions", no_argument, &no_subscriptions, 1},
+               {"no-sync", no_argument, NULL, 4},
                 {"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1},
- -
+               {"no-role-passwords", no_argument, &no_role_passwords, 1},
+ +#ifdef PGXC
+ +              {"dump-nodes", no_argument, &dump_nodes, 1},
+ +              {"include-nodes", no_argument, &include_nodes, 1},
+ +#endif
                 {NULL, 0, NULL, 0}
         };
   
diff --cc src/bin/pg_rewind/filemap.c
Simple merge
diff --cc src/bin/pg_waldump/rmgrdesc.c

index 3272d999eff89a34d305662a26528bae30f7097c,852d8ca4b1c6cbdeee4601b84fda27dfed396796..08f3aff908f5f738b3537aff89c101d74d39a8c9
--- 1/src/bin/pg_xlogdump/rmgrdesc.c
--- 2/src/bin/pg_waldump/rmgrdesc.c
+++ b/src/bin/pg_waldump/rmgrdesc.c
@@@ -32,11 -32,7 +32,11 @@@
   #include "storage/standbydefs.h"
   #include "utils/relmapper.h"
   
- #define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \
+ +#ifdef XCP
+ +#include "pgxc/barrier.h"
+ +#endif
+ +
+ #define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
         { name, desc, identify},
   
   const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = {
diff --cc src/bin/pgbench/pgbench.c
Simple merge
diff --cc src/bin/psql/command.c

index 9300a56dad2c8648acbb5ab364bd6406b660cd77,b3263a9570afc56dc96fc4885e19d8083b626496..c376d96989aebfbc02ce59f9b49c2029e55a4ec8
--- 1/src/bin/psql/command.c
--- 2/src/bin/psql/command.c
+++ b/src/bin/psql/command.c
@@@ -2037,17 -3247,16 +3247,20 @@@ connection_warnings(bool in_startup
                 }
                 /* For version match, only print psql banner on startup. */
                 else if (in_startup)
+ +#ifdef PGXC
+ +                      printf("%s (PGXL %s, based on PG %s)\n", pset.progname, PGXC_VERSION, PG_VERSION);
+ +#else
                         printf("%s (%s)\n", pset.progname, PG_VERSION);
+ +#endif
   
                 if (pset.sversion / 100 > client_ver / 100)
-                       printf(_("WARNING: %s major version %d.%d, server major version %d.%d.\n"
+                       printf(_("WARNING: %s major version %s, server major version %s.\n"
                                          "         Some psql features might not work.\n"),
-                                pset.progname, client_ver / 10000, (client_ver / 100) % 100,
-                                  pset.sversion / 10000, (pset.sversion / 100) % 100);
+                                  pset.progname,
+                                  formatPGVersionNumber(client_ver, false,
+                                                                                cverbuf, sizeof(cverbuf)),
+                                  formatPGVersionNumber(pset.sversion, false,
+                                                                                sverbuf, sizeof(sverbuf)));
   
   #ifdef WIN32
                 checkWin32Codepage();
diff --cc src/bin/psql/describe.c
Simple merge
diff --cc src/bin/psql/startup.c
Simple merge
diff --cc src/bin/psql/tab-complete.c

index faa3bffc9461dd47ffd475a4a2e8ba8664526330,2abd08758df51b93210d76e11e290130df7fc42c..04e6a21bb3cc0b28bb3fd488b145760f3de7d2dc
--- 1/src/bin/psql/tab-complete.c
--- 2/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@@ -889,12 -997,12 +1012,13 @@@ typedef struc
   
   #define THING_NO_CREATE               (1 << 0)        /* should not show up after CREATE */
   #define THING_NO_DROP         (1 << 1)        /* should not show up after DROP */
- #define THING_NO_SHOW         (THING_NO_CREATE | THING_NO_DROP)
+ #define THING_NO_ALTER                (1 << 2)        /* should not show up after ALTER */
+ #define THING_NO_SHOW         (THING_NO_CREATE | THING_NO_DROP | THING_NO_ALTER)
   
   static const pgsql_thing_t words_after_create[] = {
-       {"ACCESS METHOD", NULL, NULL},
+       {"ACCESS METHOD", NULL, NULL, THING_NO_ALTER},
         {"AGGREGATE", NULL, &Query_for_list_of_aggregates},
+ +      {"BARRIER", NULL, NULL},        /* Comes barrier name next, so skip it */
         {"CAST", NULL, NULL},           /* Casts have complex structures for names, so
                                                                  * skip it */
         {"COLLATION", "SELECT pg_catalog.quote_ident(collname) FROM pg_catalog.pg_collation WHERE collencoding IN (-1, pg_catalog.pg_char_to_encoding(pg_catalog.getdatabaseencoding())) AND substring(pg_catalog.quote_ident(collname),1,%d)='%s'"},
@@@ -910,12 -1019,13 +1035,13 @@@
         {"DOMAIN", NULL, &Query_for_list_of_domains},
         {"EVENT TRIGGER", NULL, NULL},
         {"EXTENSION", Query_for_list_of_extensions},
- -      {"FOREIGN DATA WRAPPER", NULL, NULL},
- -      {"FOREIGN TABLE", NULL, NULL},
         {"FUNCTION", NULL, &Query_for_list_of_functions},
         {"GROUP", Query_for_list_of_roles},
-       {"LANGUAGE", Query_for_list_of_languages},
         {"INDEX", NULL, &Query_for_list_of_indexes},
+       {"LANGUAGE", Query_for_list_of_languages},
+       {"LARGE OBJECT", NULL, NULL, THING_NO_CREATE | THING_NO_DROP},
+ +      {"NODE", Query_for_list_of_available_nodenames},
+ +      {"NODE GROUP", Query_for_list_of_available_nodegroup_names},
         {"MATERIALIZED VIEW", NULL, &Query_for_list_of_matviews},
         {"OPERATOR", NULL, NULL},       /* Querying for this is probably not such a
                                                                  * good idea. */
@@@ -926,16 -1037,27 +1053,26 @@@
         {"RULE", "SELECT pg_catalog.quote_ident(rulename) FROM pg_catalog.pg_rules WHERE substring(pg_catalog.quote_ident(rulename),1,%d)='%s'"},
         {"SCHEMA", Query_for_list_of_schemas},
         {"SEQUENCE", NULL, &Query_for_list_of_sequences},
+       {"SERVER", Query_for_list_of_servers},
+       {"STATISTICS", NULL, &Query_for_list_of_statistics},
+       {"SUBSCRIPTION", Query_for_list_of_subscriptions},
+       {"SYSTEM", NULL, NULL, THING_NO_CREATE | THING_NO_DROP},
         {"TABLE", NULL, &Query_for_list_of_tables},
         {"TABLESPACE", Query_for_list_of_tablespaces},
-       {"TEMP", NULL, NULL, THING_NO_DROP},            /* for CREATE TEMP TABLE ... */
+       {"TEMP", NULL, NULL, THING_NO_DROP | THING_NO_ALTER},           /* for CREATE TEMP TABLE
+                                                                                                                                * ... */
         {"TEMPLATE", Query_for_list_of_ts_templates, NULL, THING_NO_SHOW},
+       {"TEMPORARY", NULL, NULL, THING_NO_DROP | THING_NO_ALTER},      /* for CREATE TEMPORARY
+                                                                                                                                * TABLE ... */
         {"TEXT SEARCH", NULL, NULL},
+       {"TRANSFORM", NULL, NULL},
+       {"TRIGGER", "SELECT pg_catalog.quote_ident(tgname) FROM pg_catalog.pg_trigger WHERE substring(pg_catalog.quote_ident(tgname),1,%d)='%s' AND NOT tgisinternal"},
         {"TYPE", NULL, &Query_for_list_of_datatypes},
-       {"UNIQUE", NULL, NULL, THING_NO_DROP},          /* for CREATE UNIQUE INDEX ... */
-       {"UNLOGGED", NULL, NULL, THING_NO_DROP},        /* for CREATE UNLOGGED TABLE
-                                                                                                * ... */
+       {"UNIQUE", NULL, NULL, THING_NO_DROP | THING_NO_ALTER},         /* for CREATE UNIQUE
+                                                                                                                                * INDEX ... */
+       {"UNLOGGED", NULL, NULL, THING_NO_DROP | THING_NO_ALTER},       /* for CREATE UNLOGGED
+                                                                                                                                * TABLE ... */
         {"USER", Query_for_list_of_roles},
- -      {"USER MAPPING FOR", NULL, NULL},
         {"VIEW", NULL, &Query_for_list_of_views},
         {NULL}                                          /* end of list */
   };
@@@ -1392,18 -1517,54 +1532,66 @@@ psql_completion(const char *text, int s
                 else
                         COMPLETE_WITH_FUNCTION_ARG(prev2_wd);
         }
+ +
+ +      /* ALTER NODE */
+ +      else if (Matches2("ALTER", "NODE"))
+ +              COMPLETE_WITH_QUERY(Query_for_list_of_available_nodenames);
+ +      else if (Matches2("ALTER", "NODE"))
+ +              COMPLETE_WITH_CONST("WITH");
+ +      else if (Matches3("ALTER", "NODE", "WITH"))
+ +              COMPLETE_WITH_CONST("(");
+ +      else if (Matches3("ALTER", "NODE", "WITH"))
+ +
+ +              COMPLETE_WITH_LIST5("TYPE", "HOST", "PORT", "PRIMARY", "PREFERRED");
+ +
+       /* ALTER PUBLICATION <name> */
+       else if (Matches3("ALTER", "PUBLICATION", MatchAny))
+       {
+               COMPLETE_WITH_LIST5("ADD TABLE", "DROP TABLE", "OWNER TO", "RENAME TO", "SET");
+       }
+       /* ALTER PUBLICATION <name> SET */
+       else if (Matches4("ALTER", "PUBLICATION", MatchAny, "SET"))
+       {
+               COMPLETE_WITH_LIST2("(", "TABLE");
+       }
+       /* ALTER PUBLICATION <name> SET ( */
+       else if (HeadMatches3("ALTER", "PUBLICATION", MatchAny) && TailMatches2("SET", "("))
+       {
+               COMPLETE_WITH_CONST("publish");
+       }
+       /* ALTER SUBSCRIPTION <name> */
+       else if (Matches3("ALTER", "SUBSCRIPTION", MatchAny))
+       {
+               COMPLETE_WITH_LIST7("CONNECTION", "ENABLE", "DISABLE", "OWNER TO",
+                                                       "RENAME TO", "REFRESH PUBLICATION", "SET");
+       }
+       /* ALTER SUBSCRIPTION <name> REFRESH PUBLICATION */
+       else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) &&
+                        TailMatches2("REFRESH", "PUBLICATION"))
+       {
+               COMPLETE_WITH_CONST("WITH (");
+       }
+       /* ALTER SUBSCRIPTION <name> REFRESH PUBLICATION WITH ( */
+       else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) &&
+                        TailMatches4("REFRESH", "PUBLICATION", "WITH", "("))
+       {
+               COMPLETE_WITH_CONST("copy_data");
+       }
+       /* ALTER SUBSCRIPTION <name> SET */
+       else if (Matches4("ALTER", "SUBSCRIPTION", MatchAny, "SET"))
+       {
+               COMPLETE_WITH_LIST2("(", "PUBLICATION");
+       }
+       /* ALTER SUBSCRIPTION <name> SET ( */
+       else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) && TailMatches2("SET", "("))
+       {
+               COMPLETE_WITH_LIST2("slot_name", "synchronous_commit");
+       }
+       /* ALTER SUBSCRIPTION <name> SET PUBLICATION */
+       else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) && TailMatches2("SET", "PUBLICATION"))
+       {
+               /* complete with nothing here as this refers to remote publications */
+       }
         /* ALTER SCHEMA <name> */
         else if (Matches3("ALTER", "SCHEMA", MatchAny))
                 COMPLETE_WITH_LIST2("OWNER TO", "RENAME TO");
@@@ -2308,20 -2715,7 +2766,21 @@@
         else if (Matches3("DROP", "OWNED", "BY"))
                 COMPLETE_WITH_QUERY(Query_for_list_of_roles);
   
+ +      /* DROP NODE */
+ +      else if (Matches2("DROP", "NODE"))
+ +              COMPLETE_WITH_QUERY(Query_for_list_of_available_nodenames);     /* Should test this code if complesion is not confused with DROP NODE GROUP */
+ +
+ +      /* DROP NODE GROUP */
+ +      else if (Matches3("DROP", "NODE", "GROUP"))
+ +              COMPLETE_WITH_QUERY(Query_for_list_of_available_nodegroup_names);
+ +
+ +      /* EXECUTE DIRECT */
+ +      else if (Matches2("EXECUTE", "DIRECT"))
+ +              COMPLETE_WITH_CONST("ON");
+ +      else if (Matches3("EXECUTE", "DIRECT", "ON"))
+ +              COMPLETE_WITH_QUERY(Query_for_list_of_available_nodenames);
+ +
+       /* DROP TEXT SEARCH */
         else if (Matches3("DROP", "TEXT", "SEARCH"))
                 COMPLETE_WITH_LIST4("CONFIGURATION", "DICTIONARY", "PARSER", "TEMPLATE");
   
diff --cc src/common/Makefile
Simple merge
diff --cc src/common/relpath.c
Simple merge
diff --cc src/include/Makefile
Simple merge
diff --cc src/include/access/hash.h

index ce6a70687e35e702f53177b9a6ec6cca0fe793bd,3a210a876b0cffa73a78760090c078fd29697566..c608b03bb070e4bf06115fa3bdbcb6a2deddb20e
--- 1/src/include/access/hash.h
--- 2/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@@ -364,15 -408,19 +408,24 @@@ extern bool _hash_convert_tuple(Relatio
                                         Datum *index_values, bool *index_isnull);
   extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
   extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
+ extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
+ extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
+ extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
+                                                                  uint32 lowmask, uint32 maxbucket);
+ extern void _hash_kill_items(IndexScanDesc scan);
   
   /* hash.c */
- extern void hash_redo(XLogReaderState *record);
- extern void hash_desc(StringInfo buf, XLogReaderState *record);
- extern const char *hash_identify(uint8 info);
+ extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
+                                 Buffer bucket_buf, BlockNumber bucket_blkno,
+                                 BufferAccessStrategy bstrategy,
+                                 uint32 maxbucket, uint32 highmask, uint32 lowmask,
+                                 double *tuples_removed, double *num_index_tuples,
+                                 bool bucket_has_garbage,
+                                 IndexBulkDeleteCallback callback, void *callback_state);
   
+ +#ifdef PGXC
+ +extern Datum compute_hash(Oid type, Datum value, char locator);
+ +extern char *get_compute_hash_function(Oid type, char locator);
+ +#endif
+ +
   #endif   /* HASH_H */
diff --cc src/include/access/htup.h

index 01d5a6f92619f67289456560b0df27318ec9b88c,870adf4f77bd6df55ca09ad50e38dcb9af5d11dc..1d31b5f1c2d887f90d8b23f01e46c01ae41070b6
--- 1/src/include/access/htup.h
--- 2/src/include/access/htup.h
+++ b/src/include/access/htup.h
@@@ -4,8 -4,7 +4,8 @@@
    *      POSTGRES heap tuple definitions.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/access/htup.h
diff --cc src/include/access/rmgrlist.h

index 77920395c15941002ff5594dc64223d67af63d4e,2f43c199d37189a967df337108951e2cc0b73fd4..0988cb410387fcb9eebad3af4dcca9a4d5453a95
--- 1/src/include/access/rmgrlist.h
--- 2/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@@ -25,28 -25,25 +25,28 @@@
    */
   
   /* symbol name, textual name, redo, desc, identify, startup, cleanup */
- PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL)
- PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL)
- PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL)
- PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL)
- PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL)
- PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL)
- PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL)
- PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL)
- PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL)
- PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL)
- PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL)
- PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL)
- PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL)
- PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup)
- PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup)
- PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL)
- PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup)
- PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL)
- PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL)
- PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL)
+ PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
+ PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
+ PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask)
+ PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
+ PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
+ PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
+ PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask)
+ PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask)
+ PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask)
+ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
+ +#ifdef PGXC
- PG_RMGR(RM_BARRIER_ID, "Barrier", barrier_redo, barrier_desc, NULL, NULL, NULL) 
++PG_RMGR(RM_BARRIER_ID, "Barrier", barrier_redo, barrier_desc, barrier_identify, NULL, NULL, NULL) 
+ +#endif
- PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL)
- PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL)
+ PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
+ PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
diff --cc src/include/access/sysattr.h
Simple merge
diff --cc src/include/access/transam.h

index 395953a6f13b5247b90c9e28b082ee03ca814f33,d25a2dd2073f406f1ba471d564fbe5d14b2e31bd..e357d5dea874c4315aa14c3b08886931b664850c
--- 1/src/include/access/transam.h
--- 2/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@@ -4,10 -4,8 +4,10 @@@
    *      postgres transaction access method support code
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/access/transam.h
    *
diff --cc src/include/access/twophase.h
Simple merge
diff --cc src/include/access/xact.h

index 063c8c2af3a7f4c59d882420903b59e3d4f77c96,7eb85b72df2fd2e3caad17c4a756e4b1e95117e0..2186e706a63c3b762cb6a3ad0af962b9232c27de
--- 1/src/include/access/xact.h
--- 2/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@@ -4,10 -4,8 +4,10 @@@
    *      postgres transaction system definitions
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/access/xact.h
    *
diff --cc src/include/access/xlog.h

index 0f5a0a3720023262373461819e7e1440c83d8929,e00ab12d2ee64e3a91aad425d4f423eddf41e1e8..4a633a7fad96a0bc39a5b12aa7d701ec8d5d1bd6
--- 1/src/include/access/xlog.h
--- 2/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@@ -83,9 -83,7 +83,10 @@@ typedef enu
         RECOVERY_TARGET_XID,
         RECOVERY_TARGET_TIME,
         RECOVERY_TARGET_NAME,
+ +#ifdef PGXC
+ +      RECOVERY_TARGET_BARRIER,
+ +#endif
+       RECOVERY_TARGET_LSN,
         RECOVERY_TARGET_IMMEDIATE
   } RecoveryTargetType;
   
diff --cc src/include/bootstrap/bootstrap.h

index 0daf681a22a72b5b6146f2ca6cbc612b7cbb2fe6,cb123e4d6469851dce072fb16c949bc324a52cd9..51a0ba925fbefd8ac1f706217ff02ecfd5a655bd
--- 1/src/include/bootstrap/bootstrap.h
--- 2/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@@ -4,9 -4,8 +4,9 @@@
    *      include file for the bootstrapping code
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/bootstrap/bootstrap.h
    *
diff --cc src/include/c.h
Simple merge
diff --cc src/include/catalog/catalog.h

index b90ffa1c83fa0c2cc1ba834b23e5575d364f9703,d0a199afde7d1e2de269fde0f81560fca4227f48..7062d7ed2fbabd14c376f0cb07a2b5f0cbcd44b9
--- 1/src/include/catalog/catalog.h
--- 2/src/include/catalog/catalog.h
+++ b/src/include/catalog/catalog.h
@@@ -4,8 -4,7 +4,8 @@@
    *      prototypes for functions in backend/catalog/catalog.c
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/catalog/catalog.h
diff --cc src/include/catalog/dependency.h

index 2aff4968815b8580279836f7ccca72337066f91f,8586b9d7a1f0a1122b607a13cf49875e7d2e3c39..c4d0c694f466c8964005a462bffbea389ec293e8
--- 1/src/include/catalog/dependency.h
--- 2/src/include/catalog/dependency.h
+++ b/src/include/catalog/dependency.h
@@@ -4,9 -4,8 +4,9 @@@
    *      Routines to support inter-object dependencies.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/catalog/dependency.h
    *
@@@ -184,15 -186,6 +192,11 @@@ extern void performDeletion(const Objec
   extern void performMultipleDeletions(const ObjectAddresses *objects,
                                                  DropBehavior behavior, int flags);
   
- 
- extern void deleteWhatDependsOn(const ObjectAddress *object,
-                                       bool showNotices);
- 
+ +#ifdef PGXC
+ +extern void performRename(const ObjectAddress *object,
+ +                                                const char *oldname,
+ +                                                const char *newname);
+ +#endif
   extern void recordDependencyOnExpr(const ObjectAddress *depender,
                                            Node *expr, List *rtable,
                                            DependencyType behavior);
diff --cc src/include/catalog/heap.h

index a5f053fc525ee78983b65df3d502744e821db5c4,aa494528364367c72c36b97c8344d69d54050671..12ad62532be642d26e979d70926218548bcfd111
--- 1/src/include/catalog/heap.h
--- 2/src/include/catalog/heap.h
+++ b/src/include/catalog/heap.h
@@@ -4,9 -4,8 +4,9 @@@
    *      prototypes for functions in backend/catalog/heap.c
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/catalog/heap.h
    *
@@@ -135,24 -134,16 +135,35 @@@ extern void CheckAttributeType(const ch
                                    List *containing_rowtypes,
                                    bool allow_system_table_mods);
   
+ +#ifdef PGXC
+ +/* Functions related to distribution data of relations */
+ +extern void AddRelationDistribution(Oid relid,
+ +                              DistributeBy *distributeby,
+ +                              PGXCSubCluster *subcluster,
+ +                              List             *parentOids,
+ +                              TupleDesc        descriptor);
+ +extern void GetRelationDistributionItems(Oid relid,
+ +                                                                               DistributeBy *distributeby,
+ +                                                                               TupleDesc descriptor,
+ +                                                                               char *locatortype,
+ +                                                                               int *hashalgorithm,
+ +                                                                               int *hashbuckets,
+ +                                                                               AttrNumber *attnum);
+ +extern Oid *GetRelationDistributionNodes(PGXCSubCluster *subcluster,
+ +                                                                               int *numnodes);
+ +extern Oid *BuildRelationDistributionNodes(List *nodes, int *numnodes);
+ +extern Oid *SortRelationDistributionNodes(Oid *nodeoids, int numnodes);
+ +#endif
+ /* pg_partitioned_table catalog manipulation functions */
+ extern void StorePartitionKey(Relation rel,
+                                 char strategy,
+                                 int16 partnatts,
+                                 AttrNumber *partattrs,
+                                 List *partexprs,
+                                 Oid *partopclass,
+                                 Oid *partcollation);
+ extern void RemovePartitionKeyByRelId(Oid relid);
+ extern void StorePartitionBound(Relation rel, Relation parent,
+                                       PartitionBoundSpec *bound);
   
   #endif   /* HEAP_H */
diff --cc src/include/catalog/indexing.h

index f2d8be7856253ab2093d6df9846a4246306d128f,07300f8a2bd9e50a881f65754766b3e21923cf99..35f50b69a5af20d707c0158ad9505abecde1a908
--- 1/src/include/catalog/indexing.h
--- 2/src/include/catalog/indexing.h
+++ b/src/include/catalog/indexing.h
@@@ -5,9 -5,8 +5,9 @@@
    *      on system catalogs
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/catalog/indexing.h
    *
diff --cc src/include/catalog/namespace.h

index 3c31dafa9d402b2f4ea8487adaea20c99811415d,5294a52984989e683aaddcfff09b8ebe742ee775..14df88290aec40e4fe3a91d834815a85781a9b8b
--- 1/src/include/catalog/namespace.h
--- 2/src/include/catalog/namespace.h
+++ b/src/include/catalog/namespace.h
@@@ -4,8 -4,7 +4,8 @@@
    *      prototypes for functions in backend/catalog/namespace.c
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/catalog/namespace.h
diff --cc src/include/catalog/pg_class.h
Simple merge
diff --cc src/include/catalog/pg_namespace.h

index 308af498122ac5f97fcae0b0cbb426ccff374f8c,cb42abf5f8d2550867e1fc43f6448b5b1db061af..010190e35c235d1fda8b5a8caed4f4321e2ba7fc
--- 1/src/include/catalog/pg_namespace.h
--- 2/src/include/catalog/pg_namespace.h
+++ b/src/include/catalog/pg_namespace.h
@@@ -5,8 -5,7 +5,8 @@@
    *      along with the relation's initial contents.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/catalog/pg_namespace.h
diff --cc src/include/catalog/pg_proc.h

index 7fccccf2a4fe7f4eba9e45ca3640f44b9437f2b5,460cdb9ed816f8e49e68f0578d80d6bd13d4cbf1..e06ed6cc775ab9fb9ebcdaa5a879a8feddcd65cf
--- 1/src/include/catalog/pg_proc.h
--- 2/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@@ -4,8 -4,7 +4,8 @@@
    *      definition of the system "procedure" relation (pg_proc)
    *      along with the relation's initial contents.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/catalog/pg_proc.h
@@@ -5319,20 -5433,9 +5449,23 @@@ DESCR("get an individual replication or
   DATA(insert OID = 6014 ( pg_show_replication_origin_status PGNSP PGUID 12 1 100 0 0 f f f f f t v r 0 0 2249 "" "{26,25,3220,3220}" "{o,o,o,o}" "{local_id, external_id, remote_lsn, local_lsn}" _null_ _null_ pg_show_replication_origin_status _null_ _null_ _null_ ));
   DESCR("get progress for all replication origins");
   
+ +#ifdef USE_MODULE_MSGIDS
+ +DATA(insert OID = 6015 ( pg_msgmodule_set PGNSP PGUID 12 1 1 0 0 f f f f t t i s 4 0 16 "20 20 20 25" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_set _null_ _null_ _null_ ));
+ +DESCR("set debugging level for module/file/msg");
+ +DATA(insert OID = 6016 ( pg_msgmodule_change PGNSP PGUID 12 1 1 0 0 f f f f t t i s 4 0 16 "20 20 20 20" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_change _null_ _null_ _null_ ));
+ +DESCR("change debugging level for module/file/msg");
+ +DATA(insert OID = 6017 ( pg_msgmodule_enable PGNSP PGUID 12 1 1 0 0 f f f f t t i s 1 0 16 "20" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_enable _null_ _null_ _null_ ));
+ +DESCR("pid to honour overriden log levels");
+ +DATA(insert OID = 6018 ( pg_msgmodule_disable PGNSP PGUID 12 1 1 0 0 f f f f t t i s 1 0 16 "20" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_disable _null_ _null_ _null_ ));
+ +DESCR("pid to ignore overriden log levels");
+ +DATA(insert OID = 6019 ( pg_msgmodule_enable_all PGNSP PGUID 12 1 1 0 0 f f f f t t i s 1 0 16 "16" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_enable_all _null_ _null_ _null_ ));
+ +DESCR("all current/future processes to honour overriden log levels");
+ +DATA(insert OID = 6020 ( pg_msgmodule_disable_all PGNSP PGUID 12 1 1 0 0 f f f f t t i s 0 0 16 "" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_disable_all _null_ _null_ _null_ ));
+ +DESCR("all processes to ignore overriden log levels");
+ +#endif
+ /* publications */
+ DATA(insert OID = 6119 ( pg_get_publication_tables    PGNSP PGUID 12 1 1000 0 0 f f t f t t s s 1 0 26 "25" "{25,26}" "{i,o}" "{pubname,relid}" _null_ _null_ pg_get_publication_tables _null_ _null_ _null_ ));
+ DESCR("get OIDs of tables in a publication");
   
   /* rls */
   DATA(insert OID = 3298 (  row_security_active    PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 16 "26" _null_ _null_ _null_ _null_ _null_        row_security_active _null_ _null_ _null_ ));
diff --cc src/include/catalog/pg_type.h

index 439a22605e8dc3247f90c8360f06d1d086fce480,345e9164060b728fb25e5a1c6f98d007ca1bca6f..8dfbc8a15fbaa9d727bfce4a57ca71c5f855df57
--- 1/src/include/catalog/pg_type.h
--- 2/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@@@ -5,8 -5,7 +5,8 @@@
    *      along with the relation's initial contents.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/catalog/pg_type.h
diff --cc src/include/commands/dbcommands.h
Simple merge
diff --cc src/include/commands/explain.h
Simple merge
diff --cc src/include/commands/prepare.h

index 072ce7858093fb963dcb365fdac758cce491fc8a,c60e6f30b88ddfed2976b5763e7f2e8a497fcb15..147f22b8701d9640760d6ec72edcb1c306809d8c
--- 1/src/include/commands/prepare.h
--- 2/src/include/commands/prepare.h
+++ b/src/include/commands/prepare.h
@@@ -36,18 -33,10 +36,19 @@@ typedef struc
         TimestampTz prepare_time;       /* the time when the stmt was prepared */
   } PreparedStatement;
   
+ +#ifdef PGXC
+ +typedef struct
+ +{
+ +      /* dynahash.c requires key to be first field */
+ +      char            stmt_name[NAMEDATALEN];
+ +      int             number_of_nodes;        /* number of nodes where statement is active */
+ +      int             dns_node_indices[0];            /* node ids where statement is active */
+ +} DatanodeStatement;
+ +#endif
   
   /* Utility statements PREPARE, EXECUTE, DEALLOCATE, EXPLAIN EXECUTE */
- extern void PrepareQuery(PrepareStmt *stmt, const char *queryString);
+ extern void PrepareQuery(PrepareStmt *stmt, const char *queryString,
+                        int stmt_location, int stmt_len);
   extern void ExecuteQuery(ExecuteStmt *stmt, IntoClause *intoClause,
                          const char *queryString, ParamListInfo params,
                          DestReceiver *dest, char *completionTag);
diff --cc src/include/commands/schemacmds.h

index b87a10dd405de9fe89e83be24b47382e7462fc55,f07a389c7f6981f8a24dabe016f60db63c8e05dd..7079bfbf5a4e9ff6b4858cc492aff5e7443081ac
--- 1/src/include/commands/schemacmds.h
--- 2/src/include/commands/schemacmds.h
+++ b/src/include/commands/schemacmds.h
@@@ -18,13 -18,10 +18,11 @@@
   #include "catalog/objectaddress.h"
   #include "nodes/parsenodes.h"
   
- #ifdef PGXC
   extern Oid CreateSchemaCommand(CreateSchemaStmt *parsetree,
-                                       const char *queryString, bool is_top_level);
- #else
- extern Oid CreateSchemaCommand(CreateSchemaStmt *parsetree,
-                                       const char *queryString);
- #endif
+                                       const char *queryString,
++                                      bool is_top_level,
+                                       int stmt_location, int stmt_len);
+ 
   extern void RemoveSchemaById(Oid schemaOid);
   
   extern ObjectAddress RenameSchema(const char *oldname, const char *newname);
diff --cc src/include/commands/sequence.h

index 0f82def1f13ad9819c85ef04c19be979bafb9b7b,304586e48e813eee4c936fa51c8fa498f311dd87..0e9533cc2de7f0e7159dffdaf0f9418fbad7f9fd
--- 1/src/include/commands/sequence.h
--- 2/src/include/commands/sequence.h
+++ b/src/include/commands/sequence.h
@@@ -3,8 -3,7 +3,8 @@@
    * sequence.h
    *      prototypes for sequence.c.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/commands/sequence.h
@@@ -19,29 -18,18 +19,23 @@@
   #include "fmgr.h"
   #include "lib/stringinfo.h"
   #include "nodes/parsenodes.h"
+ #include "parser/parse_node.h"
   #include "storage/relfilenode.h"
   
+ +#ifdef PGXC
+ +#include "utils/relcache.h"
+ +#include "gtm/gtm_c.h"
+ +#include "access/xact.h"
+ +#endif
   
- typedef struct FormData_pg_sequence
+ typedef struct FormData_pg_sequence_data
   {
-       NameData        sequence_name;
         int64           last_value;
-       int64           start_value;
-       int64           increment_by;
-       int64           max_value;
-       int64           min_value;
-       int64           cache_value;
         int64           log_cnt;
-       bool            is_cycled;
         bool            is_called;
- } FormData_pg_sequence;
+ } FormData_pg_sequence_data;
   
- typedef FormData_pg_sequence *Form_pg_sequence;
+ typedef FormData_pg_sequence_data *Form_pg_sequence_data;
   
   /*
    * Columns of a sequence relation
@@@ -87,26 -64,6 +70,27 @@@ extern void ResetSequenceCaches(void)
   extern void seq_redo(XLogReaderState *rptr);
   extern void seq_desc(StringInfo buf, XLogReaderState *rptr);
   extern const char *seq_identify(uint8 info);
+ extern void seq_mask(char *pagedata, BlockNumber blkno);
   
+ +#ifdef XCP
+ +#define DEFAULT_CACHEVAL      1
+ +extern int SequenceRangeVal;
+ +#endif
+ +#ifdef PGXC
+ +/*
+ + * List of actions that registered the callback.
+ + * This is listed here and not in sequence.c because callback can also
+ + * be registered in dependency.c and tablecmds.c as sequences can be dropped
+ + * or renamed in cascade.
+ + */
+ +typedef enum
+ +{
+ +      GTM_CREATE_SEQ,
+ +      GTM_DROP_SEQ
+ +} GTM_SequenceDropType;
+ +
+ +extern bool IsTempSequence(Oid relid);
+ +extern char *GetGlobalSeqName(Relation rel, const char *new_seqname, const char *new_schemaname);
+ +#endif
+ +
   #endif   /* SEQUENCE_H */
diff --cc src/include/commands/tablecmds.h
Simple merge
diff --cc src/include/commands/trigger.h
Simple merge
diff --cc src/include/commands/vacuum.h

index b87bf2ace97e000aec0b39a5744d26798eadb5f0,541c2fa3cf2f09fb3e2b31bd8c9719118e38ec6d..fd2dc860dd02c2b5fef426c9b43a4be439429910
--- 1/src/include/commands/vacuum.h
--- 2/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@@ -4,8 -4,7 +4,8 @@@
    *      header file for postgres vacuum cleaner and statistics analyzer
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/commands/vacuum.h
diff --cc src/include/commands/variable.h

index 4997e1e166ce4a67e26225c6cbf84b0a11827e9d,247423c6fbe5e23b64306326dca9ea45ed62174c..e0fb3332dfb3fc374233344d380eefdd5b0ac29e
--- 1/src/include/commands/variable.h
--- 2/src/include/commands/variable.h
+++ b/src/include/commands/variable.h
@@@ -2,8 -2,7 +2,8 @@@
    * variable.h
    *            Routines for handling specialized SET variables.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/commands/variable.h
diff --cc src/include/common/relpath.h
Simple merge
diff --cc src/include/executor/execdesc.h

index ca9edf539fd15170a80b890363b6e7f0e58017d2,37de6f2011332c80e7f30738da9cc3b14af02b14..62a2f2e477bbaff18a2ee491d35c61c0197f1058
--- 1/src/include/executor/execdesc.h
--- 2/src/include/executor/execdesc.h
+++ b/src/include/executor/execdesc.h
@@@ -5,8 -5,7 +5,8 @@@
    *      and related modules.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/executor/execdesc.h
@@@ -51,13 -48,8 +51,15 @@@ typedef struct QueryDes
         EState     *estate;                     /* executor's query-wide state */
         PlanState  *planstate;          /* tree of per-plan-node state */
   
+ +#ifdef XCP
+ +      SharedQueue squeue;             /* the shared memory queue to sent data to other
+ +                                                               * nodes */
+ +      int             myindex;                /* -1 if locally executed subplan is producing
+ +                                                               * data and distribute via squeue. Otherwise
+ +                                                               * get local data from squeue */
+ +#endif
+       /* This field is set by ExecutorRun */
+       bool            already_executed;               /* true if previously executed */
   
         /* This is always set NULL by the core system, but plugins can change it */
         struct Instrumentation *totaltime;      /* total time spent in ExecutorRun */
diff --cc src/include/executor/executor.h

index 7b5cf2f1f769613c3d4673742d45eacc084e147c,8cc5f3a413f89d7cd06e6a0afc8f9410d240dee3..fdf7c15b70c662368f6a60a6ce5b30e715a85c3a
--- 1/src/include/executor/executor.h
--- 2/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@@ -4,8 -4,7 +4,8 @@@
    *      support for the POSTGRES executor module
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/executor/executor.h
@@@ -63,21 -63,8 +64,12 @@@
   #define EXEC_FLAG_WITH_OIDS           0x0020  /* force OIDs in returned tuples */
   #define EXEC_FLAG_WITHOUT_OIDS        0x0040  /* force no OIDs in returned tuples */
   #define EXEC_FLAG_WITH_NO_DATA        0x0080  /* rel scannability doesn't matter */
+ +#ifdef XCP
+ +/* distributed executor may never execute the plan on this node  */
+ +#define EXEC_FLAG_SUBPLAN             0x0100
+ +#endif
   
   
- /*
-  * ExecEvalExpr was formerly a function containing a switch statement;
-  * now it's just a macro invoking the function pointed to by an ExprState
-  * node.  Beware of double evaluation of the ExprState argument!
-  */
- #define ExecEvalExpr(expr, econtext, isNull, isDone) \
-       ((*(expr)->evalfunc) (expr, econtext, isNull, isDone))
- 
- 
   /* Hook for plugins to get control in ExecutorStart() */
   typedef void (*ExecutorStart_hook_type) (QueryDesc *queryDesc, int eflags);
   extern PGDLLIMPORT ExecutorStart_hook_type ExecutorStart_hook;
diff --cc src/include/executor/spi.h
Simple merge
diff --cc src/include/executor/tuptable.h

index bfcca219e0c88f57ac4c03380e5cb88ea9e631fc,32489ef9bde05519f3defbb334d7ff6b14e52819..efdb6fee5a3874b1024b52871678b4f4cc2bec78
--- 1/src/include/executor/tuptable.h
--- 2/src/include/executor/tuptable.h
+++ b/src/include/executor/tuptable.h
@@@ -4,8 -4,7 +4,8 @@@
    *      tuple table support stuff
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/executor/tuptable.h
diff --cc src/include/libpq/libpq-be.h
Simple merge
diff --cc src/include/miscadmin.h

index bb0d7d1dac6634190733f1678fb08a1864102907,4c607b299c2d3a2c31a99e9a67d15a2adb4117ee..343cbd9692bcc7e17aa10bba42a3254e2e5c3e38
--- 1/src/include/miscadmin.h
--- 2/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@@ -10,8 -10,7 +10,8 @@@
    *      Over time, this has also become the preferred place for widely known
    *      resource-limitation stuff, such as work_mem and check_stack_depth().
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/miscadmin.h
diff --cc src/include/nodes/bitmapset.h
Simple merge
diff --cc src/include/nodes/execnodes.h

index 411d969b3b1908f868c8e72491379a069442e4f7,d33392f3b55341d7d85f91ad2f3835add95f0af4..2bc126dabe02d1f47bf9c538244dd9229789da72
--- 1/src/include/nodes/execnodes.h
--- 2/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@@ -4,8 -4,7 +4,8 @@@
    *      definitions for executor state nodes
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/nodes/execnodes.h
@@@ -24,8 -26,75 +27,76 @@@
   #include "utils/reltrigger.h"
   #include "utils/sortsupport.h"
   #include "utils/tuplestore.h"
+ +#include "pgxc/squeue.h"
   #include "utils/tuplesort.h"
+ #include "nodes/tidbitmap.h"
+ #include "storage/condition_variable.h"
+ 
+ 
+ /* ----------------
+  *            ExprState node
+  *
+  * ExprState is the top-level node for expression evaluation.
+  * It contains instructions (in ->steps) to evaluate the expression.
+  * ----------------
+  */
+ struct ExprState;                             /* forward references in this file */
+ struct ExprContext;
+ struct ExprEvalStep;                  /* avoid including execExpr.h everywhere */
+ 
+ typedef Datum (*ExprStateEvalFunc) (struct ExprState *expression,
+                                                                                               struct ExprContext *econtext,
+                                                                                               bool *isNull);
+ 
+ /* Bits in ExprState->flags (see also execExpr.h for private flag bits): */
+ /* expression is for use with ExecQual() */
+ #define EEO_FLAG_IS_QUAL                                      (1 << 0)
+ 
+ typedef struct ExprState
+ {
+       Node            tag;
+ 
+       uint8           flags;                  /* bitmask of EEO_FLAG_* bits, see above */
+ 
+       /*
+        * Storage for result value of a scalar expression, or for individual
+        * column results within expressions built by ExecBuildProjectionInfo().
+        */
+       bool            resnull;
+       Datum           resvalue;
+ 
+       /*
+        * If projecting a tuple result, this slot holds the result; else NULL.
+        */
+       TupleTableSlot *resultslot;
+ 
+       /*
+        * Instructions to compute expression's return value.
+        */
+       struct ExprEvalStep *steps;
+ 
+       /*
+        * Function that actually evaluates the expression.  This can be set to
+        * different values depending on the complexity of the expression.
+        */
+       ExprStateEvalFunc evalfunc;
+ 
+       /* original expression tree, for debugging only */
+       Expr       *expr;
+ 
+       /*
+        * XXX: following only needed during "compilation", could be thrown away.
+        */
+ 
+       int                     steps_len;              /* number of steps currently */
+       int                     steps_alloc;    /* allocated length of steps array */
+ 
+       Datum      *innermost_caseval;
+       bool       *innermost_casenull;
+ 
+       Datum      *innermost_domainval;
+       bool       *innermost_domainnull;
+ } ExprState;
   
   
   /* ----------------
@@@ -373,12 -421,17 +423,22 @@@ typedef struct EStat
         ResultRelInfo *es_result_relations; /* array of ResultRelInfos */
         int                     es_num_result_relations;                /* length of array */
         ResultRelInfo *es_result_relation_info;         /* currently active array elt */
+ +#ifdef PGXC
+ +#ifndef PGXC
+ +      struct PlanState        *es_result_remoterel;                   /* currently active remote rel */
+ +#endif
+ +#endif
   
+       /*
+        * Info about the target partitioned target table root(s) for
+        * update/delete queries.  They required only to fire any per-statement
+        * triggers defined on the table.  It exists separately from
+        * es_result_relations, because partitioned tables don't appear in the
+        * plan tree for the update/delete cases.
+        */
+       ResultRelInfo *es_root_result_relations;        /* array of ResultRelInfos */
+       int                     es_num_root_result_relations;   /* length of the array */
+ 
         /* Stuff used for firing triggers: */
         List       *es_trig_target_relations;           /* trigger-only ResultRelInfos */
         TupleTableSlot *es_trig_tuple_slot; /* for trigger output tuples */
diff --cc src/include/nodes/nodes.h

index 88d615d6fd8812cdc95b0ad4d251306b1868a9bf,15de93635573affce3919fdab596bcf464368d78..df93faed901106821e68cc6c7478b0fe9bbd0687
--- 1/src/include/nodes/nodes.h
--- 2/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@@ -4,10 -4,8 +4,10 @@@
    *      Definitions for tagged nodes.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/nodes/nodes.h
    *
@@@ -209,10 -190,7 +219,11 @@@ typedef enum NodeTa
         T_FromExpr,
         T_OnConflictExpr,
         T_IntoClause,
+ +#ifdef PGXC
+ +      T_DistributeBy,
+ +      T_PGXCSubCluster,
+ +#endif
+       T_NextValueExpr,
   
         /*
          * TAGS FOR EXPRESSION STATE NODES (execnodes.h)
@@@ -301,9 -264,10 +297,13 @@@
         T_PlaceHolderInfo,
         T_MinMaxAggInfo,
         T_PlannerParamItem,
+ +#ifdef XCP
+ +      T_RemoteSubPath,
+ +#endif
+       T_RollupData,
+       T_GroupingSetData,
+       T_StatisticExtInfo,
+ 
         /*
          * TAGS FOR MEMORY NODES (memnodes.h)
          */
@@@ -594,13 -587,9 +632,12 @@@ castNodeImpl(NodeTag type, void *ptr
   /*
    * nodes/{outfuncs.c,print.c}
    */
- extern char *nodeToString(const void *obj);
- 
+ +#ifdef XCP
+ +extern void set_portable_output(bool value);
+ +#endif
   struct Bitmapset;                             /* not to include bitmapset.h here */
   struct StringInfoData;                        /* not to include stringinfo.h here */
+ 
   extern void outNode(struct StringInfoData *str, const void *obj);
   extern void outToken(struct StringInfoData *str, const char *s);
   extern void outBitmapset(struct StringInfoData *str,
diff --cc src/include/nodes/params.h

index 79b310647b709a7cf17ee823a83407793eed2251,e19ac24582803f7300ee2236722a11434a7b9a0a..d9a48191f0dc9a2ed1a23472cc10756b9de1c9b0
--- 1/src/include/nodes/params.h
--- 2/src/include/nodes/params.h
+++ b/src/include/nodes/params.h
@@@ -4,8 -4,7 +4,8 @@@
    *      Support for finding the values associated with Param nodes.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/nodes/params.h
diff --cc src/include/nodes/parsenodes.h

index 8c78e3eb2b86c409b88d0efff796b6ac9d1c8377,8720e713c42cd83abd762398be7884496b5e1e98..8d4e58ca89cf9158d49473d6f50541a5d14a8285
--- 1/src/include/nodes/parsenodes.h
--- 2/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@@ -7,13 -7,13 +7,15 @@@
    * This is a byte (not character) offset in the original source text, to be
    * used for positioning an error cursor when there is an error related to
    * the node.  Access to the original source text is needed to make use of
-  * the location.
+  * the location.  At the topmost (statement) level, we also provide a
+  * statement length, likewise measured in bytes, for convenience in
+  * identifying statement boundaries in multi-statement source strings.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/nodes/parsenodes.h
    *
@@@ -26,11 -26,14 +28,18 @@@
   #include "nodes/lockoptions.h"
   #include "nodes/primnodes.h"
   #include "nodes/value.h"
+ +#ifdef PGXC
+ +#include "access/tupdesc.h"
+ +#include "pgxc/locator.h"
+ +#endif
   
+ typedef enum OverridingKind
+ {
+       OVERRIDING_NOT_SET = 0,
+       OVERRIDING_USER_VALUE,
+       OVERRIDING_SYSTEM_VALUE
+ } OverridingKind;
+ 
   /* Possible sources of a Query */
   typedef enum QuerySource
   {
@@@ -787,11 -924,10 +930,13 @@@ typedef enum RTEKin
         RTE_SUBQUERY,                           /* subquery in FROM */
         RTE_JOIN,                                       /* join */
         RTE_FUNCTION,                           /* function in FROM */
+       RTE_TABLEFUNC,                          /* TableFunc(.., column list) */
         RTE_VALUES,                                     /* VALUES (<exprlist>), (<exprlist>), ... */
-       RTE_CTE                                         /* common table expr (WITH list element) */
+ +#ifdef PGXC
-       ,RTE_REMOTE_DUMMY                       /* RTEs created by remote plan reduction */
++      RTE_REMOTE_DUMMY,                       /* RTEs created by remote plan reduction */
+ +#endif /* PGXC */
+       RTE_CTE,                                        /* common table expr (WITH list element) */
+       RTE_NAMEDTUPLESTORE                     /* tuplestore, e.g. for AFTER triggers */
   } RTEKind;
   
   typedef struct RangeTblEntry
@@@ -2233,9 -2442,7 +2467,10 @@@ typedef struct CreateSeqStm
         RangeVar   *sequence;           /* the sequence to create */
         List       *options;
         Oid                     ownerId;                /* ID of owner, or InvalidOid for default */
+ +#ifdef PGXC
+ +      bool            is_serial;              /* Indicates if this sequence is part of SERIAL process */
+ +#endif
+       bool            for_identity;
         bool            if_not_exists;  /* just do nothing if it already exists? */
   } CreateSeqStmt;
   
@@@ -2244,10 -2451,8 +2479,11 @@@ typedef struct AlterSeqStm
         NodeTag         type;
         RangeVar   *sequence;           /* the sequence to alter */
         List       *options;
+       bool            for_identity;
         bool            missing_ok;             /* skip error if a role is missing? */
+ +#ifdef PGXC
+ +      bool            is_serial;              /* Indicates if this sequence is part of SERIAL process */
+ +#endif
   } AlterSeqStmt;
   
   /* ----------------------
@@@ -3215,29 -3344,65 +3458,89 @@@ typedef struct AlterTSConfigurationStm
         bool            missing_ok;             /* for DROP - skip error if missing? */
   } AlterTSConfigurationStmt;
   
+ +/* PGXC_BEGIN */
+ +/*
+ + * EXECUTE DIRECT statement
+ + */
+ +typedef struct ExecDirectStmt
+ +{
+ +      NodeTag         type;
+ +      List            *node_names;
+ +      char            *query;
+ +} ExecDirectStmt;
+ +
+ +/*
+ + * CLEAN CONNECTION statement
+ + */
+ +typedef struct CleanConnStmt
+ +{
+ +      NodeTag         type;
+ +      List            *nodes;         /* list of nodes dropped */
+ +      char            *dbname;        /* name of database to drop connections */
+ +      char            *username;      /* name of user whose connections are dropped */
+ +      bool            is_coord;       /* type of connections dropped */
+ +      bool            is_force;       /* option force  */
+ +} CleanConnStmt;
+ +/* PGXC_END */
   
+ typedef struct CreatePublicationStmt
+ {
+       NodeTag         type;
+       char       *pubname;            /* Name of of the publication */
+       List       *options;            /* List of DefElem nodes */
+       List       *tables;                     /* Optional list of tables to add */
+       bool            for_all_tables; /* Special publication for all tables in db */
+ } CreatePublicationStmt;
+ 
+ typedef struct AlterPublicationStmt
+ {
+       NodeTag         type;
+       char       *pubname;            /* Name of of the publication */
+ 
+       /* parameters used for ALTER PUBLICATION ... WITH */
+       List       *options;            /* List of DefElem nodes */
+ 
+       /* parameters used for ALTER PUBLICATION ... ADD/DROP TABLE */
+       List       *tables;                     /* List of tables to add/drop */
+       bool            for_all_tables; /* Special publication for all tables in db */
+       DefElemAction tableAction;      /* What action to perform with the tables */
+ } AlterPublicationStmt;
+ 
+ typedef struct CreateSubscriptionStmt
+ {
+       NodeTag         type;
+       char       *subname;            /* Name of of the subscription */
+       char       *conninfo;           /* Connection string to publisher */
+       List       *publication;        /* One or more publication to subscribe to */
+       List       *options;            /* List of DefElem nodes */
+ } CreateSubscriptionStmt;
+ 
+ typedef enum AlterSubscriptionType
+ {
+       ALTER_SUBSCRIPTION_OPTIONS,
+       ALTER_SUBSCRIPTION_CONNECTION,
+       ALTER_SUBSCRIPTION_PUBLICATION,
+       ALTER_SUBSCRIPTION_PUBLICATION_REFRESH,
+       ALTER_SUBSCRIPTION_REFRESH,
+       ALTER_SUBSCRIPTION_ENABLED
+ } AlterSubscriptionType;
+ 
+ typedef struct AlterSubscriptionStmt
+ {
+       NodeTag         type;
+       AlterSubscriptionType kind; /* ALTER_SUBSCRIPTION_OPTIONS, etc */
+       char       *subname;            /* Name of of the subscription */
+       char       *conninfo;           /* Connection string to publisher */
+       List       *publication;        /* One or more publication to subscribe to */
+       List       *options;            /* List of DefElem nodes */
+ } AlterSubscriptionStmt;
+ 
+ typedef struct DropSubscriptionStmt
+ {
+       NodeTag         type;
+       char       *subname;            /* Name of of the subscription */
+       bool            missing_ok;             /* Skip error if missing? */
+       DropBehavior behavior;          /* RESTRICT or CASCADE behavior */
+ } DropSubscriptionStmt;
+ 
   #endif   /* PARSENODES_H */
diff --cc src/include/nodes/pg_list.h
Simple merge
diff --cc src/include/nodes/plannodes.h

index d811d09cca61cd32f7bbf0e8f6fddcc120b6612f,d84372da386911ff4c989dbed72530a394b2379e..a4b9f18aa5da368c7e9826641794280120430b55
--- 1/src/include/nodes/plannodes.h
--- 2/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@@ -4,8 -4,7 +4,8 @@@
    *      definitions for query plan nodes
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/nodes/plannodes.h
@@@ -74,19 -90,12 +91,25 @@@ typedef struct PlannedStm
         List       *invalItems;         /* other dependencies, as PlanInvalItems */
   
         int                     nParamExec;             /* number of PARAM_EXEC Params used */
+ +#ifdef XCP
+ +      int                     nParamRemote;   /* number of params sent from the master mode */
+ +
+ +      struct RemoteParam *remoteparams;/* parameter descriptors */
+ +
+ +      const char *pname;                      /* the portal name */
+ +
+ +      /* Parameters to filter out result rows */
+ +      char            distributionType;
+ +      AttrNumber  distributionKey;
+ +      List       *distributionNodes;
+ +      List       *distributionRestrict;
+ +#endif        
+ 
+       Node       *utilityStmt;        /* non-null if this is utility stmt */
+ 
+       /* statement location in source string (copied from Query) */
+       int                     stmt_location;  /* start location, or -1 if unknown */
+       int                     stmt_len;               /* length in bytes; 0 means "rest of string" */
   } PlannedStmt;
   
   /* macro for fetching the Plan associated with a SubPlan node */
diff --cc src/include/nodes/primnodes.h

index f22afd38033702917a0d01e268ec3d6e3081f8a0,86ec82eaaae8637918b3d1a1df84ed20978b8192..66dd6b50e4c27fd54f1c96909ec46101a3669c77
--- 1/src/include/nodes/primnodes.h
--- 2/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@@ -7,10 -7,8 +7,10 @@@
    *      and join trees.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/nodes/primnodes.h
    *
diff --cc src/include/nodes/relation.h

index d78197e8a7ab3cd0f214443deb89148f7919c24e,902e9faf12a327d7bafecc61c50cbd42f1e36dfb..1e7e6942d54125a688383c2f3d3f7716812dd888
--- 1/src/include/nodes/relation.h
--- 2/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@@ -4,8 -4,7 +4,8 @@@
    *      Definitions for planner's internal data structures.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/nodes/relation.h
diff --cc src/include/optimizer/cost.h

index 0a70840dd3cfbf8ffb9d95a6da7def3217c18322,3cf681e91b1762d7e2795ae54c8a829e4d9ad9f6..2701500a4a2ffd0902eff401965180e0a605a0c5
--- 1/src/include/optimizer/cost.h
--- 2/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@@ -4,8 -4,7 +4,8 @@@
    *      prototypes for costsize.c and clausesel.c.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/optimizer/cost.h
@@@ -75,10 -66,7 +75,8 @@@ extern bool enable_nestloop
   extern bool enable_material;
   extern bool enable_mergejoin;
   extern bool enable_hashjoin;
- #ifdef PGXC
+ +extern bool enable_fast_query_shipping;
- 
- #endif
+ extern bool enable_gathermerge;
   extern int    constraint_exclusion;
   
   extern double clamp_row_est(double nrows);
@@@ -102,13 -90,16 +100,19 @@@ extern void cost_subqueryscan(SubqueryS
                                   RelOptInfo *baserel, ParamPathInfo *param_info);
   extern void cost_functionscan(Path *path, PlannerInfo *root,
                                   RelOptInfo *baserel, ParamPathInfo *param_info);
+ extern void cost_tableexprscan(Path *path, PlannerInfo *root,
+                                  RelOptInfo *baserel, ParamPathInfo *param_info);
   extern void cost_valuesscan(Path *path, PlannerInfo *root,
                                 RelOptInfo *baserel, ParamPathInfo *param_info);
+ +#ifdef PGXC
+ +extern void cost_remotequery(Path *path, PlannerInfo *root, RelOptInfo *baserel);
+ +#endif
+ extern void cost_tablefuncscan(Path *path, PlannerInfo *root,
+                                  RelOptInfo *baserel, ParamPathInfo *param_info);
   extern void cost_ctescan(Path *path, PlannerInfo *root,
                          RelOptInfo *baserel, ParamPathInfo *param_info);
+ extern void cost_namedtuplestorescan(Path *path, PlannerInfo *root,
+                                                RelOptInfo *baserel, ParamPathInfo *param_info);
   extern void cost_recursive_union(Path *runion, Path *nrterm, Path *rterm);
   extern void cost_sort(Path *path, PlannerInfo *root,
                   List *pathkeys, Cost input_cost, double tuples, int width,
diff --cc src/include/optimizer/pathnode.h

index e8ffed5f49b454cd0314018aaaac1c8e61be6260,245825c38b95854a50056904bc55e47ddf7caaec..7937deebdadd8322bf03b5fbdac447b1c2ba9bc2
--- 1/src/include/optimizer/pathnode.h
--- 2/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@@ -4,8 -4,7 +4,8 @@@
    *      prototypes for pathnode.c, relnode.c.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/optimizer/pathnode.h
@@@ -77,15 -80,28 +81,28 @@@ extern UniquePath *create_unique_path(P
   extern GatherPath *create_gather_path(PlannerInfo *root,
                                    RelOptInfo *rel, Path *subpath, PathTarget *target,
                                    Relids required_outer, double *rows);
+ extern GatherMergePath *create_gather_merge_path(PlannerInfo *root,
+                                                RelOptInfo *rel,
+                                                Path *subpath,
+                                                PathTarget *target,
+                                                List *pathkeys,
+                                                Relids required_outer,
+                                                double *rows);
   extern SubqueryScanPath *create_subqueryscan_path(PlannerInfo *root,
- -                                               RelOptInfo *rel, Path *subpath,
- -                                               List *pathkeys, Relids required_outer);
+ +                                               RelOptInfo *rel, Path *subpath, List *pathkeys,
+ +                                               Relids required_outer, Distribution *distribution);
   extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel,
                                                  List *pathkeys, Relids required_outer);
+ extern Path *create_tablexprscan_path(PlannerInfo *root, RelOptInfo *rel,
+                                                List *pathkeys, Relids required_outer);
   extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel,
                                            Relids required_outer);
+ extern Path *create_tablefuncscan_path(PlannerInfo *root, RelOptInfo *rel,
+                                                 Relids required_outer);
   extern Path *create_ctescan_path(PlannerInfo *root, RelOptInfo *rel,
                                         Relids required_outer);
+ extern Path *create_namedtuplestorescan_path(PlannerInfo *root, RelOptInfo *rel,
+                                                               Relids required_outer);
   extern Path *create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel,
                                                   Relids required_outer);
   extern ForeignPath *create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel,
diff --cc src/include/optimizer/planmain.h

index 7f96df0beba5409de9e9a87fa7a44cf4257a7105,e773c0f7edacd267d7d348012381f418064d5a7a..4ef9ddb2ba9bc0204ecc045e5fb16ed394dc07d2
--- 1/src/include/optimizer/planmain.h
--- 2/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@@ -4,8 -4,7 +4,8 @@@
    *      prototypes for various files in optimizer/plan
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/optimizer/planmain.h
diff --cc src/include/optimizer/planner.h
Simple merge
diff --cc src/include/parser/analyze.h

index 6e3c47dafff4eb5c73b2e690ae68baa5e7c19e33,9b33ba5dfd175614d956ec7c300c951138bfc4d8..2c14b1e1af36381924ce7c39b057b248a2aad596
--- 1/src/include/parser/analyze.h
--- 2/src/include/parser/analyze.h
+++ b/src/include/parser/analyze.h
@@@ -4,8 -4,7 +4,8 @@@
    *            parse analysis for optimizable statements
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/parser/analyze.h
diff --cc src/include/parser/gramparse.h
Simple merge
diff --cc src/include/parser/kwlist.h

index ca265b4de243c377a4143b7671cb479e36d4b3a7,f50e45e886da8d3a120eecfb6dc3739b0a6bed1e..d10017583d66d5485ea3b0f2e2d99f0709b71e22
--- 1/src/include/parser/kwlist.h
--- 2/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@@ -7,9 -7,8 +7,9 @@@
    * by the PG_KEYWORD macro, which is not defined in this file; it can
    * be defined by the caller for special purposes.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * IDENTIFICATION
    *      src/include/parser/kwlist.h
@@@ -135,8 -129,8 +137,9 @@@ PG_KEYWORD("delimiter", DELIMITER, UNRE
   PG_KEYWORD("delimiters", DELIMITERS, UNRESERVED_KEYWORD)
   PG_KEYWORD("depends", DEPENDS, UNRESERVED_KEYWORD)
   PG_KEYWORD("desc", DESC, RESERVED_KEYWORD)
+ PG_KEYWORD("detach", DETACH, UNRESERVED_KEYWORD)
   PG_KEYWORD("dictionary", DICTIONARY, UNRESERVED_KEYWORD)
+ +PG_KEYWORD("direct", DIRECT, UNRESERVED_KEYWORD)
   PG_KEYWORD("disable", DISABLE_P, UNRESERVED_KEYWORD)
   PG_KEYWORD("discard", DISCARD, UNRESERVED_KEYWORD)
   PG_KEYWORD("distinct", DISTINCT, RESERVED_KEYWORD)
@@@ -266,11 -255,9 +270,12 @@@ PG_KEYWORD("names", NAMES, UNRESERVED_K
   PG_KEYWORD("national", NATIONAL, COL_NAME_KEYWORD)
   PG_KEYWORD("natural", NATURAL, TYPE_FUNC_NAME_KEYWORD)
   PG_KEYWORD("nchar", NCHAR, COL_NAME_KEYWORD)
+ PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD)
   PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD)
   PG_KEYWORD("no", NO, UNRESERVED_KEYWORD)
+ +#ifdef PGXC
+ +PG_KEYWORD("node", NODE, UNRESERVED_KEYWORD)
+ +#endif
   PG_KEYWORD("none", NONE, COL_NAME_KEYWORD)
   PG_KEYWORD("not", NOT, RESERVED_KEYWORD)
   PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD)
@@@ -328,10 -311,8 +335,11 @@@ PG_KEYWORD("privileges", PRIVILEGES, UN
   PG_KEYWORD("procedural", PROCEDURAL, UNRESERVED_KEYWORD)
   PG_KEYWORD("procedure", PROCEDURE, UNRESERVED_KEYWORD)
   PG_KEYWORD("program", PROGRAM, UNRESERVED_KEYWORD)
+ PG_KEYWORD("publication", PUBLICATION, UNRESERVED_KEYWORD)
   PG_KEYWORD("quote", QUOTE, UNRESERVED_KEYWORD)
+ +#ifdef PGXC
+ +PG_KEYWORD("randomly", RANDOMLY, UNRESERVED_KEYWORD)
+ +#endif
   PG_KEYWORD("range", RANGE, UNRESERVED_KEYWORD)
   PG_KEYWORD("read", READ, UNRESERVED_KEYWORD)
   PG_KEYWORD("real", REAL, COL_NAME_KEYWORD)
diff --cc src/include/parser/parse_agg.h

index 2c81da6c58645defce46cb541a0d6eab8ad9ae96,8a54d59d6f9478689859689ffb67cec99b9c7234..0ec3bc2e240944339e76538875abab327b226a39
--- 1/src/include/parser/parse_agg.h
--- 2/src/include/parser/parse_agg.h
+++ b/src/include/parser/parse_agg.h
@@@ -3,8 -3,7 +3,8 @@@
    * parse_agg.h
    *      handle aggregates and window functions in parser
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/parser/parse_agg.h
diff --cc src/include/parser/parse_func.h

index e8ff93058d0155c00eaf0e9e4314af0fbada015c,5be1812242a6b33a61e22ddc24f46cdbe0fd5a50..e0dad6ac10a9547c5a5a7530c309d803e2e27eb0
--- 1/src/include/parser/parse_func.h
--- 2/src/include/parser/parse_func.h
+++ b/src/include/parser/parse_func.h
@@@ -62,10 -62,11 +62,12 @@@ extern const char *func_signature_strin
   
   extern Oid LookupFuncName(List *funcname, int nargs, const Oid *argtypes,
                            bool noError);
- extern Oid LookupFuncNameTypeNames(List *funcname, List *argtypes,
-                                               bool noError);
- extern Oid LookupAggNameTypeNames(List *aggname, List *argtypes,
-                                          bool noError);
+ extern Oid LookupFuncWithArgs(ObjectWithArgs *func,
+                                  bool noError);
+ extern Oid LookupAggWithArgs(ObjectWithArgs *agg,
+                                 bool noError);
+ 
+ extern void check_srf_call_placement(ParseState *pstate, int location);
   
+ +extern void check_pg_get_expr_args(ParseState *pstate, Oid fnoid, List *args);
   #endif   /* PARSE_FUNC_H */
diff --cc src/include/parser/parse_relation.h
Simple merge
diff --cc src/include/parser/parse_utilcmd.h

index f4497d6ceab4500f735649c7505b89daff3542ac,8d0d17f8577a972824141279dcf920d7f8b088de..c3cdf7158c7e273f1363f824d7f23f3a5f17ea46
--- 1/src/include/parser/parse_utilcmd.h
--- 2/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@@ -4,10 -4,8 +4,10 @@@
    *            parse analysis for utility commands
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/parser/parse_utilcmd.h
    *
@@@ -32,8 -25,7 +32,10 @@@ extern IndexStmt *transformIndexStmt(Oi
   extern void transformRuleStmt(RuleStmt *stmt, const char *queryString,
                                   List **actions, Node **whereClause);
   extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt);
+ +#ifdef PGXC
+ +extern bool CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname);
+ +#endif
+ extern PartitionBoundSpec *transformPartitionBound(ParseState *pstate, Relation parent,
+                                               PartitionBoundSpec *spec);
   
   #endif   /* PARSE_UTILCMD_H */
diff --cc src/include/parser/parser.h
Simple merge
diff --cc src/include/parser/scanner.h
Simple merge
diff --cc src/include/pg_config.h.in
Simple merge
diff --cc src/include/pg_config.h.win32

index bb8cb3a5fb8aaa3cdb67a306ead12facb4a94e86,2d87a21ea8f4cbd7099928f369bad1660ace8d63..f920e356648a166784e4c83d0682156a07c618e7
--- 1/src/include/pg_config.h.win32
--- 2/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@@@ -548,16 -545,16 +545,16 @@@
   #define MEMSET_LOOP_LIMIT 1024
   
   /* Define to the address where bug reports for this package should be sent. */
- -#define PACKAGE_BUGREPORT "p[email protected]"
+ +#define PACKAGE_BUGREPORT "p[email protected]"
   
   /* Define to the full name of this package. */
- -#define PACKAGE_NAME "PostgreSQL"
+ +#define PACKAGE_NAME "Postgres-XL"
   
   /* Define to the full name and version of this package. */
- #define PACKAGE_STRING "Postgres-XL 9.6alpha1"
- -#define PACKAGE_STRING "PostgreSQL 10beta1"
++#define PACKAGE_STRING "Postgres-XL 10alpha1"
   
   /* Define to the version of this package. */
- #define PACKAGE_VERSION "9.6alpha1"
+ #define PACKAGE_VERSION "10beta1"
   
   /* Define to the name of a signed 128-bit integer type. */
   #undef PG_INT128_TYPE
@@@ -566,19 -563,13 +563,19 @@@
   #define PG_INT64_TYPE long long int
   
   /* PostgreSQL version as a string */
- #define PG_VERSION "9.6beta4"
+ #define PG_VERSION "10beta1"
   
   /* PostgreSQL version as a number */
- #define PG_VERSION_NUM 90600
+ #define PG_VERSION_NUM 100000
   
   /* Define to the one symbol short name of this package. */
- -#define PACKAGE_TARNAME "postgresql"
+ +#define PACKAGE_TARNAME "postgres-xl"
+ +
+ +/* Postgres-XC version as a string */
+ +#define PGXC_VERSION "1.1devel"
+ +
+ +/* Postgres-XC version as a number */
+ +#define PGXC_VERSION_NUM 10100
   
   /* Define to the name of the default PostgreSQL service principal in Kerberos.
      (--with-krb-srvnam=NAME) */
diff --cc src/include/pgstat.h

index bace5c6bd101a2abdee41caa8932f9726aec0ac4,5e029c0f4ef490a0a8d693de25d39571d4700474..5f58effe6c35e2a643ab3fbcce02168167b33a75
--- 1/src/include/pgstat.h
--- 2/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@@ -3,8 -3,7 +3,8 @@@
    *
    *    Definitions for the PostgreSQL statistics collector daemon.
    *
-  *    Copyright (c) 2001-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  *    Copyright (c) 2001-2017, PostgreSQL Global Development Group
    *
    *    src/include/pgstat.h
    * ----------
@@@ -716,15 -734,176 +735,177 @@@ typedef enum BackendStat
    * Wait Classes
    * ----------
    */
- typedef enum WaitClass
+ #define PG_WAIT_LWLOCK                                0x01000000U
+ #define PG_WAIT_LOCK                          0x03000000U
+ #define PG_WAIT_BUFFER_PIN                    0x04000000U
+ #define PG_WAIT_ACTIVITY                      0x05000000U
+ #define PG_WAIT_CLIENT                                0x06000000U
+ #define PG_WAIT_EXTENSION                     0x07000000U
+ #define PG_WAIT_IPC                                   0x08000000U
+ #define PG_WAIT_TIMEOUT                               0x09000000U
+ #define PG_WAIT_IO                                    0x0A000000U
+ 
+ /* ----------
+  * Wait Events - Activity
+  *
+  * Use this category when a process is waiting because it has no work to do,
+  * unless the "Client" or "Timeout" category describes the situation better.
+  * Typically, this should only be used for background processes.
+  * ----------
+  */
+ typedef enum
   {
-       WAIT_UNDEFINED,
-       WAIT_LWLOCK_NAMED,
-       WAIT_LWLOCK_TRANCHE,
-       WAIT_LOCK,
-       WAIT_BUFFER_PIN
- }     WaitClass;
+       WAIT_EVENT_ARCHIVER_MAIN = PG_WAIT_ACTIVITY,
+       WAIT_EVENT_AUTOVACUUM_MAIN,
+       WAIT_EVENT_BGWRITER_HIBERNATE,
+       WAIT_EVENT_BGWRITER_MAIN,
+       WAIT_EVENT_CHECKPOINTER_MAIN,
+       WAIT_EVENT_PGSTAT_MAIN,
+       WAIT_EVENT_RECOVERY_WAL_ALL,
+       WAIT_EVENT_RECOVERY_WAL_STREAM,
+       WAIT_EVENT_SYSLOGGER_MAIN,
+       WAIT_EVENT_WAL_RECEIVER_MAIN,
+       WAIT_EVENT_WAL_SENDER_MAIN,
+       WAIT_EVENT_WAL_WRITER_MAIN,
+       WAIT_EVENT_LOGICAL_LAUNCHER_MAIN,
- -      WAIT_EVENT_LOGICAL_APPLY_MAIN
++      WAIT_EVENT_LOGICAL_APPLY_MAIN,
++      WAIT_EVENT_CLUSTER_MONITOR_MAIN
+ } WaitEventActivity;
   
+ /* ----------
+  * Wait Events - Client
+  *
+  * Use this category when a process is waiting to send data to or receive data
+  * from the frontend process to which it is connected.  This is never used for
+  * a background process, which has no client connection.
+  * ----------
+  */
+ typedef enum
+ {
+       WAIT_EVENT_CLIENT_READ = PG_WAIT_CLIENT,
+       WAIT_EVENT_CLIENT_WRITE,
+       WAIT_EVENT_SSL_OPEN_SERVER,
+       WAIT_EVENT_WAL_RECEIVER_WAIT_START,
+       WAIT_EVENT_LIBPQWALRECEIVER,
+       WAIT_EVENT_WAL_SENDER_WAIT_WAL,
+       WAIT_EVENT_WAL_SENDER_WRITE_DATA
+ } WaitEventClient;
+ 
+ /* ----------
+  * Wait Events - IPC
+  *
+  * Use this category when a process cannot complete the work it is doing because
+  * it is waiting for a notification from another process.
+  * ----------
+  */
+ typedef enum
+ {
+       WAIT_EVENT_BGWORKER_SHUTDOWN = PG_WAIT_IPC,
+       WAIT_EVENT_BGWORKER_STARTUP,
+       WAIT_EVENT_BTREE_PAGE,
+       WAIT_EVENT_EXECUTE_GATHER,
+       WAIT_EVENT_MQ_INTERNAL,
+       WAIT_EVENT_MQ_PUT_MESSAGE,
+       WAIT_EVENT_MQ_RECEIVE,
+       WAIT_EVENT_MQ_SEND,
+       WAIT_EVENT_PARALLEL_FINISH,
+       WAIT_EVENT_PARALLEL_BITMAP_SCAN,
+       WAIT_EVENT_PROCARRAY_GROUP_UPDATE,
+       WAIT_EVENT_SAFE_SNAPSHOT,
+       WAIT_EVENT_SYNC_REP,
+       WAIT_EVENT_LOGICAL_SYNC_DATA,
+       WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE
+ } WaitEventIPC;
+ 
+ /* ----------
+  * Wait Events - Timeout
+  *
+  * Use this category when a process is waiting for a timeout to expire.
+  * ----------
+  */
+ typedef enum
+ {
+       WAIT_EVENT_BASE_BACKUP_THROTTLE = PG_WAIT_TIMEOUT,
+       WAIT_EVENT_PG_SLEEP,
+       WAIT_EVENT_RECOVERY_APPLY_DELAY
+ } WaitEventTimeout;
+ 
+ /* ----------
+  * Wait Events - IO
+  *
+  * Use this category when a process is waiting for a IO.
+  * ----------
+  */
+ typedef enum
+ {
+       WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO,
+       WAIT_EVENT_BUFFILE_WRITE,
+       WAIT_EVENT_CONTROL_FILE_READ,
+       WAIT_EVENT_CONTROL_FILE_SYNC,
+       WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE,
+       WAIT_EVENT_CONTROL_FILE_WRITE,
+       WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE,
+       WAIT_EVENT_COPY_FILE_READ,
+       WAIT_EVENT_COPY_FILE_WRITE,
+       WAIT_EVENT_DATA_FILE_EXTEND,
+       WAIT_EVENT_DATA_FILE_FLUSH,
+       WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC,
+       WAIT_EVENT_DATA_FILE_PREFETCH,
+       WAIT_EVENT_DATA_FILE_READ,
+       WAIT_EVENT_DATA_FILE_SYNC,
+       WAIT_EVENT_DATA_FILE_TRUNCATE,
+       WAIT_EVENT_DATA_FILE_WRITE,
+       WAIT_EVENT_DSM_FILL_ZERO_WRITE,
+       WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ,
+       WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC,
+       WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE,
+       WAIT_EVENT_LOCK_FILE_CREATE_READ,
+       WAIT_EVENT_LOCK_FILE_CREATE_SYNC,
+       WAIT_EVENT_LOCK_FILE_CREATE_WRITE,
+       WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ,
+       WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC,
+       WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC,
+       WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE,
+       WAIT_EVENT_LOGICAL_REWRITE_SYNC,
+       WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE,
+       WAIT_EVENT_LOGICAL_REWRITE_WRITE,
+       WAIT_EVENT_RELATION_MAP_READ,
+       WAIT_EVENT_RELATION_MAP_SYNC,
+       WAIT_EVENT_RELATION_MAP_WRITE,
+       WAIT_EVENT_REORDER_BUFFER_READ,
+       WAIT_EVENT_REORDER_BUFFER_WRITE,
+       WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ,
+       WAIT_EVENT_REPLICATION_SLOT_READ,
+       WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC,
+       WAIT_EVENT_REPLICATION_SLOT_SYNC,
+       WAIT_EVENT_REPLICATION_SLOT_WRITE,
+       WAIT_EVENT_SLRU_FLUSH_SYNC,
+       WAIT_EVENT_SLRU_READ,
+       WAIT_EVENT_SLRU_SYNC,
+       WAIT_EVENT_SLRU_WRITE,
+       WAIT_EVENT_SNAPBUILD_READ,
+       WAIT_EVENT_SNAPBUILD_SYNC,
+       WAIT_EVENT_SNAPBUILD_WRITE,
+       WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC,
+       WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE,
+       WAIT_EVENT_TIMELINE_HISTORY_READ,
+       WAIT_EVENT_TIMELINE_HISTORY_SYNC,
+       WAIT_EVENT_TIMELINE_HISTORY_WRITE,
+       WAIT_EVENT_TWOPHASE_FILE_READ,
+       WAIT_EVENT_TWOPHASE_FILE_SYNC,
+       WAIT_EVENT_TWOPHASE_FILE_WRITE,
+       WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ,
+       WAIT_EVENT_WAL_BOOTSTRAP_SYNC,
+       WAIT_EVENT_WAL_BOOTSTRAP_WRITE,
+       WAIT_EVENT_WAL_COPY_READ,
+       WAIT_EVENT_WAL_COPY_SYNC,
+       WAIT_EVENT_WAL_COPY_WRITE,
+       WAIT_EVENT_WAL_INIT_SYNC,
+       WAIT_EVENT_WAL_INIT_WRITE,
+       WAIT_EVENT_WAL_READ,
+       WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN,
+       WAIT_EVENT_WAL_WRITE
+ } WaitEventIO;
   
   /* ----------
    * Command type for progress reporting purposes
diff --cc src/include/port.h

index 7ef1b7b3d6391d6df115750358e500a5907ddd9c,52910ed203fc19f1369b5dd31eceaf48def5d10d..7d8e8a68d431a2800187328af51f65904ab7b208
--- 1/src/include/port.h
--- 2/src/include/port.h
+++ b/src/include/port.h
@@@ -459,13 -466,14 +466,18 @@@ extern int      pg_check_dir(const char *dir
   /* port/pgmkdirp.c */
   extern int    pg_mkdir_p(char *path, int omode);
   
+ +#ifndef PGSIGFUNC
+ +#define PGSIGFUNC
   /* port/pqsignal.c */
   typedef void (*pqsigfunc) (int signo);
+ +#endif
+ +
   extern pqsigfunc pqsignal(int signo, pqsigfunc func);
+ #ifndef WIN32
+ extern pqsigfunc pqsignal_no_restart(int signo, pqsigfunc func);
+ #else
+ #define pqsignal_no_restart(signo, func) pqsignal(signo, func)
+ #endif
   
   /* port/quotes.c */
   extern char *escape_single_quotes_ascii(const char *src);
diff --cc src/include/postgres.h

index 3b93f7b3bbcbc97339fb8bb00e88e844979c6fa7,f3582d5523aef4ef817356b08abd140ac956be1e..87df7844f4aba511b577e7f62fbde1459efb002a
--- 1/src/include/postgres.h
--- 2/src/include/postgres.h
+++ b/src/include/postgres.h
@@@ -7,9 -7,8 +7,9 @@@
    * Client-side code should include postgres_fe.h instead.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1995, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/postgres.h
    *
diff --cc src/include/postmaster/autovacuum.h

index 5b1ee8fd218687571b02ebd01bbeb1d0fd35b493,d383fd3926eeb54a9b36e475f7fffb13b2a4d2c0..e61cc93e556432ad23533f9ce1c06fbb1801fb84
--- 1/src/include/postmaster/autovacuum.h
--- 2/src/include/postmaster/autovacuum.h
+++ b/src/include/postmaster/autovacuum.h
@@@ -4,9 -4,8 +4,9 @@@
    *      header file for integrated autovacuum daemon
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/postmaster/autovacuum.h
    *
@@@ -15,12 -14,18 +15,23 @@@
   #ifndef AUTOVACUUM_H
   #define AUTOVACUUM_H
   
+ #include "storage/block.h"
+ 
+ /*
+  * Other processes can request specific work from autovacuum, identified by
+  * AutoVacuumWorkItem elements.
+  */
+ typedef enum
+ {
+       AVW_BRINSummarizeRange
+ } AutoVacuumWorkItemType;
+ 
   
+ +
+ +#ifdef PGXC  /* PGXC_DATANODE */
+ +#define IsAutoVacuumAnalyzeWorker() (IsAutoVacuumWorkerProcess() && !(MyProc->vacuumFlags & PROC_IN_VACUUM))
+ +#endif
+ +
   /* GUC variables */
   extern bool autovacuum_start_daemon;
   extern int    autovacuum_max_workers;
diff --cc src/include/rewrite/rewriteHandler.h
Simple merge
diff --cc src/include/storage/backendid.h

index 4ec99d8a18c1d295a88110f949620ffb44d17569,9d1fc500820fcc6f1460f8488177a5098ad046f5..3445caeb9365d34f131e912bdba3686ffc44d2f5
--- 1/src/include/storage/backendid.h
--- 2/src/include/storage/backendid.h
+++ b/src/include/storage/backendid.h
@@@ -4,8 -4,7 +4,8 @@@
    *      POSTGRES backend id communication definitions
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/storage/backendid.h
diff --cc src/include/storage/lock.h
Simple merge
diff --cc src/include/storage/lwlock.h

index a8dfbf2ee6b10d5d7eb64a364bd7eb93ac29935d,0cd45bb6d8e959f81a8c8f80d71eb1542b40024e..c22daef179df85d33548836c9aafaa0dde93fcb6
--- 1/src/include/storage/lwlock.h
--- 2/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@@ -4,8 -4,7 +4,8 @@@
    *      Lightweight lock manager
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/storage/lwlock.h
@@@ -235,7 -211,8 +212,9 @@@ typedef enum BuiltinTrancheId
         LWTRANCHE_BUFFER_MAPPING,
         LWTRANCHE_LOCK_MANAGER,
         LWTRANCHE_PREDICATE_LOCK_MANAGER,
+ +      LWTRANCHE_SHARED_QUEUES,
+       LWTRANCHE_PARALLEL_QUERY_DSA,
+       LWTRANCHE_TBM,
         LWTRANCHE_FIRST_USER_DEFINED
   }     BuiltinTrancheIds;
   
diff --cc src/include/storage/proc.h

index bc336fbaff7d1f29ad4f7946c4f755a9bee1e6ad,2fbde36dad2d3491ce919d9c24631ac59a9509f7..0be7165d4f664ea7ec2f2d280f6e6c0e61498717
--- 1/src/include/storage/proc.h
--- 2/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@@ -4,8 -4,7 +4,8 @@@
    *      per-process shared memory data structures
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/storage/proc.h
@@@ -102,13 -112,9 +113,15 @@@ struct PGPRO
         BackendId       backendId;              /* This backend's backend ID (if assigned) */
         Oid                     databaseId;             /* OID of database this backend is using */
         Oid                     roleId;                 /* OID of role using this backend */
+ +#ifdef XCP
+ +      Oid                     coordId;                /* Oid of originating coordinator */
+ +      int                     coordPid;               /* Pid of the originating session */
+ +      BackendId       firstBackendId; /* Backend ID of the first backend of
+ +                                                               * the distributed session */
+ +#endif
   
+       bool            isBackgroundWorker;             /* true if background worker. */
+ 
         /*
          * While in hot standby mode, shows that a conflict signal has been sent
          * for the current transaction. Set/cleared while holding ProcArrayLock,
@@@ -262,16 -269,9 +281,15 @@@ extern PGPROC *PreparedXactProcs
    * Background writer, checkpointer and WAL writer run during normal operation.
    * Startup process and WAL receiver also consume 2 slots, but WAL writer is
    * launched only after startup has exited, so we only need 4 slots.
+ + *
+ + * PGXC needs another slot for the pool manager process
    */
+ +#ifdef PGXC
+ +#define NUM_AUXILIARY_PROCS           5
+ +#else
   #define NUM_AUXILIARY_PROCS           4
+ +#endif
   
- 
   /* configurable options */
   extern int    DeadlockTimeout;
   extern int    StatementTimeout;
diff --cc src/include/storage/procarray.h

index ea12e5c795a2fd8e997541da18e0bbe37cb7e84c,22955a79dd448a3abf83c09617af48a38c6ff31d..bc46229b4265235746e03d2588c42ba6c27b10bc
--- 1/src/include/storage/procarray.h
--- 2/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@@ -4,10 -4,8 +4,10 @@@
    *      POSTGRES process array definitions.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/storage/procarray.h
    *
@@@ -21,16 -19,43 +21,52 @@@
   #include "utils/relcache.h"
   #include "utils/snapshot.h"
   
+ +#ifdef XCP
+ +extern int GlobalSnapshotSource;
+ +
+ +typedef enum GlobalSnapshotSourceType
+ +{
+ +      GLOBAL_SNAPSHOT_SOURCE_GTM,
+ +      GLOBAL_SNAPSHOT_SOURCE_COORDINATOR
+ +} GlobalSnapshotSourceType;
+ +#endif
   
+ /*
+  * These are to implement PROCARRAY_FLAGS_XXX
+  *
+  * Note: These flags are cloned from PROC_XXX flags in src/include/storage/proc.h
+  * to avoid forcing to include proc.h when including procarray.h. So if you modify
+  * PROC_XXX flags, you need to modify these flags.
+  */
+ #define               PROCARRAY_VACUUM_FLAG                   0x02            /* currently running
+                                                                                                                * lazy vacuum */
+ #define               PROCARRAY_ANALYZE_FLAG                  0x04            /* currently running
+                                                                                                                * analyze */
+ #define               PROCARRAY_LOGICAL_DECODING_FLAG 0x10            /* currently doing
+                                                                                                                * logical decoding
+                                                                                                                * outside xact */
+ 
+ #define               PROCARRAY_SLOTS_XMIN                    0x20            /* replication slot
+                                                                                                                * xmin, catalog_xmin */
+ /*
+  * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching
+  * PGXACT->vacuumFlags. Other flags are used for different purposes and
+  * have no corresponding PROC flag equivalent.
+  */
+ #define               PROCARRAY_PROC_FLAGS_MASK       (PROCARRAY_VACUUM_FLAG | \
+                                                                                PROCARRAY_ANALYZE_FLAG | \
+                                                                                PROCARRAY_LOGICAL_DECODING_FLAG)
+ 
+ /* Use the following flags as an input "flags" to GetOldestXmin function */
+ /* Consider all backends except for logical decoding ones which manage xmin separately */
+ #define               PROCARRAY_FLAGS_DEFAULT                 PROCARRAY_LOGICAL_DECODING_FLAG
+ /* Ignore vacuum backends */
+ #define               PROCARRAY_FLAGS_VACUUM                  PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG
+ /* Ignore analyze backends */
+ #define               PROCARRAY_FLAGS_ANALYZE                 PROCARRAY_FLAGS_DEFAULT | PROCARRAY_ANALYZE_FLAG
+ /* Ignore both vacuum and analyze backends */
+ #define               PROCARRAY_FLAGS_VACUUM_ANALYZE  PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG | PROCARRAY_ANALYZE_FLAG
+ 
   extern Size ProcArrayShmemSize(void);
   extern void CreateSharedProcArray(void);
   extern void ProcArrayAdd(PGPROC *proc);
@@@ -81,11 -89,9 +117,11 @@@ extern RunningTransactions GetRunningTr
   
   extern bool TransactionIdIsInProgress(TransactionId xid);
   extern bool TransactionIdIsActive(TransactionId xid);
- extern TransactionId GetOldestXmin(Relation rel, bool ignoreVacuum);
- extern TransactionId GetOldestXminInternal(Relation rel, bool ignoreVacuum,
+ extern TransactionId GetOldestXmin(Relation rel, int flags);
++extern TransactionId GetOldestXminInternal(Relation rel, int flags,
+ +              bool computeLocal, TransactionId lastGlobalXmin);
   extern TransactionId GetOldestActiveTransactionId(void);
- extern TransactionId GetOldestSafeDecodingTransactionId(void);
+ extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
   
   extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
   extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
diff --cc src/include/storage/procsignal.h

index 105fbaffea476269f7f6c2e8db9c64ced885c00a,d068dde5d76de916adbd9410f571f566e5d4776c..67cb9138294811adead209279b919d989cd5027a
--- 1/src/include/storage/procsignal.h
--- 2/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@@ -4,8 -4,7 +4,8 @@@
    *      Routines for interprocess signalling
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/storage/procsignal.h
diff --cc src/include/storage/relfilenode.h
Simple merge
diff --cc src/include/storage/smgr.h

index 7e384f6ea7af63d334fb7cfbf8621b21e340d3c7,9ce68296558cdc73fcf4d0d6f9ace3fbb20725c7..91a97f84b84a0b746de8898e6db564b84d3b45cc
--- 1/src/include/storage/smgr.h
--- 2/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@@ -4,8 -4,7 +4,8 @@@
    *      storage manager switch public interface declarations.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/storage/smgr.h
diff --cc src/include/tcop/dest.h

index 746d106a3997a0a2ca1542794f88b820dec5f42a,c459af2e139ee598b7ee1cc6c64d7b3cf029025a..622d35b34668249291a5b5a27269c5ec053bc673
--- 1/src/include/tcop/dest.h
--- 2/src/include/tcop/dest.h
+++ b/src/include/tcop/dest.h
@@@ -57,8 -57,7 +57,8 @@@
    * calls in portal and cursor manipulations.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/tcop/dest.h
diff --cc src/include/tcop/pquery.h

index ba34a8446adda4c5618be922635161847dd5b8b8,12ff4588c61abff139c8f6e4683d615d9c89e48c..e8ec5d0f8f0f8fcc9d932fe6f73cc4763f62e366
--- 1/src/include/tcop/pquery.h
--- 2/src/include/tcop/pquery.h
+++ b/src/include/tcop/pquery.h
@@@ -4,8 -4,7 +4,8 @@@
    *      prototypes for pquery.c.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/tcop/pquery.h
diff --cc src/include/tcop/tcopprot.h

index 30ff4cba9b88ad1d5ddbd1d27d919a2a5cf70f0b,f1a34a1c7248016f68d05c44d7db3d4ed86e7816..a32735bd3fe036d72c5e6c41e07c1f366761d342
--- 1/src/include/tcop/tcopprot.h
--- 2/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@@ -24,10 -24,8 +24,11 @@@
   #include "nodes/plannodes.h"
   #include "storage/procsignal.h"
   #include "utils/guc.h"
+ #include "utils/queryenvironment.h"
   
+ +/* needed because of 'struct timeval' and 'struct rusage' */
+ +#include <sys/time.h>
+ +#include <sys/resource.h>
   
   /* Required daylight between max_stack_depth and the kernel limit, in bytes */
   #define STACK_DEPTH_SLOP (512 * 1024L)
@@@ -50,13 -48,15 +51,16 @@@ typedef enu
   extern int    log_statement;
   
   extern List *pg_parse_query(const char *query_string);
- extern List *pg_analyze_and_rewrite(Node *parsetree, const char *query_string,
-                                          Oid *paramTypes, int numParams);
- extern List *pg_analyze_and_rewrite_params(Node *parsetree,
+ +extern List *pg_parse_query_get_source(const char *query_string, List **queries);
+ extern List *pg_analyze_and_rewrite(RawStmt *parsetree,
+                                          const char *query_string,
+                                          Oid *paramTypes, int numParams,
+                                          QueryEnvironment *queryEnv);
+ extern List *pg_analyze_and_rewrite_params(RawStmt *parsetree,
                                                           const char *query_string,
                                                           ParserSetupHook parserSetup,
-                                                         void *parserSetupArg);
+                                                         void *parserSetupArg,
+                                                         QueryEnvironment *queryEnv);
   extern PlannedStmt *pg_plan_query(Query *querytree, int cursorOptions,
                           ParamListInfo boundParams);
   extern List *pg_plan_queries(List *querytrees, int cursorOptions,
diff --cc src/include/tcop/utility.h

index 15a8458280bed6a00c8f25cd46cdda659d92a504,14f65c34d66165533f636fea1381f3ffe3c3ca3f..d52f3e852513bee7dc8c3d386ed513b337bacb64
--- 1/src/include/tcop/utility.h
--- 2/src/include/tcop/utility.h
+++ b/src/include/tcop/utility.h
@@@ -24,30 -24,21 +24,27 @@@ typedef enu
   } ProcessUtilityContext;
   
   /* Hook for plugins to get control in ProcessUtility() */
- typedef void (*ProcessUtility_hook_type) (Node *parsetree,
+ typedef void (*ProcessUtility_hook_type) (PlannedStmt *pstmt,
                                           const char *queryString, ProcessUtilityContext context,
                                                                                                           ParamListInfo params,
- -                                                                      DestReceiver *dest, char *completionTag);
+                                                                                                 QueryEnvironment *queryEnv,
- #ifdef PGXC
+ +                                                                      DestReceiver *dest,
- #endif /* PGXC */
+ +                                                                      bool sentToRemote,
+ +                                                                      char *completionTag);
   extern PGDLLIMPORT ProcessUtility_hook_type ProcessUtility_hook;
   
- extern void ProcessUtility(Node *parsetree, const char *queryString,
+ extern void ProcessUtility(PlannedStmt *pstmt, const char *queryString,
                            ProcessUtilityContext context, ParamListInfo params,
- -                         DestReceiver *dest, char *completionTag);
+                          QueryEnvironment *queryEnv,
- #ifdef PGXC
+ +                         DestReceiver *dest,
- #endif /* PGXC */
+ +                         bool sentToRemote,
- extern void standard_ProcessUtility(Node *parsetree, const char *queryString,
+ +                         char *completionTag);
+ extern void standard_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
                                                 ProcessUtilityContext context, ParamListInfo params,
- -                                              DestReceiver *dest, char *completionTag);
+                                               QueryEnvironment *queryEnv,
- #ifdef PGXC
+ +                                              DestReceiver *dest,
- #endif /* PGXC */
+ +                                              bool sentToRemote,
+ +                                              char *completionTag);
   
   extern bool UtilityReturnsTuples(Node *parsetree);
   
@@@ -59,10 -50,6 +56,10 @@@ extern const char *CreateCommandTag(Nod
   
   extern LogStmtLevel GetCommandLogLevel(Node *parsetree);
   
- extern bool CommandIsReadOnly(Node *parsetree);
+ extern bool CommandIsReadOnly(PlannedStmt *pstmt);
   
+ +#ifdef PGXC
+ +extern bool pgxc_lock_for_utility_stmt(Node *parsetree);
+ +#endif
+ +
   #endif   /* UTILITY_H */
diff --cc src/include/utils/builtins.h

index fa7b94065946ec0b18851cfbb7fc73e833b4318a,1435a7b57a924d2dd673ea18a931585fb2965e76..ea93c922f1fd281095a79d67705125f26ac87064
--- 1/src/include/utils/builtins.h
--- 2/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@@ -4,8 -4,7 +4,8 @@@
    *      Declarations for operations on built-in types.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/utils/builtins.h
@@@ -16,125 -15,11 +16,14 @@@
   #define BUILTINS_H
   
   #include "fmgr.h"
- #include "lib/stringinfo.h"
+ +#include "nodes/parsenodes.h"
+ +#ifdef PGXC
+ +#include "lib/stringinfo.h"
+ +#endif
- #include "utils/sortsupport.h"
- /*
-  *            Defined in adt/
-  */
- 
- /* acl.c */
- extern Datum has_any_column_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_name_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_name_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_id_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_id_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_name_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_name_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_id_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_id_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_attnum(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_id(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_name_name(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_name_id(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_id_name(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_id_id(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_name(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_id(PG_FUNCTION_ARGS);
- 
- /* amutils.c */
- extern Datum pg_indexam_has_property(PG_FUNCTION_ARGS);
- extern Datum pg_index_has_property(PG_FUNCTION_ARGS);
- extern Datum pg_index_column_has_property(PG_FUNCTION_ARGS);
+ #include "nodes/nodes.h"
+ #include "utils/fmgrprotos.h"
   
- -
   /* bool.c */
- extern Datum boolin(PG_FUNCTION_ARGS);
- extern Datum boolout(PG_FUNCTION_ARGS);
- extern Datum boolrecv(PG_FUNCTION_ARGS);
- extern Datum boolsend(PG_FUNCTION_ARGS);
- extern Datum booltext(PG_FUNCTION_ARGS);
- extern Datum booleq(PG_FUNCTION_ARGS);
- extern Datum boolne(PG_FUNCTION_ARGS);
- extern Datum boollt(PG_FUNCTION_ARGS);
- extern Datum boolgt(PG_FUNCTION_ARGS);
- extern Datum boolle(PG_FUNCTION_ARGS);
- extern Datum boolge(PG_FUNCTION_ARGS);
- extern Datum booland_statefunc(PG_FUNCTION_ARGS);
- extern Datum boolor_statefunc(PG_FUNCTION_ARGS);
- extern Datum bool_accum(PG_FUNCTION_ARGS);
- extern Datum bool_accum_inv(PG_FUNCTION_ARGS);
- extern Datum bool_alltrue(PG_FUNCTION_ARGS);
- extern Datum bool_anytrue(PG_FUNCTION_ARGS);
   extern bool parse_bool(const char *value, bool *result);
   extern bool parse_bool_with_len(const char *value, size_t len, bool *result);
   
@@@ -358,406 -64,17 +68,30 @@@ extern char *float8out_internal(double 
   extern int    float4_cmp_internal(float4 a, float4 b);
   extern int    float8_cmp_internal(float8 a, float8 b);
   
- extern Datum float4in(PG_FUNCTION_ARGS);
- extern Datum float4out(PG_FUNCTION_ARGS);
- extern Datum float4recv(PG_FUNCTION_ARGS);
- extern Datum float4send(PG_FUNCTION_ARGS);
- extern Datum float8in(PG_FUNCTION_ARGS);
- extern Datum float8out(PG_FUNCTION_ARGS);
- extern Datum float8recv(PG_FUNCTION_ARGS);
- extern Datum float8send(PG_FUNCTION_ARGS);
- extern Datum float4abs(PG_FUNCTION_ARGS);
- extern Datum float4um(PG_FUNCTION_ARGS);
- extern Datum float4up(PG_FUNCTION_ARGS);
- extern Datum float4larger(PG_FUNCTION_ARGS);
- extern Datum float4smaller(PG_FUNCTION_ARGS);
- extern Datum float8abs(PG_FUNCTION_ARGS);
- extern Datum float8um(PG_FUNCTION_ARGS);
- extern Datum float8up(PG_FUNCTION_ARGS);
- extern Datum float8larger(PG_FUNCTION_ARGS);
- extern Datum float8smaller(PG_FUNCTION_ARGS);
- extern Datum float4pl(PG_FUNCTION_ARGS);
- extern Datum float4mi(PG_FUNCTION_ARGS);
- extern Datum float4mul(PG_FUNCTION_ARGS);
- extern Datum float4div(PG_FUNCTION_ARGS);
- extern Datum float8pl(PG_FUNCTION_ARGS);
- extern Datum float8mi(PG_FUNCTION_ARGS);
- extern Datum float8mul(PG_FUNCTION_ARGS);
- extern Datum float8div(PG_FUNCTION_ARGS);
- extern Datum float4eq(PG_FUNCTION_ARGS);
- extern Datum float4ne(PG_FUNCTION_ARGS);
- extern Datum float4lt(PG_FUNCTION_ARGS);
- extern Datum float4le(PG_FUNCTION_ARGS);
- extern Datum float4gt(PG_FUNCTION_ARGS);
- extern Datum float4ge(PG_FUNCTION_ARGS);
- extern Datum float8eq(PG_FUNCTION_ARGS);
- extern Datum float8ne(PG_FUNCTION_ARGS);
- extern Datum float8lt(PG_FUNCTION_ARGS);
- extern Datum float8le(PG_FUNCTION_ARGS);
- extern Datum float8gt(PG_FUNCTION_ARGS);
- extern Datum float8ge(PG_FUNCTION_ARGS);
- extern Datum ftod(PG_FUNCTION_ARGS);
- extern Datum i4tod(PG_FUNCTION_ARGS);
- extern Datum i2tod(PG_FUNCTION_ARGS);
- extern Datum dtof(PG_FUNCTION_ARGS);
- extern Datum dtoi4(PG_FUNCTION_ARGS);
- extern Datum dtoi2(PG_FUNCTION_ARGS);
- extern Datum i4tof(PG_FUNCTION_ARGS);
- extern Datum i2tof(PG_FUNCTION_ARGS);
- extern Datum ftoi4(PG_FUNCTION_ARGS);
- extern Datum ftoi2(PG_FUNCTION_ARGS);
- extern Datum dround(PG_FUNCTION_ARGS);
- extern Datum dceil(PG_FUNCTION_ARGS);
- extern Datum dfloor(PG_FUNCTION_ARGS);
- extern Datum dsign(PG_FUNCTION_ARGS);
- extern Datum dtrunc(PG_FUNCTION_ARGS);
- extern Datum dsqrt(PG_FUNCTION_ARGS);
- extern Datum dcbrt(PG_FUNCTION_ARGS);
- extern Datum dpow(PG_FUNCTION_ARGS);
- extern Datum dexp(PG_FUNCTION_ARGS);
- extern Datum dlog1(PG_FUNCTION_ARGS);
- extern Datum dlog10(PG_FUNCTION_ARGS);
- extern Datum dacos(PG_FUNCTION_ARGS);
- extern Datum dasin(PG_FUNCTION_ARGS);
- extern Datum datan(PG_FUNCTION_ARGS);
- extern Datum datan2(PG_FUNCTION_ARGS);
- extern Datum dcos(PG_FUNCTION_ARGS);
- extern Datum dcot(PG_FUNCTION_ARGS);
- extern Datum dsin(PG_FUNCTION_ARGS);
- extern Datum dtan(PG_FUNCTION_ARGS);
- extern Datum dacosd(PG_FUNCTION_ARGS);
- extern Datum dasind(PG_FUNCTION_ARGS);
- extern Datum datand(PG_FUNCTION_ARGS);
- extern Datum datan2d(PG_FUNCTION_ARGS);
- extern Datum dcosd(PG_FUNCTION_ARGS);
- extern Datum dcotd(PG_FUNCTION_ARGS);
- extern Datum dsind(PG_FUNCTION_ARGS);
- extern Datum dtand(PG_FUNCTION_ARGS);
- extern Datum degrees(PG_FUNCTION_ARGS);
- extern Datum dpi(PG_FUNCTION_ARGS);
- extern Datum radians(PG_FUNCTION_ARGS);
- extern Datum drandom(PG_FUNCTION_ARGS);
- extern Datum setseed(PG_FUNCTION_ARGS);
- extern Datum float8_combine(PG_FUNCTION_ARGS);
- extern Datum float8_accum(PG_FUNCTION_ARGS);
- extern Datum float4_accum(PG_FUNCTION_ARGS);
- extern Datum float8_avg(PG_FUNCTION_ARGS);
- extern Datum float8_var_pop(PG_FUNCTION_ARGS);
- extern Datum float8_var_samp(PG_FUNCTION_ARGS);
- extern Datum float8_stddev_pop(PG_FUNCTION_ARGS);
- extern Datum float8_stddev_samp(PG_FUNCTION_ARGS);
- extern Datum float8_regr_accum(PG_FUNCTION_ARGS);
- extern Datum float8_regr_combine(PG_FUNCTION_ARGS);
- extern Datum float8_regr_sxx(PG_FUNCTION_ARGS);
- extern Datum float8_regr_syy(PG_FUNCTION_ARGS);
- extern Datum float8_regr_sxy(PG_FUNCTION_ARGS);
- extern Datum float8_regr_avgx(PG_FUNCTION_ARGS);
- extern Datum float8_regr_avgy(PG_FUNCTION_ARGS);
- extern Datum float8_covar_pop(PG_FUNCTION_ARGS);
- extern Datum float8_covar_samp(PG_FUNCTION_ARGS);
- extern Datum float8_corr(PG_FUNCTION_ARGS);
- extern Datum float8_regr_r2(PG_FUNCTION_ARGS);
- extern Datum float8_regr_slope(PG_FUNCTION_ARGS);
- extern Datum float8_regr_intercept(PG_FUNCTION_ARGS);
- extern Datum float48pl(PG_FUNCTION_ARGS);
- extern Datum float48mi(PG_FUNCTION_ARGS);
- extern Datum float48mul(PG_FUNCTION_ARGS);
- extern Datum float48div(PG_FUNCTION_ARGS);
- extern Datum float84pl(PG_FUNCTION_ARGS);
- extern Datum float84mi(PG_FUNCTION_ARGS);
- extern Datum float84mul(PG_FUNCTION_ARGS);
- extern Datum float84div(PG_FUNCTION_ARGS);
- extern Datum float48eq(PG_FUNCTION_ARGS);
- extern Datum float48ne(PG_FUNCTION_ARGS);
- extern Datum float48lt(PG_FUNCTION_ARGS);
- extern Datum float48le(PG_FUNCTION_ARGS);
- extern Datum float48gt(PG_FUNCTION_ARGS);
- extern Datum float48ge(PG_FUNCTION_ARGS);
- extern Datum float84eq(PG_FUNCTION_ARGS);
- extern Datum float84ne(PG_FUNCTION_ARGS);
- extern Datum float84lt(PG_FUNCTION_ARGS);
- extern Datum float84le(PG_FUNCTION_ARGS);
- extern Datum float84gt(PG_FUNCTION_ARGS);
- extern Datum float84ge(PG_FUNCTION_ARGS);
- extern Datum width_bucket_float8(PG_FUNCTION_ARGS);
- 
- /* dbsize.c */
- extern Datum pg_tablespace_size_oid(PG_FUNCTION_ARGS);
- extern Datum pg_tablespace_size_name(PG_FUNCTION_ARGS);
- extern Datum pg_database_size_oid(PG_FUNCTION_ARGS);
- extern Datum pg_database_size_name(PG_FUNCTION_ARGS);
- extern Datum pg_relation_size(PG_FUNCTION_ARGS);
- extern Datum pg_total_relation_size(PG_FUNCTION_ARGS);
- extern Datum pg_size_pretty(PG_FUNCTION_ARGS);
- extern Datum pg_size_pretty_numeric(PG_FUNCTION_ARGS);
- extern Datum pg_size_bytes(PG_FUNCTION_ARGS);
- extern Datum pg_table_size(PG_FUNCTION_ARGS);
- extern Datum pg_indexes_size(PG_FUNCTION_ARGS);
- extern Datum pg_relation_filenode(PG_FUNCTION_ARGS);
- extern Datum pg_filenode_relation(PG_FUNCTION_ARGS);
- extern Datum pg_relation_filepath(PG_FUNCTION_ARGS);
- 
- /* genfile.c */
- extern Datum pg_stat_file(PG_FUNCTION_ARGS);
- extern Datum pg_stat_file_1arg(PG_FUNCTION_ARGS);
- extern Datum pg_read_file(PG_FUNCTION_ARGS);
- extern Datum pg_read_file_off_len(PG_FUNCTION_ARGS);
- extern Datum pg_read_file_all(PG_FUNCTION_ARGS);
- extern Datum pg_read_binary_file(PG_FUNCTION_ARGS);
- extern Datum pg_read_binary_file_off_len(PG_FUNCTION_ARGS);
- extern Datum pg_read_binary_file_all(PG_FUNCTION_ARGS);
- extern Datum pg_ls_dir(PG_FUNCTION_ARGS);
- extern Datum pg_ls_dir_1arg(PG_FUNCTION_ARGS);
- 
- /* misc.c */
- extern Datum pg_num_nulls(PG_FUNCTION_ARGS);
- extern Datum pg_num_nonnulls(PG_FUNCTION_ARGS);
- extern Datum current_database(PG_FUNCTION_ARGS);
- extern Datum current_query(PG_FUNCTION_ARGS);
- extern Datum pg_cancel_backend(PG_FUNCTION_ARGS);
- extern Datum pg_terminate_backend(PG_FUNCTION_ARGS);
- extern Datum pg_reload_conf(PG_FUNCTION_ARGS);
- extern Datum pg_tablespace_databases(PG_FUNCTION_ARGS);
- extern Datum pg_tablespace_location(PG_FUNCTION_ARGS);
- extern Datum pg_rotate_logfile(PG_FUNCTION_ARGS);
- extern Datum pg_sleep(PG_FUNCTION_ARGS);
- extern Datum pg_get_keywords(PG_FUNCTION_ARGS);
- extern Datum pg_typeof(PG_FUNCTION_ARGS);
- extern Datum pg_collation_for(PG_FUNCTION_ARGS);
- extern Datum pg_relation_is_updatable(PG_FUNCTION_ARGS);
- extern Datum pg_column_is_updatable(PG_FUNCTION_ARGS);
- extern Datum parse_ident(PG_FUNCTION_ARGS);
- 
   /* oid.c */
- extern Datum oidin(PG_FUNCTION_ARGS);
- extern Datum oidout(PG_FUNCTION_ARGS);
- extern Datum oidrecv(PG_FUNCTION_ARGS);
- extern Datum oidsend(PG_FUNCTION_ARGS);
- extern Datum oideq(PG_FUNCTION_ARGS);
- extern Datum oidne(PG_FUNCTION_ARGS);
- extern Datum oidlt(PG_FUNCTION_ARGS);
- extern Datum oidle(PG_FUNCTION_ARGS);
- extern Datum oidge(PG_FUNCTION_ARGS);
- extern Datum oidgt(PG_FUNCTION_ARGS);
- extern Datum oidlarger(PG_FUNCTION_ARGS);
- extern Datum oidsmaller(PG_FUNCTION_ARGS);
- extern Datum oidvectorin(PG_FUNCTION_ARGS);
- extern Datum oidvectorout(PG_FUNCTION_ARGS);
- extern Datum oidvectorrecv(PG_FUNCTION_ARGS);
- extern Datum oidvectorsend(PG_FUNCTION_ARGS);
- extern Datum oidvectoreq(PG_FUNCTION_ARGS);
- extern Datum oidvectorne(PG_FUNCTION_ARGS);
- extern Datum oidvectorlt(PG_FUNCTION_ARGS);
- extern Datum oidvectorle(PG_FUNCTION_ARGS);
- extern Datum oidvectorge(PG_FUNCTION_ARGS);
- extern Datum oidvectorgt(PG_FUNCTION_ARGS);
   extern oidvector *buildoidvector(const Oid *oids, int n);
   extern Oid    oidparse(Node *node);
- 
- /* orderedsetaggs.c */
- extern Datum ordered_set_transition(PG_FUNCTION_ARGS);
- extern Datum ordered_set_transition_multi(PG_FUNCTION_ARGS);
- extern Datum percentile_disc_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_float8_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_interval_final(PG_FUNCTION_ARGS);
- extern Datum percentile_disc_multi_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_float8_multi_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_interval_multi_final(PG_FUNCTION_ARGS);
- extern Datum mode_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_rank_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_percent_rank_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_cume_dist_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_dense_rank_final(PG_FUNCTION_ARGS);
- 
- /* pseudotypes.c */
- extern Datum cstring_in(PG_FUNCTION_ARGS);
- extern Datum cstring_out(PG_FUNCTION_ARGS);
- extern Datum cstring_recv(PG_FUNCTION_ARGS);
- extern Datum cstring_send(PG_FUNCTION_ARGS);
- extern Datum any_in(PG_FUNCTION_ARGS);
- extern Datum any_out(PG_FUNCTION_ARGS);
- extern Datum anyarray_in(PG_FUNCTION_ARGS);
- extern Datum anyarray_out(PG_FUNCTION_ARGS);
- extern Datum anyarray_recv(PG_FUNCTION_ARGS);
- extern Datum anyarray_send(PG_FUNCTION_ARGS);
- extern Datum anynonarray_in(PG_FUNCTION_ARGS);
- extern Datum anynonarray_out(PG_FUNCTION_ARGS);
- extern Datum anyenum_in(PG_FUNCTION_ARGS);
- extern Datum anyenum_out(PG_FUNCTION_ARGS);
- extern Datum anyrange_in(PG_FUNCTION_ARGS);
- extern Datum anyrange_out(PG_FUNCTION_ARGS);
- extern Datum void_in(PG_FUNCTION_ARGS);
- extern Datum void_out(PG_FUNCTION_ARGS);
- extern Datum void_recv(PG_FUNCTION_ARGS);
- extern Datum void_send(PG_FUNCTION_ARGS);
+ +#ifdef PGXC
+ +extern Datum pgxc_node_str (PG_FUNCTION_ARGS);
+ +extern Datum pgxc_lock_for_backup (PG_FUNCTION_ARGS);
+ +#endif
- extern Datum trigger_in(PG_FUNCTION_ARGS);
- extern Datum trigger_out(PG_FUNCTION_ARGS);
- extern Datum event_trigger_in(PG_FUNCTION_ARGS);
- extern Datum event_trigger_out(PG_FUNCTION_ARGS);
- extern Datum language_handler_in(PG_FUNCTION_ARGS);
- extern Datum language_handler_out(PG_FUNCTION_ARGS);
- extern Datum fdw_handler_in(PG_FUNCTION_ARGS);
- extern Datum fdw_handler_out(PG_FUNCTION_ARGS);
- extern Datum index_am_handler_in(PG_FUNCTION_ARGS);
- extern Datum index_am_handler_out(PG_FUNCTION_ARGS);
- extern Datum tsm_handler_in(PG_FUNCTION_ARGS);
- extern Datum tsm_handler_out(PG_FUNCTION_ARGS);
- extern Datum internal_in(PG_FUNCTION_ARGS);
- extern Datum internal_out(PG_FUNCTION_ARGS);
- extern Datum opaque_in(PG_FUNCTION_ARGS);
- extern Datum opaque_out(PG_FUNCTION_ARGS);
- extern Datum anyelement_in(PG_FUNCTION_ARGS);
- extern Datum anyelement_out(PG_FUNCTION_ARGS);
- extern Datum shell_in(PG_FUNCTION_ARGS);
- extern Datum shell_out(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_in(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_out(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_recv(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_send(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_in(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_out(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_recv(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_send(PG_FUNCTION_ARGS);
+ extern int    oid_cmp(const void *p1, const void *p2);
   
   /* regexp.c */
- extern Datum nameregexeq(PG_FUNCTION_ARGS);
- extern Datum nameregexne(PG_FUNCTION_ARGS);
- extern Datum textregexeq(PG_FUNCTION_ARGS);
- extern Datum textregexne(PG_FUNCTION_ARGS);
- extern Datum nameicregexeq(PG_FUNCTION_ARGS);
- extern Datum nameicregexne(PG_FUNCTION_ARGS);
- extern Datum texticregexeq(PG_FUNCTION_ARGS);
- extern Datum texticregexne(PG_FUNCTION_ARGS);
- extern Datum textregexsubstr(PG_FUNCTION_ARGS);
- extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS);
- extern Datum textregexreplace(PG_FUNCTION_ARGS);
- extern Datum similar_escape(PG_FUNCTION_ARGS);
- extern Datum regexp_matches(PG_FUNCTION_ARGS);
- extern Datum regexp_matches_no_flags(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_table(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_array(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS);
   extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive,
                                         Oid collation, bool *exact);
   
- /* regproc.c */
- extern Datum regprocin(PG_FUNCTION_ARGS);
- extern Datum regprocout(PG_FUNCTION_ARGS);
- extern Datum to_regproc(PG_FUNCTION_ARGS);
- extern Datum to_regprocedure(PG_FUNCTION_ARGS);
- extern Datum regprocrecv(PG_FUNCTION_ARGS);
- extern Datum regprocsend(PG_FUNCTION_ARGS);
- extern Datum regprocedurein(PG_FUNCTION_ARGS);
- extern Datum regprocedureout(PG_FUNCTION_ARGS);
- extern Datum regprocedurerecv(PG_FUNCTION_ARGS);
- extern Datum regproceduresend(PG_FUNCTION_ARGS);
- extern Datum regoperin(PG_FUNCTION_ARGS);
- extern Datum regoperout(PG_FUNCTION_ARGS);
- extern Datum regoperrecv(PG_FUNCTION_ARGS);
- extern Datum regopersend(PG_FUNCTION_ARGS);
- extern Datum to_regoper(PG_FUNCTION_ARGS);
- extern Datum to_regoperator(PG_FUNCTION_ARGS);
- extern Datum regoperatorin(PG_FUNCTION_ARGS);
- extern Datum regoperatorout(PG_FUNCTION_ARGS);
- extern Datum regoperatorrecv(PG_FUNCTION_ARGS);
- extern Datum regoperatorsend(PG_FUNCTION_ARGS);
- extern Datum regclassin(PG_FUNCTION_ARGS);
- extern Datum regclassout(PG_FUNCTION_ARGS);
- extern Datum regclassrecv(PG_FUNCTION_ARGS);
- extern Datum regclasssend(PG_FUNCTION_ARGS);
- extern Datum to_regclass(PG_FUNCTION_ARGS);
- extern Datum regtypein(PG_FUNCTION_ARGS);
- extern Datum regtypeout(PG_FUNCTION_ARGS);
- extern Datum regtyperecv(PG_FUNCTION_ARGS);
- extern Datum regtypesend(PG_FUNCTION_ARGS);
- extern Datum to_regtype(PG_FUNCTION_ARGS);
- extern Datum regrolein(PG_FUNCTION_ARGS);
- extern Datum regroleout(PG_FUNCTION_ARGS);
- extern Datum regrolerecv(PG_FUNCTION_ARGS);
- extern Datum regrolesend(PG_FUNCTION_ARGS);
- extern Datum to_regrole(PG_FUNCTION_ARGS);
- extern Datum regnamespacein(PG_FUNCTION_ARGS);
- extern Datum regnamespaceout(PG_FUNCTION_ARGS);
- extern Datum regnamespacerecv(PG_FUNCTION_ARGS);
- extern Datum regnamespacesend(PG_FUNCTION_ARGS);
- extern Datum to_regnamespace(PG_FUNCTION_ARGS);
- extern Datum regconfigin(PG_FUNCTION_ARGS);
- extern Datum regconfigout(PG_FUNCTION_ARGS);
- extern Datum regconfigrecv(PG_FUNCTION_ARGS);
- extern Datum regconfigsend(PG_FUNCTION_ARGS);
- extern Datum regdictionaryin(PG_FUNCTION_ARGS);
- extern Datum regdictionaryout(PG_FUNCTION_ARGS);
- extern Datum regdictionaryrecv(PG_FUNCTION_ARGS);
- extern Datum regdictionarysend(PG_FUNCTION_ARGS);
- extern Datum text_regclass(PG_FUNCTION_ARGS);
- extern List *stringToQualifiedNameList(const char *string);
- extern char *format_procedure(Oid procedure_oid);
- extern char *format_procedure_qualified(Oid procedure_oid);
- extern void format_procedure_parts(Oid operator_oid, List **objnames,
-                                          List **objargs);
- extern char *format_operator(Oid operator_oid);
- extern char *format_operator_qualified(Oid operator_oid);
- extern void format_operator_parts(Oid operator_oid, List **objnames,
-                                         List **objargs);
- 
- /* rowtypes.c */
- extern Datum record_in(PG_FUNCTION_ARGS);
- extern Datum record_out(PG_FUNCTION_ARGS);
- extern Datum record_recv(PG_FUNCTION_ARGS);
- extern Datum record_send(PG_FUNCTION_ARGS);
- extern Datum record_eq(PG_FUNCTION_ARGS);
- extern Datum record_ne(PG_FUNCTION_ARGS);
- extern Datum record_lt(PG_FUNCTION_ARGS);
- extern Datum record_gt(PG_FUNCTION_ARGS);
- extern Datum record_le(PG_FUNCTION_ARGS);
- extern Datum record_ge(PG_FUNCTION_ARGS);
- extern Datum btrecordcmp(PG_FUNCTION_ARGS);
- extern Datum record_image_eq(PG_FUNCTION_ARGS);
- extern Datum record_image_ne(PG_FUNCTION_ARGS);
- extern Datum record_image_lt(PG_FUNCTION_ARGS);
- extern Datum record_image_gt(PG_FUNCTION_ARGS);
- extern Datum record_image_le(PG_FUNCTION_ARGS);
- extern Datum record_image_ge(PG_FUNCTION_ARGS);
- extern Datum btrecordimagecmp(PG_FUNCTION_ARGS);
- 
   /* ruleutils.c */
   extern bool quote_all_identifiers;
- extern Datum pg_get_ruledef(PG_FUNCTION_ARGS);
- extern Datum pg_get_ruledef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_wrap(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_name(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_name_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_indexdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_indexdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_triggerdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_triggerdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_constraintdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_constraintdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_expr(PG_FUNCTION_ARGS);
- extern Datum pg_get_expr_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_userbyid(PG_FUNCTION_ARGS);
- extern Datum pg_get_serial_sequence(PG_FUNCTION_ARGS);
- extern Datum pg_get_functiondef(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_arguments(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_identity_arguments(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_result(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_arg_default(PG_FUNCTION_ARGS);
+ +#ifdef PGXC
+ +extern void get_query_def_from_valuesList(Query *query, StringInfo buf);
+ +extern void deparse_query(Query *query, StringInfo buf, List *parentnamespace,
+ +              bool finalise_aggs, bool sortgroup_colno);
+ +#endif
+ +#ifdef PGXC
+ +extern List *deparse_context_for_plan(Node *plan, List *ancestors,
+ +                                                        List *rtable);
+ +#endif
   extern const char *quote_identifier(const char *ident);
   extern char *quote_qualified_identifier(const char *qualifier,
                                                    const char *ident);
@@@ -1179,186 -119,6 +136,21 @@@ extern char *format_type_with_typemod_q
   extern int32 type_maximum_size(Oid type_oid, int32 typemod);
   
   /* quote.c */
- extern Datum quote_ident(PG_FUNCTION_ARGS);
- extern Datum quote_literal(PG_FUNCTION_ARGS);
   extern char *quote_literal_cstr(const char *rawstr);
- extern Datum quote_nullable(PG_FUNCTION_ARGS);
- 
- /* guc.c */
- extern Datum show_config_by_name(PG_FUNCTION_ARGS);
- extern Datum show_config_by_name_missing_ok(PG_FUNCTION_ARGS);
- extern Datum set_config_by_name(PG_FUNCTION_ARGS);
- extern Datum show_all_settings(PG_FUNCTION_ARGS);
- extern Datum show_all_file_settings(PG_FUNCTION_ARGS);
- 
- /* pg_config.c */
- extern Datum pg_config(PG_FUNCTION_ARGS);
- 
- /* pg_controldata.c */
- extern Datum pg_control_checkpoint(PG_FUNCTION_ARGS);
- extern Datum pg_control_system(PG_FUNCTION_ARGS);
- extern Datum pg_control_init(PG_FUNCTION_ARGS);
- extern Datum pg_control_recovery(PG_FUNCTION_ARGS);
- 
- /* rls.c */
- extern Datum row_security_active(PG_FUNCTION_ARGS);
- extern Datum row_security_active_name(PG_FUNCTION_ARGS);
- 
- /* lockfuncs.c */
- extern Datum pg_lock_status(PG_FUNCTION_ARGS);
- extern Datum pg_blocking_pids(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_all(PG_FUNCTION_ARGS);
- 
- /* txid.c */
- extern Datum txid_snapshot_in(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_out(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_recv(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_send(PG_FUNCTION_ARGS);
- extern Datum txid_current(PG_FUNCTION_ARGS);
- extern Datum txid_current_snapshot(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_xmin(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_xmax(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_xip(PG_FUNCTION_ARGS);
- extern Datum txid_visible_in_snapshot(PG_FUNCTION_ARGS);
- 
- /* uuid.c */
- extern Datum uuid_in(PG_FUNCTION_ARGS);
- extern Datum uuid_out(PG_FUNCTION_ARGS);
- extern Datum uuid_send(PG_FUNCTION_ARGS);
- extern Datum uuid_recv(PG_FUNCTION_ARGS);
- extern Datum uuid_lt(PG_FUNCTION_ARGS);
- extern Datum uuid_le(PG_FUNCTION_ARGS);
- extern Datum uuid_eq(PG_FUNCTION_ARGS);
- extern Datum uuid_ge(PG_FUNCTION_ARGS);
- extern Datum uuid_gt(PG_FUNCTION_ARGS);
- extern Datum uuid_ne(PG_FUNCTION_ARGS);
- extern Datum uuid_cmp(PG_FUNCTION_ARGS);
- extern Datum uuid_sortsupport(PG_FUNCTION_ARGS);
- extern Datum uuid_hash(PG_FUNCTION_ARGS);
- 
- /* windowfuncs.c */
- extern Datum window_row_number(PG_FUNCTION_ARGS);
- extern Datum window_rank(PG_FUNCTION_ARGS);
- extern Datum window_dense_rank(PG_FUNCTION_ARGS);
- extern Datum window_percent_rank(PG_FUNCTION_ARGS);
- extern Datum window_cume_dist(PG_FUNCTION_ARGS);
- extern Datum window_ntile(PG_FUNCTION_ARGS);
- extern Datum window_lag(PG_FUNCTION_ARGS);
- extern Datum window_lag_with_offset(PG_FUNCTION_ARGS);
- extern Datum window_lag_with_offset_and_default(PG_FUNCTION_ARGS);
- extern Datum window_lead(PG_FUNCTION_ARGS);
- extern Datum window_lead_with_offset(PG_FUNCTION_ARGS);
- extern Datum window_lead_with_offset_and_default(PG_FUNCTION_ARGS);
- extern Datum window_first_value(PG_FUNCTION_ARGS);
- extern Datum window_last_value(PG_FUNCTION_ARGS);
- extern Datum window_nth_value(PG_FUNCTION_ARGS);
- 
- /* access/spgist/spgquadtreeproc.c */
- extern Datum spg_quad_config(PG_FUNCTION_ARGS);
- extern Datum spg_quad_choose(PG_FUNCTION_ARGS);
- extern Datum spg_quad_picksplit(PG_FUNCTION_ARGS);
- extern Datum spg_quad_inner_consistent(PG_FUNCTION_ARGS);
- extern Datum spg_quad_leaf_consistent(PG_FUNCTION_ARGS);
- 
- /* access/spgist/spgkdtreeproc.c */
- extern Datum spg_kd_config(PG_FUNCTION_ARGS);
- extern Datum spg_kd_choose(PG_FUNCTION_ARGS);
- extern Datum spg_kd_picksplit(PG_FUNCTION_ARGS);
- extern Datum spg_kd_inner_consistent(PG_FUNCTION_ARGS);
- 
- /* access/spgist/spgtextproc.c */
- extern Datum spg_text_config(PG_FUNCTION_ARGS);
- extern Datum spg_text_choose(PG_FUNCTION_ARGS);
- extern Datum spg_text_picksplit(PG_FUNCTION_ARGS);
- extern Datum spg_text_inner_consistent(PG_FUNCTION_ARGS);
- extern Datum spg_text_leaf_consistent(PG_FUNCTION_ARGS);
- 
- /* access/gin/ginarrayproc.c */
- extern Datum ginarrayextract(PG_FUNCTION_ARGS);
- extern Datum ginarrayextract_2args(PG_FUNCTION_ARGS);
- extern Datum ginqueryarrayextract(PG_FUNCTION_ARGS);
- extern Datum ginarrayconsistent(PG_FUNCTION_ARGS);
- extern Datum ginarraytriconsistent(PG_FUNCTION_ARGS);
- 
- /* access/tablesample/bernoulli.c */
- extern Datum tsm_bernoulli_handler(PG_FUNCTION_ARGS);
- 
- /* access/tablesample/system.c */
- extern Datum tsm_system_handler(PG_FUNCTION_ARGS);
- 
- /* access/transam/twophase.c */
- extern Datum pg_prepared_xact(PG_FUNCTION_ARGS);
- 
- /* access/transam/multixact.c */
- extern Datum pg_get_multixact_members(PG_FUNCTION_ARGS);
- 
- /* access/transam/committs.c */
- extern Datum pg_xact_commit_timestamp(PG_FUNCTION_ARGS);
- extern Datum pg_last_committed_xact(PG_FUNCTION_ARGS);
- 
- /* catalogs/dependency.c */
- extern Datum pg_describe_object(PG_FUNCTION_ARGS);
- extern Datum pg_identify_object(PG_FUNCTION_ARGS);
- extern Datum pg_identify_object_as_address(PG_FUNCTION_ARGS);
- 
- /* catalog/objectaddress.c */
- extern Datum pg_get_object_address(PG_FUNCTION_ARGS);
- 
- /* commands/constraint.c */
- extern Datum unique_key_recheck(PG_FUNCTION_ARGS);
- 
- /* commands/event_trigger.c */
- extern Datum pg_event_trigger_dropped_objects(PG_FUNCTION_ARGS);
- extern Datum pg_event_trigger_table_rewrite_oid(PG_FUNCTION_ARGS);
- extern Datum pg_event_trigger_table_rewrite_reason(PG_FUNCTION_ARGS);
- extern Datum pg_event_trigger_ddl_commands(PG_FUNCTION_ARGS);
- 
- /* commands/extension.c */
- extern Datum pg_available_extensions(PG_FUNCTION_ARGS);
- extern Datum pg_available_extension_versions(PG_FUNCTION_ARGS);
- extern Datum pg_extension_update_paths(PG_FUNCTION_ARGS);
- extern Datum pg_extension_config_dump(PG_FUNCTION_ARGS);
- 
- /* commands/prepare.c */
- extern Datum pg_prepared_statement(PG_FUNCTION_ARGS);
- 
- /* utils/mmgr/portalmem.c */
- extern Datum pg_cursor(PG_FUNCTION_ARGS);
   
+ +#ifdef PGXC
+ +/* backend/pgxc/pool/poolutils.c */
+ +extern Datum pgxc_pool_check(PG_FUNCTION_ARGS);
+ +extern Datum pgxc_pool_reload(PG_FUNCTION_ARGS);
+ +
+ +/* backend/access/transam/transam.c */
+ +extern Datum pgxc_is_committed(PG_FUNCTION_ARGS);
+ +extern Datum pgxc_is_inprogress(PG_FUNCTION_ARGS);
+ +#endif
+ +extern Datum pg_msgmodule_set(PG_FUNCTION_ARGS);
+ +extern Datum pg_msgmodule_change(PG_FUNCTION_ARGS);
+ +extern Datum pg_msgmodule_enable(PG_FUNCTION_ARGS);
+ +extern Datum pg_msgmodule_disable(PG_FUNCTION_ARGS);
+ +extern Datum pg_msgmodule_enable_all(PG_FUNCTION_ARGS);
+ +extern Datum pg_msgmodule_disable_all(PG_FUNCTION_ARGS);
   #endif   /* BUILTINS_H */
diff --cc src/include/utils/elog.h
Simple merge
diff --cc src/include/utils/guc.h

index 23288d9806074a9088962e4fc84a606d12282512,87d07410845a70d28da2a87e3bc361c8a15b93a0..144281aa244bde9bda71f5cf9187bfafcd96ef6e
--- 1/src/include/utils/guc.h
--- 2/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@@ -4,8 -4,7 +4,8 @@@
    * External declarations pertaining to backend/utils/misc/guc.c and
    * backend/utils/misc/guc-file.l
    *
-  * Copyright (c) 2000-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Copyright (c) 2000-2017, PostgreSQL Global Development Group
    * Written by Peter Eisentraut <[email protected]>.
    *
    * src/include/utils/guc.h
@@@ -243,14 -242,8 +243,13 @@@ extern bool log_executor_stats
   extern bool log_statement_stats;
   extern bool log_btree_build_stats;
   
+ +#ifdef XCP
+ +extern bool log_gtm_stats;
+ +extern bool log_remotesubplan_stats;
+ +#endif
+ +
   extern PGDLLIMPORT bool check_function_bodies;
   extern bool default_with_oids;
- extern bool SQL_inheritance;
   
   extern int    log_min_error_statement;
   extern int    log_min_messages;
diff --cc src/include/utils/guc_tables.h
Simple merge
diff --cc src/include/utils/lsyscache.h

index f72233c33557ea25a04e120f2b96ce87808cf1cb,93588df9f74a894894c431e28df8553a643724b2..8da59bbf3eb7278fb7783256e5c34e2e06df061a
--- 1/src/include/utils/lsyscache.h
--- 2/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@@ -3,8 -3,7 +3,8 @@@
    * lsyscache.h
    *      Convenience routines for common queries in the system catalog cache.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/utils/lsyscache.h
@@@ -154,41 -168,12 +177,36 @@@ extern Oid      get_typcollation(Oid typid)
   extern bool type_is_collatable(Oid typid);
   extern Oid    getBaseType(Oid typid);
   extern Oid    getBaseTypeAndTypmod(Oid typid, int32 *typmod);
+ +#ifdef PGXC
+ +extern char *get_typename(Oid typid);
+ +extern char *get_pgxc_nodename(Oid nodeoid);
+ +extern Oid    get_pgxc_nodeoid(const char *nodename);
+ +extern uint32 get_pgxc_node_id(Oid nodeid);
+ +extern char   get_pgxc_nodetype(Oid nodeid);
+ +extern int    get_pgxc_nodeport(Oid nodeid);
+ +extern char *get_pgxc_nodehost(Oid nodeid);
+ +extern bool   is_pgxc_nodepreferred(Oid nodeid);
+ +extern bool   is_pgxc_nodeprimary(Oid nodeid);
+ +extern Oid    get_pgxc_groupoid(const char *groupname);
+ +extern int    get_pgxc_groupmembers(Oid groupid, Oid **members);
+ +extern int    get_pgxc_classnodes(Oid tableid, Oid **nodes);
++extern char * get_pgxc_groupname(Oid groupid);
+ +#endif
   extern int32 get_typavgwidth(Oid typid, int32 typmod);
   extern int32 get_attavgwidth(Oid relid, AttrNumber attnum);
- extern bool get_attstatsslot(HeapTuple statstuple,
-                                Oid atttype, int32 atttypmod,
-                                int reqkind, Oid reqop,
-                                Oid *actualop,
-                                Datum **values, int *nvalues,
-                                float4 **numbers, int *nnumbers);
- extern void free_attstatsslot(Oid atttype,
-                                 Datum *values, int nvalues,
-                                 float4 *numbers, int nnumbers);
+ extern bool get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple,
+                                int reqkind, Oid reqop, int flags);
+ extern void free_attstatsslot(AttStatsSlot *sslot);
   extern char *get_namespace_name(Oid nspid);
+ +#ifdef XCP
+ +extern Oid    get_namespaceid(const char *nspname);
+ +extern char *get_typ_name(Oid typid);
+ +extern Oid    get_typ_namespace(Oid typid);
+ +extern Oid    get_typname_typid(const char *typname, Oid typnamespace);
+ +extern Oid    get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp);
+ +extern Oid    get_opnamespace(Oid opno);
+ +extern Oid    get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp);
+ +#endif
   extern char *get_namespace_name_or_temp(Oid nspid);
   extern Oid    get_range_subtype(Oid rangeOid);
   
diff --cc src/include/utils/plancache.h

index 41b10ab37260cab994947c8c4fbb85f66cb94bda,a129f2c652ddcad87b0c152cd729f40ea17c14c7..fbb271ed1c719cb48efa8a528750616a97ece24b
--- 1/src/include/utils/plancache.h
--- 2/src/include/utils/plancache.h
+++ b/src/include/utils/plancache.h
@@@ -5,8 -5,7 +5,8 @@@
    *
    * See plancache.c for comments.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/utils/plancache.h
@@@ -148,13 -147,10 +151,13 @@@ typedef struct CachedPla
   extern void InitPlanCache(void);
   extern void ResetPlanCache(void);
   
- extern CachedPlanSource *CreateCachedPlan(Node *raw_parse_tree,
+ extern CachedPlanSource *CreateCachedPlan(struct RawStmt *raw_parse_tree,
                                  const char *query_string,
+ +#ifdef PGXC
+ +                               const char *stmt_name,
+ +#endif
                                  const char *commandTag);
- extern CachedPlanSource *CreateOneShotCachedPlan(Node *raw_parse_tree,
+ extern CachedPlanSource *CreateOneShotCachedPlan(struct RawStmt *raw_parse_tree,
                                                 const char *query_string,
                                                 const char *commandTag);
   extern void CompleteCachedPlan(CachedPlanSource *plansource,
@@@ -181,11 -178,8 +185,12 @@@ extern List *CachedPlanGetTargetList(Ca
   
   extern CachedPlan *GetCachedPlan(CachedPlanSource *plansource,
                           ParamListInfo boundParams,
-                         bool useResOwner);
+                         bool useResOwner,
+                         QueryEnvironment *queryEnv);
   extern void ReleaseCachedPlan(CachedPlan *plan, bool useResOwner);
+ +#ifdef XCP
+ +extern void SetRemoteSubplan(CachedPlanSource *plansource,
+ +                               const char *plan_string);
+ +#endif
   
   #endif   /* PLANCACHE_H */
diff --cc src/include/utils/portal.h

index ba11a5cf5f2fbfa921ea274fabf0f51a9f2e59db,ef3898c98cb55c2c0e8aacbf845d9a77194bf82f..c9ce886483fc99fa6b229e7a945e316de36e17f4
--- 1/src/include/utils/portal.h
--- 2/src/include/utils/portal.h
+++ b/src/include/utils/portal.h
@@@ -36,8 -36,7 +36,8 @@@
    * to look like NO SCROLL cursors.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/utils/portal.h
@@@ -240,16 -233,9 +241,16 @@@ extern void PortalDefineQuery(Portal po
                                   const char *commandTag,
                                   List *stmts,
                                   CachedPlan *cplan);
- extern Node *PortalListGetPrimaryStmt(List *stmts);
+ extern PlannedStmt *PortalGetPrimaryStmt(Portal portal);
   extern void PortalCreateHoldStore(Portal portal);
   extern void PortalHashTableDeleteAll(void);
+ +#ifdef XCP
+ +extern void PortalCreateProducerStore(Portal portal);
+ +extern List *getProducingPortals(void);
+ +extern void addProducingPortal(Portal portal);
+ +extern void removeProducingPortal(Portal portal);
+ +extern bool portalIsProducing(Portal portal);
+ +#endif
   extern bool ThereAreNoReadyPortals(void);
   
   #endif   /* PORTAL_H */
diff --cc src/include/utils/rel.h

index 6145b0ecc121975c53857a6d004e14e595ea5a2d,84768969d32d9611fa2069824d9c91c10a042485..dd5de58fefaa1f18c67f7a589a197d37661291e1
--- 1/src/include/utils/rel.h
--- 2/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@@ -4,10 -4,8 +4,10 @@@
    *      POSTGRES relation descriptor (a/k/a relcache entry) definitions.
    *
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/utils/rel.h
    *
@@@ -20,9 -18,9 +20,10 @@@
   #include "access/xlog.h"
   #include "catalog/pg_class.h"
   #include "catalog/pg_index.h"
+ #include "catalog/pg_publication.h"
   #include "fmgr.h"
   #include "nodes/bitmapset.h"
+ +#include "pgxc/locator.h"
   #include "rewrite/prs2lock.h"
   #include "storage/block.h"
   #include "storage/relfilenode.h"
diff --cc src/include/utils/resowner_private.h
Simple merge
diff --cc src/include/utils/snapshot.h

index 7cc6f3894d5f58da6b2495858af3990e317a4f4e,2bcaa42277e362f95422c2ed2c3dd6aba11ef0c1..a981d90616d1b878576c17f975ffb036979afd32
--- 1/src/include/utils/snapshot.h
--- 2/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@@ -3,9 -3,8 +3,9 @@@
    * snapshot.h
    *      POSTGRES snapshot definition
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/utils/snapshot.h
    *
diff --cc src/include/utils/syscache.h

index 5a16368f12f7415c04b5868ba1de0a9acc3c900b,e20284d06155aa3407086aaacfd5c2c329a6b92c..2bbc93a975c6954ddc7c10adb9eaaa6880b21db5
--- 1/src/include/utils/syscache.h
--- 2/src/include/utils/syscache.h
+++ b/src/include/utils/syscache.h
@@@ -6,9 -6,8 +6,9 @@@
    * See also lsyscache.h, which provides convenience routines for
    * common cache-lookup operations.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
+ + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
    *
    * src/include/utils/syscache.h
    *
@@@ -73,14 -72,7 +73,15 @@@ enum SysCacheIdentifie
         OPEROID,
         OPFAMILYAMNAMENSP,
         OPFAMILYOID,
+ +#ifdef PGXC
+ +      PGXCCLASSRELID,
+ +      PGXCGROUPNAME,
+ +      PGXCGROUPOID,
+ +      PGXCNODENAME,
+ +      PGXCNODEOID,
+ +      PGXCNODEIDENTIFIER,
+ +#endif
+       PARTRELID,
         PROCNAMEARGSNSP,
         PROCOID,
         RANGETYPE,
diff --cc src/include/utils/timestamp.h

index 3e8f4855d05d4d156a27f4650d3c3830bc540b0b,7fd1c7692ff0ed3f8ab12aee848718a27df362f9..5e7e353b73ae55f44cdad9952b875c88cc73dc2a
--- 1/src/include/utils/timestamp.h
--- 2/src/include/utils/timestamp.h
+++ b/src/include/utils/timestamp.h
@@@ -73,16 -53,8 +52,12 @@@
   #define INTERVAL_PRECISION(t) ((t) & INTERVAL_PRECISION_MASK)
   #define INTERVAL_RANGE(t) (((t) >> 16) & INTERVAL_RANGE_MASK)
   
- #ifdef HAVE_INT64_TIMESTAMP
   #define TimestampTzPlusMilliseconds(tz,ms) ((tz) + ((ms) * (int64) 1000))
- #else
- #define TimestampTzPlusMilliseconds(tz,ms) ((tz) + ((ms) / 1000.0))
- #endif
   
+ +#ifdef PGXC
+ +#define InvalidGlobalTimestamp ((TimestampTz) 0)
+ +#define GlobalTimestampIsValid(timestamp) ((TimestampTz) (timestamp)) != InvalidGlobalTimestamp
+ +#endif
   
   /* Set at postmaster start */
   extern TimestampTz PgStartTime;
diff --cc src/include/utils/tuplesort.h

index ac46f90c4ade6120ecdbf20e7beee12eff13cc2f,14b9026fb7ffcb10555bcd60b31b31ef22b5825d..d6be7fe826fb4ddf3ce6805b1aa0d886d0841c59
--- 1/src/include/utils/tuplesort.h
--- 2/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@@ -10,8 -10,7 +10,8 @@@
    * amounts are sorted using temporary files and a standard external sort
    * algorithm.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/utils/tuplesort.h
diff --cc src/include/utils/tuplestore.h

index 1eecc89bcf55c24134329fa55a9b817607861693,b31ede882b93a40408d86b4b40471d2733aab8da..aef4fc984040da19108568b2513d708600e804b9
--- 1/src/include/utils/tuplestore.h
--- 2/src/include/utils/tuplestore.h
+++ b/src/include/utils/tuplestore.h
@@@ -21,8 -21,7 +21,8 @@@
    * Also, we have changed the API to return tuples in TupleTableSlots,
    * so that there is a check to prevent attempted access to system columns.
    *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
    * Portions Copyright (c) 1994, Regents of the University of California
    *
    * src/include/utils/tuplestore.h
diff --cc src/interfaces/libpq/fe-auth.c
Simple merge
diff --cc src/pl/plpgsql/src/pl_exec.c

index 31ae38e395b08eba1ed31acde6647026afcd7741,7a40c99ce03795b1728bd10510ff6e7a069c1f42..0d75a99361d7eab933bba0181b0a9143b8761622
--- 1/src/pl/plpgsql/src/pl_exec.c
--- 2/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@@@ -40,10 -40,9 +40,12 @@@
   #include "utils/rel.h"
   #include "utils/snapmgr.h"
   #include "utils/typcache.h"
+ +#ifdef XCP
+ +#include "pgxc/pgxc.h"
+ +#endif
   
+ #include "plpgsql.h"
+ 
   
   typedef struct
   {
diff --cc src/pl/plpgsql/src/pl_gram.y
Simple merge
diff --cc src/port/Makefile
Simple merge
diff --cc src/port/getpeereid.c
Simple merge
diff --cc src/test/regress/expected/aggregates.out

index 47e4f38d75f19cf7467d37756d568edca1a59bc6,ce6b841a331d9ec81874bbb706eebf4988460eb0..e428dcfb44c25e9c00feb2db487c56722d5b0e7c
--- 1/src/test/regress/expected/aggregates.out
--- 2/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@@ -774,12 -820,11 +849,13 @@@ explain (costs off, nodes off
      Sort Key: (generate_series(1, 3)) DESC
      InitPlan 1 (returns $0)
        ->  Limit
- -           ->  Index Only Scan Backward using tenk1_unique2 on tenk1
- -                 Index Cond: (unique2 IS NOT NULL)
+ +           ->  Remote Subquery Scan on all
+ +                 ->  Limit
+ +                       ->  Index Only Scan Backward using tenk1_unique2 on tenk1
+ +                             Index Cond: (unique2 IS NOT NULL)
-    ->  Result
- (9 rows)
+    ->  ProjectSet
+          ->  Result
- -(8 rows)
++(10 rows)
   
   select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
    max  | g 
diff --cc src/test/regress/expected/alter_generic.out

index 2d63e197bec2ba04e9295376938f2881edddf10b,62347bc47e84a0ce79ce2b42f04271c3b9abc7a6..65b2c2245e6cd2b2636c0d29abd501443c8bf2b1
--- 1/src/test/regress/expected/alter_generic.out
--- 2/src/test/regress/expected/alter_generic.out
+++ b/src/test/regress/expected/alter_generic.out
@@@ -636,53 -673,13 +679,15 @@@ SELECT nspname, prsnam
   ---
   --- Cleanup resources
   ---
+ set client_min_messages to warning; -- suppress cascade notices
   DROP FOREIGN DATA WRAPPER alt_fdw2 CASCADE;
+ +ERROR:  foreign-data wrapper "alt_fdw2" does not exist
   DROP FOREIGN DATA WRAPPER alt_fdw3 CASCADE;
+ +ERROR:  foreign-data wrapper "alt_fdw3" does not exist
   DROP LANGUAGE alt_lang2 CASCADE;
   DROP LANGUAGE alt_lang3 CASCADE;
- DROP LANGUAGE alt_lang4 CASCADE;
- ERROR:  language "alt_lang4" does not exist
   DROP SCHEMA alt_nsp1 CASCADE;
- NOTICE:  drop cascades to 26 other objects
- DETAIL:  drop cascades to function alt_func3(integer)
- drop cascades to function alt_agg3(integer)
- drop cascades to function alt_func4(integer)
- drop cascades to function alt_func2(integer)
- drop cascades to function alt_agg4(integer)
- drop cascades to function alt_agg2(integer)
- drop cascades to conversion alt_conv3
- drop cascades to conversion alt_conv4
- drop cascades to conversion alt_conv2
- drop cascades to operator @+@(integer,integer)
- drop cascades to operator @-@(integer,integer)
- drop cascades to operator family alt_opf3 for access method hash
- drop cascades to operator family alt_opc1 for access method hash
- drop cascades to operator family alt_opc2 for access method hash
- drop cascades to operator family alt_opf4 for access method hash
- drop cascades to operator family alt_opf2 for access method hash
- drop cascades to text search dictionary alt_ts_dict3
- drop cascades to text search dictionary alt_ts_dict4
- drop cascades to text search dictionary alt_ts_dict2
- drop cascades to text search configuration alt_ts_conf3
- drop cascades to text search configuration alt_ts_conf4
- drop cascades to text search configuration alt_ts_conf2
- drop cascades to text search template alt_ts_temp3
- drop cascades to text search template alt_ts_temp2
- drop cascades to text search parser alt_ts_prs3
- drop cascades to text search parser alt_ts_prs2
   DROP SCHEMA alt_nsp2 CASCADE;
- NOTICE:  drop cascades to 9 other objects
- DETAIL:  drop cascades to function alt_nsp2.alt_func2(integer)
- drop cascades to function alt_nsp2.alt_agg2(integer)
- drop cascades to conversion alt_conv2
- drop cascades to operator alt_nsp2.@-@(integer,integer)
- drop cascades to operator family alt_nsp2.alt_opf2 for access method hash
- drop cascades to text search dictionary alt_ts_dict2
- drop cascades to text search configuration alt_ts_conf2
- drop cascades to text search template alt_ts_temp2
- drop cascades to text search parser alt_ts_prs2
   DROP USER regress_alter_user1;
   DROP USER regress_alter_user2;
   DROP USER regress_alter_user3;
diff --cc src/test/regress/expected/alter_table.out

index c54f6753d078d463f59042e2c5089d5b3c8b4629,8aadbb88a348571dd2fe6e22d1e2a2f72380d35b..0a1068146acabd694be2907b6114637ad8a6c17e
--- 1/src/test/regress/expected/alter_table.out
--- 2/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@@ -365,6 -413,29 +414,26 @@@ ALTER TABLE tmp7 ADD CONSTRAINT identit
   ALTER TABLE tmp3 ADD CONSTRAINT IDENTITY check (b = boo(b)) NOT VALID;
   NOTICE:  merging constraint "identity" with inherited definition
   ALTER TABLE tmp3 VALIDATE CONSTRAINT identity;
- -NOTICE:  boo: 16
- -NOTICE:  boo: 20
+ -- A NO INHERIT constraint should not be looked for in children during VALIDATE CONSTRAINT
+ create table parent_noinh_convalid (a int);
+ create table child_noinh_convalid () inherits (parent_noinh_convalid);
+ insert into parent_noinh_convalid values (1);
+ insert into child_noinh_convalid values (1);
+ alter table parent_noinh_convalid add constraint check_a_is_2 check (a = 2) no inherit not valid;
+ -- fail, because of the row in parent
+ alter table parent_noinh_convalid validate constraint check_a_is_2;
+ ERROR:  check constraint "check_a_is_2" is violated by some row
+ delete from only parent_noinh_convalid;
+ -- ok (parent itself contains no violating rows)
+ alter table parent_noinh_convalid validate constraint check_a_is_2;
+ select convalidated from pg_constraint where conrelid = 'parent_noinh_convalid'::regclass and conname = 'check_a_is_2';
+  convalidated 
+ --------------
+  t
+ (1 row)
+ 
+ -- cleanup
+ drop table parent_noinh_convalid, child_noinh_convalid;
   -- Try (and fail) to create constraint from tmp5(a) to tmp4(a) - unique constraint on
   -- tmp4 is a,b
   ALTER TABLE tmp5 add constraint tmpconstr foreign key(a) references tmp4(a) match full;
@@@ -531,7 -601,68 +600,67 @@@ ERROR:  Hash/Modulo distribution colum
   -- As does this...
   ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest2, ftest1)
        references pktable(ptest1, ptest2);
- -ERROR:  foreign key constraint "fktable_ftest2_fkey" cannot be implemented
- -DETAIL:  Key columns "ftest2" and "ptest1" are of incompatible types: inet and integer.
+ +ERROR:  Hash/Modulo distribution column does not refer to hash/modulo distribution column in referenced table.
+ DROP TABLE FKTABLE;
+ DROP TABLE PKTABLE;
+ -- Test that ALTER CONSTRAINT updates trigger deferrability properly
+ CREATE TEMP TABLE PKTABLE (ptest1 int primary key);
+ CREATE TEMP TABLE FKTABLE (ftest1 int);
+ ALTER TABLE FKTABLE ADD CONSTRAINT fknd FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdd FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdi FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY IMMEDIATE;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fknd2 FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ ALTER TABLE FKTABLE ALTER CONSTRAINT fknd2 NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdd2 FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ALTER CONSTRAINT fkdd2 DEFERRABLE INITIALLY DEFERRED;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdi2 FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ALTER CONSTRAINT fkdi2 DEFERRABLE INITIALLY IMMEDIATE;
+ SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+ FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+ WHERE tgrelid = 'pktable'::regclass
+ ORDER BY 1,2,3;
+  conname |         tgfoid         | tgtype | tgdeferrable | tginitdeferred 
+ ---------+------------------------+--------+--------------+----------------
+  fkdd    | "RI_FKey_cascade_del"  |      9 | f            | f
+  fkdd    | "RI_FKey_noaction_upd" |     17 | t            | t
+  fkdd2   | "RI_FKey_cascade_del"  |      9 | f            | f
+  fkdd2   | "RI_FKey_noaction_upd" |     17 | t            | t
+  fkdi    | "RI_FKey_cascade_del"  |      9 | f            | f
+  fkdi    | "RI_FKey_noaction_upd" |     17 | t            | f
+  fkdi2   | "RI_FKey_cascade_del"  |      9 | f            | f
+  fkdi2   | "RI_FKey_noaction_upd" |     17 | t            | f
+  fknd    | "RI_FKey_cascade_del"  |      9 | f            | f
+  fknd    | "RI_FKey_noaction_upd" |     17 | f            | f
+  fknd2   | "RI_FKey_cascade_del"  |      9 | f            | f
+  fknd2   | "RI_FKey_noaction_upd" |     17 | f            | f
+ (12 rows)
+ 
+ SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+ FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+ WHERE tgrelid = 'fktable'::regclass
+ ORDER BY 1,2,3;
+  conname |       tgfoid        | tgtype | tgdeferrable | tginitdeferred 
+ ---------+---------------------+--------+--------------+----------------
+  fkdd    | "RI_FKey_check_ins" |      5 | t            | t
+  fkdd    | "RI_FKey_check_upd" |     17 | t            | t
+  fkdd2   | "RI_FKey_check_ins" |      5 | t            | t
+  fkdd2   | "RI_FKey_check_upd" |     17 | t            | t
+  fkdi    | "RI_FKey_check_ins" |      5 | t            | f
+  fkdi    | "RI_FKey_check_upd" |     17 | t            | f
+  fkdi2   | "RI_FKey_check_ins" |      5 | t            | f
+  fkdi2   | "RI_FKey_check_upd" |     17 | t            | f
+  fknd    | "RI_FKey_check_ins" |      5 | f            | f
+  fknd    | "RI_FKey_check_upd" |     17 | f            | f
+  fknd2   | "RI_FKey_check_ins" |      5 | f            | f
+  fknd2   | "RI_FKey_check_upd" |     17 | f            | f
+ (12 rows)
+ 
   -- temp tables should go away by themselves, need not drop them.
   -- test check constraint adding
   create table atacc1 ( test int );
diff --cc src/test/regress/expected/arrays.out

index 4a9f2ee1fda13daa1ab0bdf0394445e1bc2ec6a9,c730563f0386ca593c0f894a9f596de86ab833e2..15ef18a0d31c3c0c395618f547b29ba89a232cbb
--- 1/src/test/regress/expected/arrays.out
--- 2/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@@ -1206,13 -1288,15 +1294,20 @@@ select 33 = all ('{33,null,33}')
    
   (1 row)
   
+ -- nulls later in the bitmap
+ SELECT -1 != ALL(ARRAY(SELECT NULLIF(g.i, 900) FROM generate_series(1,1000) g(i)));
+  ?column? 
+ ----------
+  
+ (1 row)
+ 
   -- test indexes on arrays
- -create temp table arr_tbl (f1 int[] unique);
+ +-- PGXCTODO: related to feature request 3520520, this distribution type is changed
+ +-- to replication. As integer arrays are no available distribution types, this table
+ +-- should use roundrobin distribution if nothing is specified but roundrobin
+ +-- distribution cannot be safely used to check constraints on remote nodes.
+ +-- When global constraints are supported, this replication distribution should be removed.
+ +create temp table arr_tbl (f1 int[] unique) distribute by replication;
   insert into arr_tbl values ('{1,2,3}');
   insert into arr_tbl values ('{1,2}');
   -- failure expected:
diff --cc src/test/regress/expected/box.out
Simple merge
diff --cc src/test/regress/expected/brin.out
Simple merge
diff --cc src/test/regress/expected/case.out
Simple merge
diff --cc src/test/regress/expected/collate.out
Simple merge
diff --cc src/test/regress/expected/combocid.out
Simple merge
diff --cc src/test/regress/expected/copy2.out

index 5977563902d89eed769b911173a0822c7b9012dd,65e9c626b3597c9afeaaf733d3e9a4636991f2de..e61918c8c610c1a225a3637a9888e950d081bbd1
--- 1/src/test/regress/expected/copy2.out
--- 2/src/test/regress/expected/copy2.out
+++ b/src/test/regress/expected/copy2.out
@@@ -485,19 -438,21 +485,19 @@@ begi
   end $$ language plpgsql immutable;
   alter table check_con_tbl add check (check_con_function(check_con_tbl.*));
   \d+ check_con_tbl
-                     Table "public.check_con_tbl"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  f1     | integer |           | plain   |              | 
+                                Table "public.check_con_tbl"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  f1     | integer |           |          |         | plain   |              | 
   Check constraints:
       "check_con_tbl_check" CHECK (check_con_function(check_con_tbl.*))
+ +Distribute By: HASH(f1)
+ +Location Nodes: ALL DATANODES
   
   copy check_con_tbl from stdin;
- -NOTICE:  input = {"f1":1}
- -NOTICE:  input = {"f1":null}
   copy check_con_tbl from stdin;
- -NOTICE:  input = {"f1":0}
   ERROR:  new row for relation "check_con_tbl" violates check constraint "check_con_tbl_check"
   DETAIL:  Failing row contains (0).
- -CONTEXT:  COPY check_con_tbl, line 1: "0"
   select * from check_con_tbl;
    f1 
   ----
@@@ -505,11 -460,113 +505,115 @@@
      
   (2 rows)
   
+ -- test with RLS enabled.
+ CREATE ROLE regress_rls_copy_user;
+ CREATE ROLE regress_rls_copy_user_colperms;
+ CREATE TABLE rls_t1 (a int, b int, c int);
+ COPY rls_t1 (a, b, c) from stdin;
+ CREATE POLICY p1 ON rls_t1 FOR SELECT USING (a % 2 = 0);
+ ALTER TABLE rls_t1 ENABLE ROW LEVEL SECURITY;
+ ALTER TABLE rls_t1 FORCE ROW LEVEL SECURITY;
+ GRANT SELECT ON TABLE rls_t1 TO regress_rls_copy_user;
+ GRANT SELECT (a, b) ON TABLE rls_t1 TO regress_rls_copy_user_colperms;
+ -- all columns
+ COPY rls_t1 TO stdout;
+ 1     4       1
+ 2     3       2
+ 3     2       3
+ 4     1       4
+ COPY rls_t1 (a, b, c) TO stdout;
+ 1     4       1
+ 2     3       2
+ 3     2       3
+ 4     1       4
+ -- subset of columns
+ COPY rls_t1 (a) TO stdout;
+ 1
+ 2
+ 3
+ 4
+ COPY rls_t1 (a, b) TO stdout;
+ 1     4
+ 2     3
+ 3     2
+ 4     1
+ -- column reordering
+ COPY rls_t1 (b, a) TO stdout;
+ 4     1
+ 3     2
+ 2     3
+ 1     4
+ SET SESSION AUTHORIZATION regress_rls_copy_user;
+ -- all columns
+ COPY rls_t1 TO stdout;
+ 2     3       2
+ 4     1       4
+ COPY rls_t1 (a, b, c) TO stdout;
+ 2     3       2
+ 4     1       4
+ -- subset of columns
+ COPY rls_t1 (a) TO stdout;
+ 2
+ 4
+ COPY rls_t1 (a, b) TO stdout;
+ 2     3
+ 4     1
+ -- column reordering
+ COPY rls_t1 (b, a) TO stdout;
+ 3     2
+ 1     4
+ RESET SESSION AUTHORIZATION;
+ SET SESSION AUTHORIZATION regress_rls_copy_user_colperms;
+ -- attempt all columns (should fail)
+ COPY rls_t1 TO stdout;
+ ERROR:  permission denied for relation rls_t1
+ COPY rls_t1 (a, b, c) TO stdout;
+ ERROR:  permission denied for relation rls_t1
+ -- try to copy column with no privileges (should fail)
+ COPY rls_t1 (c) TO stdout;
+ ERROR:  permission denied for relation rls_t1
+ -- subset of columns (should succeed)
+ COPY rls_t1 (a) TO stdout;
+ 2
+ 4
+ COPY rls_t1 (a, b) TO stdout;
+ 2     3
+ 4     1
+ RESET SESSION AUTHORIZATION;
+ -- test with INSTEAD OF INSERT trigger on a view
+ CREATE TABLE instead_of_insert_tbl(id serial, name text);
+ CREATE VIEW instead_of_insert_tbl_view AS SELECT ''::text AS str;
+ COPY instead_of_insert_tbl_view FROM stdin; -- fail
+ ERROR:  cannot copy to view "instead_of_insert_tbl_view"
+ HINT:  To enable copying to a view, provide an INSTEAD OF INSERT trigger.
+ CREATE FUNCTION fun_instead_of_insert_tbl() RETURNS trigger AS $$
+ BEGIN
+   INSERT INTO instead_of_insert_tbl (name) VALUES (NEW.str);
+   RETURN NULL;
+ END;
+ $$ LANGUAGE plpgsql;
+ CREATE TRIGGER trig_instead_of_insert_tbl_view
+   INSTEAD OF INSERT ON instead_of_insert_tbl_view
+   FOR EACH ROW EXECUTE PROCEDURE fun_instead_of_insert_tbl();
+ COPY instead_of_insert_tbl_view FROM stdin;
+ SELECT * FROM instead_of_insert_tbl;
+  id | name  
+ ----+-------
+   1 | test1
+ (1 row)
+ 
+ -- clean up
   DROP TABLE forcetest;
   DROP TABLE vistest;
+ +ERROR:  table "vistest" does not exist
   DROP FUNCTION truncate_in_subxact();
+ +ERROR:  function truncate_in_subxact() does not exist
   DROP TABLE x, y;
+ DROP TABLE rls_t1 CASCADE;
+ DROP ROLE regress_rls_copy_user;
+ DROP ROLE regress_rls_copy_user_colperms;
   DROP FUNCTION fn_x_before();
   DROP FUNCTION fn_x_after();
+ DROP TABLE instead_of_insert_tbl;
+ DROP VIEW instead_of_insert_tbl_view;
+ DROP FUNCTION fun_instead_of_insert_tbl();
diff --cc src/test/regress/expected/create_index.out

index 5de986faeb6ad7115832509b3f86e05f316ad9e3,26cd05933cab1d9740d25a0e4067986765fc9c0c..bccb6920dd20ea511f2aac5931742384773c70b1
--- 1/src/test/regress/expected/create_index.out
--- 2/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@@ -2481,19 -2346,16 +2477,18 @@@ DROP TABLE unlogged_hash_table
   -- maintenance_work_mem setting and fillfactor:
   SET maintenance_work_mem = '1MB';
   CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fillfactor = 10);
- WARNING:  hash indexes are not WAL-logged and their use is discouraged
   EXPLAIN (COSTS OFF)
   SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA';
- -                      QUERY PLAN                       
- --------------------------------------------------------
- - Aggregate
- -   ->  Bitmap Heap Scan on tenk1
- -         Recheck Cond: (stringu1 = 'TVAAAA'::name)
- -         ->  Bitmap Index Scan on hash_tuplesort_idx
- -               Index Cond: (stringu1 = 'TVAAAA'::name)
- -(5 rows)
+ +                            QUERY PLAN                             
+ +-------------------------------------------------------------------
+ + Finalize Aggregate
+ +   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+ +         ->  Partial Aggregate
+ +               ->  Bitmap Heap Scan on tenk1
+ +                     Recheck Cond: (stringu1 = 'TVAAAA'::name)
+ +                     ->  Bitmap Index Scan on hash_tuplesort_idx
+ +                           Index Cond: (stringu1 = 'TVAAAA'::name)
+ +(7 rows)
   
   SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA';
    count 
@@@ -2592,22 -2445,36 +2587,22 @@@ REINDEX TABLE concur_heap
   DELETE FROM concur_heap WHERE f1 = 'b';
   VACUUM FULL concur_heap;
   \d concur_heap
- Table "public.concur_heap"
-  Column | Type | Modifiers 
- --------+------+-----------
-  f1     | text | 
-  f2     | text | 
+            Table "public.concur_heap"
+  Column | Type | Collation | Nullable | Default 
+ --------+------+-----------+----------+---------
+  f1     | text |           |          | 
+  f2     | text |           |          | 
   Indexes:
- -    "concur_index2" UNIQUE, btree (f1)
- -    "concur_index3" UNIQUE, btree (f2) INVALID
- -    "concur_heap_expr_idx" btree ((f2 || f1))
- -    "concur_index1" btree (f2, f1)
- -    "concur_index4" btree (f2) WHERE f1 = 'a'::text
- -    "concur_index5" btree (f2) WHERE f1 = 'x'::text
       "std_index" btree (f2)
   
   REINDEX TABLE concur_heap;
   \d concur_heap
- Table "public.concur_heap"
-  Column | Type | Modifiers 
- --------+------+-----------
-  f1     | text | 
-  f2     | text | 
+            Table "public.concur_heap"
+  Column | Type | Collation | Nullable | Default 
+ --------+------+-----------+----------+---------
+  f1     | text |           |          | 
+  f2     | text |           |          | 
   Indexes:
- -    "concur_index2" UNIQUE, btree (f1)
- -    "concur_index3" UNIQUE, btree (f2)
- -    "concur_heap_expr_idx" btree ((f2 || f1))
- -    "concur_index1" btree (f2, f1)
- -    "concur_index4" btree (f2) WHERE f1 = 'a'::text
- -    "concur_index5" btree (f2) WHERE f1 = 'x'::text
       "std_index" btree (f2)
   
   --
@@@ -2627,22 -2492,16 +2622,22 @@@ ERROR:  DROP INDEX CONCURRENTLY cannot 
   ROLLBACK;
   -- successes
   DROP INDEX CONCURRENTLY IF EXISTS "concur_index3";
+ +NOTICE:  index "concur_index3" does not exist, skipping
+ +ERROR:  DROP INDEX CONCURRENTLY cannot run inside a transaction block
   DROP INDEX CONCURRENTLY "concur_index4";
+ +ERROR:  index "concur_index4" does not exist
   DROP INDEX CONCURRENTLY "concur_index5";
+ +ERROR:  index "concur_index5" does not exist
   DROP INDEX CONCURRENTLY "concur_index1";
+ +ERROR:  index "concur_index1" does not exist
   DROP INDEX CONCURRENTLY "concur_heap_expr_idx";
+ +ERROR:  index "concur_heap_expr_idx" does not exist
   \d concur_heap
- Table "public.concur_heap"
-  Column | Type | Modifiers 
- --------+------+-----------
-  f1     | text | 
-  f2     | text | 
+            Table "public.concur_heap"
+  Column | Type | Collation | Nullable | Default 
+ --------+------+-----------+----------+---------
+  f1     | text |           |          | 
+  f2     | text |           |          | 
   Indexes:
       "std_index" btree (f2)
   
@@@ -3048,14 -2941,54 +3043,56 @@@ RESET enable_indexonlyscan
   --
   explain (costs off)
     select * from tenk1 where (thousand, tenthous) in ((1,1001), (null,null));
- -                      QUERY PLAN                      
- -------------------------------------------------------
- - Index Scan using tenk1_thous_tenthous on tenk1
- -   Index Cond: ((thousand = 1) AND (tenthous = 1001))
- -(2 rows)
+ +                         QUERY PLAN                         
+ +------------------------------------------------------------
+ + Remote Fast Query Execution
+ +   Node/s: datanode_1, datanode_2
+ +   ->  Index Scan using tenk1_thous_tenthous on tenk1
+ +         Index Cond: ((thousand = 1) AND (tenthous = 1001))
+ +(4 rows)
   
+ --
+ -- Check matching of boolean index columns to WHERE conditions and sort keys
+ --
+ create temp table boolindex (b bool, i int, unique(b, i), junk float);
+ explain (costs off)
+   select * from boolindex order by b, i limit 10;
+                       QUERY PLAN                       
+ -------------------------------------------------------
+  Limit
+    ->  Index Scan using boolindex_b_i_key on boolindex
+ (2 rows)
+ 
+ explain (costs off)
+   select * from boolindex where b order by i limit 10;
+                       QUERY PLAN                       
+ -------------------------------------------------------
+  Limit
+    ->  Index Scan using boolindex_b_i_key on boolindex
+          Index Cond: (b = true)
+          Filter: b
+ (4 rows)
+ 
+ explain (costs off)
+   select * from boolindex where b = true order by i desc limit 10;
+                            QUERY PLAN                           
+ ----------------------------------------------------------------
+  Limit
+    ->  Index Scan Backward using boolindex_b_i_key on boolindex
+          Index Cond: (b = true)
+          Filter: b
+ (4 rows)
+ 
+ explain (costs off)
+   select * from boolindex where not b order by i limit 10;
+                       QUERY PLAN                       
+ -------------------------------------------------------
+  Limit
+    ->  Index Scan using boolindex_b_i_key on boolindex
+          Index Cond: (b = false)
+          Filter: (NOT b)
+ (4 rows)
+ 
   --
   -- REINDEX (VERBOSE)
   --
diff --cc src/test/regress/expected/create_table.out
Simple merge
diff --cc src/test/regress/expected/create_table_like.out

index 4c60b36f61e55ce9faef16a0f0919d5d765811b1,3f405c94ce8a19b5250f57db7744b96f64a9a98b..595817bf85016540bfee97503500ad5cfb06cfc4
--- 1/src/test/regress/expected/create_table_like.out
--- 2/src/test/regress/expected/create_table_like.out
+++ b/src/test/regress/expected/create_table_like.out
@@@ -66,7 -66,54 +66,54 @@@ SELECT * FROM inhg; /* Two records wit
   (2 rows)
   
   DROP TABLE inhg;
- -CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text); /* copies indexes */
+ CREATE TABLE test_like_id_1 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ \d test_like_id_1
+                      Table "public.test_like_id_1"
+  Column |  Type   | Collation | Nullable |           Default            
+ --------+---------+-----------+----------+------------------------------
+  a      | integer |           | not null | generated always as identity
+  b      | text    |           |          | 
+ 
+ INSERT INTO test_like_id_1 (b) VALUES ('b1');
+ SELECT * FROM test_like_id_1;
+  a | b  
+ ---+----
+  1 | b1
+ (1 row)
+ 
+ CREATE TABLE test_like_id_2 (LIKE test_like_id_1);
+ \d test_like_id_2
+            Table "public.test_like_id_2"
+  Column |  Type   | Collation | Nullable | Default 
+ --------+---------+-----------+----------+---------
+  a      | integer |           | not null | 
+  b      | text    |           |          | 
+ 
+ INSERT INTO test_like_id_2 (b) VALUES ('b2');
+ ERROR:  null value in column "a" violates not-null constraint
+ DETAIL:  Failing row contains (null, b2).
+ SELECT * FROM test_like_id_2;  -- identity was not copied
+  a | b 
+ ---+---
+ (0 rows)
+ 
+ CREATE TABLE test_like_id_3 (LIKE test_like_id_1 INCLUDING IDENTITY);
+ \d test_like_id_3
+                      Table "public.test_like_id_3"
+  Column |  Type   | Collation | Nullable |           Default            
+ --------+---------+-----------+----------+------------------------------
+  a      | integer |           | not null | generated always as identity
+  b      | text    |           |          | 
+ 
+ INSERT INTO test_like_id_3 (b) VALUES ('b3');
+ SELECT * FROM test_like_id_3;  -- identity was copied and applied
+  a | b  
+ ---+----
+  1 | b3
+ (1 row)
+ 
+ DROP TABLE test_like_id_1, test_like_id_2, test_like_id_3;
+ +CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text) DISTRIBUTE BY REPLICATION; /* copies indexes */
   INSERT INTO inhg VALUES (5, 10);
   INSERT INTO inhg VALUES (20, 10); -- should fail
   ERROR:  duplicate key value violates unique constraint "inhg_pkey"
diff --cc src/test/regress/expected/create_view.out
Simple merge
diff --cc src/test/regress/expected/date.out
Simple merge
diff --cc src/test/regress/expected/enum.out
Simple merge
diff --cc src/test/regress/expected/equivclass.out

index 4be2ba05470cbffa92b4db635c22b31ff02a191c,a96b2a1b07c36bfa0c9888c9e30631134fe68637..2082b8a77ba5a717fe8b811e77e3af7aef1efea4
--- 1/src/test/regress/expected/equivclass.out
--- 2/src/test/regress/expected/equivclass.out
+++ b/src/test/regress/expected/equivclass.out
@@@ -291,34 -260,69 +291,32 @@@ explain (costs off
        union all
        select ff + 4 as x from ec1) as ss2
     where ss1.x = ec1.f1 and ss1.x = ss2.x and ec1.ff = 42::int8;
- -                             QUERY PLAN                              
- ----------------------------------------------------------------------
+ +                                 QUERY PLAN                                  
+ +-----------------------------------------------------------------------------
    Nested Loop
- -   ->  Nested Loop
- -         ->  Index Scan using ec1_pkey on ec1
- -               Index Cond: (ff = '42'::bigint)
+ +   Join Filter: (x = x)
+ +   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
            ->  Append
- -               ->  Index Scan using ec1_expr2 on ec1 ec1_1
- -                     Index Cond: (((ff + 2) + 1) = ec1.f1)
- -               ->  Index Scan using ec1_expr3 on ec1 ec1_2
- -                     Index Cond: (((ff + 3) + 1) = ec1.f1)
- -               ->  Index Scan using ec1_expr4 on ec1 ec1_3
- -                     Index Cond: ((ff + 4) = ec1.f1)
- -   ->  Append
- -         ->  Index Scan using ec1_expr2 on ec1 ec1_4
- -               Index Cond: (((ff + 2) + 1) = (((ec1_1.ff + 2) + 1)))
- -         ->  Index Scan using ec1_expr3 on ec1 ec1_5
- -               Index Cond: (((ff + 3) + 1) = (((ec1_1.ff + 2) + 1)))
- -         ->  Index Scan using ec1_expr4 on ec1 ec1_6
- -               Index Cond: ((ff + 4) = (((ec1_1.ff + 2) + 1)))
- -(18 rows)
- -
- --- let's try that as a mergejoin
- -set enable_mergejoin = on;
- -set enable_nestloop = off;
- -explain (costs off)
- -  select * from ec1,
- -    (select ff + 1 as x from
- -       (select ff + 2 as ff from ec1
- -        union all
- -        select ff + 3 as ff from ec1) ss0
- -     union all
- -     select ff + 4 as x from ec1) as ss1,
- -    (select ff + 1 as x from
- -       (select ff + 2 as ff from ec1
- -        union all
- -        select ff + 3 as ff from ec1) ss0
- -     union all
- -     select ff + 4 as x from ec1) as ss2
- -  where ss1.x = ec1.f1 and ss1.x = ss2.x and ec1.ff = 42::int8;
- -                           QUERY PLAN                            
- ------------------------------------------------------------------
- - Merge Join
- -   Merge Cond: ((((ec1_4.ff + 2) + 1)) = (((ec1_1.ff + 2) + 1)))
- -   ->  Merge Append
- -         Sort Key: (((ec1_4.ff + 2) + 1))
- -         ->  Index Scan using ec1_expr2 on ec1 ec1_4
- -         ->  Index Scan using ec1_expr3 on ec1 ec1_5
- -         ->  Index Scan using ec1_expr4 on ec1 ec1_6
+ +               ->  Seq Scan on ec1 ec1_4
+ +               ->  Seq Scan on ec1 ec1_5
+ +               ->  Seq Scan on ec1 ec1_6
      ->  Materialize
- -         ->  Merge Join
- -               Merge Cond: ((((ec1_1.ff + 2) + 1)) = ec1.f1)
- -               ->  Merge Append
- -                     Sort Key: (((ec1_1.ff + 2) + 1))
- -                     ->  Index Scan using ec1_expr2 on ec1 ec1_1
- -                     ->  Index Scan using ec1_expr3 on ec1 ec1_2
- -                     ->  Index Scan using ec1_expr4 on ec1 ec1_3
- -               ->  Sort
- -                     Sort Key: ec1.f1 USING <
+ +         ->  Nested Loop
+ +               Join Filter: (x = ec1.f1)
+ +               ->  Remote Subquery Scan on all (datanode_1)
                        ->  Index Scan using ec1_pkey on ec1
                              Index Cond: (ff = '42'::bigint)
+ +               ->  Materialize
+ +                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+ +                           ->  Append
+ +                                 ->  Seq Scan on ec1 ec1_1
+ +                                 ->  Seq Scan on ec1 ec1_2
+ +                                 ->  Seq Scan on ec1 ec1_3
   (19 rows)
   
- -- excluding as XL does not support complex queries
- -- with 'union all'
+ +-- let's try that as a mergejoin
+ +set enable_mergejoin = on;
+ +set enable_nestloop = off;
   -- check partially indexed scan
   set enable_nestloop = on;
   set enable_mergejoin = off;
diff --cc src/test/regress/expected/event_trigger.out

index f025a8bb9152f4e251abfeba8a78165a3510c47c,906dcb8b319bad03f85563c580a087d1bd71f6eb..085eb207b91a2faf26960dc463bc435d03dab485
--- 1/src/test/regress/expected/event_trigger.out
--- 2/src/test/regress/expected/event_trigger.out
+++ b/src/test/regress/expected/event_trigger.out
@@@ -80,13 -78,8 +80,10 @@@ LINE 2:    execute procedure test_event
   create event trigger regress_event_trigger2 on ddl_command_start
      when tag in ('create table', 'CREATE FUNCTION')
      execute procedure test_event_trigger();
+ +ERROR:  EVENT TRIGGER not yet supported in Postgres-XL
   -- OK
   comment on event trigger regress_event_trigger is 'test comment';
- -- should fail, event triggers are not schema objects
- comment on event trigger wrong.regress_event_trigger is 'test comment';
- ERROR:  event trigger name cannot be qualified
+ +ERROR:  event trigger "regress_event_trigger" does not exist
   -- drop as non-superuser should fail
   create role regress_evt_user;
   set role regress_evt_user;
diff --cc src/test/regress/expected/foreign_data.out

index c668e8e5582f99a2e9309b5bb0afe2230886b2ae,699309b3b2db0dcf2e14ed6570a92e86ed681050..de22510740617ded55a5b047df598f6cd02f7712
--- 1/src/test/regress/expected/foreign_data.out
--- 2/src/test/regress/expected/foreign_data.out
+++ b/src/test/regress/expected/foreign_data.out
@@@ -186,67 -194,80 +186,71 @@@ ERROR:  foreign-data wrapper "nonexiste
   DROP FOREIGN DATA WRAPPER IF EXISTS nonexistent;
   NOTICE:  foreign-data wrapper "nonexistent" does not exist, skipping
   \dew+
- -                                                        List of foreign-data wrappers
- -    Name    |           Owner           | Handler |        Validator         | Access privileges |         FDW Options          | Description 
- -------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
- - dummy      | regress_foreign_data_user | -       | -                        |                   |                              | useless
- - foo        | regress_test_role_super   | -       | -                        |                   | (b '3', c '4', a '2', d '5') | 
- - postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |                              | 
- -(3 rows)
+ +                           List of foreign-data wrappers
+ + Name | Owner | Handler | Validator | Access privileges | FDW Options | Description 
+ +------+-------+---------+-----------+-------------------+-------------+-------------
+ +(0 rows)
   
   DROP ROLE regress_test_role_super;                          -- ERROR
- -ERROR:  role "regress_test_role_super" cannot be dropped because some objects depend on it
- -DETAIL:  owner of foreign-data wrapper foo
   SET ROLE regress_test_role_super;
+ +ERROR:  role "regress_test_role_super" does not exist
   DROP FOREIGN DATA WRAPPER foo;
+ +ERROR:  foreign-data wrapper "foo" does not exist
   RESET ROLE;
   DROP ROLE regress_test_role_super;
+ +ERROR:  role "regress_test_role_super" does not exist
   \dew+
- -                                                List of foreign-data wrappers
- -    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW Options | Description 
- -------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
- - dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
- - postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
- -(2 rows)
+ +                           List of foreign-data wrappers
+ + Name | Owner | Handler | Validator | Access privileges | FDW Options | Description 
+ +------+-------+---------+-----------+-------------------+-------------+-------------
+ +(0 rows)
   
   CREATE FOREIGN DATA WRAPPER foo;
+ +ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
+ +DETAIL:  The feature is not currently supported
   CREATE SERVER s1 FOREIGN DATA WRAPPER foo;
+ +ERROR:  Postgres-XL does not support SERVER yet
+ +DETAIL:  The feature is not currently supported
   COMMENT ON SERVER s1 IS 'foreign server';
+ +ERROR:  server "s1" does not exist
   CREATE USER MAPPING FOR current_user SERVER s1;
+ +ERROR:  Postgres-XL does not support USER MAPPING yet
+ +DETAIL:  The feature is not currently supported
+ CREATE USER MAPPING FOR current_user SERVER s1;                               -- ERROR
+ ERROR:  user mapping for "regress_foreign_data_user" already exists for server s1
+ CREATE USER MAPPING IF NOT EXISTS FOR current_user SERVER s1; -- NOTICE
+ NOTICE:  user mapping for "regress_foreign_data_user" already exists for server s1, skipping
   \dew+
- -                                                List of foreign-data wrappers
- -    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW Options | Description 
- -------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
- - dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
- - foo        | regress_foreign_data_user | -       | -                        |                   |             | 
- - postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
- -(3 rows)
+ +                           List of foreign-data wrappers
+ + Name | Owner | Handler | Validator | Access privileges | FDW Options | Description 
+ +------+-------+---------+-----------+-------------------+-------------+-------------
+ +(0 rows)
   
   \des+
- -                                                   List of foreign servers
- - Name |           Owner           | Foreign-data wrapper | Access privileges | Type | Version | FDW Options |  Description   
- -------+---------------------------+----------------------+-------------------+------+---------+-------------+----------------
- - s1   | regress_foreign_data_user | foo                  |                   |      |         |             | foreign server
- -(1 row)
+ +                                       List of foreign servers
+ + Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW Options | Description 
+ +------+-------+----------------------+-------------------+------+---------+-------------+-------------
+ +(0 rows)
   
   \deu+
- -              List of user mappings
- - Server |         User name         | FDW Options 
- ---------+---------------------------+-------------
- - s1     | regress_foreign_data_user | 
- -(1 row)
+ +      List of user mappings
+ + Server | User name | FDW Options 
+ +--------+-----------+-------------
+ +(0 rows)
   
   DROP FOREIGN DATA WRAPPER foo;                              -- ERROR
- -ERROR:  cannot drop foreign-data wrapper foo because other objects depend on it
- -DETAIL:  server s1 depends on foreign-data wrapper foo
- -user mapping for regress_foreign_data_user on server s1 depends on server s1
- -HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+ +ERROR:  foreign-data wrapper "foo" does not exist
   SET ROLE regress_test_role;
   DROP FOREIGN DATA WRAPPER foo CASCADE;                      -- ERROR
- -ERROR:  must be owner of foreign-data wrapper foo
+ +ERROR:  foreign-data wrapper "foo" does not exist
   RESET ROLE;
   DROP FOREIGN DATA WRAPPER foo CASCADE;
- -NOTICE:  drop cascades to 2 other objects
- -DETAIL:  drop cascades to server s1
- -drop cascades to user mapping for regress_foreign_data_user on server s1
+ +ERROR:  foreign-data wrapper "foo" does not exist
   \dew+
- -                                                List of foreign-data wrappers
- -    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW Options | Description 
- -------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
- - dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
- - postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
- -(2 rows)
+ +                           List of foreign-data wrappers
+ + Name | Owner | Handler | Validator | Access privileges | FDW Options | Description 
+ +------+-------+---------+-----------+-------------------+-------------+-------------
+ +(0 rows)
   
   \des+
                                          List of foreign servers
@@@ -996,69 -1153,108 +1000,124 @@@ ALTER FOREIGN DATA WRAPPER foo OPTIONS 
   ERROR:  permission denied to alter foreign-data wrapper "foo"
   HINT:  Must be superuser to alter a foreign-data wrapper.
   DROP FOREIGN DATA WRAPPER foo;                                  -- ERROR
- -ERROR:  must be owner of foreign-data wrapper foo
+ +ERROR:  foreign-data wrapper "foo" does not exist
   GRANT USAGE ON FOREIGN DATA WRAPPER postgresql TO regress_test_role; -- WARNING
- -WARNING:  no privileges were granted for "postgresql"
+ +ERROR:  foreign-data wrapper "postgresql" does not exist
   GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role;
+ +ERROR:  foreign-data wrapper "foo" does not exist
   CREATE SERVER s9 FOREIGN DATA WRAPPER postgresql;
+ +ERROR:  Postgres-XL does not support SERVER yet
+ +DETAIL:  The feature is not currently supported
   ALTER SERVER s6 VERSION '0.5';                                  -- ERROR
- -ERROR:  must be owner of foreign server s6
+ +ERROR:  server "s6" does not exist
   DROP SERVER s6;                                                 -- ERROR
- -ERROR:  must be owner of foreign server s6
+ +ERROR:  server "s6" does not exist
   GRANT USAGE ON FOREIGN SERVER s6 TO regress_test_role;          -- ERROR
- -ERROR:  permission denied for foreign server s6
+ +ERROR:  server "s6" does not exist
   GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;
+ +ERROR:  server "s9" does not exist
   CREATE USER MAPPING FOR public SERVER s6;                       -- ERROR
- -ERROR:  must be owner of foreign server s6
+ +ERROR:  Postgres-XL does not support USER MAPPING yet
+ +DETAIL:  The feature is not currently supported
   CREATE USER MAPPING FOR public SERVER s9;
+ +ERROR:  Postgres-XL does not support USER MAPPING yet
+ +DETAIL:  The feature is not currently supported
   ALTER USER MAPPING FOR regress_test_role SERVER s6 OPTIONS (gotcha 'true'); -- ERROR
- -ERROR:  must be owner of foreign server s6
+ +ERROR:  server "s6" does not exist
   DROP USER MAPPING FOR regress_test_role SERVER s6;              -- ERROR
- -ERROR:  must be owner of foreign server s6
+ +ERROR:  server "s6" does not exist
   RESET ROLE;
   REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_unprivileged_role; -- ERROR
- -ERROR:  dependent privileges exist
- -HINT:  Use CASCADE to revoke them too.
+ +ERROR:  foreign-data wrapper "foo" does not exist
   REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_unprivileged_role CASCADE;
+ +ERROR:  foreign-data wrapper "foo" does not exist
   SET ROLE regress_unprivileged_role;
   GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role;   -- ERROR
- -ERROR:  permission denied for foreign-data wrapper foo
+ +ERROR:  foreign-data wrapper "foo" does not exist
   CREATE SERVER s10 FOREIGN DATA WRAPPER foo;                     -- ERROR
- -ERROR:  permission denied for foreign-data wrapper foo
+ +ERROR:  Postgres-XL does not support SERVER yet
+ +DETAIL:  The feature is not currently supported
   ALTER SERVER s9 VERSION '1.1';
+ +ERROR:  server "s9" does not exist
   GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;
+ +ERROR:  server "s9" does not exist
   CREATE USER MAPPING FOR current_user SERVER s9;
+ +ERROR:  Postgres-XL does not support USER MAPPING yet
+ +DETAIL:  The feature is not currently supported
   DROP SERVER s9 CASCADE;
- -NOTICE:  drop cascades to 2 other objects
- -DETAIL:  drop cascades to user mapping for public on server s9
- -drop cascades to user mapping for regress_unprivileged_role on server s9
+ +ERROR:  server "s9" does not exist
   RESET ROLE;
   CREATE SERVER s9 FOREIGN DATA WRAPPER foo;
+ +ERROR:  Postgres-XL does not support SERVER yet
+ +DETAIL:  The feature is not currently supported
   GRANT USAGE ON FOREIGN SERVER s9 TO regress_unprivileged_role;
+ +ERROR:  server "s9" does not exist
   SET ROLE regress_unprivileged_role;
   ALTER SERVER s9 VERSION '1.2';                                  -- ERROR
- -ERROR:  must be owner of foreign server s9
+ +ERROR:  server "s9" does not exist
   GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;          -- WARNING
- -WARNING:  no privileges were granted for "s9"
+ +ERROR:  server "s9" does not exist
   CREATE USER MAPPING FOR current_user SERVER s9;
+ +ERROR:  Postgres-XL does not support USER MAPPING yet
+ +DETAIL:  The feature is not currently supported
   DROP SERVER s9 CASCADE;                                         -- ERROR
+ +ERROR:  server "s9" does not exist
+ ERROR:  must be owner of foreign server s9
+ -- Check visibility of user mapping data
+ SET ROLE regress_test_role;
+ CREATE SERVER s10 FOREIGN DATA WRAPPER foo;
+ CREATE USER MAPPING FOR public SERVER s10 OPTIONS (user 'secret');
+ GRANT USAGE ON FOREIGN SERVER s10 TO regress_unprivileged_role;
+ -- owner of server can see option fields
+ \deu+
+                  List of user mappings
+  Server |         User name         |    FDW Options    
+ --------+---------------------------+-------------------
+  s10    | public                    | ("user" 'secret')
+  s4     | regress_foreign_data_user | 
+  s5     | regress_test_role         | (modified '1')
+  s6     | regress_test_role         | 
+  s8     | public                    | 
+  s8     | regress_foreign_data_user | 
+  s9     | regress_unprivileged_role | 
+  t1     | public                    | (modified '1')
+ (8 rows)
+ 
+ RESET ROLE;
+ -- superuser can see option fields
+ \deu+
+                   List of user mappings
+  Server |         User name         |     FDW Options     
+ --------+---------------------------+---------------------
+  s10    | public                    | ("user" 'secret')
+  s4     | regress_foreign_data_user | 
+  s5     | regress_test_role         | (modified '1')
+  s6     | regress_test_role         | 
+  s8     | public                    | 
+  s8     | regress_foreign_data_user | (password 'public')
+  s9     | regress_unprivileged_role | 
+  t1     | public                    | (modified '1')
+ (8 rows)
+ 
+ -- unprivileged user cannot see option fields
+ SET ROLE regress_unprivileged_role;
+ \deu+
+               List of user mappings
+  Server |         User name         | FDW Options 
+ --------+---------------------------+-------------
+  s10    | public                    | 
+  s4     | regress_foreign_data_user | 
+  s5     | regress_test_role         | 
+  s6     | regress_test_role         | 
+  s8     | public                    | 
+  s8     | regress_foreign_data_user | 
+  s9     | regress_unprivileged_role | 
+  t1     | public                    | 
+ (8 rows)
+ 
   RESET ROLE;
+ DROP SERVER s10 CASCADE;
+ NOTICE:  drop cascades to user mapping for public on server s10
   -- Triggers
   CREATE FUNCTION dummy_trigger() RETURNS TRIGGER AS $$
     BEGIN
@@@ -1075,8 -1269,13 +1134,15 @@@ CREATE TRIGGER trigtest_after_stmt AFTE
   ON foreign_schema.foreign_table_1
   FOR EACH STATEMENT
   EXECUTE PROCEDURE dummy_trigger();
+ +ERROR:  Postgres-XL does not support TRIGGER yet
+ +DETAIL:  The feature is not currently supported
+ CREATE TRIGGER trigtest_after_stmt_tt AFTER INSERT OR UPDATE OR DELETE -- ERROR
+ ON foreign_schema.foreign_table_1
+ REFERENCING NEW TABLE AS new_table
+ FOR EACH STATEMENT
+ EXECUTE PROCEDURE dummy_trigger();
+ ERROR:  "foreign_table_1" is a foreign table
+ DETAIL:  Triggers on foreign tables cannot have transition tables.
   CREATE TRIGGER trigtest_before_row BEFORE INSERT OR UPDATE OR DELETE
   ON foreign_schema.foreign_table_1
   FOR EACH ROW
@@@ -1118,52 -1307,72 +1184,62 @@@ CREATE TABLE pt1 
   );
   CREATE FOREIGN TABLE ft2 () INHERITS (pt1)
     SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ +ERROR:  server "s0" does not exist
   \d+ pt1
- -                                    Table "public.pt1"
- - Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
- ---------+---------+-----------+----------+---------+----------+--------------+-------------
- - c1     | integer |           | not null |         | plain    |              | 
- - c2     | text    |           |          |         | extended |              | 
- - c3     | date    |           |          |         | plain    |              | 
- -Child tables: ft2
+ +                          Table "public.pt1"
+ + Column |  Type   | Modifiers | Storage  | Stats target | Description 
+ +--------+---------+-----------+----------+--------------+-------------
+ + c1     | integer | not null  | plain    |              | 
+ + c2     | text    |           | extended |              | 
+ + c3     | date    |           | plain    |              | 
+ +Distribute By: HASH(c1)
+ +Location Nodes: ALL DATANODES
   
   \d+ ft2
- -                                       Foreign table "public.ft2"
- - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
- ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- - c1     | integer |           | not null |         |             | plain    |              | 
- - c2     | text    |           |          |         |             | extended |              | 
- - c3     | date    |           |          |         |             | plain    |              | 
- -Server: s0
- -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
- -Inherits: pt1
- -
   DROP FOREIGN TABLE ft2;
+ +ERROR:  foreign table "ft2" does not exist
   \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    |              | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ +Distribute By: HASH(c1)
+ +Location Nodes: ALL DATANODES
   
   CREATE FOREIGN TABLE ft2 (
         c1 integer NOT NULL,
         c2 text,
         c3 date
   ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ +ERROR:  server "s0" does not exist
   \d+ ft2
- -                                       Foreign table "public.ft2"
- - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
- ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- - c1     | integer |           | not null |         |             | plain    |              | 
- - c2     | text    |           |          |         |             | extended |              | 
- - c3     | date    |           |          |         |             | plain    |              | 
- -Server: s0
- -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
- -
   ALTER FOREIGN TABLE ft2 INHERIT pt1;
+ +ERROR:  relation "ft2" does not exist
   \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    |              | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
- -Child tables: ft2
+ +Distribute By: HASH(c1)
+ +Location Nodes: ALL DATANODES
   
   \d+ ft2
+                                        Foreign table "public.ft2"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           |          |         |             | plain    |              | 
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ Inherits: pt1
+ 
   CREATE TABLE ct3() INHERITS(ft2);
+ +ERROR:  relation "ft2" does not exist
   CREATE FOREIGN TABLE ft3 (
         c1 integer NOT NULL,
         c2 text,
@@@ -1281,22 -1610,19 +1357,22 @@@ CREATE FOREIGN TABLE ft2 
         c2 text,
         c3 date
   ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ +ERROR:  server "s0" does not exist
   -- child must have parent's INHERIT constraints
   ALTER FOREIGN TABLE ft2 INHERIT pt1;                            -- ERROR
- -ERROR:  child table is missing constraint "pt1chk2"
+ +ERROR:  relation "ft2" does not exist
   ALTER FOREIGN TABLE ft2 ADD CONSTRAINT pt1chk2 CHECK (c2 <> '');
+ +ERROR:  relation "ft2" does not exist
   ALTER FOREIGN TABLE ft2 INHERIT pt1;
+ +ERROR:  relation "ft2" does not exist
   -- child does not inherit NO INHERIT constraints
   \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    | 10000        | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    | 10000        | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
   Check constraints:
       "pt1chk1" CHECK (c1 > 0) NO INHERIT
       "pt1chk2" CHECK (c2 <> ''::text)
@@@ -1311,65 -1648,114 +1387,65 @@@ ALTER TABLE pt1 DROP CONSTRAINT pt1chk
   INSERT INTO pt1 VALUES (1, 'pt1'::text, '1994-01-01'::date);
   ALTER TABLE pt1 ADD CONSTRAINT pt1chk3 CHECK (c2 <> '') NOT VALID;
   \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    | 10000        | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    | 10000        | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
   Check constraints:
       "pt1chk3" CHECK (c2 <> ''::text) NOT VALID
- -Child tables: ft2
+ +Distribute By: HASH(c1)
+ +Location Nodes: ALL DATANODES
   
   \d+ ft2
- -                                       Foreign table "public.ft2"
- - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
- ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- - c1     | integer |           | not null |         |             | plain    |              | 
- - c2     | text    |           |          |         |             | extended |              | 
- - c3     | date    |           |          |         |             | plain    |              | 
- -Check constraints:
- -    "pt1chk2" CHECK (c2 <> ''::text)
- -    "pt1chk3" CHECK (c2 <> ''::text) NOT VALID
- -Server: s0
- -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
- -Inherits: pt1
- -
   -- VALIDATE CONSTRAINT need do nothing on foreign tables
   ALTER TABLE pt1 VALIDATE CONSTRAINT pt1chk3;
   \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    | 10000        | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    | 10000        | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
   Check constraints:
       "pt1chk3" CHECK (c2 <> ''::text)
- -Child tables: ft2
+ +Distribute By: HASH(c1)
+ +Location Nodes: ALL DATANODES
   
   \d+ ft2
- -                                       Foreign table "public.ft2"
- - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
- ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- - c1     | integer |           | not null |         |             | plain    |              | 
- - c2     | text    |           |          |         |             | extended |              | 
- - c3     | date    |           |          |         |             | plain    |              | 
- -Check constraints:
- -    "pt1chk2" CHECK (c2 <> ''::text)
- -    "pt1chk3" CHECK (c2 <> ''::text)
- -Server: s0
- -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
- -Inherits: pt1
- -
   -- OID system column
   ALTER TABLE pt1 SET WITH OIDS;
   \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    | 10000        | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    | 10000        | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
   Check constraints:
       "pt1chk3" CHECK (c2 <> ''::text)
- -Child tables: ft2
   Has OIDs: yes
+ +Distribute By: HASH(c1)
+ +Location Nodes: ALL DATANODES
   
   \d+ ft2
- -                                       Foreign table "public.ft2"
- - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
- ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- - c1     | integer |           | not null |         |             | plain    |              | 
- - c2     | text    |           |          |         |             | extended |              | 
- - c3     | date    |           |          |         |             | plain    |              | 
- -Check constraints:
- -    "pt1chk2" CHECK (c2 <> ''::text)
- -    "pt1chk3" CHECK (c2 <> ''::text)
- -Server: s0
- -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
- -Inherits: pt1
- -Has OIDs: yes
- -
   ALTER TABLE ft2 SET WITHOUT OIDS;  -- ERROR
- -ERROR:  cannot drop inherited column "oid"
+ +ERROR:  relation "ft2" does not exist
   ALTER TABLE pt1 SET WITHOUT OIDS;
   \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    | 10000        | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    | 10000        | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
   Check constraints:
       "pt1chk3" CHECK (c2 <> ''::text)
- -Child tables: ft2
+ +Distribute By: HASH(c1)
+ +Location Nodes: ALL DATANODES
   
   \d+ ft2
- -                                       Foreign table "public.ft2"
- - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
- ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- - c1     | integer |           | not null |         |             | plain    |              | 
- - c2     | text    |           |          |         |             | extended |              | 
- - c3     | date    |           |          |         |             | plain    |              | 
- -Check constraints:
- -    "pt1chk2" CHECK (c2 <> ''::text)
- -    "pt1chk3" CHECK (c2 <> ''::text)
- -Server: s0
- -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
- -Inherits: pt1
- -
   -- changes name of an attribute recursively
   ALTER TABLE pt1 RENAME COLUMN c1 TO f1;
   ALTER TABLE pt1 RENAME COLUMN c2 TO f2;
@@@ -1377,33 -1763,47 +1453,33 @@@ ALTER TABLE pt1 RENAME COLUMN c3 TO f3
   -- changes name of a constraint recursively
   ALTER TABLE pt1 RENAME CONSTRAINT pt1chk3 TO f2_check;
   \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  f1     | integer | not null  | plain    | 10000        | 
-  f2     | text    |           | extended |              | 
-  f3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  f1     | integer |           | not null |         | plain    | 10000        | 
+  f2     | text    |           |          |         | extended |              | 
+  f3     | date    |           |          |         | plain    |              | 
   Check constraints:
       "f2_check" CHECK (f2 <> ''::text)
- -Child tables: ft2
+ +Distribute By: HASH(f1)
+ +Location Nodes: ALL DATANODES
   
   \d+ ft2
- -                                       Foreign table "public.ft2"
- - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
- ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- - f1     | integer |           | not null |         |             | plain    |              | 
- - f2     | text    |           |          |         |             | extended |              | 
- - f3     | date    |           |          |         |             | plain    |              | 
- -Check constraints:
- -    "f2_check" CHECK (f2 <> ''::text)
- -    "pt1chk2" CHECK (f2 <> ''::text)
- -Server: s0
- -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
- -Inherits: pt1
- -
   -- TRUNCATE doesn't work on foreign tables, either directly or recursively
   TRUNCATE ft2;  -- ERROR
- -ERROR:  "ft2" is not a table
+ +ERROR:  relation "ft2" does not exist
   TRUNCATE pt1;  -- ERROR
- -ERROR:  "ft2" is not a table
   DROP TABLE pt1 CASCADE;
- -NOTICE:  drop cascades to foreign table ft2
   -- IMPORT FOREIGN SCHEMA
   IMPORT FOREIGN SCHEMA s1 FROM SERVER s9 INTO public; -- ERROR
- -ERROR:  foreign-data wrapper "foo" has no handler
+ +ERROR:  server "s9" does not exist
   IMPORT FOREIGN SCHEMA s1 LIMIT TO (t1) FROM SERVER s9 INTO public; --ERROR
- -ERROR:  foreign-data wrapper "foo" has no handler
+ +ERROR:  server "s9" does not exist
   IMPORT FOREIGN SCHEMA s1 EXCEPT (t1) FROM SERVER s9 INTO public; -- ERROR
- -ERROR:  foreign-data wrapper "foo" has no handler
+ +ERROR:  server "s9" does not exist
   IMPORT FOREIGN SCHEMA s1 EXCEPT (t1, t2) FROM SERVER s9 INTO public
   OPTIONS (option1 'value1', option2 'value2'); -- ERROR
- -ERROR:  foreign-data wrapper "foo" has no handler
+ +ERROR:  server "s9" does not exist
   -- DROP FOREIGN TABLE
   DROP FOREIGN TABLE no_table;                                    -- ERROR
   ERROR:  foreign table "no_table" does not exist
@@@ -1414,7 -1813,209 +1490,206 @@@ ERROR:  foreign table "foreign_table_1
   -- REASSIGN OWNED/DROP OWNED of foreign objects
   REASSIGN OWNED BY regress_test_role TO regress_test_role2;
   DROP OWNED BY regress_test_role2;
- -ERROR:  cannot drop desired object(s) because other objects depend on them
- -DETAIL:  user mapping for regress_test_role on server s5 depends on server s5
- -HINT:  Use DROP ... CASCADE to drop the dependent objects too.
   DROP OWNED BY regress_test_role2 CASCADE;
+ NOTICE:  drop cascades to user mapping for regress_test_role on server s5
+ -- Foreign partition DDL stuff
+ CREATE TABLE pt2 (
+       c1 integer NOT NULL,
+       c2 text,
+       c3 date
+ ) PARTITION BY LIST (c1);
+ CREATE FOREIGN TABLE pt2_1 PARTITION OF pt2 FOR VALUES IN (1)
+   SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ Partitions: pt2_1 FOR VALUES IN (1)
+ 
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           |          |         |             | plain    |              | 
+ Partition of: pt2 FOR VALUES IN (1)
+ Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ 
+ -- partition cannot have additional columns
+ DROP FOREIGN TABLE pt2_1;
+ CREATE FOREIGN TABLE pt2_1 (
+       c1 integer NOT NULL,
+       c2 text,
+       c3 date,
+       c4 char
+ ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ \d+ pt2_1
+                                          Foreign table "public.pt2_1"
+  Column |     Type     | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+--------------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer      |           | not null |         |             | plain    |              | 
+  c2     | text         |           |          |         |             | extended |              | 
+  c3     | date         |           |          |         |             | plain    |              | 
+  c4     | character(1) |           |          |         |             | extended |              | 
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ 
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);       -- ERROR
+ ERROR:  table "pt2_1" contains column "c4" not found in parent "pt2"
+ DETAIL:  New partition should contain only the columns present in parent.
+ DROP FOREIGN TABLE pt2_1;
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ 
+ CREATE FOREIGN TABLE pt2_1 (
+       c1 integer NOT NULL,
+       c2 text,
+       c3 date
+ ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           |          |         |             | plain    |              | 
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ 
+ -- no attach partition validation occurs for foreign tables
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ Partitions: pt2_1 FOR VALUES IN (1)
+ 
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           |          |         |             | plain    |              | 
+ Partition of: pt2 FOR VALUES IN (1)
+ Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ 
+ -- cannot add column to a partition
+ ALTER TABLE pt2_1 ADD c4 char;
+ ERROR:  cannot add column to a partition
+ -- ok to have a partition's own constraints though
+ ALTER TABLE pt2_1 ALTER c3 SET NOT NULL;
+ ALTER TABLE pt2_1 ADD CONSTRAINT p21chk CHECK (c2 <> '');
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ Partitions: pt2_1 FOR VALUES IN (1)
+ 
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           | not null |         |             | plain    |              | 
+ Partition of: pt2 FOR VALUES IN (1)
+ Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+ Check constraints:
+     "p21chk" CHECK (c2 <> ''::text)
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ 
+ -- cannot drop inherited NOT NULL constraint from a partition
+ ALTER TABLE pt2_1 ALTER c1 DROP NOT NULL;
+ ERROR:  column "c1" is marked NOT NULL in parent table
+ -- partition must have parent's constraints
+ ALTER TABLE pt2 DETACH PARTITION pt2_1;
+ ALTER TABLE pt2 ALTER c2 SET NOT NULL;
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           | not null |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ 
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           | not null |         |             | plain    |              | 
+ Check constraints:
+     "p21chk" CHECK (c2 <> ''::text)
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ 
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);       -- ERROR
+ ERROR:  column "c2" in child table must be marked NOT NULL
+ ALTER FOREIGN TABLE pt2_1 ALTER c2 SET NOT NULL;
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
+ ALTER TABLE pt2 DETACH PARTITION pt2_1;
+ ALTER TABLE pt2 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0);
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           | not null |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ Check constraints:
+     "pt2chk1" CHECK (c1 > 0)
+ 
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           | not null |         |             | extended |              | 
+  c3     | date    |           | not null |         |             | plain    |              | 
+ Check constraints:
+     "p21chk" CHECK (c2 <> ''::text)
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ 
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);       -- ERROR
+ ERROR:  child table is missing constraint "pt2chk1"
+ ALTER FOREIGN TABLE pt2_1 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0);
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
+ -- TRUNCATE doesn't work on foreign tables, either directly or recursively
+ TRUNCATE pt2_1;  -- ERROR
+ ERROR:  "pt2_1" is not a table
+ TRUNCATE pt2;  -- ERROR
+ ERROR:  "pt2_1" is not a table
+ DROP FOREIGN TABLE pt2_1;
+ DROP TABLE pt2;
   -- Cleanup
   DROP SCHEMA foreign_schema CASCADE;
   DROP ROLE regress_test_role;                                -- ERROR
diff --cc src/test/regress/expected/foreign_key.out

index acb5eeb802477aa5aee3a190b49d044c4f71ebd6,fef072eddfa5e3870197a091cc415a2e9d573791..0559b0380b0270c2e6a05ffc6e180373d87300e8
--- 1/src/test/regress/expected/foreign_key.out
--- 2/src/test/regress/expected/foreign_key.out
+++ b/src/test/regress/expected/foreign_key.out
@@@ -1369,22 -1363,55 +1369,56 @@@ drop table pp, cc
   -- Test interaction of foreign-key optimization with rules (bug #14219)
   --
   create temp table t1 (a integer primary key, b text);
- -create temp table t2 (a integer primary key, b integer references t1);
+ +create temp table t2 (a integer, b integer references t1) distribute by hash (b);
   create rule r1 as on delete to t1 do delete from t2 where t2.b = old.a;
   explain (costs off) delete from t1 where a = 1;
+ +ERROR:  could not plan this distributed delete
+ +DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
+ +delete from t1 where a = 1;
+ +ERROR:  could not plan this distributed delete
+ +DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
+ +drop rule r1 on t1;
+ +explain (costs off, nodes off) delete from t1 where a = 1;
                    QUERY PLAN                 
   --------------------------------------------
- - Delete on t2
- -   ->  Nested Loop
+ + Remote Fast Query Execution
+ +   ->  Delete on t1
            ->  Index Scan using t1_pkey on t1
                  Index Cond: (a = 1)
- -         ->  Seq Scan on t2
- -               Filter: (b = 1)
- - 
- - Delete on t1
- -   ->  Index Scan using t1_pkey on t1
- -         Index Cond: (a = 1)
- -(10 rows)
+ +(4 rows)
   
   delete from t1 where a = 1;
+ --
+ -- Test deferred FK check on a tuple deleted by a rolled-back subtransaction
+ --
+ create table pktable2(f1 int primary key);
+ create table fktable2(f1 int references pktable2 deferrable initially deferred);
+ insert into pktable2 values(1);
+ begin;
+ insert into fktable2 values(1);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit;
+ begin;
+ insert into fktable2 values(2);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit; -- fail
+ ERROR:  insert or update on table "fktable2" violates foreign key constraint "fktable2_f1_fkey"
+ DETAIL:  Key (f1)=(2) is not present in table "pktable2".
+ --
+ -- Test that we prevent dropping FK constraint with pending trigger events
+ --
+ begin;
+ insert into fktable2 values(2);
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ ERROR:  cannot ALTER TABLE "fktable2" because it has pending trigger events
+ commit;
+ begin;
+ delete from pktable2 where f1 = 1;
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ ERROR:  cannot ALTER TABLE "pktable2" because it has pending trigger events
+ commit;
+ drop table pktable2, fktable2;
diff --cc src/test/regress/expected/gist.out
Simple merge
diff --cc src/test/regress/expected/groupingsets.out

index efd00c97503064c452ce5d15e6d7f45bc329b440,fd618afe603a588fd4a81a4a3eea88bed25cd565..8cfdb1fba355e063f212219ef880677766573e73
--- 1/src/test/regress/expected/groupingsets.out
--- 2/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@@ -375,11 -383,11 +383,11 @@@ select 
   ERROR:  aggregate functions are not allowed in FROM clause of their own query level
   LINE 3:        lateral (select a, b, sum(v.x) from gstest_data(v.x) ...
                                        ^
- -- min max optimisation should still work with GROUP BY ()
+ -- min max optimization should still work with GROUP BY ()
   explain (costs off)
     select min(unique1) from tenk1 GROUP BY ();
- -                         QUERY PLAN                         
- -------------------------------------------------------------
+ +                               QUERY PLAN                               
+ +------------------------------------------------------------------------
    Result
      InitPlan 1 (returns $0)
        ->  Limit
diff --cc src/test/regress/expected/horology.out
Simple merge
diff --cc src/test/regress/expected/indirect_toast.out

index db608fe4f8538540fba165cdc5b0fbfe674c8e17,3e255fbded8de20b646e38dc033a96ba1e91d4a5..2958a0a579faca95437e74c674c4338455120b24
--- 1/src/test/regress/expected/indirect_toast.out
--- 2/src/test/regress/expected/indirect_toast.out
+++ b/src/test/regress/expected/indirect_toast.out
@@@ -18,12 -18,12 +18,12 @@@ UPDATE toasttest SET cnt = cnt +1 RETUR
                                                                                                   substring                                                                                                 
   ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    (two-compressed,1,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
+ + ("one-toasted,one-null",1,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
    (two-toasted,1,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345
    ("one-compressed,one-null",1,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
- - ("one-toasted,one-null",1,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
   (4 rows)
   
- -- modification without modifying asigned value
+ -- modification without modifying assigned value
   UPDATE toasttest SET cnt = cnt +1, f1 = f1 RETURNING substring(toasttest::text, 1, 200);
                                                                                                   substring                                                                                                 
   ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@@@ -56,12 -56,12 +56,12 @@@ SELECT substring(toasttest::text, 1, 20
                                                                                                   substring                                                                                                 
   ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    (two-compressed,4,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
+ + ("one-toasted,one-null",4,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
    (two-toasted,4,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234
    ("one-compressed,one-null",4,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
- - ("one-toasted,one-null",4,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
   (4 rows)
   
- -- check we didn't screw with main/toast tuple visiblity
+ -- check we didn't screw with main/toast tuple visibility
   VACUUM FREEZE toasttest;
   SELECT substring(toasttest::text, 1, 200) FROM toasttest;
                                                                                                   substring                                                                                                 
@@@ -92,12 -90,12 +92,12 @@@ UPDATE toasttest SET cnt = cnt +1 RETUR
                                                                                                   substring                                                                                                 
   ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    (two-compressed,5,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
+ + ("one-toasted,one-null",5,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
    (two-toasted,5,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234
    ("one-compressed,one-null",5,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
- - ("one-toasted,one-null",5,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
   (4 rows)
   
- -- modification without modifying asigned value
+ -- modification without modifying assigned value
   UPDATE toasttest SET cnt = cnt +1, f1 = f1 RETURNING substring(toasttest::text, 1, 200);
                                                                                                   substring                                                                                                 
   ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@@@ -131,13 -129,13 +131,13 @@@ SELECT substring(toasttest::text, 1, 20
                                                                                                   substring                                                                                                 
   ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    (two-compressed,8,--123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
- - (two-toasted,8,--123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
- - ("one-compressed,one-null",8,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
    ("one-toasted,one-null",8,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
    ("one-toasted,one-null, via indirect",0,1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
+ + (two-toasted,8,--123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
+ + ("one-compressed,one-null",8,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
   (5 rows)
   
- -- check we didn't screw with main/toast tuple visiblity
+ -- check we didn't screw with main/toast tuple visibility
   VACUUM FREEZE toasttest;
   SELECT substring(toasttest::text, 1, 200) FROM toasttest;
                                                                                                   substring                                                                                                 
diff --cc src/test/regress/expected/inet.out

index d3b9bd382fbce3340f1fcbcc59e8d288178f4bff,be9427eb6b8178f7a095d1896207f9f9cb72145c..00fd7201170738ae45f205ba983142103799cb57
--- 1/src/test/regress/expected/inet.out
--- 2/src/test/regress/expected/inet.out
+++ b/src/test/regress/expected/inet.out
@@@ -414,137 -411,285 +414,285 @@@ SELECT i FROM inet_tbl WHERE i << '192.
   
   SET enable_seqscan TO on;
   DROP INDEX inet_idx2;
+ -- check that spgist index works correctly
+ CREATE INDEX inet_idx3 ON inet_tbl using spgist (i);
+ SET enable_seqscan TO off;
+ SELECT * FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+        c        |        i         
+ ----------------+------------------
+  192.168.1.0/24 | 192.168.1.0/25
+  192.168.1.0/24 | 192.168.1.255/25
+  192.168.1.0/26 | 192.168.1.226
+ (3 rows)
+ 
+ SELECT * FROM inet_tbl WHERE i <<= '192.168.1.0/24'::cidr ORDER BY i;
+        c        |        i         
+ ----------------+------------------
+  192.168.1.0/24 | 192.168.1.0/24
+  192.168.1.0/24 | 192.168.1.226/24
+  192.168.1.0/24 | 192.168.1.255/24
+  192.168.1.0/24 | 192.168.1.0/25
+  192.168.1.0/24 | 192.168.1.255/25
+  192.168.1.0/26 | 192.168.1.226
+ (6 rows)
+ 
+ SELECT * FROM inet_tbl WHERE i && '192.168.1.0/24'::cidr ORDER BY i;
+        c        |        i         
+ ----------------+------------------
+  192.168.1.0/24 | 192.168.1.0/24
+  192.168.1.0/24 | 192.168.1.226/24
+  192.168.1.0/24 | 192.168.1.255/24
+  192.168.1.0/24 | 192.168.1.0/25
+  192.168.1.0/24 | 192.168.1.255/25
+  192.168.1.0/26 | 192.168.1.226
+ (6 rows)
+ 
+ SELECT * FROM inet_tbl WHERE i >>= '192.168.1.0/24'::cidr ORDER BY i;
+        c        |        i         
+ ----------------+------------------
+  192.168.1.0/24 | 192.168.1.0/24
+  192.168.1.0/24 | 192.168.1.226/24
+  192.168.1.0/24 | 192.168.1.255/24
+ (3 rows)
+ 
+ SELECT * FROM inet_tbl WHERE i >> '192.168.1.0/24'::cidr ORDER BY i;
+  c | i 
+ ---+---
+ (0 rows)
+ 
+ SELECT * FROM inet_tbl WHERE i < '192.168.1.0/24'::cidr ORDER BY i;
+       c      |      i      
+ -------------+-------------
+  10.0.0.0/8  | 9.1.2.3/8
+  10.0.0.0/32 | 10.1.2.3/8
+  10.0.0.0/8  | 10.1.2.3/8
+  10.0.0.0/8  | 10.1.2.3/8
+  10.1.0.0/16 | 10.1.2.3/16
+  10.1.2.0/24 | 10.1.2.3/24
+  10.1.2.3/32 | 10.1.2.3
+  10.0.0.0/8  | 11.1.2.3/8
+ (8 rows)
+ 
+ SELECT * FROM inet_tbl WHERE i <= '192.168.1.0/24'::cidr ORDER BY i;
+        c        |       i        
+ ----------------+----------------
+  10.0.0.0/8     | 9.1.2.3/8
+  10.0.0.0/8     | 10.1.2.3/8
+  10.0.0.0/32    | 10.1.2.3/8
+  10.0.0.0/8     | 10.1.2.3/8
+  10.1.0.0/16    | 10.1.2.3/16
+  10.1.2.0/24    | 10.1.2.3/24
+  10.1.2.3/32    | 10.1.2.3
+  10.0.0.0/8     | 11.1.2.3/8
+  192.168.1.0/24 | 192.168.1.0/24
+ (9 rows)
+ 
+ SELECT * FROM inet_tbl WHERE i = '192.168.1.0/24'::cidr ORDER BY i;
+        c        |       i        
+ ----------------+----------------
+  192.168.1.0/24 | 192.168.1.0/24
+ (1 row)
+ 
+ SELECT * FROM inet_tbl WHERE i >= '192.168.1.0/24'::cidr ORDER BY i;
+          c          |        i         
+ --------------------+------------------
+  192.168.1.0/24     | 192.168.1.0/24
+  192.168.1.0/24     | 192.168.1.226/24
+  192.168.1.0/24     | 192.168.1.255/24
+  192.168.1.0/24     | 192.168.1.0/25
+  192.168.1.0/24     | 192.168.1.255/25
+  192.168.1.0/26     | 192.168.1.226
+  ::ffff:1.2.3.4/128 | ::4.3.2.1/24
+  10:23::f1/128      | 10:23::f1/64
+  10:23::8000/113    | 10:23::ffff
+ (9 rows)
+ 
+ SELECT * FROM inet_tbl WHERE i > '192.168.1.0/24'::cidr ORDER BY i;
+          c          |        i         
+ --------------------+------------------
+  192.168.1.0/24     | 192.168.1.226/24
+  192.168.1.0/24     | 192.168.1.255/24
+  192.168.1.0/24     | 192.168.1.0/25
+  192.168.1.0/24     | 192.168.1.255/25
+  192.168.1.0/26     | 192.168.1.226
+  ::ffff:1.2.3.4/128 | ::4.3.2.1/24
+  10:23::f1/128      | 10:23::f1/64
+  10:23::8000/113    | 10:23::ffff
+ (8 rows)
+ 
+ SELECT * FROM inet_tbl WHERE i <> '192.168.1.0/24'::cidr ORDER BY i;
+          c          |        i         
+ --------------------+------------------
+  10.0.0.0/8         | 9.1.2.3/8
+  10.0.0.0/8         | 10.1.2.3/8
+  10.0.0.0/32        | 10.1.2.3/8
+  10.0.0.0/8         | 10.1.2.3/8
+  10.1.0.0/16        | 10.1.2.3/16
+  10.1.2.0/24        | 10.1.2.3/24
+  10.1.2.3/32        | 10.1.2.3
+  10.0.0.0/8         | 11.1.2.3/8
+  192.168.1.0/24     | 192.168.1.226/24
+  192.168.1.0/24     | 192.168.1.255/24
+  192.168.1.0/24     | 192.168.1.0/25
+  192.168.1.0/24     | 192.168.1.255/25
+  192.168.1.0/26     | 192.168.1.226
+  ::ffff:1.2.3.4/128 | ::4.3.2.1/24
+  10:23::f1/128      | 10:23::f1/64
+  10:23::8000/113    | 10:23::ffff
+ (16 rows)
+ 
+ -- test index-only scans
+ EXPLAIN (COSTS OFF)
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+                     QUERY PLAN                     
+ ---------------------------------------------------
+  Sort
+    Sort Key: i
+    ->  Index Only Scan using inet_idx3 on inet_tbl
+          Index Cond: (i << '192.168.1.0/24'::inet)
+ (4 rows)
+ 
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+         i         
+ ------------------
+  192.168.1.0/25
+  192.168.1.255/25
+  192.168.1.226
+ (3 rows)
+ 
+ SET enable_seqscan TO on;
+ DROP INDEX inet_idx3;
   -- simple tests of inet boolean and arithmetic operators
- -SELECT i, ~i AS "~i" FROM inet_tbl;
+ +SELECT i, ~i AS "~i" FROM inet_tbl ORDER BY i;
           i         |                     ~i                     
   ------------------+--------------------------------------------
- - 192.168.1.226/24 | 63.87.254.29/24
- - 192.168.1.226    | 63.87.254.29
- - 192.168.1.0/24   | 63.87.254.255/24
- - 192.168.1.0/25   | 63.87.254.255/25
- - 192.168.1.255/24 | 63.87.254.0/24
- - 192.168.1.255/25 | 63.87.254.0/25
+ + 9.1.2.3/8        | 246.254.253.252/8
    10.1.2.3/8       | 245.254.253.252/8
    10.1.2.3/8       | 245.254.253.252/8
- - 10.1.2.3         | 245.254.253.252
- - 10.1.2.3/24      | 245.254.253.252/24
- - 10.1.2.3/16      | 245.254.253.252/16
    10.1.2.3/8       | 245.254.253.252/8
+ + 10.1.2.3/16      | 245.254.253.252/16
+ + 10.1.2.3/24      | 245.254.253.252/24
+ + 10.1.2.3         | 245.254.253.252
    11.1.2.3/8       | 244.254.253.252/8
- - 9.1.2.3/8        | 246.254.253.252/8
+ + 192.168.1.0/24   | 63.87.254.255/24
+ + 192.168.1.226/24 | 63.87.254.29/24
+ + 192.168.1.255/24 | 63.87.254.0/24
+ + 192.168.1.0/25   | 63.87.254.255/25
+ + 192.168.1.255/25 | 63.87.254.0/25
+ + 192.168.1.226    | 63.87.254.29
+ + ::4.3.2.1/24     | ffff:ffff:ffff:ffff:ffff:ffff:fbfc:fdfe/24
    10:23::f1/64     | ffef:ffdc:ffff:ffff:ffff:ffff:ffff:ff0e/64
    10:23::ffff      | ffef:ffdc:ffff:ffff:ffff:ffff:ffff:0
- - ::4.3.2.1/24     | ffff:ffff:ffff:ffff:ffff:ffff:fbfc:fdfe/24
   (17 rows)
   
- -SELECT i, c, i & c AS "and" FROM inet_tbl;
+ +SELECT i, c, i & c AS "and" FROM inet_tbl ORDER BY i, c;
           i         |         c          |      and       
   ------------------+--------------------+----------------
- - 192.168.1.226/24 | 192.168.1.0/24     | 192.168.1.0/24
- - 192.168.1.226    | 192.168.1.0/26     | 192.168.1.0
- - 192.168.1.0/24   | 192.168.1.0/24     | 192.168.1.0/24
- - 192.168.1.0/25   | 192.168.1.0/24     | 192.168.1.0/25
- - 192.168.1.255/24 | 192.168.1.0/24     | 192.168.1.0/24
- - 192.168.1.255/25 | 192.168.1.0/24     | 192.168.1.0/25
+ + 9.1.2.3/8        | 10.0.0.0/8         | 8.0.0.0/8
+ + 10.1.2.3/8       | 10.0.0.0/8         | 10.0.0.0/8
    10.1.2.3/8       | 10.0.0.0/8         | 10.0.0.0/8
    10.1.2.3/8       | 10.0.0.0/32        | 10.0.0.0
- - 10.1.2.3         | 10.1.2.3/32        | 10.1.2.3
- - 10.1.2.3/24      | 10.1.2.0/24        | 10.1.2.0/24
    10.1.2.3/16      | 10.1.0.0/16        | 10.1.0.0/16
- - 10.1.2.3/8       | 10.0.0.0/8         | 10.0.0.0/8
+ + 10.1.2.3/24      | 10.1.2.0/24        | 10.1.2.0/24
+ + 10.1.2.3         | 10.1.2.3/32        | 10.1.2.3
    11.1.2.3/8       | 10.0.0.0/8         | 10.0.0.0/8
- - 9.1.2.3/8        | 10.0.0.0/8         | 8.0.0.0/8
+ + 192.168.1.0/24   | 192.168.1.0/24     | 192.168.1.0/24
+ + 192.168.1.226/24 | 192.168.1.0/24     | 192.168.1.0/24
+ + 192.168.1.255/24 | 192.168.1.0/24     | 192.168.1.0/24
+ + 192.168.1.0/25   | 192.168.1.0/24     | 192.168.1.0/25
+ + 192.168.1.255/25 | 192.168.1.0/24     | 192.168.1.0/25
+ + 192.168.1.226    | 192.168.1.0/26     | 192.168.1.0
+ + ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | ::0.2.2.0
    10:23::f1/64     | 10:23::f1/128      | 10:23::f1
    10:23::ffff      | 10:23::8000/113    | 10:23::8000
- - ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | ::0.2.2.0
   (17 rows)
   
- -SELECT i, c, i | c AS "or" FROM inet_tbl;
+ +SELECT i, c, i | c AS "or" FROM inet_tbl ORDER BY i, c;
           i         |         c          |        or        
   ------------------+--------------------+------------------
- - 192.168.1.226/24 | 192.168.1.0/24     | 192.168.1.226/24
- - 192.168.1.226    | 192.168.1.0/26     | 192.168.1.226
- - 192.168.1.0/24   | 192.168.1.0/24     | 192.168.1.0/24
- - 192.168.1.0/25   | 192.168.1.0/24     | 192.168.1.0/25
- - 192.168.1.255/24 | 192.168.1.0/24     | 192.168.1.255/24
- - 192.168.1.255/25 | 192.168.1.0/24     | 192.168.1.255/25
+ + 9.1.2.3/8        | 10.0.0.0/8         | 11.1.2.3/8
+ + 10.1.2.3/8       | 10.0.0.0/8         | 10.1.2.3/8
    10.1.2.3/8       | 10.0.0.0/8         | 10.1.2.3/8
    10.1.2.3/8       | 10.0.0.0/32        | 10.1.2.3
- - 10.1.2.3         | 10.1.2.3/32        | 10.1.2.3
- - 10.1.2.3/24      | 10.1.2.0/24        | 10.1.2.3/24
    10.1.2.3/16      | 10.1.0.0/16        | 10.1.2.3/16
- - 10.1.2.3/8       | 10.0.0.0/8         | 10.1.2.3/8
+ + 10.1.2.3/24      | 10.1.2.0/24        | 10.1.2.3/24
+ + 10.1.2.3         | 10.1.2.3/32        | 10.1.2.3
    11.1.2.3/8       | 10.0.0.0/8         | 11.1.2.3/8
- - 9.1.2.3/8        | 10.0.0.0/8         | 11.1.2.3/8
+ + 192.168.1.0/24   | 192.168.1.0/24     | 192.168.1.0/24
+ + 192.168.1.226/24 | 192.168.1.0/24     | 192.168.1.226/24
+ + 192.168.1.255/24 | 192.168.1.0/24     | 192.168.1.255/24
+ + 192.168.1.0/25   | 192.168.1.0/24     | 192.168.1.0/25
+ + 192.168.1.255/25 | 192.168.1.0/24     | 192.168.1.255/25
+ + 192.168.1.226    | 192.168.1.0/26     | 192.168.1.226
+ + ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | ::ffff:5.3.3.5
    10:23::f1/64     | 10:23::f1/128      | 10:23::f1
    10:23::ffff      | 10:23::8000/113    | 10:23::ffff
- - ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | ::ffff:5.3.3.5
   (17 rows)
   
- -SELECT i, i + 500 AS "i+500" FROM inet_tbl;
+ +SELECT i, i + 500 AS "i+500" FROM inet_tbl ORDER BY i;
           i         |      i+500       
   ------------------+------------------
- - 192.168.1.226/24 | 192.168.3.214/24
- - 192.168.1.226    | 192.168.3.214
- - 192.168.1.0/24   | 192.168.2.244/24
- - 192.168.1.0/25   | 192.168.2.244/25
- - 192.168.1.255/24 | 192.168.3.243/24
- - 192.168.1.255/25 | 192.168.3.243/25
+ + 9.1.2.3/8        | 9.1.3.247/8
    10.1.2.3/8       | 10.1.3.247/8
    10.1.2.3/8       | 10.1.3.247/8
- - 10.1.2.3         | 10.1.3.247
- - 10.1.2.3/24      | 10.1.3.247/24
- - 10.1.2.3/16      | 10.1.3.247/16
    10.1.2.3/8       | 10.1.3.247/8
+ + 10.1.2.3/16      | 10.1.3.247/16
+ + 10.1.2.3/24      | 10.1.3.247/24
+ + 10.1.2.3         | 10.1.3.247
    11.1.2.3/8       | 11.1.3.247/8
- - 9.1.2.3/8        | 9.1.3.247/8
+ + 192.168.1.0/24   | 192.168.2.244/24
+ + 192.168.1.226/24 | 192.168.3.214/24
+ + 192.168.1.255/24 | 192.168.3.243/24
+ + 192.168.1.0/25   | 192.168.2.244/25
+ + 192.168.1.255/25 | 192.168.3.243/25
+ + 192.168.1.226    | 192.168.3.214
+ + ::4.3.2.1/24     | ::4.3.3.245/24
    10:23::f1/64     | 10:23::2e5/64
    10:23::ffff      | 10:23::1:1f3
- - ::4.3.2.1/24     | ::4.3.3.245/24
   (17 rows)
   
- -SELECT i, i - 500 AS "i-500" FROM inet_tbl;
+ +SELECT i, i - 500 AS "i-500" FROM inet_tbl ORDER BY i;
           i         |                 i-500                  
   ------------------+----------------------------------------
- - 192.168.1.226/24 | 192.167.255.238/24
- - 192.168.1.226    | 192.167.255.238
- - 192.168.1.0/24   | 192.167.255.12/24
- - 192.168.1.0/25   | 192.167.255.12/25
- - 192.168.1.255/24 | 192.168.0.11/24
- - 192.168.1.255/25 | 192.168.0.11/25
+ + 9.1.2.3/8        | 9.1.0.15/8
    10.1.2.3/8       | 10.1.0.15/8
    10.1.2.3/8       | 10.1.0.15/8
- - 10.1.2.3         | 10.1.0.15
- - 10.1.2.3/24      | 10.1.0.15/24
- - 10.1.2.3/16      | 10.1.0.15/16
    10.1.2.3/8       | 10.1.0.15/8
+ + 10.1.2.3/16      | 10.1.0.15/16
+ + 10.1.2.3/24      | 10.1.0.15/24
+ + 10.1.2.3         | 10.1.0.15
    11.1.2.3/8       | 11.1.0.15/8
- - 9.1.2.3/8        | 9.1.0.15/8
+ + 192.168.1.0/24   | 192.167.255.12/24
+ + 192.168.1.226/24 | 192.167.255.238/24
+ + 192.168.1.255/24 | 192.168.0.11/24
+ + 192.168.1.0/25   | 192.167.255.12/25
+ + 192.168.1.255/25 | 192.168.0.11/25
+ + 192.168.1.226    | 192.167.255.238
+ + ::4.3.2.1/24     | ::4.3.0.13/24
    10:23::f1/64     | 10:22:ffff:ffff:ffff:ffff:ffff:fefd/64
    10:23::ffff      | 10:23::fe0b
- - ::4.3.2.1/24     | ::4.3.0.13/24
   (17 rows)
   
- -SELECT i, c, i - c AS "minus" FROM inet_tbl;
+ +SELECT i, c, i - c AS "minus" FROM inet_tbl ORDER BY i, c;
           i         |         c          |      minus       
   ------------------+--------------------+------------------
- - 192.168.1.226/24 | 192.168.1.0/24     |              226
- - 192.168.1.226    | 192.168.1.0/26     |              226
- - 192.168.1.0/24   | 192.168.1.0/24     |                0
- - 192.168.1.0/25   | 192.168.1.0/24     |                0
- - 192.168.1.255/24 | 192.168.1.0/24     |              255
- - 192.168.1.255/25 | 192.168.1.0/24     |              255
+ + 9.1.2.3/8        | 10.0.0.0/8         |        -16711165
+ + 10.1.2.3/8       | 10.0.0.0/8         |            66051
    10.1.2.3/8       | 10.0.0.0/8         |            66051
    10.1.2.3/8       | 10.0.0.0/32        |            66051
- - 10.1.2.3         | 10.1.2.3/32        |                0
- - 10.1.2.3/24      | 10.1.2.0/24        |                3
    10.1.2.3/16      | 10.1.0.0/16        |              515
- - 10.1.2.3/8       | 10.0.0.0/8         |            66051
+ + 10.1.2.3/24      | 10.1.2.0/24        |                3
+ + 10.1.2.3         | 10.1.2.3/32        |                0
    11.1.2.3/8       | 10.0.0.0/8         |         16843267
- - 9.1.2.3/8        | 10.0.0.0/8         |        -16711165
+ + 192.168.1.0/24   | 192.168.1.0/24     |                0
+ + 192.168.1.226/24 | 192.168.1.0/24     |              226
+ + 192.168.1.255/24 | 192.168.1.0/24     |              255
+ + 192.168.1.0/25   | 192.168.1.0/24     |                0
+ + 192.168.1.255/25 | 192.168.1.0/24     |              255
+ + 192.168.1.226    | 192.168.1.0/26     |              226
+ + ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | -281470631346435
    10:23::f1/64     | 10:23::f1/128      |                0
    10:23::ffff      | 10:23::8000/113    |            32767
- - ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | -281470631346435
   (17 rows)
   
   SELECT '127.0.0.1'::inet + 257;
diff --cc src/test/regress/expected/inherit.out

index 9b6a131a6577fa130070e81aa9e76c970822eb88,35d182d599233021900d2f04946d4cb80633b0a2..dd0c3d1eafae1a81bbaa6e191538fbc5f98764ff
--- 1/src/test/regress/expected/inherit.out
--- 2/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@@ -1135,17 -1001,15 +1193,17 @@@ ALTER TABLE inhts RENAME aa TO aaa
   ERROR:  cannot rename inherited column "aa"
   ALTER TABLE inhts RENAME d TO dd;
   \d+ inhts
-                         Table "public.inhts"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  aa     | integer |           | plain   |              | 
-  b      | integer |           | plain   |              | 
-  c      | integer |           | plain   |              | 
-  dd     | integer |           | plain   |              | 
+                                    Table "public.inhts"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  aa     | integer |           |          |         | plain   |              | 
+  b      | integer |           |          |         | plain   |              | 
+  c      | integer |           |          |         | plain   |              | 
+  dd     | integer |           |          |         | plain   |              | 
   Inherits: inht1,
             inhs1
+ +Distribute By: HASH(aa)
+ +Location Nodes: ALL DATANODES
   
   DROP TABLE inhts;
   -- Test for renaming in diamond inheritance
@@@ -1156,18 -1020,16 +1214,18 @@@ NOTICE:  merging multiple inherited def
   NOTICE:  merging multiple inherited definitions of column "b"
   ALTER TABLE inht1 RENAME aa TO aaa;
   \d+ inht4
-                         Table "public.inht4"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  aaa    | integer |           | plain   |              | 
-  b      | integer |           | plain   |              | 
-  x      | integer |           | plain   |              | 
-  y      | integer |           | plain   |              | 
-  z      | integer |           | plain   |              | 
+                                    Table "public.inht4"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  aaa    | integer |           |          |         | plain   |              | 
+  b      | integer |           |          |         | plain   |              | 
+  x      | integer |           |          |         | plain   |              | 
+  y      | integer |           |          |         | plain   |              | 
+  z      | integer |           |          |         | plain   |              | 
   Inherits: inht2,
             inht3
+ +Distribute By: HASH(aaa)
+ +Location Nodes: ALL DATANODES
   
   CREATE TABLE inhts (d int) INHERITS (inht2, inhs1);
   NOTICE:  merging multiple inherited definitions of column "b"
@@@ -1175,18 -1037,16 +1233,18 @@@ ALTER TABLE inht1 RENAME aaa TO aaaa
   ALTER TABLE inht1 RENAME b TO bb;                -- to be failed
   ERROR:  cannot rename inherited column "b"
   \d+ inhts
-                         Table "public.inhts"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  aaaa   | integer |           | plain   |              | 
-  b      | integer |           | plain   |              | 
-  x      | integer |           | plain   |              | 
-  c      | integer |           | plain   |              | 
-  d      | integer |           | plain   |              | 
+                                    Table "public.inhts"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  aaaa   | integer |           |          |         | plain   |              | 
+  b      | integer |           |          |         | plain   |              | 
+  x      | integer |           |          |         | plain   |              | 
+  c      | integer |           |          |         | plain   |              | 
+  d      | integer |           |          |         | plain   |              | 
   Inherits: inht2,
             inhs1
+ +Distribute By: HASH(aaaa)
+ +Location Nodes: ALL DATANODES
   
   WITH RECURSIVE r AS (
     SELECT 'inht1'::regclass AS inhrelid
@@@ -1238,26 -1096,22 +1296,26 @@@ Location Nodes: ALL DATANODE
   
   ALTER TABLE ONLY test_constraints DROP CONSTRAINT test_constraints_val1_val2_key;
   \d+ test_constraints
-                         Table "public.test_constraints"
-  Column |       Type        | Modifiers | Storage  | Stats target | Description 
- --------+-------------------+-----------+----------+--------------+-------------
-  id     | integer           |           | plain    |              | 
-  val1   | character varying |           | extended |              | 
-  val2   | integer           |           | plain    |              | 
+                                    Table "public.test_constraints"
+  Column |       Type        | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+-------------------+-----------+----------+---------+----------+--------------+-------------
+  id     | integer           |           |          |         | plain    |              | 
+  val1   | character varying |           |          |         | extended |              | 
+  val2   | integer           |           |          |         | plain    |              | 
   Child tables: test_constraints_inh
+ +Distribute By: HASH(val1)
+ +Location Nodes: ALL DATANODES
   
   \d+ test_constraints_inh
-                       Table "public.test_constraints_inh"
-  Column |       Type        | Modifiers | Storage  | Stats target | Description 
- --------+-------------------+-----------+----------+--------------+-------------
-  id     | integer           |           | plain    |              | 
-  val1   | character varying |           | extended |              | 
-  val2   | integer           |           | plain    |              | 
+                                  Table "public.test_constraints_inh"
+  Column |       Type        | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+-------------------+-----------+----------+---------+----------+--------------+-------------
+  id     | integer           |           |          |         | plain    |              | 
+  val1   | character varying |           |          |         | extended |              | 
+  val2   | integer           |           |          |         | plain    |              | 
   Inherits: test_constraints
+ +Distribute By: HASH(val1)
+ +Location Nodes: ALL DATANODES
   
   DROP TABLE test_constraints_inh;
   DROP TABLE test_constraints;
@@@ -1279,22 -1131,18 +1337,22 @@@ Location Nodes: ALL DATANODE
   
   ALTER TABLE test_ex_constraints DROP CONSTRAINT test_ex_constraints_c_excl;
   \d+ test_ex_constraints
-                  Table "public.test_ex_constraints"
-  Column |  Type  | Modifiers | Storage | Stats target | Description 
- --------+--------+-----------+---------+--------------+-------------
-  c      | circle |           | plain   |              | 
+                            Table "public.test_ex_constraints"
+  Column |  Type  | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+--------+-----------+----------+---------+---------+--------------+-------------
+  c      | circle |           |          |         | plain   |              | 
   Child tables: test_ex_constraints_inh
+ +Distribute By: ROUND ROBIN
+ +Location Nodes: ALL DATANODES
   
   \d+ test_ex_constraints_inh
-                Table "public.test_ex_constraints_inh"
-  Column |  Type  | Modifiers | Storage | Stats target | Description 
- --------+--------+-----------+---------+--------------+-------------
-  c      | circle |           | plain   |              | 
+                          Table "public.test_ex_constraints_inh"
+  Column |  Type  | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+--------+-----------+----------+---------+---------+--------------+-------------
+  c      | circle |           |          |         | plain   |              | 
   Inherits: test_ex_constraints
+ +Distribute By: ROUND ROBIN
+ +Location Nodes: ALL DATANODES
   
   DROP TABLE test_ex_constraints_inh;
   DROP TABLE test_ex_constraints;
@@@ -1311,14 -1159,12 +1369,14 @@@ Indexes
       "test_primary_constraints_pkey" PRIMARY KEY, btree (id)
   Referenced by:
       TABLE "test_foreign_constraints" CONSTRAINT "test_foreign_constraints_id1_fkey" FOREIGN KEY (id1) REFERENCES test_primary_constraints(id)
+ +Distribute By: HASH(id)
+ +Location Nodes: ALL DATANODES
   
   \d+ test_foreign_constraints
-                Table "public.test_foreign_constraints"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  id1    | integer |           | plain   |              | 
+                          Table "public.test_foreign_constraints"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  id1    | integer |           |          |         | plain   |              | 
   Foreign-key constraints:
       "test_foreign_constraints_id1_fkey" FOREIGN KEY (id1) REFERENCES test_primary_constraints(id)
   Child tables: test_foreign_constraints_inh
@@@ -1327,22 -1171,18 +1385,22 @@@ Location Nodes: ALL DATANODE
   
   ALTER TABLE test_foreign_constraints DROP CONSTRAINT test_foreign_constraints_id1_fkey;
   \d+ test_foreign_constraints
-                Table "public.test_foreign_constraints"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  id1    | integer |           | plain   |              | 
+                          Table "public.test_foreign_constraints"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  id1    | integer |           |          |         | plain   |              | 
   Child tables: test_foreign_constraints_inh
+ +Distribute By: HASH(id1)
+ +Location Nodes: ALL DATANODES
   
   \d+ test_foreign_constraints_inh
-              Table "public.test_foreign_constraints_inh"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  id1    | integer |           | plain   |              | 
+                        Table "public.test_foreign_constraints_inh"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  id1    | integer |           |          |         | plain   |              | 
   Inherits: test_foreign_constraints
+ +Distribute By: HASH(id1)
+ +Location Nodes: ALL DATANODES
   
   DROP TABLE test_foreign_constraints_inh;
   DROP TABLE test_foreign_constraints;
diff --cc src/test/regress/expected/insert.out
Simple merge
diff --cc src/test/regress/expected/insert_conflict.out
Simple merge
diff --cc src/test/regress/expected/interval.out
Simple merge
diff --cc src/test/regress/expected/join.out

index 0ff949c272bd89a520d5d0cc1ffbb80765405618,d08b1e1ae5377471f2ea53c673169d8e02f6fe19..65a5356412f43936d64e5dabc0d4b6e15ed94a72
--- 1/src/test/regress/expected/join.out
--- 2/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@@ -5611,41 -5327,367 +5612,406 @@@ ERROR:  invalid reference to FROM-claus
   LINE 1: ...xx1 using lateral (select * from int4_tbl where f1 = x1) ss;
                                                                   ^
   HINT:  There is an entry for table "xx1", but it cannot be referenced from this part of the query.
+ +-- demonstrate problem with extrememly slow join
+ +CREATE TABLE testr (a int, b int) DISTRIBUTE BY REPLICATION;
+ +INSERT INTO testr SELECT generate_series(1, 10000), generate_series(5001, 15000);
+ +CREATE TABLE testh (a int, b int);
+ +INSERT INTO testh SELECT generate_series(1, 10000), generate_series(8001, 18000);
+ +set enable_mergejoin TO false;
+ +set enable_hashjoin TO false;
+ +EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
+ +                                    QUERY PLAN                                     
+ +-----------------------------------------------------------------------------------
+ + Finalize Aggregate
+ +   Output: count(*)
+ +   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+ +         Output: PARTIAL count(*)
+ +         ->  Partial Aggregate
+ +               Output: PARTIAL count(*)
+ +               ->  Nested Loop Anti Join
+ +                     Join Filter: (testr.b = testh.b)
+ +                     ->  Remote Subquery Scan on all (datanode_1)
+ +                           Output: testr.b
+ +                           Distribute results by H: b
+ +                           ->  Seq Scan on public.testr
+ +                                 Output: testr.b
+ +                     ->  Materialize
+ +                           Output: testh.b
+ +                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+ +                                 Output: testh.b
+ +                                 Distribute results by H: b
+ +                                 ->  Seq Scan on public.testh
+ +                                       Output: testh.b
+ +(20 rows)
+ +
+ +SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
+ + count 
+ +-------
+ +  3000
+ +(1 row)
+ +
++
+ --
+ -- test planner's ability to mark joins as unique
+ --
+ create table j1 (id int primary key);
+ create table j2 (id int primary key);
+ create table j3 (id int);
+ insert into j1 values(1),(2),(3);
+ insert into j2 values(1),(2),(3);
+ insert into j3 values(1),(1);
+ analyze j1;
+ analyze j2;
+ analyze j3;
+ -- ensure join is properly marked as unique
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id = j2.id;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Join
+    Output: j1.id, j2.id
+    Inner Unique: true
+    Hash Cond: (j1.id = j2.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Hash
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (10 rows)
+ 
+ -- ensure join is not unique when not an equi-join
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id > j2.id;
+             QUERY PLAN             
+ -----------------------------------
+  Nested Loop
+    Output: j1.id, j2.id
+    Join Filter: (j1.id > j2.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Materialize
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (9 rows)
+ 
+ -- ensure non-unique rel is not chosen as inner
+ explain (verbose, costs off)
+ select * from j1 inner join j3 on j1.id = j3.id;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Join
+    Output: j1.id, j3.id
+    Inner Unique: true
+    Hash Cond: (j3.id = j1.id)
+    ->  Seq Scan on public.j3
+          Output: j3.id
+    ->  Hash
+          Output: j1.id
+          ->  Seq Scan on public.j1
+                Output: j1.id
+ (10 rows)
+ 
+ -- ensure left join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 left join j2 on j1.id = j2.id;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Left Join
+    Output: j1.id, j2.id
+    Inner Unique: true
+    Hash Cond: (j1.id = j2.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Hash
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (10 rows)
+ 
+ -- ensure right join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 right join j2 on j1.id = j2.id;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Left Join
+    Output: j1.id, j2.id
+    Inner Unique: true
+    Hash Cond: (j2.id = j1.id)
+    ->  Seq Scan on public.j2
+          Output: j2.id
+    ->  Hash
+          Output: j1.id
+          ->  Seq Scan on public.j1
+                Output: j1.id
+ (10 rows)
+ 
+ -- ensure full join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 full join j2 on j1.id = j2.id;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Full Join
+    Output: j1.id, j2.id
+    Inner Unique: true
+    Hash Cond: (j1.id = j2.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Hash
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (10 rows)
+ 
+ -- a clauseless (cross) join can't be unique
+ explain (verbose, costs off)
+ select * from j1 cross join j2;
+             QUERY PLAN             
+ -----------------------------------
+  Nested Loop
+    Output: j1.id, j2.id
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Materialize
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (8 rows)
+ 
+ -- ensure a natural join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 natural join j2;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Join
+    Output: j1.id
+    Inner Unique: true
+    Hash Cond: (j1.id = j2.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Hash
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (10 rows)
+ 
+ -- ensure a distinct clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select distinct id from j3) j3 on j1.id = j3.id;
+                   QUERY PLAN                   
+ -----------------------------------------------
+  Nested Loop
+    Output: j1.id, j3.id
+    Inner Unique: true
+    Join Filter: (j1.id = j3.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Materialize
+          Output: j3.id
+          ->  Unique
+                Output: j3.id
+                ->  Sort
+                      Output: j3.id
+                      Sort Key: j3.id
+                      ->  Seq Scan on public.j3
+                            Output: j3.id
+ (15 rows)
+ 
+ -- ensure group by clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select id from j3 group by id) j3 on j1.id = j3.id;
+                   QUERY PLAN                   
+ -----------------------------------------------
+  Nested Loop
+    Output: j1.id, j3.id
+    Inner Unique: true
+    Join Filter: (j1.id = j3.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Materialize
+          Output: j3.id
+          ->  Group
+                Output: j3.id
+                Group Key: j3.id
+                ->  Sort
+                      Output: j3.id
+                      Sort Key: j3.id
+                      ->  Seq Scan on public.j3
+                            Output: j3.id
+ (16 rows)
+ 
+ drop table j1;
+ drop table j2;
+ drop table j3;
+ -- test more complex permutations of unique joins
+ create table j1 (id1 int, id2 int, primary key(id1,id2));
+ create table j2 (id1 int, id2 int, primary key(id1,id2));
+ create table j3 (id1 int, id2 int, primary key(id1,id2));
+ insert into j1 values(1,1),(1,2);
+ insert into j2 values(1,1);
+ insert into j3 values(1,1);
+ analyze j1;
+ analyze j2;
+ analyze j3;
+ -- ensure there's no unique join when not all columns which are part of the
+ -- unique index are seen in the join clause
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1;
+                 QUERY PLAN                
+ ------------------------------------------
+  Nested Loop
+    Output: j1.id1, j1.id2, j2.id1, j2.id2
+    Join Filter: (j1.id1 = j2.id1)
+    ->  Seq Scan on public.j2
+          Output: j2.id1, j2.id2
+    ->  Seq Scan on public.j1
+          Output: j1.id1, j1.id2
+ (7 rows)
+ 
+ -- ensure proper unique detection with multiple join quals
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2;
+                         QUERY PLAN                        
+ ----------------------------------------------------------
+  Nested Loop
+    Output: j1.id1, j1.id2, j2.id1, j2.id2
+    Inner Unique: true
+    Join Filter: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
+    ->  Seq Scan on public.j1
+          Output: j1.id1, j1.id2
+    ->  Materialize
+          Output: j2.id1, j2.id2
+          ->  Seq Scan on public.j2
+                Output: j2.id1, j2.id2
+ (10 rows)
+ 
+ -- ensure we don't detect the join to be unique when quals are not part of the
+ -- join condition
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+                 QUERY PLAN                
+ ------------------------------------------
+  Nested Loop
+    Output: j1.id1, j1.id2, j2.id1, j2.id2
+    Join Filter: (j1.id1 = j2.id1)
+    ->  Seq Scan on public.j1
+          Output: j1.id1, j1.id2
+          Filter: (j1.id2 = 1)
+    ->  Seq Scan on public.j2
+          Output: j2.id1, j2.id2
+ (8 rows)
+ 
+ -- as above, but for left joins.
+ explain (verbose, costs off)
+ select * from j1
+ left join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+                 QUERY PLAN                
+ ------------------------------------------
+  Nested Loop Left Join
+    Output: j1.id1, j1.id2, j2.id1, j2.id2
+    Join Filter: (j1.id1 = j2.id1)
+    ->  Seq Scan on public.j1
+          Output: j1.id1, j1.id2
+          Filter: (j1.id2 = 1)
+    ->  Seq Scan on public.j2
+          Output: j2.id1, j2.id2
+ (8 rows)
+ 
+ -- validate logic in merge joins which skips mark and restore.
+ -- it should only do this if all quals which were used to detect the unique
+ -- are present as join quals, and not plain quals.
+ set enable_nestloop to 0;
+ set enable_hashjoin to 0;
+ set enable_sort to 0;
+ -- create an index that will be preferred over the PK to perform the join
+ create index j1_id1_idx on j1 (id1) where id1 % 1000 = 1;
+ explain (costs off) select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+                  QUERY PLAN                 
+ --------------------------------------------
+  Merge Join
+    Merge Cond: (j1.id1 = j2.id1)
+    Join Filter: (j1.id2 = j2.id2)
+    ->  Index Scan using j1_id1_idx on j1
+    ->  Index Scan using j1_id1_idx on j1 j2
+ (5 rows)
+ 
+ select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+  id1 | id2 | id1 | id2 
+ -----+-----+-----+-----
+    1 |   1 |   1 |   1
+    1 |   2 |   1 |   2
+ (2 rows)
+ 
+ reset enable_nestloop;
+ reset enable_hashjoin;
+ reset enable_sort;
+ drop table j1;
+ drop table j2;
+ drop table j3;
+ -- check that semijoin inner is not seen as unique for a portion of the outerrel
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from tenk1 t3
+               where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+       and t1.unique1 < 1;
+                                    QUERY PLAN                                    
+ ---------------------------------------------------------------------------------
+  Nested Loop
+    Output: t1.unique1, t2.hundred
+    ->  Hash Join
+          Output: t1.unique1, t3.tenthous
+          Hash Cond: (t3.thousand = t1.unique1)
+          ->  HashAggregate
+                Output: t3.thousand, t3.tenthous
+                Group Key: t3.thousand, t3.tenthous
+                ->  Index Only Scan using tenk1_thous_tenthous on public.tenk1 t3
+                      Output: t3.thousand, t3.tenthous
+          ->  Hash
+                Output: t1.unique1
+                ->  Index Only Scan using onek_unique1 on public.onek t1
+                      Output: t1.unique1
+                      Index Cond: (t1.unique1 < 1)
+    ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
+          Output: t2.hundred
+          Index Cond: (t2.hundred = t3.tenthous)
+ (18 rows)
+ 
+ -- ... unless it actually is unique
+ create table j3 as select unique1, tenthous from onek;
+ vacuum analyze j3;
+ create unique index on j3(unique1, tenthous);
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from j3
+               where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred)
+       and t1.unique1 < 1;
+                                QUERY PLAN                               
+ ------------------------------------------------------------------------
+  Nested Loop
+    Output: t1.unique1, t2.hundred
+    ->  Nested Loop
+          Output: t1.unique1, j3.tenthous
+          ->  Index Only Scan using onek_unique1 on public.onek t1
+                Output: t1.unique1
+                Index Cond: (t1.unique1 < 1)
+          ->  Index Only Scan using j3_unique1_tenthous_idx on public.j3
+                Output: j3.unique1, j3.tenthous
+                Index Cond: (j3.unique1 = t1.unique1)
+    ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
+          Output: t2.hundred
+          Index Cond: (t2.hundred = j3.tenthous)
+ (13 rows)
+ 
+ drop table j3;
diff --cc src/test/regress/expected/json.out
Simple merge
diff --cc src/test/regress/expected/jsonb.out
Simple merge
diff --cc src/test/regress/expected/limit.out
Simple merge
diff --cc src/test/regress/expected/macaddr.out
Simple merge
diff --cc src/test/regress/expected/matview.out
Simple merge
diff --cc src/test/regress/expected/money.out

index 52457bace285c10f458ca8d65948476545d7f415,ab86595fc0219343da3374c11303e7d2d435d7a8..334a67163377b73b9956298c7afae6d0539d5598
--- 1/src/test/regress/expected/money.out
--- 2/src/test/regress/expected/money.out
+++ b/src/test/regress/expected/money.out
@@@ -248,26 -470,22 +470,45 @@@ SELECT (-12345678901234567)::int8::mone
    -$12,345,678,901,234,567.00
   (1 row)
   
+ SELECT (-12345678901234567)::numeric::money;
+             money            
+ -----------------------------
+  -$12,345,678,901,234,567.00
+ (1 row)
+ 
+ -- Cast from money
+ SELECT '12345678901234567'::money::numeric;
+        numeric        
+ ----------------------
+  12345678901234567.00
+ (1 row)
+ 
+ SELECT '-12345678901234567'::money::numeric;
+         numeric        
+ -----------------------
+  -12345678901234567.00
+ (1 row)
+ 
+ +INSERT INTO money_data VALUES ('$223.459');
+ +INSERT INTO money_data VALUES ('$323.459');
+ +INSERT INTO money_data VALUES ('$423.459');
+ +INSERT INTO money_data VALUES ('$523.459');
+ +SELECT sum(m) FROM money_data;
+ +    sum    
+ +-----------
+ + $1,617.30
+ +(1 row)
+ +
+ +CREATE TABLE money_data2 (a int, m money);
+ +INSERT INTO money_data2 VALUES (1, '$123.459');
+ +INSERT INTO money_data2 VALUES (2, '$223.459');
+ +INSERT INTO money_data2 VALUES (3, '$323.459');
+ +INSERT INTO money_data2 VALUES (4, '$423.459');
+ +INSERT INTO money_data2 VALUES (5, '$523.459');
+ +SELECT sum(m) FROM money_data2;
+ +    sum    
+ +-----------
+ + $1,617.30
+ +(1 row)
+ +
+ +DROP TABLE money_data2;
diff --cc src/test/regress/expected/numeric.out
Simple merge
diff --cc src/test/regress/expected/object_address.out

index e82d8f84fb01433a92f01ea9d32589f7b3d60d05,8c56512007ef9afb6f2b9ff6602e597574414396..067c945d24e7e7ba42b132e4d13fbb8eb6d857cb
--- 1/src/test/regress/expected/object_address.out
--- 2/src/test/regress/expected/object_address.out
+++ b/src/test/regress/expected/object_address.out
@@@ -202,23 -407,77 +215,29 @@@ WITH objects (type, name, args) AS (VAL
                                 -- event trigger
                                 ('policy', '{addr_nsp, gentable, genpol}', '{}'),
                                 ('transform', '{int}', '{sql}'),
-                               ('access method', '{btree}', '{}')
+                               ('access method', '{btree}', '{}'),
+                               ('publication', '{addr_pub}', '{}'),
+                               ('publication relation', '{addr_nsp, gentable}', '{addr_pub}'),
+                               ('subscription', '{addr_sub}', '{}'),
+                               ('statistics object', '{addr_nsp, gentable_stat}', '{}')
           )
- SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.subobjid)).*,
+ SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*,
         -- test roundtrip through pg_identify_object_as_address
-       ROW(pg_identify_object(addr1.classid, addr1.objid, addr1.subobjid)) =
-       ROW(pg_identify_object(addr2.classid, addr2.objid, addr2.subobjid))
+       ROW(pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)) =
+       ROW(pg_identify_object(addr2.classid, addr2.objid, addr2.objsubid))
           FROM objects, pg_get_object_address(type, name, args) addr1,
-                       pg_identify_object_as_address(classid, objid, subobjid) ioa(typ,nms,args),
+                       pg_identify_object_as_address(classid, objid, objsubid) ioa(typ,nms,args),
                         pg_get_object_address(typ, nms, ioa.args) as addr2
- -      ORDER BY addr1.classid, addr1.objid, addr1.objsubid;
- -           type            |   schema   |       name        |                               identity                               | ?column? 
- ----------------------------+------------+-------------------+----------------------------------------------------------------------+----------
- - default acl               |            |                   | for role regress_addr_user in schema public on tables                | t
- - default acl               |            |                   | for role regress_addr_user on tables                                 | t
- - type                      | pg_catalog | _int4             | integer[]                                                            | t
- - type                      | addr_nsp   | gencomptype       | addr_nsp.gencomptype                                                 | t
- - type                      | addr_nsp   | genenum           | addr_nsp.genenum                                                     | t
- - type                      | addr_nsp   | gendomain         | addr_nsp.gendomain                                                   | t
- - function                  | pg_catalog |                   | pg_catalog.pg_identify_object(pg_catalog.oid,pg_catalog.oid,integer) | t
- - aggregate                 | addr_nsp   |                   | addr_nsp.genaggr(integer)                                            | t
- - sequence                  | addr_nsp   | gentable_a_seq    | addr_nsp.gentable_a_seq                                              | t
- - table                     | addr_nsp   | gentable          | addr_nsp.gentable                                                    | t
- - table column              | addr_nsp   | gentable          | addr_nsp.gentable.b                                                  | t
- - index                     | addr_nsp   | gentable_pkey     | addr_nsp.gentable_pkey                                               | t
- - view                      | addr_nsp   | genview           | addr_nsp.genview                                                     | t
- - materialized view         | addr_nsp   | genmatview        | addr_nsp.genmatview                                                  | t
- - foreign table             | addr_nsp   | genftable         | addr_nsp.genftable                                                   | t
- - foreign table column      | addr_nsp   | genftable         | addr_nsp.genftable.a                                                 | t
- - role                      |            | regress_addr_user | regress_addr_user                                                    | t
- - server                    |            | addr_fserv        | addr_fserv                                                           | t
- - user mapping              |            |                   | regress_addr_user on server integer                                  | t
- - foreign-data wrapper      |            | addr_fdw          | addr_fdw                                                             | t
- - access method             |            | btree             | btree                                                                | t
- - operator of access method |            |                   | operator 1 (integer, integer) of pg_catalog.integer_ops USING btree  | t
- - function of access method |            |                   | function 2 (integer, integer) of pg_catalog.integer_ops USING btree  | t
- - default value             |            |                   | for addr_nsp.gentable.b                                              | t
- - cast                      |            |                   | (bigint AS integer)                                                  | t
- - table constraint          | addr_nsp   |                   | a_chk on addr_nsp.gentable                                           | t
- - domain constraint         | addr_nsp   |                   | domconstr on addr_nsp.gendomain                                      | t
- - conversion                | pg_catalog | ascii_to_mic      | pg_catalog.ascii_to_mic                                              | t
- - language                  |            | plpgsql           | plpgsql                                                              | t
- - schema                    |            | addr_nsp          | addr_nsp                                                             | t
- - operator class            | pg_catalog | int4_ops          | pg_catalog.int4_ops USING btree                                      | t
- - operator                  | pg_catalog |                   | pg_catalog.+(integer,integer)                                        | t
- - rule                      |            |                   | "_RETURN" on addr_nsp.genview                                        | t
- - trigger                   |            |                   | t on addr_nsp.gentable                                               | t
- - operator family           | pg_catalog | integer_ops       | pg_catalog.integer_ops USING btree                                   | t
- - policy                    |            |                   | genpol on addr_nsp.gentable                                          | t
- - statistics object         | addr_nsp   | gentable_stat     | addr_nsp.gentable_stat                                               | t
- - collation                 | pg_catalog | "default"         | pg_catalog."default"                                                 | t
- - transform                 |            |                   | for integer on language sql                                          | t
- - text search dictionary    | addr_nsp   | addr_ts_dict      | addr_nsp.addr_ts_dict                                                | t
- - text search parser        | addr_nsp   | addr_ts_prs       | addr_nsp.addr_ts_prs                                                 | t
- - text search configuration | addr_nsp   | addr_ts_conf      | addr_nsp.addr_ts_conf                                                | t
- - text search template      | addr_nsp   | addr_ts_temp      | addr_nsp.addr_ts_temp                                                | t
- - subscription              |            | addr_sub          | addr_sub                                                             | t
- - publication               |            | addr_pub          | addr_pub                                                             | t
- - publication relation      |            |                   | gentable in publication addr_pub                                     | t
- -(46 rows)
- -
+ +      ORDER BY addr1.classid, addr1.objid, addr1.subobjid;
+ +ERROR:  relation "addr_nsp.genftable" does not exist
   ---
   --- Cleanup resources
   ---
   SET client_min_messages TO 'warning';
   DROP FOREIGN DATA WRAPPER addr_fdw CASCADE;
+ +ERROR:  foreign-data wrapper "addr_fdw" does not exist
+ DROP PUBLICATION addr_pub;
+ DROP SUBSCRIPTION addr_sub;
   DROP SCHEMA addr_nsp CASCADE;
   DROP OWNED BY regress_addr_user;
   DROP USER regress_addr_user;
diff --cc src/test/regress/expected/opr_sanity.out
Simple merge
diff --cc src/test/regress/expected/plpgsql.out

index 96d91fe48671909d8dfb715a8041937a80d01c62,7ebbde60d3034998abc217928ea860b56cf64e24..cab92b1ac01627458286a585582aa02812649851
--- 1/src/test/regress/expected/plpgsql.out
--- 2/src/test/regress/expected/plpgsql.out
+++ b/src/test/regress/expected/plpgsql.out
@@@ -4632,14 -4632,15 +4633,15 @@@ begi
     get diagnostics rc = row_count;
     raise notice '% %', found, rc;
     return query execute 'values(10),(20)';
-   get diagnostics rc = row_count;
-   raise notice '% %', found, rc;
+   -- just for fun, let's use array elements as targets
+   get diagnostics rca[1] = row_count;
+   raise notice '% %', found, rca[1];
     return query execute 'select * from (values(10),(20)) f(a) where false';
-   get diagnostics rc = row_count;
-   raise notice '% %', found, rc;
+   get diagnostics rca[2] = row_count;
+   raise notice '% %', found, rca[2];
   end;
   $$ language plpgsql;
- -select * from rttest();
+ +select * from rttest() order by 1;
   NOTICE:  t 2
   NOTICE:  f 0
   NOTICE:  t 2
diff --cc src/test/regress/expected/polymorphism.out
Simple merge
diff --cc src/test/regress/expected/prepared_xacts.out

index ce5a30c2eba269d2680a8d9edf0beda9edf2c985,eb77c18788e35ebf61e31183cbeb3e2b36780c30..ff804e1341606242e5225f7bb213f724d86c15d3
--- 1/src/test/regress/expected/prepared_xacts.out
--- 2/src/test/regress/expected/prepared_xacts.out
+++ b/src/test/regress/expected/prepared_xacts.out
@@@ -258,14 -215,21 +256,19 @@@ ERROR:  could not obtain lock on relati
   rollback;
   -- Commit table creation
   COMMIT PREPARED 'regress-one';
+ +ERROR:  prepared transaction with identifier "regress-one" does not exist
   \d pxtest2
+               Table "public.pxtest2"
+  Column |  Type   | Collation | Nullable | Default 
+ --------+---------+-----------+----------+---------
+  a      | integer |           |          | 
+ 
   SELECT * FROM pxtest2;
- - a 
- ----
- - 1
- - 3
- -(2 rows)
- -
+ +ERROR:  relation "pxtest2" does not exist
+ +LINE 1: SELECT * FROM pxtest2;
+ +                      ^
   -- There should be one prepared transaction
- -SELECT gid FROM pg_prepared_xacts;
+ +SELECT gid FROM pg_prepared_xacts ORDER BY 1;
        gid     
   -------------
    regress-two
diff --cc src/test/regress/expected/privileges.out

index 599721a58bf59d7465e83deb4f43f12e6d5d372b,3262aa1d100aac051d461f8996125437d20da904..0c097c445f53b823d1c2719d624e5136bcb589aa
--- 1/src/test/regress/expected/privileges.out
--- 2/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@@ -1499,10 -1724,15 +1677,10 @@@ DROP TABLE atest6
   DROP TABLE atestc;
   DROP TABLE atestp1;
   DROP TABLE atestp2;
- SELECT lo_unlink(oid) FROM pg_largeobject_metadata;
+ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3000 ORDER BY oid;
    lo_unlink 
   -----------
- -         1
- -         1
- -         1
- -         1
- -         1
- -(5 rows)
+ +(0 rows)
   
   DROP GROUP regress_group1;
   DROP GROUP regress_group2;
diff --cc src/test/regress/expected/rangefuncs.out
Simple merge
diff --cc src/test/regress/expected/replica_identity.out
Simple merge
diff --cc src/test/regress/expected/rolenames.out
Simple merge
diff --cc src/test/regress/expected/rowsecurity.out

index 2211308dc835dc195c1d81a06b56908152829acd,7bf29368d0fecf643cfce0293e6ffe92506d5791..67fd53a2a06a14e62644cae798e84aeee9ec2c6e
--- 1/src/test/regress/expected/rowsecurity.out
--- 2/src/test/regress/expected/rowsecurity.out
+++ b/src/test/regress/expected/rowsecurity.out
@@@ -86,16 -162,23 +161,17 @@@ SELECT * FROM document WHERE f_leak(dti
      4 |  44 |      1 | regress_rls_bob   | my first manga
      6 |  22 |      1 | regress_rls_carol | great science fiction
      8 |  44 |      1 | regress_rls_carol | great manga
- (4 rows)
+    9 |  22 |      1 | regress_rls_dave  | awesome science fiction
+ (5 rows)
   
   SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did;
- -NOTICE:  f_leak => my first novel
- -NOTICE:  f_leak => my first manga
- -NOTICE:  f_leak => great science fiction
- -NOTICE:  f_leak => great manga
- -NOTICE:  f_leak => awesome science fiction
- - cid | did | dlevel |      dauthor      |         dtitle          |      cname      
- ------+-----+--------+-------------------+-------------------------+-----------------
- -  11 |   1 |      1 | regress_rls_bob   | my first novel          | novel
- -  44 |   4 |      1 | regress_rls_bob   | my first manga          | manga
- -  22 |   6 |      1 | regress_rls_carol | great science fiction   | science fiction
- -  44 |   8 |      1 | regress_rls_carol | great manga             | manga
- -  22 |   9 |      1 | regress_rls_dave  | awesome science fiction | science fiction
- -(5 rows)
+ + cid | did | dlevel |      dauthor      |        dtitle         |      cname      
+ +-----+-----+--------+-------------------+-----------------------+-----------------
+ +  11 |   1 |      1 | regress_rls_bob   | my first novel        | novel
+ +  44 |   4 |      1 | regress_rls_bob   | my first manga        | manga
+ +  22 |   6 |      1 | regress_rls_carol | great science fiction | science fiction
+ +  44 |   8 |      1 | regress_rls_carol | great manga           | manga
+ +(4 rows)
   
   -- try a sampled version
   SELECT * FROM document TABLESAMPLE BERNOULLI(50) REPEATABLE(0)
@@@ -119,62 -218,147 +195,71 @@@ SELECT * FROM document WHERE f_leak(dti
      6 |  22 |      1 | regress_rls_carol | great science fiction
      7 |  33 |      2 | regress_rls_carol | great technology book
      8 |  44 |      1 | regress_rls_carol | great manga
- (8 rows)
+    9 |  22 |      1 | regress_rls_dave  | awesome science fiction
+   10 |  33 |      2 | regress_rls_dave  | awesome technology book
+ (10 rows)
   
   SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did;
- -NOTICE:  f_leak => my first novel
- -NOTICE:  f_leak => my second novel
- -NOTICE:  f_leak => my science fiction
- -NOTICE:  f_leak => my first manga
- -NOTICE:  f_leak => my second manga
- -NOTICE:  f_leak => great science fiction
- -NOTICE:  f_leak => great technology book
- -NOTICE:  f_leak => great manga
- -NOTICE:  f_leak => awesome science fiction
- -NOTICE:  f_leak => awesome technology book
- - cid | did | dlevel |      dauthor      |         dtitle          |      cname      
- ------+-----+--------+-------------------+-------------------------+-----------------
- -  11 |   1 |      1 | regress_rls_bob   | my first novel          | novel
- -  11 |   2 |      2 | regress_rls_bob   | my second novel         | novel
- -  22 |   3 |      2 | regress_rls_bob   | my science fiction      | science fiction
- -  44 |   4 |      1 | regress_rls_bob   | my first manga          | manga
- -  44 |   5 |      2 | regress_rls_bob   | my second manga         | manga
- -  22 |   6 |      1 | regress_rls_carol | great science fiction   | science fiction
- -  33 |   7 |      2 | regress_rls_carol | great technology book   | technology
- -  44 |   8 |      1 | regress_rls_carol | great manga             | manga
- -  22 |   9 |      1 | regress_rls_dave  | awesome science fiction | science fiction
- -  33 |  10 |      2 | regress_rls_dave  | awesome technology book | technology
- -(10 rows)
+ + cid | did | dlevel |      dauthor      |        dtitle         |      cname      
+ +-----+-----+--------+-------------------+-----------------------+-----------------
+ +  11 |   1 |      1 | regress_rls_bob   | my first novel        | novel
+ +  11 |   2 |      2 | regress_rls_bob   | my second novel       | novel
+ +  22 |   3 |      2 | regress_rls_bob   | my science fiction    | science fiction
+ +  44 |   4 |      1 | regress_rls_bob   | my first manga        | manga
+ +  44 |   5 |      2 | regress_rls_bob   | my second manga       | manga
+ +  22 |   6 |      1 | regress_rls_carol | great science fiction | science fiction
+ +  33 |   7 |      2 | regress_rls_carol | great technology book | technology
+ +  44 |   8 |      1 | regress_rls_carol | great manga           | manga
+ +(8 rows)
   
   -- try a sampled version
   SELECT * FROM document TABLESAMPLE BERNOULLI(50) REPEATABLE(0)
     WHERE f_leak(dtitle) ORDER BY did;
- -NOTICE:  f_leak => my first manga
- -NOTICE:  f_leak => my second manga
- -NOTICE:  f_leak => great science fiction
- -NOTICE:  f_leak => great manga
- -NOTICE:  f_leak => awesome science fiction
- - did | cid | dlevel |      dauthor      |         dtitle          
- ------+-----+--------+-------------------+-------------------------
- -   4 |  44 |      1 | regress_rls_bob   | my first manga
- -   5 |  44 |      2 | regress_rls_bob   | my second manga
+ + did | cid | dlevel |      dauthor      |        dtitle         
+ +-----+-----+--------+-------------------+-----------------------
      6 |  22 |      1 | regress_rls_carol | great science fiction
      8 |  44 |      1 | regress_rls_carol | great manga
- -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
- -(5 rows)
+ +(2 rows)
   
   EXPLAIN (COSTS OFF) SELECT * FROM document WHERE f_leak(dtitle);
- -                     QUERY PLAN                     
- -----------------------------------------------------
- - Seq Scan on document
- -   Filter: ((dlevel <= $0) AND f_leak(dtitle))
- -   InitPlan 1 (returns $0)
- -     ->  Index Scan using uaccount_pkey on uaccount
- -           Index Cond: (pguser = CURRENT_USER)
- -(5 rows)
- -
- -EXPLAIN (COSTS OFF) SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle);
- -                        QUERY PLAN                         
- ------------------------------------------------------------
- - Hash Join
- -   Hash Cond: (category.cid = document.cid)
- -   InitPlan 1 (returns $0)
- -     ->  Index Scan using uaccount_pkey on uaccount
- -           Index Cond: (pguser = CURRENT_USER)
- -   ->  Seq Scan on category
- -   ->  Hash
- -         ->  Seq Scan on document
- -               Filter: ((dlevel <= $0) AND f_leak(dtitle))
+ +                               QUERY PLAN                                
+ +-------------------------------------------------------------------------
+ + Remote Subquery Scan on all (datanode_1,datanode_2)
+ +   ->  Subquery Scan on document
+ +         Filter: f_leak(document.dtitle)
+ +         ->  Seq Scan on document document_1
+ +               Filter: (dlevel <= $0)
+ +               InitPlan 1 (returns $0)
+ +                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+ +                       ->  Index Scan using uaccount_pkey on uaccount
+ +                             Index Cond: (pguser = "current_user"())
   (9 rows)
   
- --- viewpoint from regress_rls_dave
- -SET SESSION AUTHORIZATION regress_rls_dave;
- -SELECT * FROM document WHERE f_leak(dtitle) ORDER BY did;
- -NOTICE:  f_leak => my first novel
- -NOTICE:  f_leak => my second novel
- -NOTICE:  f_leak => my science fiction
- -NOTICE:  f_leak => great science fiction
- -NOTICE:  f_leak => great technology book
- -NOTICE:  f_leak => awesome science fiction
- -NOTICE:  f_leak => awesome technology book
- - did | cid | dlevel |      dauthor      |         dtitle          
- ------+-----+--------+-------------------+-------------------------
- -   1 |  11 |      1 | regress_rls_bob   | my first novel
- -   2 |  11 |      2 | regress_rls_bob   | my second novel
- -   3 |  22 |      2 | regress_rls_bob   | my science fiction
- -   6 |  22 |      1 | regress_rls_carol | great science fiction
- -   7 |  33 |      2 | regress_rls_carol | great technology book
- -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
- -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
- -(7 rows)
- -
- -SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did;
- -NOTICE:  f_leak => my first novel
- -NOTICE:  f_leak => my second novel
- -NOTICE:  f_leak => my science fiction
- -NOTICE:  f_leak => great science fiction
- -NOTICE:  f_leak => great technology book
- -NOTICE:  f_leak => awesome science fiction
- -NOTICE:  f_leak => awesome technology book
- - cid | did | dlevel |      dauthor      |         dtitle          |      cname      
- ------+-----+--------+-------------------+-------------------------+-----------------
- -  11 |   1 |      1 | regress_rls_bob   | my first novel          | novel
- -  11 |   2 |      2 | regress_rls_bob   | my second novel         | novel
- -  22 |   3 |      2 | regress_rls_bob   | my science fiction      | science fiction
- -  22 |   6 |      1 | regress_rls_carol | great science fiction   | science fiction
- -  33 |   7 |      2 | regress_rls_carol | great technology book   | technology
- -  22 |   9 |      1 | regress_rls_dave  | awesome science fiction | science fiction
- -  33 |  10 |      2 | regress_rls_dave  | awesome technology book | technology
- -(7 rows)
- -
- -EXPLAIN (COSTS OFF) SELECT * FROM document WHERE f_leak(dtitle);
- -                                          QUERY PLAN                                          
- -----------------------------------------------------------------------------------------------
- - Seq Scan on document
- -   Filter: ((cid <> 44) AND (cid <> 44) AND (cid < 50) AND (dlevel <= $0) AND f_leak(dtitle))
- -   InitPlan 1 (returns $0)
- -     ->  Index Scan using uaccount_pkey on uaccount
- -           Index Cond: (pguser = CURRENT_USER)
- -(5 rows)
- -
   EXPLAIN (COSTS OFF) SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle);
- -                                                QUERY PLAN                                                
- -----------------------------------------------------------------------------------------------------------
- - Hash Join
- -   Hash Cond: (category.cid = document.cid)
- -   InitPlan 1 (returns $0)
- -     ->  Index Scan using uaccount_pkey on uaccount
- -           Index Cond: (pguser = CURRENT_USER)
- -   ->  Seq Scan on category
- -   ->  Hash
- -         ->  Seq Scan on document
- -               Filter: ((cid <> 44) AND (cid <> 44) AND (cid < 50) AND (dlevel <= $0) AND f_leak(dtitle))
- -(9 rows)
+ +                                     QUERY PLAN                                      
+ +-------------------------------------------------------------------------------------
+ + Remote Subquery Scan on all (datanode_1,datanode_2)
+ +   ->  Hash Join
+ +         Hash Cond: (category.cid = document.cid)
+ +         ->  Seq Scan on category
+ +         ->  Hash
+ +               ->  Subquery Scan on document
+ +                     Filter: f_leak(document.dtitle)
+ +                     ->  Seq Scan on document document_1
+ +                           Filter: (dlevel <= $0)
+ +                           InitPlan 1 (returns $0)
+ +                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+ +                                   ->  Index Scan using uaccount_pkey on uaccount
+ +                                         Index Cond: (pguser = "current_user"())
+ +(13 rows)
   
+ -- 44 would technically fail for both p2r and p1r, but we should get an error
+ -- back from p1r for this because it sorts first
+ INSERT INTO document VALUES (100, 44, 1, 'regress_rls_dave', 'testing sorting of policies'); -- fail
+ ERROR:  new row violates row-level security policy "p1r" for table "document"
+ -- Just to see a p2r error
+ INSERT INTO document VALUES (100, 55, 1, 'regress_rls_dave', 'testing sorting of policies'); -- fail
+ ERROR:  new row violates row-level security policy "p2r" for table "document"
   -- only owner can change policies
   ALTER POLICY p1 ON document USING (true);    --fail
   ERROR:  must be owner of relation document
@@@ -254,15 -448,15 +339,18 @@@ CREATE POLICY p2 ON categor
   ALTER TABLE category ENABLE ROW LEVEL SECURITY;
   -- cannot delete PK referenced by invisible FK
   SET SESSION AUTHORIZATION regress_rls_bob;
- SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid;
+ SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid ORDER BY d.did, c.cid;
    did | cid | dlevel |     dauthor     |       dtitle       | cid |   cname    
   -----+-----+--------+-----------------+--------------------+-----+------------
- -   1 |  11 |      1 | regress_rls_bob | my first novel     |  11 | novel
+ +   4 |  44 |      1 | regress_rls_bob | my first manga     |     | 
+ +   5 |  44 |      2 | regress_rls_bob | my second manga    |     | 
      2 |  11 |      2 | regress_rls_bob | my second novel    |  11 | novel
+ +   1 |  11 |      1 | regress_rls_bob | my first novel     |  11 | novel
+ +     |     |        |                 |                    |  33 | technology
      3 |  22 |      2 | regress_rls_bob | my science fiction |     | 
+    4 |  44 |      1 | regress_rls_bob | my first manga     |     | 
+    5 |  44 |      2 | regress_rls_bob | my second manga    |     | 
+      |     |        |                 |                    |  33 | technology
   (6 rows)
   
   DELETE FROM category WHERE cid = 33;    -- fails with FK violation
@@@ -270,15 -464,15 +358,16 @@@ ERROR:  update or delete on table "cate
   DETAIL:  Key is still referenced from table "document".
   -- can insert FK referencing invisible PK
   SET SESSION AUTHORIZATION regress_rls_carol;
- SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid;
+ SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid ORDER BY d.did, c.cid;
    did | cid | dlevel |      dauthor      |        dtitle         | cid |      cname      
   -----+-----+--------+-------------------+-----------------------+-----+-----------------
+ +   8 |  44 |      1 | regress_rls_carol | great manga           |  44 | manga
      6 |  22 |      1 | regress_rls_carol | great science fiction |  22 | science fiction
      7 |  33 |      2 | regress_rls_carol | great technology book |     | 
+    8 |  44 |      1 | regress_rls_carol | great manga           |  44 | manga
   (3 rows)
   
- INSERT INTO document VALUES (10, 33, 1, current_user, 'hoge');
+ INSERT INTO document VALUES (11, 33, 1, current_user, 'hoge');
   -- UNIQUE or PRIMARY KEY constraint violation DOES reveal presence of row
   SET SESSION AUTHORIZATION regress_rls_bob;
   INSERT INTO document VALUES (8, 44, 1, 'regress_rls_bob', 'my third manga'); -- Must fail with unique violation, revealing presence of did we can't see
@@@ -298,18 -491,20 +387,18 @@@ DETAIL:  correlated UPDATE or updating 
   RESET SESSION AUTHORIZATION;
   SET row_security TO ON;
   SELECT * FROM document;
-  did | cid | dlevel |      dauthor      |        dtitle         
- -----+-----+--------+-------------------+-----------------------
+  did | cid | dlevel |      dauthor      |         dtitle          
+ -----+-----+--------+-------------------+-------------------------
      1 |  11 |      1 | regress_rls_bob   | my first novel
      2 |  11 |      2 | regress_rls_bob   | my second novel
- -   3 |  22 |      2 | regress_rls_bob   | my science fiction
- -   4 |  44 |      1 | regress_rls_bob   | my first manga
      5 |  44 |      2 | regress_rls_bob   | my second manga
      6 |  22 |      1 | regress_rls_carol | great science fiction
- -   7 |  33 |      2 | regress_rls_carol | great technology book
      8 |  44 |      1 | regress_rls_carol | great manga
- -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
- -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
- -  11 |  33 |      1 | regress_rls_carol | hoge
- -(11 rows)
+ +   3 |  22 |      2 | regress_rls_bob   | my science fiction
+ +   4 |  44 |      1 | regress_rls_bob   | my first manga
+ +   7 |  33 |      2 | regress_rls_carol | great technology book
+ +  10 |  33 |      1 | regress_rls_carol | hoge
+ +(9 rows)
   
   SELECT * FROM category;
    cid |      cname      
@@@ -324,18 -519,20 +413,18 @@@
   RESET SESSION AUTHORIZATION;
   SET row_security TO OFF;
   SELECT * FROM document;
-  did | cid | dlevel |      dauthor      |        dtitle         
- -----+-----+--------+-------------------+-----------------------
+  did | cid | dlevel |      dauthor      |         dtitle          
+ -----+-----+--------+-------------------+-------------------------
      1 |  11 |      1 | regress_rls_bob   | my first novel
      2 |  11 |      2 | regress_rls_bob   | my second novel
- -   3 |  22 |      2 | regress_rls_bob   | my science fiction
- -   4 |  44 |      1 | regress_rls_bob   | my first manga
      5 |  44 |      2 | regress_rls_bob   | my second manga
      6 |  22 |      1 | regress_rls_carol | great science fiction
- -   7 |  33 |      2 | regress_rls_carol | great technology book
      8 |  44 |      1 | regress_rls_carol | great manga
- -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
- -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
- -  11 |  33 |      1 | regress_rls_carol | hoge
- -(11 rows)
+ +   3 |  22 |      2 | regress_rls_bob   | my science fiction
+ +   4 |  44 |      1 | regress_rls_bob   | my first manga
+ +   7 |  33 |      2 | regress_rls_carol | great technology book
+ +  10 |  33 |      1 | regress_rls_carol | hoge
+ +(9 rows)
   
   SELECT * FROM category;
    cid |      cname      
@@@ -350,18 -547,20 +439,18 @@@
   SET SESSION AUTHORIZATION regress_rls_exempt_user;
   SET row_security TO OFF;
   SELECT * FROM document;
-  did | cid | dlevel |      dauthor      |        dtitle         
- -----+-----+--------+-------------------+-----------------------
+  did | cid | dlevel |      dauthor      |         dtitle          
+ -----+-----+--------+-------------------+-------------------------
      1 |  11 |      1 | regress_rls_bob   | my first novel
      2 |  11 |      2 | regress_rls_bob   | my second novel
- -   3 |  22 |      2 | regress_rls_bob   | my science fiction
- -   4 |  44 |      1 | regress_rls_bob   | my first manga
      5 |  44 |      2 | regress_rls_bob   | my second manga
      6 |  22 |      1 | regress_rls_carol | great science fiction
- -   7 |  33 |      2 | regress_rls_carol | great technology book
      8 |  44 |      1 | regress_rls_carol | great manga
- -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
- -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
- -  11 |  33 |      1 | regress_rls_carol | hoge
- -(11 rows)
+ +   3 |  22 |      2 | regress_rls_bob   | my science fiction
+ +   4 |  44 |      1 | regress_rls_bob   | my first manga
+ +   7 |  33 |      2 | regress_rls_carol | great technology book
+ +  10 |  33 |      1 | regress_rls_carol | hoge
+ +(9 rows)
   
   SELECT * FROM category;
    cid |      cname      
@@@ -376,18 -575,20 +465,18 @@@
   SET SESSION AUTHORIZATION regress_rls_alice;
   SET row_security TO ON;
   SELECT * FROM document;
-  did | cid | dlevel |      dauthor      |        dtitle         
- -----+-----+--------+-------------------+-----------------------
+  did | cid | dlevel |      dauthor      |         dtitle          
+ -----+-----+--------+-------------------+-------------------------
      1 |  11 |      1 | regress_rls_bob   | my first novel
      2 |  11 |      2 | regress_rls_bob   | my second novel
- -   3 |  22 |      2 | regress_rls_bob   | my science fiction
- -   4 |  44 |      1 | regress_rls_bob   | my first manga
      5 |  44 |      2 | regress_rls_bob   | my second manga
      6 |  22 |      1 | regress_rls_carol | great science fiction
- -   7 |  33 |      2 | regress_rls_carol | great technology book
      8 |  44 |      1 | regress_rls_carol | great manga
- -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
- -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
- -  11 |  33 |      1 | regress_rls_carol | hoge
- -(11 rows)
+ +   3 |  22 |      2 | regress_rls_bob   | my science fiction
+ +   4 |  44 |      1 | regress_rls_bob   | my first manga
+ +   7 |  33 |      2 | regress_rls_carol | great technology book
+ +  10 |  33 |      1 | regress_rls_carol | hoge
+ +(9 rows)
   
   SELECT * FROM category;
    cid |      cname      
@@@ -402,18 -603,20 +491,18 @@@
   SET SESSION AUTHORIZATION regress_rls_alice;
   SET row_security TO OFF;
   SELECT * FROM document;
-  did | cid | dlevel |      dauthor      |        dtitle         
- -----+-----+--------+-------------------+-----------------------
+  did | cid | dlevel |      dauthor      |         dtitle          
+ -----+-----+--------+-------------------+-------------------------
      1 |  11 |      1 | regress_rls_bob   | my first novel
      2 |  11 |      2 | regress_rls_bob   | my second novel
- -   3 |  22 |      2 | regress_rls_bob   | my science fiction
- -   4 |  44 |      1 | regress_rls_bob   | my first manga
      5 |  44 |      2 | regress_rls_bob   | my second manga
      6 |  22 |      1 | regress_rls_carol | great science fiction
- -   7 |  33 |      2 | regress_rls_carol | great technology book
      8 |  44 |      1 | regress_rls_carol | great manga
- -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
- -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
- -  11 |  33 |      1 | regress_rls_carol | hoge
- -(11 rows)
+ +   3 |  22 |      2 | regress_rls_bob   | my science fiction
+ +   4 |  44 |      1 | regress_rls_bob   | my first manga
+ +   7 |  33 |      2 | regress_rls_carol | great technology book
+ +  10 |  33 |      1 | regress_rls_carol | hoge
+ +(9 rows)
   
   SELECT * FROM category;
    cid |      cname      
@@@ -1769,15 -1982,12 +1859,14 @@@ SELECT * FROM rls_view
   (2 rows)
   
   EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
- -               QUERY PLAN                
- ------------------------------------------
- - Seq Scan on z1
- -   Filter: (((a % 2) = 0) AND f_leak(b))
- -(2 rows)
- -
+ +                     QUERY PLAN                      
+ +-----------------------------------------------------
+ + Remote Subquery Scan on all (datanode_1,datanode_2)
+ +   ->  Subquery Scan on z1
+ +         Filter: f_leak(z1.b)
+ +         ->  Seq Scan on z1 z1_1
+ +               Filter: ((a % 2) = 0)
+ +(5 rows)
- 
   -- Query as role that is not owner of table but is owner of view.
   -- Should return records based on view owner policies.
   SET SESSION AUTHORIZATION regress_rls_bob;
diff --cc src/test/regress/expected/rowtypes.out
Simple merge
diff --cc src/test/regress/expected/rules.out

index 40d6060eafd926150f32c76dce053ca1bfda299b,912360d70af0bf63796666ce1b1b2da6970b86b6..8f733b02425134f501ca0d78331eb626ec812937
--- 1/src/test/regress/expected/rules.out
--- 2/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@@ -947,10 -938,10 +947,10 @@@ CREATE TABLE shoe_data 
         shoename   char(10),      -- primary key
         sh_avail   integer,       -- available # of pairs
         slcolor    char(10),      -- preferred shoelace color
-       slminlen   float,         -- miminum shoelace length
+       slminlen   float,         -- minimum shoelace length
         slmaxlen   float,         -- maximum shoelace length
         slunit     char(8)        -- length unit
- -);
+ +) distribute by roundrobin;
   CREATE TABLE shoelace_data (
         sl_name    char(10),      -- primary key
         sl_avail   integer,       -- available # of pairs
@@@ -2734,13 -2792,12 +2818,13 @@@ select * from rules_log
   (12 rows)
   
   create rule r3 as on delete to rules_src do notify rules_src_deletion;
+ +ERROR:  Rule may not use NOTIFY, it is not yet supported
   \d+ rules_src
-                       Table "public.rules_src"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  f1     | integer |           | plain   |              | 
-  f2     | integer |           | plain   |              | 
+                                  Table "public.rules_src"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  f1     | integer |           |          |         | plain   |              | 
+  f2     | integer |           |          |         | plain   |              | 
   Rules:
       r1 AS
       ON UPDATE TO rules_src DO  INSERT INTO rules_log (f1, f2, tag) VALUES (old.f1,old.f2,'old'::text), (new.f1,new.f2,'new'::text)
diff --cc src/test/regress/expected/sanity_check.out

index 80d63fd8c3c31180e5a43157259d71ed56d59e68,6750152e0f4908bcddc8bbe7cbc997ed77606722..5e541e294f07edf83bd1602d71cf0b28f7ffba2d
--- 1/src/test/regress/expected/sanity_check.out
--- 2/src/test/regress/expected/sanity_check.out
+++ b/src/test/regress/expected/sanity_check.out
@@@ -67,7 -69,14 +69,14 @@@ kd_point_tbl|
   line_tbl|f
   log_table|f
   lseg_tbl|f
- -main_table|f
+ +main_table|t
+ mlparted|f
+ mlparted1|f
+ mlparted11|f
+ mlparted12|f
+ mlparted2|f
+ mlparted3|f
+ mlparted4|f
   money_data|f
   num_data|f
   num_exp_add|t
@@@ -139,11 -158,9 +158,12 @@@ pg_ts_parser|
   pg_ts_template|t
   pg_type|t
   pg_user_mapping|t
+ +pgxc_class|t
+ +pgxc_group|t
+ +pgxc_node|t
   point_tbl|t
   polygon_tbl|t
+ quad_box_tbl|t
   quad_point_tbl|t
   radix_text_tbl|t
   ramp|f
diff --cc src/test/regress/expected/select.out

index 174d3b655f7b103dbc41a5853e551a15255b331a,1fab5136d29f6c92605407ffc8a39d16a1a739df..ef9e1c51291c6a2cdb868bd7328a8a1f6a3de08b
--- 1/src/test/regress/expected/select.out
--- 2/src/test/regress/expected/select.out
+++ b/src/test/regress/expected/select.out
@@@ -759,16 -752,24 +759,26 @@@ select * from onek2 where unique2 = 11 
        494 |      11 |   0 |    2 |   4 |     14 |       4 |       94 |          94 |       494 |      494 |   8 |    9 | ATAAAA   | LAAAAA   | VVVVxx
   (1 row)
   
+ -- actually run the query with an analyze to use the partial index
+ explain (costs off, analyze on, timing off, summary off)
+ select * from onek2 where unique2 = 11 and stringu1 = 'ATAAAA';
+                            QUERY PLAN                            
+ -----------------------------------------------------------------
+  Index Scan using onek2_u2_prtl on onek2 (actual rows=1 loops=1)
+    Index Cond: (unique2 = 11)
+    Filter: (stringu1 = 'ATAAAA'::name)
+ (3 rows)
+ 
   explain (costs off)
   select unique2 from onek2 where unique2 = 11 and stringu1 = 'ATAAAA';
- -               QUERY PLAN                
- ------------------------------------------
- - Index Scan using onek2_u2_prtl on onek2
- -   Index Cond: (unique2 = 11)
- -   Filter: (stringu1 = 'ATAAAA'::name)
- -(3 rows)
+ +                  QUERY PLAN                   
+ +-----------------------------------------------
+ + Remote Fast Query Execution
+ +   Node/s: datanode_1, datanode_2
+ +   ->  Index Scan using onek2_u2_prtl on onek2
+ +         Index Cond: (unique2 = 11)
+ +         Filter: (stringu1 = 'ATAAAA'::name)
+ +(5 rows)
   
   select unique2 from onek2 where unique2 = 11 and stringu1 = 'ATAAAA';
    unique2 
diff --cc src/test/regress/expected/select_parallel.out

index 33f3ee6f8de25ee271b17c134bcd47df16223101,3e35e96c4b3a8f88a03fedaee9783bee224d5776..580b33af5e41eece57001c2e515d028e03283a85
--- 1/src/test/regress/expected/select_parallel.out
--- 2/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@@ -98,36 -92,223 +98,244 @@@ explain (costs off
   explain (costs off)
         select  sum(parallel_restricted(unique1)) from tenk1
         group by(parallel_restricted(unique1));
+ +                        QUERY PLAN                         
+ +-----------------------------------------------------------
+ + HashAggregate
+ +   Group Key: parallel_restricted(unique1)
+ +   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+ +         ->  Index Only Scan using tenk1_unique1 on tenk1
+ +(4 rows)
+ +
+                             QUERY PLAN                             
+ -------------------------------------------------------------------
+  HashAggregate
+    Group Key: parallel_restricted(unique1)
+    ->  Gather
+          Workers Planned: 4
+          ->  Parallel Index Only Scan using tenk1_unique1 on tenk1
+ (5 rows)
+ 
+ -- test parallel plans for queries containing un-correlated subplans.
+ alter table tenk2 set (parallel_workers = 0);
+ explain (costs off)
+       select count(*) from tenk1 where (two, four) not in
+       (select hundred, thousand from tenk2 where thousand > 100);
+                       QUERY PLAN                      
+ ------------------------------------------------------
+  Finalize Aggregate
+    ->  Gather
+          Workers Planned: 4
+          ->  Partial Aggregate
+                ->  Parallel Seq Scan on tenk1
+                      Filter: (NOT (hashed SubPlan 1))
+                      SubPlan 1
+                        ->  Seq Scan on tenk2
+                              Filter: (thousand > 100)
+ (9 rows)
+ 
+ select count(*) from tenk1 where (two, four) not in
+       (select hundred, thousand from tenk2 where thousand > 100);
+  count 
+ -------
+  10000
+ (1 row)
+ 
+ -- this is not parallel-safe due to use of random() within SubLink's testexpr:
+ explain (costs off)
+       select * from tenk1 where (unique1 + random())::integer not in
+       (select ten from tenk2);
+              QUERY PLAN             
+ ------------------------------------
+  Seq Scan on tenk1
+    Filter: (NOT (hashed SubPlan 1))
+    SubPlan 1
+      ->  Seq Scan on tenk2
+ (4 rows)
+ 
+ alter table tenk2 reset (parallel_workers);
+ -- test parallel index scans.
+ set enable_seqscan to off;
+ set enable_bitmapscan to off;
+ explain (costs off)
+       select  count((unique1)) from tenk1 where hundred > 1;
+                              QUERY PLAN                             
+ --------------------------------------------------------------------
+  Finalize Aggregate
+    ->  Gather
+          Workers Planned: 4
+          ->  Partial Aggregate
+                ->  Parallel Index Scan using tenk1_hundred on tenk1
+                      Index Cond: (hundred > 1)
+ (6 rows)
+ 
+ select  count((unique1)) from tenk1 where hundred > 1;
+  count 
+ -------
+   9800
+ (1 row)
+ 
+ -- test parallel index-only scans.
+ explain (costs off)
+       select  count(*) from tenk1 where thousand > 95;
+                                    QUERY PLAN                                   
+ --------------------------------------------------------------------------------
+  Finalize Aggregate
+    ->  Gather
+          Workers Planned: 4
+          ->  Partial Aggregate
+                ->  Parallel Index Only Scan using tenk1_thous_tenthous on tenk1
+                      Index Cond: (thousand > 95)
+ (6 rows)
+ 
+ select  count(*) from tenk1 where thousand > 95;
+  count 
+ -------
+   9040
+ (1 row)
+ 
+ reset enable_seqscan;
+ reset enable_bitmapscan;
+ -- test parallel bitmap heap scan.
+ set enable_seqscan to off;
+ set enable_indexscan to off;
+ set enable_hashjoin to off;
+ set enable_mergejoin to off;
+ set enable_material to off;
+ -- test prefetching, if the platform allows it
+ DO $$
+ BEGIN
+  SET effective_io_concurrency = 50;
+ EXCEPTION WHEN invalid_parameter_value THEN
+ END $$;
+ set work_mem='64kB';  --set small work mem to force lossy pages
+ explain (costs off)
+       select count(*) from tenk1, tenk2 where tenk1.hundred > 1 and tenk2.thousand=0;
+                          QUERY PLAN                         
+ ------------------------------------------------------------
+  Aggregate
+    ->  Nested Loop
+          ->  Seq Scan on tenk2
+                Filter: (thousand = 0)
+          ->  Gather
+                Workers Planned: 4
+                ->  Parallel Bitmap Heap Scan on tenk1
+                      Recheck Cond: (hundred > 1)
+                      ->  Bitmap Index Scan on tenk1_hundred
+                            Index Cond: (hundred > 1)
+ (10 rows)
+ 
+ select count(*) from tenk1, tenk2 where tenk1.hundred > 1 and tenk2.thousand=0;
+  count 
+ -------
+  98000
+ (1 row)
+ 
+ create table bmscantest (a int, t text);
+ insert into bmscantest select r, 'fooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' FROM generate_series(1,100000) r;
+ create index i_bmtest ON bmscantest(a);
+ select count(*) from bmscantest where a>1;
+  count 
+ -------
+  99999
+ (1 row)
+ 
+ reset enable_seqscan;
+ reset enable_indexscan;
+ reset enable_hashjoin;
+ reset enable_mergejoin;
+ reset enable_material;
+ reset effective_io_concurrency;
+ reset work_mem;
+ drop table bmscantest;
+ -- test parallel merge join path.
+ set enable_hashjoin to off;
+ set enable_nestloop to off;
+ explain (costs off)
+       select  count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1;
+                                   QUERY PLAN                                   
+ -------------------------------------------------------------------------------
+  Finalize Aggregate
+    ->  Gather
+          Workers Planned: 4
+          ->  Partial Aggregate
+                ->  Merge Join
+                      Merge Cond: (tenk1.unique1 = tenk2.unique1)
+                      ->  Parallel Index Only Scan using tenk1_unique1 on tenk1
+                      ->  Index Only Scan using tenk2_unique1 on tenk2
+ (8 rows)
+ 
+ select  count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1;
+  count 
+ -------
+  10000
+ (1 row)
+ 
+ reset enable_hashjoin;
+ reset enable_nestloop;
+ --test gather merge
+ set enable_hashagg to off;
+ explain (costs off)
+    select  string4, count((unique2)) from tenk1 group by string4 order by string4;
+                      QUERY PLAN                     
+ ----------------------------------------------------
+  Finalize GroupAggregate
+    Group Key: string4
+    ->  Gather Merge
+          Workers Planned: 4
+          ->  Partial GroupAggregate
+                Group Key: string4
+                ->  Sort
+                      Sort Key: string4
+                      ->  Parallel Seq Scan on tenk1
+ (9 rows)
+ 
+ select  string4, count((unique2)) from tenk1 group by string4 order by string4;
+  string4 | count 
+ ---------+-------
+  AAAAxx  |  2500
+  HHHHxx  |  2500
+  OOOOxx  |  2500
+  VVVVxx  |  2500
+ (4 rows)
+ 
+ reset enable_hashagg;
   set force_parallel_mode=1;
   explain (costs off)
     select stringu1::int2 from tenk1 where unique1 = 1;
- -                  QUERY PLAN                   
- ------------------------------------------------
- - Gather
- -   Workers Planned: 1
- -   Single Copy: true
- -   ->  Index Scan using tenk1_unique1 on tenk1
- -         Index Cond: (unique1 = 1)
- -(5 rows)
+ +                     QUERY PLAN                      
+ +-----------------------------------------------------
+ + Remote Fast Query Execution
+ +   Node/s: datanode_1
+ +   ->  Gather
+ +         Workers Planned: 1
+ +         Single Copy: true
+ +         ->  Index Scan using tenk1_unique1 on tenk1
+ +               Index Cond: (unique1 = 1)
+ +(7 rows)
+ +
+ +do $$begin
+ +  -- Provoke error, possibly in worker.  If this error happens to occur in
+ +  -- the worker, there will be a CONTEXT line which must be hidden.
+ +  perform stringu1::int2 from tenk1 where unique1 = 1;
+ +  exception
+ +      when others then
+ +              raise 'SQLERRM: %', sqlerrm;
+ +end$$;
+ +ERROR:  Internal subtransactions not supported in Postgres-XL
+ +CONTEXT:  PL/pgSQL function inline_code_block line 1 during statement block entry
+ 
+ -- to increase the parallel query test coverage
+ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1;
+                          QUERY PLAN                          
+ -------------------------------------------------------------
+  Gather (actual rows=10000 loops=1)
+    Workers Planned: 4
+    Workers Launched: 4
+    ->  Parallel Seq Scan on tenk1 (actual rows=2000 loops=5)
+ (4 rows)
+ 
+ -- provoke error in worker
+ select stringu1::int2 from tenk1 where unique1 = 1;
+ ERROR:  invalid input syntax for integer: "BAAAAA"
+ CONTEXT:  parallel worker
   rollback;
diff --cc src/test/regress/expected/select_views.out

index e632ed89d23192e84f186ea35c4bd6c2c8d0d79f,bf003adf243255d5d9418ba1b7c90a575749c5aa..33b880d72b94b51d9574c9a1400b2d636177389a
--- 1/src/test/regress/expected/select_views.out
--- 2/src/test/regress/expected/select_views.out
+++ b/src/test/regress/expected/select_views.out
@@@ -335,15 -322,924 +335,11 @@@ SELECT * FROM street ORDER BY name,cnam
    Whitlock Creek                     | [(-121.74683,37.91276),(-121.733107,37)]                                                                                                                                                                                                                                                                                                                                                                                                     | Oakland
    Whitlock Creek                     | [(-121.74683,37.91276),(-121.733107,37)]                                                                                                                                                                                                                                                                                                                                                                                                     | Oakland
    Willimet                      Way  | [(-122.0964,37.517),(-122.0949,37.493)]                                                                                                                                                                                                                                                                                                                                                                                                      | Oakland
- - Wisconsin                     St   | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)]                                                                                                                                                                                                                                                                                                                                                                                   | Oakland
    Wisconsin                     St   | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)]                                                                                                                                                                                                                                                                                                                                                                                   | Berkeley
+ + Wisconsin                     St   | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)]                                                                                                                                                                                                                                                                                                                                                                                   | Oakland
    Wp Railroad                        | [(-122.254,37.902),(-122.2506,37.891)]                                                                                                                                                                                                                                                                                                                                                                                                       | Berkeley
- - 100th                         Ave  | [(-122.1657,37.429),(-122.1647,37.432)]                                                                                                                                                                                                                                                                                                                                                                                                      | Oakland
- - 107th                         Ave  | [(-122.1555,37.403),(-122.1531,37.41)]                                                                                                                                                                                                                                                                                                                                                                                                       | Oakland
- - 14th                          St   | [(-122.299,37.147),(-122.3,37.148)]                                                                                                                                                                                                                                                                                                                                                                                                          | Lafayette
- - 19th                          Ave  | [(-122.2366,37.897),(-122.2359,37.905)]                                                                                                                                                                                                                                                                                                                                                                                                      | Berkeley
- - 1st                           St   | [(-121.75508,37.89294),(-121.753581,37.90031)]                                                                                                                                                                                                                                                                                                                                                                                               | Oakland
- - 5th                           St   | [(-122.278,37),(-122.2792,37.005),(-122.2803,37.009)]                                                                                                                                                                                                                                                                                                                                                                                        | Lafayette
- - 5th                           St   | [(-122.296,37.615),(-122.2953,37.598)]                                                                                                                                                                                                                                                                                                                                                                                                       | Berkeley
- - 82nd                          Ave  | [(-122.1695,37.596),(-122.1681,37.603)]                                                                                                                                                                                                                                                                                                                                                                                                      | Berkeley
- - 85th                          Ave  | [(-122.1877,37.466),(-122.186,37.476)]                                                                                                                                                                                                                                                                                                                                                                                                       | Oakland
- - 89th                          Ave  | [(-122.1822,37.459),(-122.1803,37.471)]                                                                                                                                                                                                                                                                                                                                                                                                      | Oakland
- - 98th                          Ave  | [(-122.1568,37.498),(-122.1558,37.502)]                                                                                                                                                                                                                                                                                                                                                                                                      | Oakland
- - 98th                          Ave  | [(-122.1693,37.438),(-122.1682,37.444)]                                                                                                                                                                                                                                                                                                                                                                                                      | Oakland
- - 98th                          Ave  | [(-122.2001,37.258),(-122.1974,37.27)]                                                                                                                                                                                                                                                                                                                                                                                                       | Lafayette
   (333 rows)
   
- SELECT name, #thepath FROM iexit ORDER BY 1, 2;
-  name | ?column? 
- ------+----------
- (0 rows)
- -SELECT name, #thepath FROM iexit ORDER BY name COLLATE "C", 2;
- -                name                | ?column? 
- -------------------------------------+----------
- - I- 580                             |        2
- - I- 580                             |        2
- - I- 580                             |        2
- - I- 580                             |        2
- - I- 580                             |        2
- - I- 580                             |        2
- - I- 580                             |        2
- - I- 580                             |        2
- - I- 580                             |        2
- - I- 580                             |        2
- - I- 580                             |        2
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        3
- - I- 580                             |        4
- - I- 580                             |        4
- - I- 580                             |        4
- - I- 580                             |        4
- - I- 580                             |        5
- - I- 580                             |        5
- - I- 580                             |        5
- - I- 580                             |        5
- - I- 580                             |        5
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        6
- - I- 580                             |        7
- - I- 580                             |        7
- - I- 580                             |        7
- - I- 580                             |        7
- - I- 580                             |        7
- - I- 580                             |        7
- - I- 580                             |        7
- - I- 580                             |        8
- - I- 580                             |        8
- - I- 580                             |        8
- - I- 580                             |        8
- - I- 580                             |        8
- - I- 580                             |        8
- - I- 580                             |        8
- - I- 580                             |        8
- - I- 580                             |        8
- - I- 580                             |        9
- - I- 580                             |        9
- - I- 580                             |        9
- - I- 580                             |        9
- - I- 580                             |        9
- - I- 580                             |       12
- - I- 580                             |       12
- - I- 580                             |       12
- - I- 580                             |       12
- - I- 580                             |       12
- - I- 580                             |       12
- - I- 580                             |       12
- - I- 580                             |       12
- - I- 580                             |       12
- - I- 580                             |       12
- - I- 580                             |       13
- - I- 580                             |       13
- - I- 580                             |       13
- - I- 580                             |       13
- - I- 580                             |       13
- - I- 580                             |       13
- - I- 580                             |       14
- - I- 580                             |       14
- - I- 580                             |       14
- - I- 580                             |       14
- - I- 580                             |       14
- - I- 580                             |       14
- - I- 580                             |       14
- - I- 580                             |       14
- - I- 580                             |       18
- - I- 580                             |       18
- - I- 580                             |       18
- - I- 580                             |       18
- - I- 580                             |       18
- - I- 580                             |       18
- - I- 580                             |       21
- - I- 580                             |       21
- - I- 580                             |       21
- - I- 580                             |       21
- - I- 580                             |       21
- - I- 580                             |       21
- - I- 580                             |       21
- - I- 580                             |       21
- - I- 580                             |       21
- - I- 580                             |       21
- - I- 580                             |       22
- - I- 580                             |       22
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        2
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        3
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        4
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        5
- - I- 580                        Ramp |        6
- - I- 580                        Ramp |        6
- - I- 580                        Ramp |        6
- - I- 580                        Ramp |        7
- - I- 580                        Ramp |        8
- - I- 580                        Ramp |        8
- - I- 580                        Ramp |        8
- - I- 580                        Ramp |        8
- - I- 580                        Ramp |        8
- - I- 580                        Ramp |        8
- - I- 580/I-680                  Ramp |        2
- - I- 580/I-680                  Ramp |        2
- - I- 580/I-680                  Ramp |        2
- - I- 580/I-680                  Ramp |        2
- - I- 580/I-680                  Ramp |        2
- - I- 580/I-680                  Ramp |        2
- - I- 580/I-680                  Ramp |        4
- - I- 580/I-680                  Ramp |        4
- - I- 580/I-680                  Ramp |        4
- - I- 580/I-680                  Ramp |        4
- - I- 580/I-680                  Ramp |        5
- - I- 580/I-680                  Ramp |        6
- - I- 580/I-680                  Ramp |        6
- - I- 580/I-680                  Ramp |        6
- - I- 680                             |        2
- - I- 680                             |        2
- - I- 680                             |        2
- - I- 680                             |        2
- - I- 680                             |        2
- - I- 680                             |        2
- - I- 680                             |        2
- - I- 680                             |        3
- - I- 680                             |        3
- - I- 680                             |        3
- - I- 680                             |        4
- - I- 680                             |        4
- - I- 680                             |        4
- - I- 680                             |        5
- - I- 680                             |        5
- - I- 680                             |        5
- - I- 680                             |        7
- - I- 680                             |        7
- - I- 680                             |        7
- - I- 680                             |        7
- - I- 680                             |        8
- - I- 680                             |        8
- - I- 680                             |        8
- - I- 680                             |        8
- - I- 680                             |       10
- - I- 680                             |       10
- - I- 680                             |       10
- - I- 680                             |       10
- - I- 680                             |       10
- - I- 680                             |       10
- - I- 680                             |       10
- - I- 680                             |       16
- - I- 680                             |       16
- - I- 680                             |       16
- - I- 680                             |       16
- - I- 680                             |       16
- - I- 680                             |       16
- - I- 680                             |       16
- - I- 680                             |       16
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        2
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        3
- - I- 680                        Ramp |        4
- - I- 680                        Ramp |        4
- - I- 680                        Ramp |        4
- - I- 680                        Ramp |        5
- - I- 680                        Ramp |        5
- - I- 680                        Ramp |        5
- - I- 680                        Ramp |        5
- - I- 680                        Ramp |        5
- - I- 680                        Ramp |        5
- - I- 680                        Ramp |        6
- - I- 680                        Ramp |        6
- - I- 680                        Ramp |        6
- - I- 680                        Ramp |        6
- - I- 680                        Ramp |        7
- - I- 680                        Ramp |        7
- - I- 680                        Ramp |        7
- - I- 680                        Ramp |        7
- - I- 680                        Ramp |        8
- - I- 680                        Ramp |        8
- - I- 680                        Ramp |        8
- - I- 680                        Ramp |        8
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        2
- - I- 80                              |        3
- - I- 80                              |        3
- - I- 80                              |        3
- - I- 80                              |        4
- - I- 80                              |        4
- - I- 80                              |        4
- - I- 80                              |        4
- - I- 80                              |        4
- - I- 80                              |        5
- - I- 80                              |        5
- - I- 80                              |        5
- - I- 80                              |        5
- - I- 80                              |        5
- - I- 80                              |        5
- - I- 80                              |        5
- - I- 80                              |        5
- - I- 80                              |        5
- - I- 80                              |       11
- - I- 80                              |       11
- - I- 80                              |       11
- - I- 80                              |       11
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        2
- - I- 80                         Ramp |        3
- - I- 80                         Ramp |        3
- - I- 80                         Ramp |        3
- - I- 80                         Ramp |        3
- - I- 80                         Ramp |        3
- - I- 80                         Ramp |        3
- - I- 80                         Ramp |        3
- - I- 80                         Ramp |        3
- - I- 80                         Ramp |        3
- - I- 80                         Ramp |        4
- - I- 80                         Ramp |        4
- - I- 80                         Ramp |        4
- - I- 80                         Ramp |        4
- - I- 80                         Ramp |        5
- - I- 80                         Ramp |        5
- - I- 80                         Ramp |        5
- - I- 80                         Ramp |        5
- - I- 80                         Ramp |        5
- - I- 80                         Ramp |        5
- - I- 80                         Ramp |        5
- - I- 80                         Ramp |        7
- - I- 80                         Ramp |        7
- - I- 80                         Ramp |        7
- - I- 80                         Ramp |        7
- - I- 880                             |        2
- - I- 880                             |        2
- - I- 880                             |        2
- - I- 880                             |        2
- - I- 880                             |        2
- - I- 880                             |        5
- - I- 880                             |        5
- - I- 880                             |        5
- - I- 880                             |        5
- - I- 880                             |        5
- - I- 880                             |        5
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        6
- - I- 880                             |        7
- - I- 880                             |        7
- - I- 880                             |        7
- - I- 880                             |        7
- - I- 880                             |        7
- - I- 880                             |        7
- - I- 880                             |        7
- - I- 880                             |        9
- - I- 880                             |        9
- - I- 880                             |        9
- - I- 880                             |        9
- - I- 880                             |        9
- - I- 880                             |        9
- - I- 880                             |        9
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       10
- - I- 880                             |       12
- - I- 880                             |       12
- - I- 880                             |       12
- - I- 880                             |       12
- - I- 880                             |       12
- - I- 880                             |       12
- - I- 880                             |       12
- - I- 880                             |       12
- - I- 880                             |       12
- - I- 880                             |       12
- - I- 880                             |       12
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       13
- - I- 880                             |       14
- - I- 880                             |       14
- - I- 880                             |       14
- - I- 880                             |       14
- - I- 880                             |       14
- - I- 880                             |       14
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       17
- - I- 880                             |       19
- - I- 880                             |       19
- - I- 880                             |       19
- - I- 880                             |       19
- - I- 880                             |       19
- - I- 880                             |       19
- - I- 880                             |       19
- - I- 880                             |       19
- - I- 880                             |       19
- - I- 880                             |       19
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        2
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        3
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        4
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        5
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        6
- - I- 880                        Ramp |        8
- - I- 880                        Ramp |        8
- - I- 880                        Ramp |        8
- - I- 980                             |        2
- - I- 980                             |        2
- - I- 980                             |        2
- - I- 980                             |        2
- - I- 980                             |        2
- - I- 980                             |        2
- - I- 980                             |        2
- - I- 980                             |        2
- - I- 980                             |        3
- - I- 980                             |        3
- - I- 980                             |        3
- - I- 980                             |        3
- - I- 980                             |        3
- - I- 980                             |        3
- - I- 980                             |        3
- - I- 980                             |        3
- - I- 980                             |        3
- - I- 980                             |        4
- - I- 980                             |        4
- - I- 980                             |        5
- - I- 980                             |        5
- - I- 980                             |        7
- - I- 980                             |        7
- - I- 980                             |        7
- - I- 980                             |        7
- - I- 980                             |       12
- - I- 980                        Ramp |        3
- - I- 980                        Ramp |        3
- - I- 980                        Ramp |        3
- - I- 980                        Ramp |        7
- -(896 rows)
   
   SELECT * FROM toyemp WHERE name = 'sharon';
     name  | age | location | annualsal 
diff --cc src/test/regress/expected/sequence.out

index d637f051cc8ac86a162038e96a28efb738a61d33,16c12f3434a05429d3e3619603da2fcadaee4b56..b57471ca5c189901a88ce2131e019760c8f2fa99
--- 1/src/test/regress/expected/sequence.out
--- 2/src/test/regress/expected/sequence.out
+++ b/src/test/regress/expected/sequence.out
@@@ -518,23 -782,24 +783,44 @@@ SELECT * FROM information_schema.sequen
   
   DROP USER regress_seq_user;
   DROP SEQUENCE seq;
+ +create table test_seqtab (unique1 int, unique2 int);
+ +insert into test_seqtab select i, i from generate_series(1,1000) s(i);
+ +create temp sequence testseq;
+ +select distinct(nextval('testseq'))
+ +  from test_seqtab order by 1 limit 10;
+ + nextval 
+ +---------
+ +       1
+ +       2
+ +       3
+ +       4
+ +       5
+ +       6
+ +       7
+ +       8
+ +       9
+ +      10
+ +(10 rows)
+ +
+ +drop table test_seqtab;
+ -- cache tests
+ CREATE SEQUENCE test_seq1 CACHE 10;
+ SELECT nextval('test_seq1');
+  nextval 
+ ---------
+        1
+ (1 row)
+ 
+ SELECT nextval('test_seq1');
+  nextval 
+ ---------
+        2
+ (1 row)
+ 
+ SELECT nextval('test_seq1');
+  nextval 
+ ---------
+        3
+ (1 row)
+ 
+ DROP SEQUENCE test_seq1;
diff --cc src/test/regress/expected/stats.out
Simple merge
diff --cc src/test/regress/expected/strings.out
Simple merge
diff --cc src/test/regress/expected/subselect.out

index 86ab675051061d9270b7628e177e51b82cd8801b,ed7d6d8034e302362d12c0d9d00926ba0d3ba94f..c355f10c4c09b806c4976f018b2bcadee3ecc69f
--- 1/src/test/regress/expected/subselect.out
--- 2/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@@ -898,86 -938,106 +952,189 @@@ select nextval('ts1')
         11
   (1 row)
   
+ +SELECT setseed(0);
+ + setseed 
+ +---------
+ + 
+ +(1 row)
+ +
+ +-- DROP TABLE IF EXISTS asd ;
+ +CREATE TABLE IF NOT EXISTS asd  AS
+ +SELECT clientid::numeric(20),
+ + (clientid / 20 )::integer::numeric(20) as userid,
+ + cts + ((random()* 3600 *24 )||'sec')::interval as cts,
+ + (ARRAY['A','B','C','D','E','F'])[(random()*5+1)::integer] as state,
+ + 0 as dim,
+ + ((ARRAY['Cat','Dog','Duck'])[(clientid / 10  )% 3  +1 ]) ::text as app_name,
+ + ((ARRAY['A','B'])[(clientid / 10  )% 2  +1 ]) ::text as platform
+ + FROM generate_series('2016-01-01'::timestamp,'2016-10-01'::timestamp,interval '15 day') cts , generate_series( 1000,2000,10) clientid , generate_series(1,6) t
+ +;
+ +SELECT dates::timestamp as dates ,B.platform,B.app_name, B.clientid, B.userid,
+ +      B.state as state
+ +FROM ( VALUES
+ +('2016.08.30. 08:52:43') ,('2016.08.29. 04:57:12') ,('2016.08.26. 08:15:05') ,
+ +('2016.08.24. 11:49:51') ,('2016.08.22. 08:45:29') ,('2016.08.21. 04:53:47') ,('2016.08.20. 08:44:03')
+ +) AS D (dates)
+ +JOIN
+ +( SELECT DISTINCT clientid FROM asd
+ +      WHERE userid=74 ) C ON True
+ +INNER JOIN LATERAL (
+ +      SELECT DISTINCT ON (clientid,app_name,platform,state,dim) x.*
+ +      FROM asd x
+ +      INNER JOIN (SELECT p.clientid,p.app_name,p.platform , p.state, p.dim ,
+ +           MAX(p.cts) AS selected_cts
+ +              FROM asd p
+ +              where cts<D.dates::timestamp and state in
+ +              ('A','B')
+ +      GROUP BY p.clientid,p.app_name,p.platform,p.state,p.dim) y
+ +      ON y.clientid = x.clientid
+ +      AND y.selected_cts = x.cts
+ +      AND y.platform = x.platform
+ +      AND y.app_name=x.app_name
+ +      AND y.state=x.state
+ +      AND y.dim = x.dim
+ +      and x.clientid = C.clientid
+ +) B ON True
+ +ORDER BY dates desc, state;
+ +          dates           | platform | app_name | clientid | userid | state 
+ +--------------------------+----------+----------+----------+--------+-------
+ + Tue Aug 30 08:52:43 2016 | A        | Dog      |     1480 |     74 | A
+ + Tue Aug 30 08:52:43 2016 | B        | Duck     |     1490 |     74 | A
+ + Tue Aug 30 08:52:43 2016 | A        | Dog      |     1480 |     74 | B
+ + Tue Aug 30 08:52:43 2016 | B        | Duck     |     1490 |     74 | B
+ + Mon Aug 29 04:57:12 2016 | A        | Dog      |     1480 |     74 | A
+ + Mon Aug 29 04:57:12 2016 | B        | Duck     |     1490 |     74 | A
+ + Mon Aug 29 04:57:12 2016 | A        | Dog      |     1480 |     74 | B
+ + Mon Aug 29 04:57:12 2016 | B        | Duck     |     1490 |     74 | B
+ + Fri Aug 26 08:15:05 2016 | B        | Duck     |     1490 |     74 | A
+ + Fri Aug 26 08:15:05 2016 | A        | Dog      |     1480 |     74 | A
+ + Fri Aug 26 08:15:05 2016 | B        | Duck     |     1490 |     74 | B
+ + Fri Aug 26 08:15:05 2016 | A        | Dog      |     1480 |     74 | B
+ + Wed Aug 24 11:49:51 2016 | A        | Dog      |     1480 |     74 | A
+ + Wed Aug 24 11:49:51 2016 | B        | Duck     |     1490 |     74 | A
+ + Wed Aug 24 11:49:51 2016 | A        | Dog      |     1480 |     74 | B
+ + Wed Aug 24 11:49:51 2016 | B        | Duck     |     1490 |     74 | B
+ + Mon Aug 22 08:45:29 2016 | B        | Duck     |     1490 |     74 | A
+ + Mon Aug 22 08:45:29 2016 | A        | Dog      |     1480 |     74 | A
+ + Mon Aug 22 08:45:29 2016 | B        | Duck     |     1490 |     74 | B
+ + Mon Aug 22 08:45:29 2016 | A        | Dog      |     1480 |     74 | B
+ + Sun Aug 21 04:53:47 2016 | B        | Duck     |     1490 |     74 | A
+ + Sun Aug 21 04:53:47 2016 | A        | Dog      |     1480 |     74 | A
+ + Sun Aug 21 04:53:47 2016 | B        | Duck     |     1490 |     74 | B
+ + Sun Aug 21 04:53:47 2016 | A        | Dog      |     1480 |     74 | B
+ + Sat Aug 20 08:44:03 2016 | A        | Dog      |     1480 |     74 | A
+ + Sat Aug 20 08:44:03 2016 | B        | Duck     |     1490 |     74 | A
+ + Sat Aug 20 08:44:03 2016 | B        | Duck     |     1490 |     74 | B
+ + Sat Aug 20 08:44:03 2016 | A        | Dog      |     1480 |     74 | B
+ +(28 rows)
+ +
+ +DROP TABLE asd;
+ +SELECT setseed(0);
+ + setseed 
+ +---------
+ + 
+ +(1 row)
+ +
+ --
+ -- Check that volatile quals aren't pushed down past a set-returning function;
+ -- while a nonvolatile qual can be, if it doesn't reference the SRF.
+ --
+ create function tattle(x int, y int) returns bool
+ volatile language plpgsql as $$
+ begin
+   raise notice 'x = %, y = %', x, y;
+   return x > y;
+ end$$;
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+                         QUERY PLAN                        
+ ----------------------------------------------------------
+  Subquery Scan on ss
+    Output: x, u
+    Filter: tattle(ss.x, 8)
+    ->  ProjectSet
+          Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+          ->  Result
+ (6 rows)
+ 
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ NOTICE:  x = 9, y = 8
+ NOTICE:  x = 9, y = 8
+ NOTICE:  x = 9, y = 8
+ NOTICE:  x = 9, y = 8
+ NOTICE:  x = 9, y = 8
+ NOTICE:  x = 9, y = 8
+  x | u  
+ ---+----
+  9 |  1
+  9 |  2
+  9 |  3
+  9 | 11
+  9 | 12
+  9 | 13
+ (6 rows)
+ 
+ -- if we pretend it's stable, we get different results:
+ alter function tattle(x int, y int) stable;
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+                      QUERY PLAN                     
+ ----------------------------------------------------
+  ProjectSet
+    Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+    ->  Result
+          One-Time Filter: tattle(9, 8)
+ (4 rows)
+ 
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ NOTICE:  x = 9, y = 8
+  x | u  
+ ---+----
+  9 |  1
+  9 |  2
+  9 |  3
+  9 | 11
+  9 | 12
+  9 | 13
+ (6 rows)
+ 
+ -- although even a stable qual should not be pushed down if it references SRF
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, u);
+                         QUERY PLAN                        
+ ----------------------------------------------------------
+  Subquery Scan on ss
+    Output: x, u
+    Filter: tattle(ss.x, ss.u)
+    ->  ProjectSet
+          Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+          ->  Result
+ (6 rows)
+ 
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, u);
+ NOTICE:  x = 9, y = 1
+ NOTICE:  x = 9, y = 2
+ NOTICE:  x = 9, y = 3
+ NOTICE:  x = 9, y = 11
+ NOTICE:  x = 9, y = 12
+ NOTICE:  x = 9, y = 13
+  x | u 
+ ---+---
+  9 | 1
+  9 | 2
+  9 | 3
+ (3 rows)
+ 
+ drop function tattle(x int, y int);
diff --cc src/test/regress/expected/tablesample.out
Simple merge
diff --cc src/test/regress/expected/timestamptz.out
Simple merge
diff --cc src/test/regress/expected/triggers.out
Simple merge
diff --cc src/test/regress/expected/tsearch.out
Simple merge
diff --cc src/test/regress/expected/txid.out
Simple merge
diff --cc src/test/regress/expected/union.out

index 7751fdaa5a25caec1bff08a04e00454a4d23ac91,5c4edd1c16645b30daf530e8c8f05839dc46d2ee..7e56416dc76c7f57dc365581c94ed1dd05e534c3
--- 1/src/test/regress/expected/union.out
--- 2/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@@ -363,16 -385,16 +388,17 @@@ SELECT q1 FROM int8_tbl INTERSECT (((SE
    4567890123456789
   (2 rows)
   
- -(((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl ORDER BY 1))) UNION ALL SELECT q2 FROM int8_tbl;
+ +(((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl))) UNION ALL SELECT q2 FROM int8_tbl ORDER BY 1;
           q1         
   -------------------
+ + -4567890123456789
+ +               123
                  123
+   4567890123456789
                  456
     4567890123456789
- -               123
     4567890123456789
- - -4567890123456789
+ +  4567890123456789
   (7 rows)
   
   SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
diff --cc src/test/regress/expected/updatable_views.out
Simple merge
diff --cc src/test/regress/expected/update.out
Simple merge
diff --cc src/test/regress/expected/uuid.out
Simple merge
diff --cc src/test/regress/expected/vacuum.out
Simple merge
diff --cc src/test/regress/expected/with.out
Simple merge
diff --cc src/test/regress/expected/xml.out
Simple merge
diff --cc src/test/regress/expected/xml_1.out
Simple merge
diff --cc src/test/regress/output/constraints.source
Simple merge
diff --cc src/test/regress/output/tablespace.source

index caf598689073131d78904a7e9a013464cab7222d,aaedf5f248665405fab2862fde6a4b57f0e72809..d5ef1715f173f9ba4e651acb95744fe4a8abb171
--- 1/src/test/regress/output/tablespace.source
--- 2/src/test/regress/output/tablespace.source
+++ b/src/test/regress/output/tablespace.source
@@@ -101,9 -241,11 +241,10 @@@ NOTICE:  no matching relations in table
   -- Should succeed
   DROP TABLESPACE regress_tblspace_renamed;
   DROP SCHEMA testschema CASCADE;
- -NOTICE:  drop cascades to 5 other objects
+ +NOTICE:  drop cascades to 3 other objects
   DETAIL:  drop cascades to table testschema.foo
   drop cascades to table testschema.asselect
- -drop cascades to table testschema.asexecute
   drop cascades to table testschema.atable
+ drop cascades to table testschema.tablespace_acl
   DROP ROLE regress_tablespace_user1;
   DROP ROLE regress_tablespace_user2;
diff --cc src/test/regress/parallel_schedule

index d1a33b97a6aa7881c3abe8352bcf7ddce161720e,1f8f0987e380f856ae1ae5ee951c5e20fe314052..4e8f4c7bdd6b25bc200e9eceff12ef29c2e8472b
--- 1/src/test/regress/parallel_schedule
--- 2/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@@ -113,8 -109,13 +119,13 @@@ test: rangefunc
   # NB: temp.sql does a reconnect which transiently uses 2 connections,
   # so keep this parallel group to at most 19 tests
   # ----------
- -test: plancache limit plpgsql copy2 temp domain rangefuncs prepare without_oid conversion truncate alter_table sequence polymorphism rowtypes returning largeobject with xml
+ +test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion truncate alter_table sequence polymorphism rowtypes returning largeobject with xml
   
+ # ----------
+ # Another group of parallel tests
+ # ----------
+ test: identity
+ 
   # event triggers cannot run concurrently with any test that runs DDL
   test: event_trigger
   
diff --cc src/test/regress/pg_regress.c

index 8e190bc73b73c047671febb727152f4c5398e370,b685aeb1fa6922f654fa89a0db033a3dba59e81d..72899c1ae6052cbe2e5fea668121680bf11709dc
--- 1/src/test/regress/pg_regress.c
--- 2/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@@ -3050,16 -2245,8 +3080,16 @@@ regression_main(int argc, char *argv[]
   
                 /* initdb */
                 header(_("initializing database system"));
+ +#ifdef PGXC
+ +              /* Initialize nodes and GTM */
+ +              initdb_node(PGXC_GTM);
+ +              initdb_node(PGXC_COORD_1);
+ +              initdb_node(PGXC_COORD_2);
+ +              initdb_node(PGXC_DATANODE_1);
+ +              initdb_node(PGXC_DATANODE_2);
+ +#else
                 snprintf(buf, sizeof(buf),
-                                "\"%s%sinitdb\" -D \"%s/data\" --noclean --nosync%s%s > \"%s/log/initdb.log\" 2>&1",
+                                "\"%s%sinitdb\" -D \"%s/data\" --no-clean --no-sync%s%s > \"%s/log/initdb.log\" 2>&1",
                                  bindir ? bindir : "",
                                  bindir ? "/" : "",
                                  temp_instance,
diff --cc src/test/regress/serial_schedule
Simple merge
diff --cc src/test/regress/sql/aggregates.sql
Simple merge
diff --cc src/test/regress/sql/alter_table.sql
Simple merge
diff --cc src/test/regress/sql/arrays.sql

index 0b52fe9ad8e53384f83bb33ec2c2e5793851cdef,25dd4e2c6dedd0ea89ad1ecdaf9fd3e9184bcd4d..943e755c2a982a30ab498277c6b6058bb2364df7
--- 1/src/test/regress/sql/arrays.sql
--- 2/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@@ -338,14 -372,11 +378,16 @@@ select 33 = all (null::int[])
   select null::int = all ('{1,2,3}');
   select 33 = all ('{1,null,3}');
   select 33 = all ('{33,null,33}');
+ -- nulls later in the bitmap
+ SELECT -1 != ALL(ARRAY(SELECT NULLIF(g.i, 900) FROM generate_series(1,1000) g(i)));
   
   -- test indexes on arrays
- -create temp table arr_tbl (f1 int[] unique);
+ +-- PGXCTODO: related to feature request 3520520, this distribution type is changed
+ +-- to replication. As integer arrays are no available distribution types, this table
+ +-- should use roundrobin distribution if nothing is specified but roundrobin
+ +-- distribution cannot be safely used to check constraints on remote nodes.
+ +-- When global constraints are supported, this replication distribution should be removed.
+ +create temp table arr_tbl (f1 int[] unique) distribute by replication;
   insert into arr_tbl values ('{1,2,3}');
   insert into arr_tbl values ('{1,2}');
   -- failure expected:
diff --cc src/test/regress/sql/box.sql
Simple merge
diff --cc src/test/regress/sql/brin.sql
Simple merge
diff --cc src/test/regress/sql/case.sql
Simple merge
diff --cc src/test/regress/sql/combocid.sql

index 709ca4d5b00f73e103ff6a9778cd960ff820adac,4faea36f41a45906c49a9bebfe73ff81e3d3ad8e..c2cdee72351049ac4e0f4b491f475484022e033d
--- 1/src/test/regress/sql/combocid.sql
--- 2/src/test/regress/sql/combocid.sql
+++ b/src/test/regress/sql/combocid.sql
@@@ -90,4 -90,22 +90,22 @@@ SELECT ctid,cmin,* FROM combocidtest OR
   
   COMMIT;
   
- -SELECT ctid,cmin,* FROM combocidtest;
+ +SELECT ctid,cmin,* FROM combocidtest ORDER BY ctid;
+ 
+ -- test for bug reported in
+ -- CABRT9RC81YUf1=jsmWopcKJEro=VoeG2ou6sPwyOUTx_qteRsg@mail.gmail.com
+ CREATE TABLE IF NOT EXISTS testcase(
+       id int PRIMARY KEY,
+       balance numeric
+ );
+ INSERT INTO testcase VALUES (1, 0);
+ BEGIN;
+ SELECT * FROM testcase WHERE testcase.id = 1 FOR UPDATE;
+ UPDATE testcase SET balance = balance + 400 WHERE id=1;
+ SAVEPOINT subxact;
+ UPDATE testcase SET balance = balance - 100 WHERE id=1;
+ ROLLBACK TO SAVEPOINT subxact;
+ -- should return one tuple
+ SELECT * FROM testcase WHERE id = 1 FOR UPDATE;
+ ROLLBACK;
+ DROP TABLE testcase;
diff --cc src/test/regress/sql/copy2.sql
Simple merge
diff --cc src/test/regress/sql/create_index.sql
Simple merge
diff --cc src/test/regress/sql/create_table.sql
Simple merge
diff --cc src/test/regress/sql/create_table_like.sql

index 3069b2bd066cc1a3befb942f90cfbdcdbcf467cb,557040bbe7d374883300775fdff22b8e31743931..070794fd5e6568c8dc0f9f7626c905bad6103eee
--- 1/src/test/regress/sql/create_table_like.sql
--- 2/src/test/regress/sql/create_table_like.sql
+++ b/src/test/regress/sql/create_table_like.sql
@@@ -37,7 -37,21 +37,21 @@@ INSERT INTO inhg VALUES ('x', 'foo',  '
   SELECT * FROM inhg; /* Two records with three columns in order x=x, xx=text, y=y */
   DROP TABLE inhg;
   
- -CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text); /* copies indexes */
+ CREATE TABLE test_like_id_1 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ \d test_like_id_1
+ INSERT INTO test_like_id_1 (b) VALUES ('b1');
+ SELECT * FROM test_like_id_1;
+ CREATE TABLE test_like_id_2 (LIKE test_like_id_1);
+ \d test_like_id_2
+ INSERT INTO test_like_id_2 (b) VALUES ('b2');
+ SELECT * FROM test_like_id_2;  -- identity was not copied
+ CREATE TABLE test_like_id_3 (LIKE test_like_id_1 INCLUDING IDENTITY);
+ \d test_like_id_3
+ INSERT INTO test_like_id_3 (b) VALUES ('b3');
+ SELECT * FROM test_like_id_3;  -- identity was copied and applied
+ DROP TABLE test_like_id_1, test_like_id_2, test_like_id_3;
+ 
+ +CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text) DISTRIBUTE BY REPLICATION; /* copies indexes */
   INSERT INTO inhg VALUES (5, 10);
   INSERT INTO inhg VALUES (20, 10); -- should fail
   DROP TABLE inhg;
diff --cc src/test/regress/sql/create_view.sql
Simple merge
diff --cc src/test/regress/sql/date.sql
Simple merge
diff --cc src/test/regress/sql/enum.sql
Simple merge
diff --cc src/test/regress/sql/equivclass.sql
Simple merge
diff --cc src/test/regress/sql/foreign_data.sql
Simple merge
diff --cc src/test/regress/sql/foreign_key.sql

index b708b5d5ec095e5d9f488650d3cf2b96c9dde61e,5f19dad03cd7c3122268345ef1480988a27e33f0..f8f0b566fb3f6f249376de33b29a67d4be3a9bb8
--- 1/src/test/regress/sql/foreign_key.sql
--- 2/src/test/regress/sql/foreign_key.sql
+++ b/src/test/regress/sql/foreign_key.sql
@@@ -1019,7 -1020,38 +1019,42 @@@ create rule r1 as on delete to t1 do de
   explain (costs off) delete from t1 where a = 1;
   delete from t1 where a = 1;
   
+ +drop rule r1 on t1;
+ +
+ +explain (costs off, nodes off) delete from t1 where a = 1;
+ +delete from t1 where a = 1;
+ --
+ -- Test deferred FK check on a tuple deleted by a rolled-back subtransaction
+ --
+ create table pktable2(f1 int primary key);
+ create table fktable2(f1 int references pktable2 deferrable initially deferred);
+ insert into pktable2 values(1);
+ 
+ begin;
+ insert into fktable2 values(1);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit;
+ 
+ begin;
+ insert into fktable2 values(2);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit; -- fail
+ 
+ --
+ -- Test that we prevent dropping FK constraint with pending trigger events
+ --
+ begin;
+ insert into fktable2 values(2);
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ commit;
+ 
+ begin;
+ delete from pktable2 where f1 = 1;
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ commit;
+ 
+ drop table pktable2, fktable2;
diff --cc src/test/regress/sql/horology.sql
Simple merge
diff --cc src/test/regress/sql/inet.sql

index e3b8b3bc9ab483416c1983f3dea9caccad1f6dec,880e115360d98356add13e2949d428ed4ddee035..ca6dcc416ad9b32fc93c3895bc268299234cb3fa
--- 1/src/test/regress/sql/inet.sql
--- 2/src/test/regress/sql/inet.sql
+++ b/src/test/regress/sql/inet.sql
@@@ -93,13 -93,36 +93,36 @@@ SELECT i FROM inet_tbl WHERE i << '192.
   SET enable_seqscan TO on;
   DROP INDEX inet_idx2;
   
+ -- check that spgist index works correctly
+ CREATE INDEX inet_idx3 ON inet_tbl using spgist (i);
+ SET enable_seqscan TO off;
+ SELECT * FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i <<= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i && '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i >>= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i >> '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i < '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i <= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i = '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i >= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i > '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i <> '192.168.1.0/24'::cidr ORDER BY i;
+ 
+ -- test index-only scans
+ EXPLAIN (COSTS OFF)
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ 
+ SET enable_seqscan TO on;
+ DROP INDEX inet_idx3;
+ 
   -- simple tests of inet boolean and arithmetic operators
- -SELECT i, ~i AS "~i" FROM inet_tbl;
- -SELECT i, c, i & c AS "and" FROM inet_tbl;
- -SELECT i, c, i | c AS "or" FROM inet_tbl;
- -SELECT i, i + 500 AS "i+500" FROM inet_tbl;
- -SELECT i, i - 500 AS "i-500" FROM inet_tbl;
- -SELECT i, c, i - c AS "minus" FROM inet_tbl;
+ +SELECT i, ~i AS "~i" FROM inet_tbl ORDER BY i;
+ +SELECT i, c, i & c AS "and" FROM inet_tbl ORDER BY i, c;
+ +SELECT i, c, i | c AS "or" FROM inet_tbl ORDER BY i, c;
+ +SELECT i, i + 500 AS "i+500" FROM inet_tbl ORDER BY i;
+ +SELECT i, i - 500 AS "i-500" FROM inet_tbl ORDER BY i;
+ +SELECT i, c, i - c AS "minus" FROM inet_tbl ORDER BY i, c;
   SELECT '127.0.0.1'::inet + 257;
   SELECT ('127.0.0.1'::inet + 257) - 257;
   SELECT '127::1'::inet + 257;
diff --cc src/test/regress/sql/inherit.sql

index e28632a33e0f860301082001c7627ac7ac6dccb4,70fe971d51f8c03db8a8b00fcf716a3ccf798a75..39e81dc966b2c02c20de120ad6a8837fd36e3b02
--- 1/src/test/regress/sql/inherit.sql
--- 2/src/test/regress/sql/inherit.sql
+++ b/src/test/regress/sql/inherit.sql
@@@ -195,8 -126,36 +195,36 @@@ fro
     ( select f1 from foo union all select f1+3 from foo ) ss
   where bar.f1 = ss.f1;
   
- -select tableoid::regclass::text as relname, bar.* from bar order by 1,2;
+ +--select tableoid::regclass::text as relname, bar.* from bar order by 1,2;
   
+ -- Check UPDATE with *partitioned* inherited target and an appendrel subquery
+ create table some_tab (a int);
+ insert into some_tab values (0);
+ create table some_tab_child () inherits (some_tab);
+ insert into some_tab_child values (1);
+ create table parted_tab (a int, b char) partition by list (a);
+ create table parted_tab_part1 partition of parted_tab for values in (1);
+ create table parted_tab_part2 partition of parted_tab for values in (2);
+ create table parted_tab_part3 partition of parted_tab for values in (3);
+ insert into parted_tab values (1, 'a'), (2, 'a'), (3, 'a');
+ 
+ update parted_tab set b = 'b'
+ from
+   (select a from some_tab union all select a+1 from some_tab) ss (a)
+ where parted_tab.a = ss.a;
+ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order by 1,2;
+ 
+ truncate parted_tab;
+ insert into parted_tab values (1, 'a'), (2, 'a'), (3, 'a');
+ update parted_tab set b = 'b'
+ from
+   (select 0 from parted_tab union all select 1 from parted_tab) ss (a)
+ where parted_tab.a = ss.a;
+ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order by 1,2;
+ 
+ drop table parted_tab;
+ drop table some_tab cascade;
+ 
   /* Test multiple inheritance of column defaults */
   
   CREATE TABLE firstparent (tomorrow date default now()::date + 1);
diff --cc src/test/regress/sql/insert.sql
Simple merge
diff --cc src/test/regress/sql/insert_conflict.sql
Simple merge
diff --cc src/test/regress/sql/interval.sql
Simple merge
diff --cc src/test/regress/sql/join.sql

index a28e44c844d902ec6402645fa41120209f28be0e,c3994ea531ce9a5492709362fd342b9565cf9cb1..83c2b5f5e3685d24703ad2c1de2d4b2f7b6e7447
--- 1/src/test/regress/sql/join.sql
--- 2/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@@ -1576,13 -1534,15 +1576,15 @@@ select count(*) from tenk1 a
     tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x;
   
   -- lateral injecting a strange outer join condition
- -explain (costs off)
+ +explain (num_nodes off, nodes off, costs off)
     select * from int8_tbl a,
       int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z)
-       on x.q2 = ss.z;
+       on x.q2 = ss.z
+   order by a.q1, a.q2, x.q1, x.q2, ss.z;
   select * from int8_tbl a,
     int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z)
-     on x.q2 = ss.z;
+     on x.q2 = ss.z
+   order by a.q1, a.q2, x.q1, x.q2, ss.z;
   
   -- lateral reference to a join alias variable
   select * from (select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1,
@@@ -1776,12 -1733,148 +1778,158 @@@ delete from xx1 using (select * from in
   delete from xx1 using (select * from int4_tbl where f1 = xx1.x1) ss;
   delete from xx1 using lateral (select * from int4_tbl where f1 = x1) ss;
   
+ +-- demonstrate problem with extrememly slow join
+ +CREATE TABLE testr (a int, b int) DISTRIBUTE BY REPLICATION;
+ +INSERT INTO testr SELECT generate_series(1, 10000), generate_series(5001, 15000);
+ +CREATE TABLE testh (a int, b int);
+ +INSERT INTO testh SELECT generate_series(1, 10000), generate_series(8001, 18000);
+ +set enable_mergejoin TO false;
+ +set enable_hashjoin TO false;
+ +EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
+ +SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
++
+ --
+ -- test planner's ability to mark joins as unique
+ --
+ 
+ create table j1 (id int primary key);
+ create table j2 (id int primary key);
+ create table j3 (id int);
+ 
+ insert into j1 values(1),(2),(3);
+ insert into j2 values(1),(2),(3);
+ insert into j3 values(1),(1);
+ 
+ analyze j1;
+ analyze j2;
+ analyze j3;
+ 
+ -- ensure join is properly marked as unique
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id = j2.id;
+ 
+ -- ensure join is not unique when not an equi-join
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id > j2.id;
+ 
+ -- ensure non-unique rel is not chosen as inner
+ explain (verbose, costs off)
+ select * from j1 inner join j3 on j1.id = j3.id;
+ 
+ -- ensure left join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 left join j2 on j1.id = j2.id;
+ 
+ -- ensure right join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 right join j2 on j1.id = j2.id;
+ 
+ -- ensure full join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 full join j2 on j1.id = j2.id;
+ 
+ -- a clauseless (cross) join can't be unique
+ explain (verbose, costs off)
+ select * from j1 cross join j2;
+ 
+ -- ensure a natural join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 natural join j2;
+ 
+ -- ensure a distinct clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select distinct id from j3) j3 on j1.id = j3.id;
+ 
+ -- ensure group by clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select id from j3 group by id) j3 on j1.id = j3.id;
+ 
+ drop table j1;
+ drop table j2;
+ drop table j3;
+ 
+ -- test more complex permutations of unique joins
+ 
+ create table j1 (id1 int, id2 int, primary key(id1,id2));
+ create table j2 (id1 int, id2 int, primary key(id1,id2));
+ create table j3 (id1 int, id2 int, primary key(id1,id2));
+ 
+ insert into j1 values(1,1),(1,2);
+ insert into j2 values(1,1);
+ insert into j3 values(1,1);
+ 
+ analyze j1;
+ analyze j2;
+ analyze j3;
+ 
+ -- ensure there's no unique join when not all columns which are part of the
+ -- unique index are seen in the join clause
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1;
+ 
+ -- ensure proper unique detection with multiple join quals
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2;
+ 
+ -- ensure we don't detect the join to be unique when quals are not part of the
+ -- join condition
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+ 
+ -- as above, but for left joins.
+ explain (verbose, costs off)
+ select * from j1
+ left join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+ 
+ -- validate logic in merge joins which skips mark and restore.
+ -- it should only do this if all quals which were used to detect the unique
+ -- are present as join quals, and not plain quals.
+ set enable_nestloop to 0;
+ set enable_hashjoin to 0;
+ set enable_sort to 0;
+ 
+ -- create an index that will be preferred over the PK to perform the join
+ create index j1_id1_idx on j1 (id1) where id1 % 1000 = 1;
+ 
+ explain (costs off) select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+ 
+ select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+ 
+ reset enable_nestloop;
+ reset enable_hashjoin;
+ reset enable_sort;
+ 
+ drop table j1;
+ drop table j2;
+ drop table j3;
+ 
+ -- check that semijoin inner is not seen as unique for a portion of the outerrel
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from tenk1 t3
+               where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+       and t1.unique1 < 1;
+ 
+ -- ... unless it actually is unique
+ create table j3 as select unique1, tenthous from onek;
+ vacuum analyze j3;
+ create unique index on j3(unique1, tenthous);
+ 
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from j3
+               where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred)
+       and t1.unique1 < 1;
+ 
+ drop table j3;
diff --cc src/test/regress/sql/json.sql
Simple merge
diff --cc src/test/regress/sql/jsonb.sql
Simple merge
diff --cc src/test/regress/sql/matview.sql
Simple merge
diff --cc src/test/regress/sql/money.sql

index 1aad4f54b019abbf80b2e0a268cdc6f9cb1363c3,37b9ecce1fc4fea57bfc79d41ef99e7d80892d9a..2e96592e1a4e95100752fe00dc4edd8a3dc830d3
--- 1/src/test/regress/sql/money.sql
--- 2/src/test/regress/sql/money.sql
+++ b/src/test/regress/sql/money.sql
@@@ -63,26 -115,13 +115,29 @@@ SELECT 12345678901234567::money
   SELECT (-12345)::money;
   SELECT (-1234567890)::money;
   SELECT (-12345678901234567)::money;
- SELECT (-123456789012345678)::money;
- SELECT (-9223372036854775808)::money;
   SELECT 1234567890::int4::money;
   SELECT 12345678901234567::int8::money;
+ SELECT 12345678901234567::numeric::money;
   SELECT (-1234567890)::int4::money;
   SELECT (-12345678901234567)::int8::money;
+ +
+ +INSERT INTO money_data VALUES ('$223.459');
+ +INSERT INTO money_data VALUES ('$323.459');
+ +INSERT INTO money_data VALUES ('$423.459');
+ +INSERT INTO money_data VALUES ('$523.459');
+ +SELECT sum(m) FROM money_data;
+ +
+ +CREATE TABLE money_data2 (a int, m money);
+ +INSERT INTO money_data2 VALUES (1, '$123.459');
+ +INSERT INTO money_data2 VALUES (2, '$223.459');
+ +INSERT INTO money_data2 VALUES (3, '$323.459');
+ +INSERT INTO money_data2 VALUES (4, '$423.459');
+ +INSERT INTO money_data2 VALUES (5, '$523.459');
+ +SELECT sum(m) FROM money_data2;
+ +DROP TABLE money_data2;
++
+ SELECT (-12345678901234567)::numeric::money;
+ 
+ -- Cast from money
+ SELECT '12345678901234567'::money::numeric;
+ SELECT '-12345678901234567'::money::numeric;
diff --cc src/test/regress/sql/numeric.sql
Simple merge
diff --cc src/test/regress/sql/opr_sanity.sql
Simple merge
diff --cc src/test/regress/sql/plpgsql.sql

index c6c0bac9cae1177911a66790bba5788ad4111d1f,60d1d38e346023a8e30cb6fcefe890588d4a955e..7bc74fbebfdc00f9cac5ce6732b17a2f10cb4760
--- 1/src/test/regress/sql/plpgsql.sql
--- 2/src/test/regress/sql/plpgsql.sql
+++ b/src/test/regress/sql/plpgsql.sql
@@@ -4516,24 -4431,338 +4518,360 @@@ exception when others the
   end;
   $$;
   
+ +
+ +-- Check parameter handling
+ +BEGIN;
+ +DROP TABLE IF EXISTS testcase_13;
+ +CREATE TABLE testcase_13 (patient_id integer);
+ +INSERT INTO testcase_13 VALUES (1);
+ +DO $$
+ +DECLARE
+ + r RECORD;
+ +BEGIN
+ +FOR r IN SELECT * FROM testcase_13 LOOP
+ +    RAISE INFO 'r.patient_id=%', r.patient_id;
+ +    IF   (SELECT EXISTS (
+ +            SELECT FROM testcase_13 WHERE patient_id = r.patient_id
+ +        ))
+ +       THEN
+ +          RAISE INFO 'condition true';
+ +    END IF;
+ +  END LOOP;
+ +END $$;
+ +ROLLBACK;
++
+ -- Test use of plpgsql in a domain check constraint (cf. bug #14414)
+ 
+ create function plpgsql_domain_check(val int) returns boolean as $$
+ begin return val > 0; end
+ $$ language plpgsql immutable;
+ 
+ create domain plpgsql_domain as integer check(plpgsql_domain_check(value));
+ 
+ do $$
+ declare v_test plpgsql_domain;
+ begin
+   v_test := 1;
+ end;
+ $$;
+ 
+ do $$
+ declare v_test plpgsql_domain := 1;
+ begin
+   v_test := 0;  -- fail
+ end;
+ $$;
+ 
+ -- Test handling of expanded array passed to a domain constraint (bug #14472)
+ 
+ create function plpgsql_arr_domain_check(val int[]) returns boolean as $$
+ begin return val[1] > 0; end
+ $$ language plpgsql immutable;
+ 
+ create domain plpgsql_arr_domain as int[] check(plpgsql_arr_domain_check(value));
+ 
+ do $$
+ declare v_test plpgsql_arr_domain;
+ begin
+   v_test := array[1];
+   v_test := v_test || 2;
+ end;
+ $$;
+ 
+ do $$
+ declare v_test plpgsql_arr_domain := array[1];
+ begin
+   v_test := 0 || v_test;  -- fail
+ end;
+ $$;
+ 
+ --
+ -- test usage of transition tables in AFTER triggers
+ --
+ 
+ CREATE TABLE transition_table_base (id int PRIMARY KEY, val text);
+ 
+ CREATE FUNCTION transition_table_base_ins_func()
+   RETURNS trigger
+   LANGUAGE plpgsql
+ AS $$
+ DECLARE
+   t text;
+   l text;
+ BEGIN
+   t = '';
+   FOR l IN EXECUTE
+            $q$
+              EXPLAIN (TIMING off, COSTS off, VERBOSE on)
+              SELECT * FROM newtable
+            $q$ LOOP
+     t = t || l || E'\n';
+   END LOOP;
+ 
+   RAISE INFO '%', t;
+   RETURN new;
+ END;
+ $$;
+ 
+ CREATE TRIGGER transition_table_base_ins_trig
+   AFTER INSERT ON transition_table_base
+   REFERENCING OLD TABLE AS oldtable NEW TABLE AS newtable
+   FOR EACH STATEMENT
+   EXECUTE PROCEDURE transition_table_base_ins_func();
+ 
+ CREATE TRIGGER transition_table_base_ins_trig
+   AFTER INSERT ON transition_table_base
+   REFERENCING NEW TABLE AS newtable
+   FOR EACH STATEMENT
+   EXECUTE PROCEDURE transition_table_base_ins_func();
+ 
+ INSERT INTO transition_table_base VALUES (1, 'One'), (2, 'Two');
+ INSERT INTO transition_table_base VALUES (3, 'Three'), (4, 'Four');
+ 
+ CREATE OR REPLACE FUNCTION transition_table_base_upd_func()
+   RETURNS trigger
+   LANGUAGE plpgsql
+ AS $$
+ DECLARE
+   t text;
+   l text;
+ BEGIN
+   t = '';
+   FOR l IN EXECUTE
+            $q$
+              EXPLAIN (TIMING off, COSTS off, VERBOSE on)
+              SELECT * FROM oldtable ot FULL JOIN newtable nt USING (id)
+            $q$ LOOP
+     t = t || l || E'\n';
+   END LOOP;
+ 
+   RAISE INFO '%', t;
+   RETURN new;
+ END;
+ $$;
+ 
+ CREATE TRIGGER transition_table_base_upd_trig
+   AFTER UPDATE ON transition_table_base
+   REFERENCING OLD TABLE AS oldtable NEW TABLE AS newtable
+   FOR EACH STATEMENT
+   EXECUTE PROCEDURE transition_table_base_upd_func();
+ 
+ UPDATE transition_table_base
+   SET val = '*' || val || '*'
+   WHERE id BETWEEN 2 AND 3;
+ 
+ CREATE TABLE transition_table_level1
+ (
+       level1_no serial NOT NULL ,
+       level1_node_name varchar(255),
+        PRIMARY KEY (level1_no)
+ ) WITHOUT OIDS;
+ 
+ CREATE TABLE transition_table_level2
+ (
+       level2_no serial NOT NULL ,
+       parent_no int NOT NULL,
+       level1_node_name varchar(255),
+        PRIMARY KEY (level2_no)
+ ) WITHOUT OIDS;
+ 
+ CREATE TABLE transition_table_status
+ (
+       level int NOT NULL,
+       node_no int NOT NULL,
+       status int,
+        PRIMARY KEY (level, node_no)
+ ) WITHOUT OIDS;
+ 
+ CREATE FUNCTION transition_table_level1_ri_parent_del_func()
+   RETURNS TRIGGER
+   LANGUAGE plpgsql
+ AS $$
+   DECLARE n bigint;
+   BEGIN
+     PERFORM FROM p JOIN transition_table_level2 c ON c.parent_no = p.level1_no;
+     IF FOUND THEN
+       RAISE EXCEPTION 'RI error';
+     END IF;
+     RETURN NULL;
+   END;
+ $$;
+ 
+ CREATE TRIGGER transition_table_level1_ri_parent_del_trigger
+   AFTER DELETE ON transition_table_level1
+   REFERENCING OLD TABLE AS p
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     transition_table_level1_ri_parent_del_func();
+ 
+ CREATE FUNCTION transition_table_level1_ri_parent_upd_func()
+   RETURNS TRIGGER
+   LANGUAGE plpgsql
+ AS $$
+   DECLARE
+     x int;
+   BEGIN
+     WITH p AS (SELECT level1_no, sum(delta) cnt
+                  FROM (SELECT level1_no, 1 AS delta FROM i
+                        UNION ALL
+                        SELECT level1_no, -1 AS delta FROM d) w
+                  GROUP BY level1_no
+                  HAVING sum(delta) < 0)
+     SELECT level1_no
+       FROM p JOIN transition_table_level2 c ON c.parent_no = p.level1_no
+       INTO x;
+     IF FOUND THEN
+       RAISE EXCEPTION 'RI error';
+     END IF;
+     RETURN NULL;
+   END;
+ $$;
+ 
+ CREATE TRIGGER transition_table_level1_ri_parent_upd_trigger
+   AFTER UPDATE ON transition_table_level1
+   REFERENCING OLD TABLE AS d NEW TABLE AS i
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     transition_table_level1_ri_parent_upd_func();
+ 
+ CREATE FUNCTION transition_table_level2_ri_child_insupd_func()
+   RETURNS TRIGGER
+   LANGUAGE plpgsql
+ AS $$
+   BEGIN
+     PERFORM FROM i
+       LEFT JOIN transition_table_level1 p
+         ON p.level1_no IS NOT NULL AND p.level1_no = i.parent_no
+       WHERE p.level1_no IS NULL;
+     IF FOUND THEN
+       RAISE EXCEPTION 'RI error';
+     END IF;
+     RETURN NULL;
+   END;
+ $$;
+ 
+ CREATE TRIGGER transition_table_level2_ri_child_insupd_trigger
+   AFTER INSERT OR UPDATE ON transition_table_level2
+   REFERENCING NEW TABLE AS i
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     transition_table_level2_ri_child_insupd_func();
+ 
+ -- create initial test data
+ INSERT INTO transition_table_level1 (level1_no)
+   SELECT generate_series(1,200);
+ ANALYZE transition_table_level1;
+ 
+ INSERT INTO transition_table_level2 (level2_no, parent_no)
+   SELECT level2_no, level2_no / 50 + 1 AS parent_no
+     FROM generate_series(1,9999) level2_no;
+ ANALYZE transition_table_level2;
+ 
+ INSERT INTO transition_table_status (level, node_no, status)
+   SELECT 1, level1_no, 0 FROM transition_table_level1;
+ 
+ INSERT INTO transition_table_status (level, node_no, status)
+   SELECT 2, level2_no, 0 FROM transition_table_level2;
+ ANALYZE transition_table_status;
+ 
+ INSERT INTO transition_table_level1(level1_no)
+   SELECT generate_series(201,1000);
+ ANALYZE transition_table_level1;
+ 
+ -- behave reasonably if someone tries to modify a transition table
+ CREATE FUNCTION transition_table_level2_bad_usage_func()
+   RETURNS TRIGGER
+   LANGUAGE plpgsql
+ AS $$
+   BEGIN
+     INSERT INTO d VALUES (1000000, 1000000, 'x');
+     RETURN NULL;
+   END;
+ $$;
+ 
+ CREATE TRIGGER transition_table_level2_bad_usage_trigger
+   AFTER DELETE ON transition_table_level2
+   REFERENCING OLD TABLE AS d
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     transition_table_level2_bad_usage_func();
+ 
+ DELETE FROM transition_table_level2
+   WHERE level2_no BETWEEN 301 AND 305;
+ 
+ DROP TRIGGER transition_table_level2_bad_usage_trigger
+   ON transition_table_level2;
+ 
+ -- attempt modifications which would break RI (should all fail)
+ DELETE FROM transition_table_level1
+   WHERE level1_no = 25;
+ 
+ UPDATE transition_table_level1 SET level1_no = -1
+   WHERE level1_no = 30;
+ 
+ INSERT INTO transition_table_level2 (level2_no, parent_no)
+   VALUES (10000, 10000);
+ 
+ UPDATE transition_table_level2 SET parent_no = 2000
+   WHERE level2_no = 40;
+ 
+ 
+ -- attempt modifications which would not break RI (should all succeed)
+ DELETE FROM transition_table_level1
+   WHERE level1_no BETWEEN 201 AND 1000;
+ 
+ DELETE FROM transition_table_level1
+   WHERE level1_no BETWEEN 100000000 AND 100000010;
+ 
+ SELECT count(*) FROM transition_table_level1;
+ 
+ DELETE FROM transition_table_level2
+   WHERE level2_no BETWEEN 211 AND 220;
+ 
+ SELECT count(*) FROM transition_table_level2;
+ 
+ CREATE TABLE alter_table_under_transition_tables
+ (
+   id int PRIMARY KEY,
+   name text
+ );
+ 
+ CREATE FUNCTION alter_table_under_transition_tables_upd_func()
+   RETURNS TRIGGER
+   LANGUAGE plpgsql
+ AS $$
+ BEGIN
+   RAISE WARNING 'old table = %, new table = %',
+                   (SELECT string_agg(id || '=' || name, ',') FROM d),
+                   (SELECT string_agg(id || '=' || name, ',') FROM i);
+   RAISE NOTICE 'one = %', (SELECT 1 FROM alter_table_under_transition_tables LIMIT 1);
+   RETURN NULL;
+ END;
+ $$;
+ 
+ -- should fail, TRUNCATE is not compatible with transition tables
+ CREATE TRIGGER alter_table_under_transition_tables_upd_trigger
+   AFTER TRUNCATE OR UPDATE ON alter_table_under_transition_tables
+   REFERENCING OLD TABLE AS d NEW TABLE AS i
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     alter_table_under_transition_tables_upd_func();
+ 
+ -- should work
+ CREATE TRIGGER alter_table_under_transition_tables_upd_trigger
+   AFTER UPDATE ON alter_table_under_transition_tables
+   REFERENCING OLD TABLE AS d NEW TABLE AS i
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     alter_table_under_transition_tables_upd_func();
+ 
+ INSERT INTO alter_table_under_transition_tables
+   VALUES (1, '1'), (2, '2'), (3, '3');
+ UPDATE alter_table_under_transition_tables
+   SET name = name || name;
+ 
+ -- now change 'name' to an integer to see what happens...
+ ALTER TABLE alter_table_under_transition_tables
+   ALTER COLUMN name TYPE int USING name::integer;
+ UPDATE alter_table_under_transition_tables
+   SET name = (name::text || name::text)::integer;
+ 
+ -- now drop column 'name'
+ ALTER TABLE alter_table_under_transition_tables
+   DROP column name;
+ UPDATE alter_table_under_transition_tables
+   SET id = id;
diff --cc src/test/regress/sql/polymorphism.sql
Simple merge
diff --cc src/test/regress/sql/prepared_xacts.sql
Simple merge
diff --cc src/test/regress/sql/privileges.sql

index fe8357d9d95e1683060f8429cab1189af6494893,fe83709e1b6e0351c09ddd020f386145985040de..e2222bdc6654d937a1c7e265e1bda631439c4b1c
--- 1/src/test/regress/sql/privileges.sql
--- 2/src/test/regress/sql/privileges.sql
+++ b/src/test/regress/sql/privileges.sql
@@@ -124,9 -124,70 +124,70 @@@ SET SESSION AUTHORIZATION regress_user4
   COPY atest2 FROM stdin; -- ok
   bar   true
   \.
- -SELECT * FROM atest1; -- ok
+ +SELECT * FROM atest1 ORDER BY 1; -- ok
   
   
+ -- test leaky-function protections in selfuncs
+ 
+ -- regress_user1 will own a table and provide a view for it.
+ SET SESSION AUTHORIZATION regress_user1;
+ 
+ CREATE TABLE atest12 as
+   SELECT x AS a, 10001 - x AS b FROM generate_series(1,10000) x;
+ CREATE INDEX ON atest12 (a);
+ CREATE INDEX ON atest12 (abs(a));
+ VACUUM ANALYZE atest12;
+ 
+ CREATE FUNCTION leak(integer,integer) RETURNS boolean
+   AS $$begin return $1 < $2; end$$
+   LANGUAGE plpgsql immutable;
+ CREATE OPERATOR <<< (procedure = leak, leftarg = integer, rightarg = integer,
+                      restrict = scalarltsel);
+ 
+ -- view with leaky operator
+ CREATE VIEW atest12v AS
+   SELECT * FROM atest12 WHERE b <<< 5;
+ GRANT SELECT ON atest12v TO PUBLIC;
+ 
+ -- This plan should use nestloop, knowing that few rows will be selected.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
+ 
+ -- And this one.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12 x, atest12 y
+   WHERE x.a = y.b and abs(y.a) <<< 5;
+ 
+ -- Check if regress_user2 can break security.
+ SET SESSION AUTHORIZATION regress_user2;
+ 
+ CREATE FUNCTION leak2(integer,integer) RETURNS boolean
+   AS $$begin raise notice 'leak % %', $1, $2; return $1 > $2; end$$
+   LANGUAGE plpgsql immutable;
+ CREATE OPERATOR >>> (procedure = leak2, leftarg = integer, rightarg = integer,
+                      restrict = scalargtsel);
+ 
+ -- This should not show any "leak" notices before failing.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12 WHERE a >>> 0;
+ 
+ -- This plan should use hashjoin, as it will expect many rows to be selected.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
+ 
+ -- Now regress_user1 grants sufficient access to regress_user2.
+ SET SESSION AUTHORIZATION regress_user1;
+ GRANT SELECT (a, b) ON atest12 TO PUBLIC;
+ SET SESSION AUTHORIZATION regress_user2;
+ 
+ -- Now regress_user2 will also get a good row estimate.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
+ 
+ -- But not for this, due to lack of table-wide permissions needed
+ -- to make use of the expression index's statistics.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12 x, atest12 y
+   WHERE x.a = y.b and abs(y.a) <<< 5;
+ 
+ -- clean up (regress_user1's objects are all dropped later)
+ DROP FUNCTION leak2(integer, integer) CASCADE;
+ 
+ 
   -- groups
   
   SET SESSION AUTHORIZATION regress_user3;
diff --cc src/test/regress/sql/rangefuncs.sql
Simple merge
diff --cc src/test/regress/sql/rowsecurity.sql
Simple merge
diff --cc src/test/regress/sql/rowtypes.sql
Simple merge
diff --cc src/test/regress/sql/rules.sql

index 449f28064be7d18f2fa37ff739b9a941972e6d1c,aada114ab2b821839feeb404e94336c5db66c1b1..742134e97170f0b9471bfbbd78ab45826baad81c
--- 1/src/test/regress/sql/rules.sql
--- 2/src/test/regress/sql/rules.sql
+++ b/src/test/regress/sql/rules.sql
@@@ -523,10 -522,10 +523,10 @@@ CREATE TABLE shoe_data 
         shoename   char(10),      -- primary key
         sh_avail   integer,       -- available # of pairs
         slcolor    char(10),      -- preferred shoelace color
-       slminlen   float,         -- miminum shoelace length
+       slminlen   float,         -- minimum shoelace length
         slmaxlen   float,         -- maximum shoelace length
         slunit     char(8)        -- length unit
- -);
+ +) distribute by roundrobin;
   
   CREATE TABLE shoelace_data (
         sl_name    char(10),      -- primary key
diff --cc src/test/regress/sql/select.sql
Simple merge
diff --cc src/test/regress/sql/select_views.sql

index a27b0faade0aeb23771795ba07c8af7a39aaf701,e742f136990b9b79e8cec83947ea08afdd8c5f9d..1b175469125a3c54a992e5638450f6f683c096a9
--- 1/src/test/regress/sql/select_views.sql
--- 2/src/test/regress/sql/select_views.sql
+++ b/src/test/regress/sql/select_views.sql
@@@ -3,9 -3,9 +3,9 @@@
   -- test the views defined in CREATE_VIEWS
   --
   
- -SELECT * FROM street;
+ +SELECT * FROM street ORDER BY name,cname,thepath::text;
   
- SELECT name, #thepath FROM iexit ORDER BY 1, 2;
+ SELECT name, #thepath FROM iexit ORDER BY name COLLATE "C", 2;
   
   SELECT * FROM toyemp WHERE name = 'sharon';
   
diff --cc src/test/regress/sql/sequence.sql

index b55bf5857e6003403c6e327f5ad30f3fb0b3a902,d53e33d779479d0f0a6a1e1d85915cd81ab83e6b..769760de3c276313320c74881d0580b57a967d86
--- 1/src/test/regress/sql/sequence.sql
--- 2/src/test/regress/sql/sequence.sql
+++ b/src/test/regress/sql/sequence.sql
@@@ -1,15 -51,15 +51,15 @@@ ALTER SEQUENCE sequence_test14 AS int
   ---
   --- test creation of SERIAL column
   ---
- -
+ +SET sequence_range = 1;
- CREATE TABLE serialTest (f1 text, f2 serial);
+ CREATE TABLE serialTest1 (f1 text, f2 serial);
   
- INSERT INTO serialTest VALUES ('foo');
- INSERT INTO serialTest VALUES ('bar');
- INSERT INTO serialTest VALUES ('force', 100);
- INSERT INTO serialTest VALUES ('wrong', NULL);
+ INSERT INTO serialTest1 VALUES ('foo');
+ INSERT INTO serialTest1 VALUES ('bar');
+ INSERT INTO serialTest1 VALUES ('force', 100);
+ INSERT INTO serialTest1 VALUES ('wrong', NULL);
   
- SELECT * FROM serialTest ORDER BY f1, f2;
- -SELECT * FROM serialTest1;
++SELECT * FROM serialTest1 ORDER BY f1, f2;
   
   -- test smallserial / bigserial
   CREATE TABLE serialTest2 (f1 text, f2 serial, f3 smallserial, f4 serial2,
@@@ -88,9 -140,9 +140,9 @@@ SELECT last_value, log_cnt IN (31, 32) 
   DROP SEQUENCE foo_seq_new;
   
   -- renaming serial sequences
- ALTER TABLE serialtest_f2_seq RENAME TO serialtest_f2_foo;
- INSERT INTO serialTest VALUES ('more');
- SELECT * FROM serialTest ORDER BY f1, f2;
+ ALTER TABLE serialtest1_f2_seq RENAME TO serialtest1_f2_foo;
+ INSERT INTO serialTest1 VALUES ('more');
- -SELECT * FROM serialTest1;
++SELECT * FROM serialTest1 ORDER BY f1, f2;
   
   --
   -- Check dependencies of serial and ordinary sequences
@@@ -264,10 -393,10 +394,18 @@@ SELECT * FROM information_schema.sequen
   DROP USER regress_seq_user;
   DROP SEQUENCE seq;
   
+ +create table test_seqtab (unique1 int, unique2 int);
+ +insert into test_seqtab select i, i from generate_series(1,1000) s(i);
+ +
+ +create temp sequence testseq;
+ +select distinct(nextval('testseq'))
+ +  from test_seqtab order by 1 limit 10;
+ +drop table test_seqtab;
++
+ -- cache tests
+ CREATE SEQUENCE test_seq1 CACHE 10;
+ SELECT nextval('test_seq1');
+ SELECT nextval('test_seq1');
+ SELECT nextval('test_seq1');
+ 
+ DROP SEQUENCE test_seq1;
diff --cc src/test/regress/sql/subselect.sql

index ce4019df9f4c96df0aa5f550dee50c21318193d6,2fc0e26ca066a97c67aea259231b8f5bf7d7cb19..2aeee3a54423acf9d64b0995ccdd16bc2ca02ae3
--- 1/src/test/regress/sql/subselect.sql
--- 2/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@@ -495,48 -497,46 +510,91 @@@ select * fro
   
   select nextval('ts1');
   
+ +SELECT setseed(0);
+ +
+ +-- DROP TABLE IF EXISTS asd ;
+ +
+ +CREATE TABLE IF NOT EXISTS asd  AS
+ +SELECT clientid::numeric(20),
+ + (clientid / 20 )::integer::numeric(20) as userid,
+ + cts + ((random()* 3600 *24 )||'sec')::interval as cts,
+ + (ARRAY['A','B','C','D','E','F'])[(random()*5+1)::integer] as state,
+ + 0 as dim,
+ + ((ARRAY['Cat','Dog','Duck'])[(clientid / 10  )% 3  +1 ]) ::text as app_name,
+ + ((ARRAY['A','B'])[(clientid / 10  )% 2  +1 ]) ::text as platform
+ + FROM generate_series('2016-01-01'::timestamp,'2016-10-01'::timestamp,interval '15 day') cts , generate_series( 1000,2000,10) clientid , generate_series(1,6) t
+ +;
+ +
+ +SELECT dates::timestamp as dates ,B.platform,B.app_name, B.clientid, B.userid,
+ +      B.state as state
+ +FROM ( VALUES
+ +('2016.08.30. 08:52:43') ,('2016.08.29. 04:57:12') ,('2016.08.26. 08:15:05') ,
+ +('2016.08.24. 11:49:51') ,('2016.08.22. 08:45:29') ,('2016.08.21. 04:53:47') ,('2016.08.20. 08:44:03')
+ +) AS D (dates)
+ +JOIN
+ +( SELECT DISTINCT clientid FROM asd
+ +      WHERE userid=74 ) C ON True
+ +INNER JOIN LATERAL (
+ +      SELECT DISTINCT ON (clientid,app_name,platform,state,dim) x.*
+ +      FROM asd x
+ +      INNER JOIN (SELECT p.clientid,p.app_name,p.platform , p.state, p.dim ,
+ +           MAX(p.cts) AS selected_cts
+ +              FROM asd p
+ +              where cts<D.dates::timestamp and state in
+ +              ('A','B')
+ +      GROUP BY p.clientid,p.app_name,p.platform,p.state,p.dim) y
+ +      ON y.clientid = x.clientid
+ +      AND y.selected_cts = x.cts
+ +      AND y.platform = x.platform
+ +      AND y.app_name=x.app_name
+ +      AND y.state=x.state
+ +      AND y.dim = x.dim
+ +      and x.clientid = C.clientid
+ +) B ON True
+ +ORDER BY dates desc, state;
+ +
+ +DROP TABLE asd;
+ +SELECT setseed(0);
+ --
+ -- Check that volatile quals aren't pushed down past a set-returning function;
+ -- while a nonvolatile qual can be, if it doesn't reference the SRF.
+ --
+ create function tattle(x int, y int) returns bool
+ volatile language plpgsql as $$
+ begin
+   raise notice 'x = %, y = %', x, y;
+   return x > y;
+ end$$;
+ 
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ 
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ 
+ -- if we pretend it's stable, we get different results:
+ alter function tattle(x int, y int) stable;
+ 
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ 
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ 
+ -- although even a stable qual should not be pushed down if it references SRF
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, u);
+ 
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, u);
+ 
+ drop function tattle(x int, y int);
diff --cc src/test/regress/sql/timestamptz.sql
Simple merge
diff --cc src/test/regress/sql/triggers.sql
Simple merge
diff --cc src/test/regress/sql/truncate.sql
Simple merge
diff --cc src/test/regress/sql/tsearch.sql
Simple merge
diff --cc src/test/regress/sql/txid.sql
Simple merge
diff --cc src/test/regress/sql/typed_table.sql
Simple merge
diff --cc src/test/regress/sql/union.sql
Simple merge
diff --cc src/test/regress/sql/updatable_views.sql
Simple merge
diff --cc src/test/regress/sql/update.sql

index 1f353632be50ae2e8911d5e5a89ce43c6f14f484,663711997b00952e1e9f0f3e7e54ac1d77f100a1..15bee1f50720efeb2f662896e6a65a4f41905123
--- 1/src/test/regress/sql/update.sql
--- 2/src/test/regress/sql/update.sql
+++ b/src/test/regress/sql/update.sql
@@@ -38,8 -38,12 +38,12 @@@ SELECT * FROM update_test  ORDER BY a, 
   UPDATE update_test SET a=v.i FROM (VALUES(100, 20)) AS v(i, j)
     WHERE update_test.b = v.j;
   
- -SELECT * FROM update_test;
+ +SELECT * FROM update_test  ORDER BY a, b, c;
   
+ -- fail, wrong data type:
+ UPDATE update_test SET a = v.* FROM (VALUES(100, 20)) AS v(i, j)
+   WHERE update_test.b = v.j;
+ 
   --
   -- Test multiple-set-clause syntax
   --
diff --cc src/test/regress/sql/vacuum.sql
Simple merge
diff --cc src/test/regress/sql/with.sql

index 11988acab52a86037a9f272fa868dd15ee840113,8ae5184d0f17cc4c2e805aad0280ae90134fe41f..e7db4f0e7bc2495a04b8a220856b4803a7fa45a5
--- 1/src/test/regress/sql/with.sql
--- 2/src/test/regress/sql/with.sql
+++ b/src/test/regress/sql/with.sql
@@@ -76,8 -76,17 +76,17 @@@ WITH RECURSIVE t(n) AS 
   UNION ALL
       SELECT n || ' bar' FROM t WHERE length(n) < 20
   )
- -SELECT n, n IS OF (text) AS is_text FROM t;
+ +SELECT n, n IS OF (text) as is_text FROM t ORDER BY n;
   
+ -- In a perfect world, this would work and resolve the literal as int ...
+ -- but for now, we have to be content with resolving to text too soon.
+ WITH RECURSIVE t(n) AS (
+     SELECT '7'
+ UNION ALL
+     SELECT n+1 FROM t WHERE n < 10
+ )
+ SELECT n, n IS OF (int) AS is_int FROM t;
+ 
   --
   -- Some examples with a tree
   --
diff --cc src/test/regress/sql/xml.sql
Simple merge
author	Pavan Deolasee <[email protected]>
	Wed, 14 Jun 2017 05:42:18 +0000 (11:12 +0530)
committer	Pavan Deolasee <[email protected]>
	Wed, 14 Jun 2017 05:42:18 +0000 (11:12 +0530)