Merge remote-tracking branch 'remotes/PGSQL/REL9_5_STABLE' into XL9_5_STABLE
authorPavan Deolasee <[email protected]>
Thu, 10 Dec 2015 10:08:17 +0000 (15:38 +0530)
committerPavan Deolasee <[email protected]>
Thu, 10 Dec 2015 10:08:17 +0000 (15:38 +0530)
178 files changed:
1  2 
configure
configure.in
contrib/pg_stat_statements/pg_stat_statements.c
doc/bug.template
doc/src/sgml/backup.sgml
doc/src/sgml/catalogs.sgml
doc/src/sgml/config.sgml
doc/src/sgml/datatype.sgml
doc/src/sgml/dblink.sgml
doc/src/sgml/ddl.sgml
doc/src/sgml/func.sgml
doc/src/sgml/high-availability.sgml
doc/src/sgml/indices.sgml
doc/src/sgml/keywords.sgml
doc/src/sgml/libpq.sgml
doc/src/sgml/maintenance.sgml
doc/src/sgml/manage-ag.sgml
doc/src/sgml/mvcc.sgml
doc/src/sgml/pageinspect.sgml
doc/src/sgml/pgstatstatements.sgml
doc/src/sgml/plpgsql.sgml
doc/src/sgml/postgres.sgml
doc/src/sgml/recovery-config.sgml
doc/src/sgml/ref/alter_database.sgml
doc/src/sgml/ref/alter_large_object.sgml
doc/src/sgml/ref/alter_table.sgml
doc/src/sgml/ref/create_database.sgml
doc/src/sgml/ref/create_function.sgml
doc/src/sgml/ref/create_table.sgml
doc/src/sgml/ref/pg_ctl-ref.sgml
doc/src/sgml/ref/pgbench.sgml
doc/src/sgml/ref/pgupgrade.sgml
doc/src/sgml/ref/vacuumdb.sgml
doc/src/sgml/rules.sgml
doc/src/sgml/runtime.sgml
src/Makefile.global.in
src/Makefile.shlib
src/backend/Makefile
src/backend/access/common/heaptuple.c
src/backend/access/heap/heapam.c
src/backend/access/rmgrdesc/xactdesc.c
src/backend/access/transam/slru.c
src/backend/access/transam/subtrans.c
src/backend/access/transam/twophase.c
src/backend/access/transam/xact.c
src/backend/access/transam/xlog.c
src/backend/catalog/Makefile
src/backend/catalog/dependency.c
src/backend/catalog/heap.c
src/backend/commands/analyze.c
src/backend/commands/copy.c
src/backend/commands/event_trigger.c
src/backend/commands/explain.c
src/backend/commands/portalcmds.c
src/backend/commands/tablecmds.c
src/backend/commands/tablespace.c
src/backend/commands/trigger.c
src/backend/commands/vacuum.c
src/backend/executor/execAmi.c
src/backend/executor/execMain.c
src/backend/executor/execQual.c
src/backend/executor/nodeModifyTable.c
src/backend/executor/spi.c
src/backend/main/main.c
src/backend/nodes/copyfuncs.c
src/backend/nodes/equalfuncs.c
src/backend/nodes/nodeFuncs.c
src/backend/nodes/outfuncs.c
src/backend/nodes/readfuncs.c
src/backend/optimizer/path/allpaths.c
src/backend/optimizer/path/costsize.c
src/backend/optimizer/plan/createplan.c
src/backend/optimizer/plan/planner.c
src/backend/optimizer/plan/setrefs.c
src/backend/optimizer/plan/subselect.c
src/backend/optimizer/prep/prepjointree.c
src/backend/optimizer/util/pathnode.c
src/backend/optimizer/util/pgxcship.c
src/backend/optimizer/util/plancat.c
src/backend/optimizer/util/relnode.c
src/backend/parser/analyze.c
src/backend/parser/gram.y
src/backend/parser/parse_agg.c
src/backend/parser/parse_clause.c
src/backend/parser/parse_expr.c
src/backend/parser/parse_relation.c
src/backend/pgxc/pool/execRemote.c
src/backend/postmaster/pgstat.c
src/backend/postmaster/postmaster.c
src/backend/rewrite/rewriteHandler.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/lmgr/lock.c
src/backend/storage/lmgr/lwlock.c
src/backend/storage/lmgr/proc.c
src/backend/tcop/postgres.c
src/backend/tcop/pquery.c
src/backend/utils/adt/numeric.c
src/backend/utils/adt/pseudotypes.c
src/backend/utils/adt/ri_triggers.c
src/backend/utils/adt/rowtypes.c
src/backend/utils/adt/ruleutils.c
src/backend/utils/adt/selfuncs.c
src/backend/utils/adt/timestamp.c
src/backend/utils/cache/lsyscache.c
src/backend/utils/cache/plancache.c
src/backend/utils/cache/relcache.c
src/backend/utils/cache/syscache.c
src/backend/utils/errcodes.txt
src/backend/utils/init/miscinit.c
src/backend/utils/init/postinit.c
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/backend/utils/mmgr/portalmem.c
src/backend/utils/sort/tuplesort.c
src/backend/utils/sort/tuplestore.c
src/bin/pg_ctl/pg_ctl.c
src/bin/pg_dump/pg_dump.c
src/bin/pg_dump/pg_dump.h
src/bin/pg_dump/pg_dumpall.c
src/bin/pg_rewind/filemap.c
src/bin/pg_xlogdump/rmgrdesc.c
src/bin/pgbench/pgbench.c
src/bin/psql/command.c
src/bin/psql/describe.c
src/bin/psql/tab-complete.c
src/include/access/xlog.h
src/include/catalog/catversion.h
src/include/catalog/dependency.h
src/include/catalog/indexing.h
src/include/catalog/pg_class.h
src/include/catalog/pg_proc.h
src/include/catalog/pg_type.h
src/include/libpq/libpq-be.h
src/include/miscadmin.h
src/include/nodes/execnodes.h
src/include/nodes/nodes.h
src/include/nodes/parsenodes.h
src/include/nodes/plannodes.h
src/include/optimizer/cost.h
src/include/parser/parse_func.h
src/include/pg_config.h.win32
src/include/storage/lwlock.h
src/include/utils/builtins.h
src/include/utils/guc_tables.h
src/include/utils/lsyscache.h
src/include/utils/plancache.h
src/include/utils/portal.h
src/include/utils/syscache.h
src/pl/plpgsql/src/pl_exec.c
src/test/regress/expected/alter_generic.out
src/test/regress/expected/alter_table.out
src/test/regress/expected/arrays.out
src/test/regress/expected/gist.out
src/test/regress/expected/inherit.out
src/test/regress/expected/insert_conflict.out
src/test/regress/expected/join.out
src/test/regress/expected/numeric.out
src/test/regress/expected/plpgsql.out
src/test/regress/expected/rules.out
src/test/regress/expected/sanity_check.out
src/test/regress/expected/tablesample.out
src/test/regress/expected/transactions.out
src/test/regress/expected/triggers.out
src/test/regress/expected/txid.out
src/test/regress/output/misc.source
src/test/regress/parallel_schedule
src/test/regress/pg_regress.c
src/test/regress/serial_schedule
src/test/regress/sql/alter_table.sql
src/test/regress/sql/arrays.sql
src/test/regress/sql/inherit.sql
src/test/regress/sql/join.sql
src/test/regress/sql/numeric.sql
src/test/regress/sql/plpgsql.sql
src/test/regress/sql/privileges.sql
src/test/regress/sql/rowsecurity.sql
src/test/regress/sql/rules.sql
src/test/regress/sql/transactions.sql

diff --cc configure
index 0467e3d3930d72db31779db970b2fb4166cf9230,2a22da9d3e15709a9ba36a1dc6ef719aa5f2200c..031b4fa22cc91cc09fe6d2ebc15c436fc97819d0
+++ b/configure
@@@ -1,8 -1,8 +1,8 @@@
  #! /bin/sh
  # Guess values for system-dependent variables and create Makefiles.
- # Generated by GNU Autoconf 2.69 for PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1).
 -# Generated by GNU Autoconf 2.69 for PostgreSQL 9.5beta1.
++# Generated by GNU Autoconf 2.69 for PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1).
  #
 -# Report bugs to <pgsql-bugs@postgresql.org>.
 +# Report bugs to <bugs@postgres-xl.org>.
  #
  #
  # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@@ -582,10 -582,9 +582,10 @@@ MAKEFLAGS
  # Identity of this package.
  PACKAGE_NAME='PostgreSQL'
  PACKAGE_TARNAME='postgresql'
- PACKAGE_VERSION='9.5alpha1 (Postgres-XL 9.5alpha1)'
- PACKAGE_XC_VERSION='9.5alpha1'
- PACKAGE_STRING='PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1)'
 -PACKAGE_VERSION='9.5beta1'
 -PACKAGE_STRING='PostgreSQL 9.5beta1'
 -PACKAGE_BUGREPORT='[email protected]'
++PACKAGE_VERSION='9.5beta1 (Postgres-XL 9.5beta1)'
++PACKAGE_XC_VERSION='9.5beta1'
++PACKAGE_STRING='PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1)'
 +PACKAGE_BUGREPORT='[email protected]'
  PACKAGE_URL=''
  
  ac_unique_file="src/backend/access/common/heaptuple.c"
@@@ -1396,7 -1395,7 +1397,7 @@@ if test "$ac_init_help" = "long"; the
    # Omit some internal or obsolete options to make the list less imposing.
    # This message is too long to be a string in the A/UX 3.1 sh.
    cat <<_ACEOF
- \`configure' configures PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1) to adapt to many kinds of systems.
 -\`configure' configures PostgreSQL 9.5beta1 to adapt to many kinds of systems.
++\`configure' configures PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1) to adapt to many kinds of systems.
  
  Usage: $0 [OPTION]... [VAR=VALUE]...
  
  
  if test -n "$ac_init_help"; then
    case $ac_init_help in
-      short | recursive ) echo "Configuration of PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1):";;
 -     short | recursive ) echo "Configuration of PostgreSQL 9.5beta1:";;
++     short | recursive ) echo "Configuration of PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1):";;
     esac
    cat <<\_ACEOF
  
  test -n "$ac_init_help" && exit $ac_status
  if $ac_init_version; then
    cat <<\_ACEOF
- PostgreSQL configure 9.5alpha1 (Postgres-XL 9.5alpha1)
 -PostgreSQL configure 9.5beta1
++PostgreSQL configure 9.5beta1 (Postgres-XL 9.5beta1)
  generated by GNU Autoconf 2.69
  
  Copyright (C) 2012 Free Software Foundation, Inc.
@@@ -2322,7 -2321,7 +2323,7 @@@ cat >config.log <<_ACEO
  This file contains any messages produced by compilers while
  running configure, to aid debugging if configure makes a mistake.
  
- It was created by PostgreSQL $as_me 9.5alpha1 (Postgres-XL 9.5alpha1), which was
 -It was created by PostgreSQL $as_me 9.5beta1, which was
++It was created by PostgreSQL $as_me 9.5beta1 (Postgres-XL 9.5beta1), which was
  generated by GNU Autoconf 2.69.  Invocation command line was
  
    $ $0 $@
@@@ -15507,10 -15582,8 +15598,11 @@@ cat >>confdefs.h <<_ACEO
  _ACEOF
  
  
 +# For PGXC, set -DPGXC by default. This can be overriden with -UPGXC if the user sets it.
 +# For Postgres-XL, set both -DPGXC and -DXCP  
 +CFLAGS="-DPGXC -DXCP $CFLAGS"
  
  # Begin output steps
  
  { $as_echo "$as_me:${as_lineno-$LINENO}: using compiler=$cc_string" >&5
@@@ -16069,7 -16142,7 +16161,7 @@@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_wr
  # report actual input values of CONFIG_FILES etc. instead of their
  # values after options handling.
  ac_log="
- This file was extended by PostgreSQL $as_me 9.5alpha1 (Postgres-XL 9.5alpha1), which was
 -This file was extended by PostgreSQL $as_me 9.5beta1, which was
++This file was extended by PostgreSQL $as_me 9.5beta1 (Postgres-XL 9.5beta1), which was
  generated by GNU Autoconf 2.69.  Invocation command line was
  
    CONFIG_FILES    = $CONFIG_FILES
@@@ -16139,7 -16212,7 +16231,7 @@@ _ACEO
  cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
  ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
  ac_cs_version="\\
- PostgreSQL config.status 9.5alpha1 (Postgres-XL 9.5alpha1)
 -PostgreSQL config.status 9.5beta1
++PostgreSQL config.status 9.5beta1 (Postgres-XL 9.5beta1)
  configured by $0, generated by GNU Autoconf 2.69,
    with options \\"\$ac_cs_config\\"
  
diff --cc configure.in
index 1455d25a5cb7692b412af2e796699f690da96c11,55ea5ed9ba05707f9e2141ec3eef4bf2c1e4dab9..5d6bf5da2c43a7ebd288c55891e030cfea2d9c3f
@@@ -17,7 -17,7 +17,7 @@@ dnl Read the Autoconf manual for detail
  dnl
  m4_pattern_forbid(^PGAC_)dnl to catch undefined macros
  
- AC_INIT([PostgreSQL], [9.5alpha1 (Postgres-XL 9.5alpha1)], [[email protected]])
 -AC_INIT([PostgreSQL], [9.5beta1], [[email protected]])
++AC_INIT([PostgreSQL], [9.5beta1 (Postgres-XL 9.5beta1)], [[email protected]])
  
  m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required.
  Untested combinations of 'autoconf' and PostgreSQL versions are not
index 561f4216a96d3e435edbc2df00a87e715ad261d4,8a0f08cfc02ca35c048144df18e14a3d0d117349..b42afe465c5049ac1064e0401f40059fde3b51b9
@@@ -27,7 -27,7 +27,7 @@@ System Configuration
  
    Operating System (example: Linux 2.4.18)    :
  
-   PostgreSQL version (example: PostgreSQL 9.5alpha1):  Postgres-XL 9.5alpha1
 -  PostgreSQL version (example: PostgreSQL 9.5beta1):  PostgreSQL 9.5beta1
++  PostgreSQL version (example: PostgreSQL 9.5beta1):  Postgres-XL 9.5beta1
  
    Compiler used (example: gcc 3.3.5)          :
  
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 31ad4b83f51cc40bbb913c7e4bfcdf7becb50e25,fe5a076fe120da59ca2d7e96ecd59f6039db43f7..c69baac498a5612381a5116ab0256232b0d4c408
mode 100755,100644..100755
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index e5a677925dba0ea4f743bdb2972dda304df08f26,9fea2702bac47aae519867c1b7e8c68efc6a58c7..53f40d9db8d2a6590d2d4f394dd84835977346d0
mode 100755,100644..100755
Simple merge
Simple merge
index f75977bed2b40037e3acabb5742520866bee2bf8,a2d0b0cbe1e2e99de441e94b86922d1c588b1516..c001195d56214983a279f93635b35445b3f79418
mode 100755,100644..100755
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index d27267586043c228e2cd81683896cd03ef47c40e,25130ecf124805565f61c17045c9589445c10e8c..2680d6ceffa997a2d1e97cacb4287ee7172eaceb
@@@ -39,10 -39,10 +39,11 @@@ POSTGRES_BKI_SRCS = $(addprefix $(top_s
        pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \
        pg_ts_parser.h pg_ts_template.h pg_extension.h \
        pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \
 +      pgxc_class.h pgxc_node.h pgxc_group.h \
        pg_foreign_table.h pg_policy.h pg_replication_origin.h \
-       pg_tablesample_method.h pg_default_acl.h pg_seclabel.h pg_shseclabel.h \
-       pg_collation.h pg_range.h pg_transform.h toasting.h indexing.h \
+       pg_default_acl.h pg_seclabel.h pg_shseclabel.h \
+       pg_collation.h pg_range.h pg_transform.h \
+       toasting.h indexing.h \
      )
  
  # location of Catalog.pm
index 1f1a28da2dfdf3d7ab589aebc22bbb3f16dddac2,90b1cd835f89edad6200872360d4dabee620ce37..4397575d8f1428aac4c3b7d7b3ad5915b245eaf1
@@@ -172,11 -157,9 +172,12 @@@ static const Oid object_classes[] = 
        UserMappingRelationId,          /* OCLASS_USER_MAPPING */
        DefaultAclRelationId,           /* OCLASS_DEFACL */
        ExtensionRelationId,            /* OCLASS_EXTENSION */
 +#ifdef PGXC
 +      PgxcClassRelationId,            /* OCLASS_PGXCCLASS */
 +#endif
        EventTriggerRelationId,         /* OCLASS_EVENT_TRIGGER */
-       PolicyRelationId                        /* OCLASS_POLICY */
+       PolicyRelationId,                       /* OCLASS_POLICY */
+       TransformRelationId                     /* OCLASS_TRANSFORM */
  };
  
  
Simple merge
Simple merge
Simple merge
index 44d03f8fab9c4f8b40b05a6f7b83eac1de45767d,3d1cb0b8e3057f3e8ef7128e1ba2a0cab5926800..44972b643272ae961b40d8238492f1a9fb46db98
@@@ -1166,22 -1166,8 +1166,11 @@@ EventTriggerSupportsObjectClass(ObjectC
                case OCLASS_USER_MAPPING:
                case OCLASS_DEFACL:
                case OCLASS_EXTENSION:
-               case OCLASS_PGXC_NODE:
-               case OCLASS_PGXC_GROUP:
 +#ifdef PGXC
 +              case OCLASS_PGXC_CLASS:
 +#endif
                case OCLASS_POLICY:
                        return true;
-               case MAX_OCLASS:
-                       /*
-                        * This shouldn't ever happen, but we keep the case to avoid a
-                        * compiler warning without a "default" clause in the switch.
-                        */
-                       Assert(false);
-                       break;
        }
  
        return true;
index 2b1aa00f33283f040418ea7178bd63470a36c423,5d06fa4ea65c4a751c38daaefb05b032a0b7aaca..d17378692975fbe72d35bb9ea55c7809b1c64bb6
@@@ -1163,76 -1104,6 +1152,73 @@@ ExplainNode(PlanState *planstate, List 
                        if (((Scan *) plan)->scanrelid > 0)
                                ExplainScanTarget((Scan *) plan, es);
                        break;
-               case T_SampleScan:
-                       ExplainScanTarget((Scan *) plan, es);
-                       break;
 +#ifdef PGXC
 +              case T_RemoteQuery:
 +                      /* Emit node execution list */
 +                      ExplainExecNodes(((RemoteQuery *)plan)->exec_nodes, es);
 +                      ExplainScanTarget((Scan *) plan, es);
 +                      break;
 +#endif
 +#ifdef XCP
 +              case T_RemoteSubplan:
 +                      {
 +                              RemoteSubplan  *rsubplan = (RemoteSubplan *) plan;
 +                              List *nodeNameList = NIL;
 +                              ListCell *lc;
 +
 +                              foreach(lc, rsubplan->nodeList)
 +                              {
 +                                      char *nodename = get_pgxc_nodename(
 +                                                      PGXCNodeGetNodeOid(lfirst_int(lc),
 +                                                                                         PGXC_NODE_DATANODE));
 +                                      nodeNameList = lappend(nodeNameList, nodename);
 +                              }
 +
 +                              /* print out destination nodes */
 +                              if (es->format == EXPLAIN_FORMAT_TEXT)
 +                              {
 +                                      if (nodeNameList)
 +                                      {
 +                                              if (es->nodes)
 +                                              {
 +                                                      bool                    first = true;
 +                                                      ListCell           *lc;
 +                                                      foreach(lc, nodeNameList)
 +                                                      {
 +                                                              char *nodename = (char *) lfirst(lc);
 +                                                              if (first)
 +                                                              {
 +                                                                      appendStringInfo(es->str, " on %s (%s",
 +                                                                                                       rsubplan->execOnAll ? "all" : "any",
 +                                                                                                       nodename);
 +                                                                      first = false;
 +                                                              }
 +                                                              else
 +                                                                      appendStringInfo(es->str, ",%s", nodename);
 +                                                      }
 +                                                      appendStringInfoChar(es->str, ')');
 +                                              }
 +                                              else
 +                                              {
 +                                                      appendStringInfo(es->str, " on %s",
 +                                                                               rsubplan->execOnAll ? "all" : "any");
 +                                              }
 +                                      }
 +                                      else
 +                                      {
 +                                              appendStringInfo(es->str, " on local node");
 +                                      }
 +                              }
 +                              else
 +                              {
 +                                      ExplainPropertyText("Replicated",
 +                                                                              rsubplan->execOnAll ? "no" : "yes",
 +                                                                              es);
 +                                      ExplainPropertyList("Node List", nodeNameList, es);
 +                              }
 +                      }
 +                      break;
 +#endif /* XCP */
                case T_IndexScan:
                        {
                                IndexScan  *indexscan = (IndexScan *) plan;
Simple merge
index 4291106a282e2f03e6113c2cfed39a5d935401d5,5dfdb8dd9aec3257031c7ec68eeaac9d453faf9e..8e7fc4d17b007027cac04c7decda9f8d2bd7a3fa
@@@ -434,18 -416,9 +436,19 @@@ static ObjectAddress ATExecAddOf(Relati
  static void ATExecDropOf(Relation rel, LOCKMODE lockmode);
  static void ATExecReplicaIdentity(Relation rel, ReplicaIdentityStmt *stmt, LOCKMODE lockmode);
  static void ATExecGenericOptions(Relation rel, List *options);
 +#ifdef PGXC
 +static void AtExecDistributeBy(Relation rel, DistributeBy *options);
 +static void AtExecSubCluster(Relation rel, PGXCSubCluster *options);
 +static void AtExecAddNode(Relation rel, List *options);
 +static void AtExecDeleteNode(Relation rel, List *options);
 +static void ATCheckCmd(Relation rel, AlterTableCmd *cmd);
 +static RedistribState *BuildRedistribCommands(Oid relid, List *subCmds);
 +static Oid *delete_node_list(Oid *old_oids, int old_num, Oid *del_oids, int del_num, int *new_num);
 +static Oid *add_node_list(Oid *old_oids, int old_num, Oid *add_oids, int add_num, int *new_num);
 +#endif
  static void ATExecEnableRowSecurity(Relation rel);
  static void ATExecDisableRowSecurity(Relation rel);
+ static void ATExecForceNoForceRowSecurity(Relation rel, bool force_rls);
  
  static void copy_relation_data(SMgrRelation rel, SMgrRelation dst,
                                   ForkNumber forkNum, char relpersistence);
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 2541494b8773286fdf9f8aea0b5c032ac4262338,c517dfd9d69c6264ecdd0c4904b8b8337ccea099..6ed46aedc978e2b349eb652104333a62c92e4c1b
@@@ -2873,24 -2888,13 +2893,18 @@@ range_table_mutator(List *rtable
                switch (rte->rtekind)
                {
                        case RTE_RELATION:
-                               if (rte->tablesample)
-                               {
-                                       CHECKFLATCOPY(newrte->tablesample, rte->tablesample,
-                                                                 TableSampleClause);
-                                       MUTATE(newrte->tablesample->args,
-                                                  newrte->tablesample->args,
-                                                  List *);
-                                       MUTATE(newrte->tablesample->repeatable,
-                                                  newrte->tablesample->repeatable,
-                                                  Node *);
-                               }
+                               MUTATE(newrte->tablesample, rte->tablesample,
+                                          TableSampleClause *);
+                               /* we don't bother to copy eref, aliases, etc; OK? */
                                break;
                        case RTE_CTE:
- #endif /* PGXC */
-                               /* we don't bother to copy eref, aliases, etc; OK? */
+                               /* nothing to do */
+                               break;
 +#ifdef PGXC
 +                      case RTE_REMOTE_DUMMY:
++                              /* nothing to do */
 +                              break;
++#endif /* PGXC */
                        case RTE_SUBQUERY:
                                if (!(flags & QTW_IGNORE_RT_SUBQUERIES))
                                {
index 84e25e16fd840b83016e63151a894ac2d699943c,991b4c217565728e254f220e01efa769e5e6d5d6..ec4e7eb138ed5ba1edc049aeb8f79e329a7d51ec
@@@ -540,19 -338,10 +540,19 @@@ _outModifyTable(StringInfo str, const M
        WRITE_NODE_FIELD(rowMarks);
        WRITE_INT_FIELD(epqParam);
        WRITE_ENUM_FIELD(onConflictAction, OnConflictAction);
 +#ifdef XCP    
 +      if (portable_output)
 +              WRITE_RELID_LIST_FIELD(arbiterIndexes);
 +      else
 +      {
 +#endif
        WRITE_NODE_FIELD(arbiterIndexes);
 +#ifdef XCP
 +      }
 +#endif
        WRITE_NODE_FIELD(onConflictSet);
        WRITE_NODE_FIELD(onConflictWhere);
-       WRITE_INT_FIELD(exclRelRTI);
+       WRITE_UINT_FIELD(exclRelRTI);
        WRITE_NODE_FIELD(exclRelTlist);
  }
  
@@@ -3727,6 -2568,16 +3669,27 @@@ _outRangeTblFunction(StringInfo str, co
        WRITE_BITMAPSET_FIELD(funcparams);
  }
  
+ static void
+ _outTableSampleClause(StringInfo str, const TableSampleClause *node)
+ {
+       WRITE_NODE_TYPE("TABLESAMPLECLAUSE");
++#ifdef XCP
++      if (portable_output)
++      {
++              WRITE_FUNCID_FIELD(tsmhandler);
++      }
++      else
++      {
++#endif
+       WRITE_OID_FIELD(tsmhandler);
++#ifdef XCP
++      }
++#endif
+       WRITE_NODE_FIELD(args);
+       WRITE_NODE_FIELD(repeatable);
+ }
  static void
  _outAExpr(StringInfo str, const A_Expr *node)
  {
@@@ -4135,11 -2998,9 +4110,14 @@@ _outNode(StringInfo str, const void *ob
                        case T_SeqScan:
                                _outSeqScan(str, obj);
                                break;
 +#ifdef PGXC
 +                      case T_RemoteQuery:
 +                              _outRemoteQuery(str, obj);
 +                              break;
 +#endif
+                       case T_SampleScan:
+                               _outSampleScan(str, obj);
+                               break;
                        case T_IndexScan:
                                _outIndexScan(str, obj);
                                break;
index 36fe01dc391fdc2ca6bec77bca491c9376ecf591,32b23fff097ca576e20f928ed8da3b2b6437600a..980ccf2ce8bc3c0ab2e17064f5b3a91f8ffd08af
@@@ -2105,1464 -1351,140 +2026,1486 @@@ _readRangeTblFunction(void
        READ_DONE();
  }
  
+ /*
+  * _readTableSampleClause
+  */
+ static TableSampleClause *
+ _readTableSampleClause(void)
+ {
+       READ_LOCALS(TableSampleClause);
++#ifdef XCP
++      if (portable_input)
++      {
++              READ_FUNCID_FIELD(tsmhandler);
++      }
++      else
++      {
++#endif
+       READ_OID_FIELD(tsmhandler);
++#ifdef XCP
++      }
++#endif
+       READ_NODE_FIELD(args);
+       READ_NODE_FIELD(repeatable);
+       READ_DONE();
+ }
  
 +#ifdef XCP
  /*
 - * parseNodeString
 - *
 - * Given a character string representing a node tree, parseNodeString creates
 - * the internal node structure.
 - *
 - * The string to be read must already have been loaded into pg_strtok().
 + * _readPlan
   */
 -Node *
 -parseNodeString(void)
 +static Plan *
 +_readPlan(void)
  {
 -      void       *return_value;
 +      READ_PLAN_FIELDS(Plan);
  
 -      READ_TEMP_LOCALS();
 +      READ_DONE();
 +}
  
 -      token = pg_strtok(&length);
  
 -#define MATCH(tokname, namelen) \
 -      (length == namelen && memcmp(token, tokname, namelen) == 0)
 +/*
 + * _readResult
 + */
 +static Result *
 +_readResult(void)
 +{
 +      READ_PLAN_FIELDS(Result);
  
 -      if (MATCH("QUERY", 5))
 -              return_value = _readQuery();
 -      else if (MATCH("WITHCHECKOPTION", 15))
 -              return_value = _readWithCheckOption();
 -      else if (MATCH("SORTGROUPCLAUSE", 15))
 -              return_value = _readSortGroupClause();
 -      else if (MATCH("GROUPINGSET", 11))
 -              return_value = _readGroupingSet();
 -      else if (MATCH("WINDOWCLAUSE", 12))
 -              return_value = _readWindowClause();
 -      else if (MATCH("ROWMARKCLAUSE", 13))
 -              return_value = _readRowMarkClause();
 -      else if (MATCH("COMMONTABLEEXPR", 15))
 -              return_value = _readCommonTableExpr();
 -      else if (MATCH("SETOPERATIONSTMT", 16))
 -              return_value = _readSetOperationStmt();
 -      else if (MATCH("ALIAS", 5))
 -              return_value = _readAlias();
 -      else if (MATCH("RANGEVAR", 8))
 -              return_value = _readRangeVar();
 -      else if (MATCH("INTOCLAUSE", 10))
 -              return_value = _readIntoClause();
 -      else if (MATCH("VAR", 3))
 -              return_value = _readVar();
 -      else if (MATCH("CONST", 5))
 -              return_value = _readConst();
 -      else if (MATCH("PARAM", 5))
 -              return_value = _readParam();
 -      else if (MATCH("AGGREF", 6))
 -              return_value = _readAggref();
 -      else if (MATCH("GROUPINGFUNC", 12))
 -              return_value = _readGroupingFunc();
 -      else if (MATCH("WINDOWFUNC", 10))
 -              return_value = _readWindowFunc();
 -      else if (MATCH("ARRAYREF", 8))
 -              return_value = _readArrayRef();
 -      else if (MATCH("FUNCEXPR", 8))
 -              return_value = _readFuncExpr();
 -      else if (MATCH("NAMEDARGEXPR", 12))
 -              return_value = _readNamedArgExpr();
 -      else if (MATCH("OPEXPR", 6))
 -              return_value = _readOpExpr();
 -      else if (MATCH("DISTINCTEXPR", 12))
 -              return_value = _readDistinctExpr();
 -      else if (MATCH("NULLIFEXPR", 10))
 -              return_value = _readNullIfExpr();
 -      else if (MATCH("SCALARARRAYOPEXPR", 17))
 -              return_value = _readScalarArrayOpExpr();
 -      else if (MATCH("BOOLEXPR", 8))
 -              return_value = _readBoolExpr();
 -      else if (MATCH("SUBLINK", 7))
 -              return_value = _readSubLink();
 -      else if (MATCH("FIELDSELECT", 11))
 -              return_value = _readFieldSelect();
 -      else if (MATCH("FIELDSTORE", 10))
 -              return_value = _readFieldStore();
 -      else if (MATCH("RELABELTYPE", 11))
 -              return_value = _readRelabelType();
 -      else if (MATCH("COERCEVIAIO", 11))
 -              return_value = _readCoerceViaIO();
 -      else if (MATCH("ARRAYCOERCEEXPR", 15))
 -              return_value = _readArrayCoerceExpr();
 -      else if (MATCH("CONVERTROWTYPEEXPR", 18))
 -              return_value = _readConvertRowtypeExpr();
 -      else if (MATCH("COLLATE", 7))
 -              return_value = _readCollateExpr();
 -      else if (MATCH("CASE", 4))
 -              return_value = _readCaseExpr();
 -      else if (MATCH("WHEN", 4))
 -              return_value = _readCaseWhen();
 -      else if (MATCH("CASETESTEXPR", 12))
 -              return_value = _readCaseTestExpr();
 -      else if (MATCH("ARRAY", 5))
 -              return_value = _readArrayExpr();
 -      else if (MATCH("ROW", 3))
 -              return_value = _readRowExpr();
 -      else if (MATCH("ROWCOMPARE", 10))
 -              return_value = _readRowCompareExpr();
 -      else if (MATCH("COALESCE", 8))
 -              return_value = _readCoalesceExpr();
 -      else if (MATCH("MINMAX", 6))
 -              return_value = _readMinMaxExpr();
 -      else if (MATCH("XMLEXPR", 7))
 -              return_value = _readXmlExpr();
 -      else if (MATCH("NULLTEST", 8))
 -              return_value = _readNullTest();
 -      else if (MATCH("BOOLEANTEST", 11))
 -              return_value = _readBooleanTest();
 -      else if (MATCH("COERCETODOMAIN", 14))
 -              return_value = _readCoerceToDomain();
 -      else if (MATCH("COERCETODOMAINVALUE", 19))
 -              return_value = _readCoerceToDomainValue();
 -      else if (MATCH("SETTODEFAULT", 12))
 -              return_value = _readSetToDefault();
 -      else if (MATCH("CURRENTOFEXPR", 13))
 -              return_value = _readCurrentOfExpr();
 -      else if (MATCH("INFERENCEELEM", 13))
 -              return_value = _readInferenceElem();
 +      READ_NODE_FIELD(resconstantqual);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readModifyTable
 + */
 +static ModifyTable *
 +_readModifyTable(void)
 +{
 +      READ_PLAN_FIELDS(ModifyTable);
 +
 +      READ_ENUM_FIELD(operation, CmdType);
 +      READ_BOOL_FIELD(canSetTag);
 +      READ_UINT_FIELD(nominalRelation);
 +      READ_NODE_FIELD(resultRelations);
 +      READ_INT_FIELD(resultRelIndex);
 +      READ_NODE_FIELD(plans);
 +      READ_NODE_FIELD(withCheckOptionLists);
 +      READ_NODE_FIELD(returningLists);
 +      READ_NODE_FIELD(fdwPrivLists);
 +      READ_NODE_FIELD(rowMarks);
 +      READ_INT_FIELD(epqParam);
 +      READ_ENUM_FIELD(onConflictAction, OnConflictAction);
 +#ifdef XCP
 +      if (portable_input)
 +              READ_RELID_LIST_FIELD(arbiterIndexes);
 +      else
 +#endif
 +      READ_NODE_FIELD(arbiterIndexes);
 +      READ_NODE_FIELD(onConflictSet);
 +      READ_NODE_FIELD(onConflictWhere);
 +      READ_INT_FIELD(exclRelRTI);
 +      READ_NODE_FIELD(exclRelTlist);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readAppend
 + */
 +static Append *
 +_readAppend(void)
 +{
 +      READ_PLAN_FIELDS(Append);
 +
 +      READ_NODE_FIELD(appendplans);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readMergeAppend
 + */
 +static MergeAppend *
 +_readMergeAppend(void)
 +{
 +      int i;
 +      READ_PLAN_FIELDS(MergeAppend);
 +
 +      READ_NODE_FIELD(mergeplans);
 +      READ_INT_FIELD(numCols);
 +
 +      token = pg_strtok(&length);             /* skip :sortColIdx */
 +      local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->sortColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :sortOperators */
 +      local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *oprname; /* operator name */
 +                      char       *leftnspname; /* left type namespace */
 +                      char       *leftname; /* left type name */
 +                      Oid                     oprleft; /* left type */
 +                      char       *rightnspname; /* right type namespace */
 +                      char       *rightname; /* right type name */
 +                      Oid                     oprright; /* right type */
 +                      /* token is already set to nspname */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get operator name */
 +                      oprname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type namespace */
 +                      leftnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type name */
 +                      leftname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type namespace */
 +                      rightnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type name */
 +                      rightname = nullable_string(token, length);
 +                      if (leftname)
 +                              oprleft = get_typname_typid(leftname,
 +                                                                                      NSP_OID(leftnspname));
 +                      else
 +                              oprleft = InvalidOid;
 +                      if (rightname)
 +                              oprright = get_typname_typid(rightname,
 +                                                                                       NSP_OID(rightnspname));
 +                      else
 +                              oprright = InvalidOid;
 +                      local_node->sortOperators[i] = get_operid(oprname,
 +                                                                                                        oprleft,
 +                                                                                                        oprright,
 +                                                                                                        NSP_OID(nspname));
 +              }
 +              else
 +              local_node->sortOperators[i] = atooid(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :collations */
 +      local_node->collations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *collname; /* collation name */
 +                      int             collencoding; /* collation encoding */
 +                      /* the token is already read */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get collname */
 +                      collname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get nargs */
 +                      collencoding = atoi(token);
 +                      if (collname)
 +                              local_node->collations[i] = get_collid(collname,
 +                                                                                                         collencoding,
 +                                                                                                         NSP_OID(nspname));
 +                      else
 +                              local_node->collations[i] = InvalidOid;
 +              }
 +              else
 +              local_node->collations[i] = atooid(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :nullsFirst */
 +      local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->nullsFirst[i] = strtobool(token);
 +      }
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readRecursiveUnion
 + */
 +static RecursiveUnion *
 +_readRecursiveUnion(void)
 +{
 +      int i;
 +      READ_PLAN_FIELDS(RecursiveUnion);
 +
 +      READ_INT_FIELD(wtParam);
 +      READ_INT_FIELD(numCols);
 +
 +      token = pg_strtok(&length);             /* skip :dupColIdx */
 +      local_node->dupColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->dupColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :dupOperators */
 +      local_node->dupOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->dupOperators[i] = atooid(token);
 +      }
 +
 +      READ_LONG_FIELD(numGroups);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readBitmapAnd
 + */
 +static BitmapAnd *
 +_readBitmapAnd(void)
 +{
 +      READ_PLAN_FIELDS(BitmapAnd);
 +
 +      READ_NODE_FIELD(bitmapplans);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readBitmapOr
 + */
 +static BitmapOr *
 +_readBitmapOr(void)
 +{
 +      READ_PLAN_FIELDS(BitmapOr);
 +
 +      READ_NODE_FIELD(bitmapplans);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readScan
 + */
 +static Scan *
 +_readScan(void)
 +{
 +      READ_SCAN_FIELDS(Scan);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readSeqScan
 + */
 +static SeqScan *
 +_readSeqScan(void)
 +{
 +      READ_SCAN_FIELDS(SeqScan);
 +
 +      READ_DONE();
 +}
 +
 +/*
 + * _readSampleScan
 + */
 +static SampleScan *
 +_readSampleScan(void)
 +{
 +      READ_SCAN_FIELDS(SampleScan);
++      READ_NODE_FIELD(tablesample);
 +
 +      READ_DONE();
 +}
 +
 +/*
 + * _readIndexScan
 + */
 +static IndexScan *
 +_readIndexScan(void)
 +{
 +      READ_SCAN_FIELDS(IndexScan);
 +
 +      if (portable_input)
 +              READ_RELID_FIELD(indexid);
 +      else
 +              READ_OID_FIELD(indexid);
 +      READ_NODE_FIELD(indexqual);
 +      READ_NODE_FIELD(indexqualorig);
 +      READ_NODE_FIELD(indexorderby);
 +      READ_NODE_FIELD(indexorderbyorig);
 +      READ_NODE_FIELD(indexorderbyops);
 +      READ_ENUM_FIELD(indexorderdir, ScanDirection);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readIndexOnlyScan
 + */
 +static IndexOnlyScan *
 +_readIndexOnlyScan(void)
 +{
 +      READ_SCAN_FIELDS(IndexOnlyScan);
 +
 +      if (portable_input)
 +              READ_RELID_FIELD(indexid);
 +      else
 +              READ_OID_FIELD(indexid);
 +      READ_NODE_FIELD(indexqual);
 +      READ_NODE_FIELD(indexorderby);
 +      READ_NODE_FIELD(indextlist);
 +      READ_ENUM_FIELD(indexorderdir, ScanDirection);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readBitmapIndexScan
 + */
 +static BitmapIndexScan *
 +_readBitmapIndexScan(void)
 +{
 +      READ_SCAN_FIELDS(BitmapIndexScan);
 +
 +      if (portable_input)
 +              READ_RELID_FIELD(indexid);
 +      else
 +              READ_OID_FIELD(indexid);
 +      READ_NODE_FIELD(indexqual);
 +      READ_NODE_FIELD(indexqualorig);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readBitmapHeapScan
 + */
 +static BitmapHeapScan *
 +_readBitmapHeapScan(void)
 +{
 +      READ_SCAN_FIELDS(BitmapHeapScan);
 +
 +      READ_NODE_FIELD(bitmapqualorig);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readTidScan
 + */
 +static TidScan *
 +_readTidScan(void)
 +{
 +      READ_SCAN_FIELDS(TidScan);
 +
 +      READ_NODE_FIELD(tidquals);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readSubqueryScan
 + */
 +static SubqueryScan *
 +_readSubqueryScan(void)
 +{
 +      READ_SCAN_FIELDS(SubqueryScan);
 +
 +      READ_NODE_FIELD(subplan);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readFunctionScan
 + */
 +static FunctionScan *
 +_readFunctionScan(void)
 +{
 +      READ_SCAN_FIELDS(FunctionScan);
 +
 +      READ_NODE_FIELD(functions);
 +      READ_BOOL_FIELD(funcordinality);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readValuesScan
 + */
 +static ValuesScan *
 +_readValuesScan(void)
 +{
 +      READ_SCAN_FIELDS(ValuesScan);
 +
 +      READ_NODE_FIELD(values_lists);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readCteScan
 + */
 +static CteScan *
 +_readCteScan(void)
 +{
 +      READ_SCAN_FIELDS(CteScan);
 +
 +      READ_INT_FIELD(ctePlanId);
 +      READ_INT_FIELD(cteParam);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readWorkTableScan
 + */
 +static WorkTableScan *
 +_readWorkTableScan(void)
 +{
 +      READ_SCAN_FIELDS(WorkTableScan);
 +
 +      READ_INT_FIELD(wtParam);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readJoin
 + */
 +static Join *
 +_readJoin(void)
 +{
 +      READ_JOIN_FIELDS(Join);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readNestLoop
 + */
 +static NestLoop *
 +_readNestLoop(void)
 +{
 +      READ_JOIN_FIELDS(NestLoop);
 +
 +      READ_NODE_FIELD(nestParams);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readMergeJoin
 + */
 +static MergeJoin *
 +_readMergeJoin(void)
 +{
 +      int                     numCols;
 +      int                     i;
 +      READ_JOIN_FIELDS(MergeJoin);
 +
 +      READ_NODE_FIELD(mergeclauses);
 +      numCols = list_length(local_node->mergeclauses);
 +
 +
 +      token = pg_strtok(&length);             /* skip :mergeFamilies */
 +      local_node->mergeFamilies = (Oid *) palloc(numCols * sizeof(Oid));
 +      for (i = 0; i < numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->mergeFamilies[i] = atooid(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :mergeCollations */
 +      local_node->mergeCollations = (Oid *) palloc(numCols * sizeof(Oid));
 +      for (i = 0; i < numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *collname; /* collation name */
 +                      int             collencoding; /* collation encoding */
 +                      /* the token is already read */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get collname */
 +                      collname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get nargs */
 +                      collencoding = atoi(token);
 +                      if (collname)
 +                              local_node->mergeCollations[i] = get_collid(collname,
 +                                                                                                                      collencoding,
 +                                                                                                                      NSP_OID(nspname));
 +                      else
 +                              local_node->mergeCollations[i] = InvalidOid;
 +              }
 +              else
 +              local_node->mergeCollations[i] = atooid(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :mergeStrategies */
 +      local_node->mergeStrategies = (int *) palloc(numCols * sizeof(int));
 +      for (i = 0; i < numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->mergeStrategies[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :mergeNullsFirst */
 +      local_node->mergeNullsFirst = (bool *) palloc(numCols * sizeof(bool));
 +      for (i = 0; i < numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->mergeNullsFirst[i] = strtobool(token);
 +      }
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readHashJoin
 + */
 +static HashJoin *
 +_readHashJoin(void)
 +{
 +      READ_JOIN_FIELDS(HashJoin);
 +
 +      READ_NODE_FIELD(hashclauses);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readMaterial
 + */
 +static Material *
 +_readMaterial(void)
 +{
 +      READ_PLAN_FIELDS(Material);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readSort
 + */
 +static Sort *
 +_readSort(void)
 +{
 +      int i;
 +      READ_PLAN_FIELDS(Sort);
 +
 +      READ_INT_FIELD(numCols);
 +
 +      token = pg_strtok(&length);             /* skip :sortColIdx */
 +      local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->sortColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :sortOperators */
 +      local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *oprname; /* operator name */
 +                      char       *leftnspname; /* left type namespace */
 +                      char       *leftname; /* left type name */
 +                      Oid                     oprleft; /* left type */
 +                      char       *rightnspname; /* right type namespace */
 +                      char       *rightname; /* right type name */
 +                      Oid                     oprright; /* right type */
 +                      /* token is already set to nspname */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get operator name */
 +                      oprname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type namespace */
 +                      leftnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type name */
 +                      leftname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type namespace */
 +                      rightnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type name */
 +                      rightname = nullable_string(token, length);
 +                      if (leftname)
 +                              oprleft = get_typname_typid(leftname,
 +                                                                                      NSP_OID(leftnspname));
 +                      else
 +                              oprleft = InvalidOid;
 +                      if (rightname)
 +                              oprright = get_typname_typid(rightname,
 +                                                                                       NSP_OID(rightnspname));
 +                      else
 +                              oprright = InvalidOid;
 +                      local_node->sortOperators[i] = get_operid(oprname,
 +                                                                                                        oprleft,
 +                                                                                                        oprright,
 +                                                                                                        NSP_OID(nspname));
 +              }
 +              else
 +              local_node->sortOperators[i] = atooid(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :collations */
 +      local_node->collations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *collname; /* collation name */
 +                      int             collencoding; /* collation encoding */
 +                      /* the token is already read */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get collname */
 +                      collname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get nargs */
 +                      collencoding = atoi(token);
 +                      if (collname)
 +                              local_node->collations[i] = get_collid(collname,
 +                                                                                                         collencoding,
 +                                                                                                         NSP_OID(nspname));
 +                      else
 +                              local_node->collations[i] = InvalidOid;
 +              }
 +              else
 +              local_node->collations[i] = atooid(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :nullsFirst */
 +      local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->nullsFirst[i] = strtobool(token);
 +      }
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readGroup
 + */
 +static Group *
 +_readGroup(void)
 +{
 +      int i;
 +      READ_PLAN_FIELDS(Group);
 +
 +      READ_INT_FIELD(numCols);
 +
 +      token = pg_strtok(&length);             /* skip :grpColIdx */
 +      local_node->grpColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->grpColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :grpOperators */
 +      local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *oprname; /* operator name */
 +                      char       *leftnspname; /* left type namespace */
 +                      char       *leftname; /* left type name */
 +                      Oid                     oprleft; /* left type */
 +                      char       *rightnspname; /* right type namespace */
 +                      char       *rightname; /* right type name */
 +                      Oid                     oprright; /* right type */
 +                      /* token is already set to nspname */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get operator name */
 +                      oprname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type namespace */
 +                      leftnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type name */
 +                      leftname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type namespace */
 +                      rightnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type name */
 +                      rightname = nullable_string(token, length);
 +                      if (leftname)
 +                              oprleft = get_typname_typid(leftname,
 +                                                                                      NSP_OID(leftnspname));
 +                      else
 +                              oprleft = InvalidOid;
 +                      if (rightname)
 +                              oprright = get_typname_typid(rightname,
 +                                                                                       NSP_OID(rightnspname));
 +                      else
 +                              oprright = InvalidOid;
 +                      local_node->grpOperators[i] = get_operid(oprname,
 +                                                                                                       oprleft,
 +                                                                                                       oprright,
 +                                                                                                       NSP_OID(nspname));
 +              }
 +              else
 +                      local_node->grpOperators[i] = atooid(token);
 +      }
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readAgg
 + */
 +static Agg *
 +_readAgg(void)
 +{
 +      int i;
 +      READ_PLAN_FIELDS(Agg);
 +
 +      READ_ENUM_FIELD(aggstrategy, AggStrategy);
 +      READ_ENUM_FIELD(aggdistribution, AggDistribution);
 +      READ_INT_FIELD(numCols);
 +
 +      token = pg_strtok(&length);             /* skip :grpColIdx */
 +      local_node->grpColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->grpColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :grpOperators */
 +      local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *oprname; /* operator name */
 +                      char       *leftnspname; /* left type namespace */
 +                      char       *leftname; /* left type name */
 +                      Oid                     oprleft; /* left type */
 +                      char       *rightnspname; /* right type namespace */
 +                      char       *rightname; /* right type name */
 +                      Oid                     oprright; /* right type */
 +                      /* token is already set to nspname */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get operator name */
 +                      oprname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type namespace */
 +                      leftnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type name */
 +                      leftname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type namespace */
 +                      rightnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type name */
 +                      rightname = nullable_string(token, length);
 +                      if (leftname)
 +                              oprleft = get_typname_typid(leftname,
 +                                                                                      NSP_OID(leftnspname));
 +                      else
 +                              oprleft = InvalidOid;
 +                      if (rightname)
 +                              oprright = get_typname_typid(rightname,
 +                                                                                       NSP_OID(rightnspname));
 +                      else
 +                              oprright = InvalidOid;
 +                      local_node->grpOperators[i] = get_operid(oprname,
 +                                                                                                       oprleft,
 +                                                                                                       oprright,
 +                                                                                                       NSP_OID(nspname));
 +              }
 +              else
 +                      local_node->grpOperators[i] = atooid(token);
 +      }
 +
 +      READ_LONG_FIELD(numGroups);
 +
 +      READ_NODE_FIELD(groupingSets);
 +      READ_NODE_FIELD(chain);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readWindowAgg
 + */
 +static WindowAgg *
 +_readWindowAgg(void)
 +{
 +      int i;
 +      READ_PLAN_FIELDS(WindowAgg);
 +
 +      READ_INT_FIELD(winref);
 +      READ_INT_FIELD(partNumCols);
 +
 +      token = pg_strtok(&length);             /* skip :partColIdx */
 +      local_node->partColIdx = (AttrNumber *) palloc(local_node->partNumCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->partNumCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->partColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :partOperators */
 +      local_node->partOperators = (Oid *) palloc(local_node->partNumCols * sizeof(Oid));
 +      for (i = 0; i < local_node->partNumCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *oprname; /* operator name */
 +                      char       *leftnspname; /* left type namespace */
 +                      char       *leftname; /* left type name */
 +                      Oid                     oprleft; /* left type */
 +                      char       *rightnspname; /* right type namespace */
 +                      char       *rightname; /* right type name */
 +                      Oid                     oprright; /* right type */
 +                      /* token is already set to nspname */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get operator name */
 +                      oprname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type namespace */
 +                      leftnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type name */
 +                      leftname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type namespace */
 +                      rightnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type name */
 +                      rightname = nullable_string(token, length);
 +                      if (leftname)
 +                              oprleft = get_typname_typid(leftname,
 +                                                                                      NSP_OID(leftnspname));
 +                      else
 +                              oprleft = InvalidOid;
 +                      if (rightname)
 +                              oprright = get_typname_typid(rightname,
 +                                                                                       NSP_OID(rightnspname));
 +                      else
 +                              oprright = InvalidOid;
 +                      local_node->partOperators[i] = get_operid(oprname,
 +                                                                                                        oprleft,
 +                                                                                                        oprright,
 +                                                                                                        NSP_OID(nspname));
 +              }
 +              else
 +                      local_node->partOperators[i] = atooid(token);
 +      }
 +
 +      READ_INT_FIELD(ordNumCols);
 +
 +      token = pg_strtok(&length);             /* skip :ordColIdx */
 +      local_node->ordColIdx = (AttrNumber *) palloc(local_node->ordNumCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->ordNumCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->ordColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :ordOperators */
 +      local_node->ordOperators = (Oid *) palloc(local_node->ordNumCols * sizeof(Oid));
 +      for (i = 0; i < local_node->ordNumCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *oprname; /* operator name */
 +                      char       *leftnspname; /* left type namespace */
 +                      char       *leftname; /* left type name */
 +                      Oid                     oprleft; /* left type */
 +                      char       *rightnspname; /* right type namespace */
 +                      char       *rightname; /* right type name */
 +                      Oid                     oprright; /* right type */
 +                      /* token is already set to nspname */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get operator name */
 +                      oprname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type namespace */
 +                      leftnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type name */
 +                      leftname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type namespace */
 +                      rightnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type name */
 +                      rightname = nullable_string(token, length);
 +                      if (leftname)
 +                              oprleft = get_typname_typid(leftname,
 +                                                                                      NSP_OID(leftnspname));
 +                      else
 +                              oprleft = InvalidOid;
 +                      if (rightname)
 +                              oprright = get_typname_typid(rightname,
 +                                                                                       NSP_OID(rightnspname));
 +                      else
 +                              oprright = InvalidOid;
 +                      local_node->ordOperators[i] = get_operid(oprname,
 +                                                                                                       oprleft,
 +                                                                                                       oprright,
 +                                                                                                       NSP_OID(nspname));
 +              }
 +              else
 +                      local_node->ordOperators[i] = atooid(token);
 +      }
 +
 +      READ_INT_FIELD(frameOptions);
 +      READ_NODE_FIELD(startOffset);
 +      READ_NODE_FIELD(endOffset);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readUnique
 + */
 +static Unique *
 +_readUnique(void)
 +{
 +      int i;
 +      READ_PLAN_FIELDS(Unique);
 +
 +      READ_INT_FIELD(numCols);
 +
 +      token = pg_strtok(&length);             /* skip :uniqColIdx */
 +      local_node->uniqColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->uniqColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :uniqOperators */
 +      local_node->uniqOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *oprname; /* operator name */
 +                      char       *leftnspname; /* left type namespace */
 +                      char       *leftname; /* left type name */
 +                      Oid                     oprleft; /* left type */
 +                      char       *rightnspname; /* right type namespace */
 +                      char       *rightname; /* right type name */
 +                      Oid                     oprright; /* right type */
 +                      /* token is already set to nspname */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get operator name */
 +                      oprname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type namespace */
 +                      leftnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type name */
 +                      leftname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type namespace */
 +                      rightnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type name */
 +                      rightname = nullable_string(token, length);
 +                      if (leftname)
 +                              oprleft = get_typname_typid(leftname,
 +                                                                                      NSP_OID(leftnspname));
 +                      else
 +                              oprleft = InvalidOid;
 +                      if (rightname)
 +                              oprright = get_typname_typid(rightname,
 +                                                                                       NSP_OID(rightnspname));
 +                      else
 +                              oprright = InvalidOid;
 +                      local_node->uniqOperators[i] = get_operid(oprname,
 +                                                                                                        oprleft,
 +                                                                                                        oprright,
 +                                                                                                        NSP_OID(nspname));
 +              }
 +              else
 +                      local_node->uniqOperators[i] = atooid(token);
 +      }
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readHash
 + */
 +static Hash *
 +_readHash(void)
 +{
 +      READ_PLAN_FIELDS(Hash);
 +
 +      if (portable_input)
 +              READ_RELID_FIELD(skewTable);
 +      else
 +              READ_OID_FIELD(skewTable);
 +      READ_INT_FIELD(skewColumn);
 +      READ_BOOL_FIELD(skewInherit);
 +      if (portable_input)
 +              READ_TYPID_FIELD(skewColType);
 +      else
 +              READ_OID_FIELD(skewColType);
 +      READ_INT_FIELD(skewColTypmod);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readSetOp
 + */
 +static SetOp *
 +_readSetOp(void)
 +{
 +      int i;
 +      READ_PLAN_FIELDS(SetOp);
 +
 +      READ_ENUM_FIELD(cmd, SetOpCmd);
 +      READ_ENUM_FIELD(strategy, SetOpStrategy);
 +      READ_INT_FIELD(numCols);
 +
 +      token = pg_strtok(&length);             /* skip :dupColIdx */
 +      local_node->dupColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->dupColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :dupOperators */
 +      local_node->dupOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->dupOperators[i] = atooid(token);
 +      }
 +
 +      READ_INT_FIELD(flagColIdx);
 +      READ_INT_FIELD(firstFlag);
 +      READ_LONG_FIELD(numGroups);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readLimit
 + */
 +static Limit *
 +_readLimit(void)
 +{
 +      READ_PLAN_FIELDS(Limit);
 +
 +      READ_NODE_FIELD(limitOffset);
 +      READ_NODE_FIELD(limitCount);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readRemoteSubplan
 + */
 +static RemoteSubplan *
 +_readRemoteSubplan(void)
 +{
 +      READ_SCAN_FIELDS(RemoteSubplan);
 +
 +      READ_CHAR_FIELD(distributionType);
 +      READ_INT_FIELD(distributionKey);
 +      READ_NODE_FIELD(distributionNodes);
 +      READ_NODE_FIELD(distributionRestrict);
 +      READ_NODE_FIELD(nodeList);
 +      READ_BOOL_FIELD(execOnAll);
 +      READ_NODE_FIELD(sort);
 +      READ_STRING_FIELD(cursor);
 +      READ_INT_FIELD(unique);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readRemoteStmt
 + */
 +static RemoteStmt *
 +_readRemoteStmt(void)
 +{
 +      int i;
 +      READ_LOCALS(RemoteStmt);
 +
 +      READ_ENUM_FIELD(commandType, CmdType);
 +      READ_BOOL_FIELD(hasReturning);
 +      READ_NODE_FIELD(planTree);
 +      READ_NODE_FIELD(rtable);
 +      READ_NODE_FIELD(resultRelations);
 +      READ_NODE_FIELD(subplans);
 +      READ_INT_FIELD(nParamExec);
 +      READ_INT_FIELD(nParamRemote);
 +      if (local_node->nParamRemote > 0)
 +      {
 +              local_node->remoteparams = (RemoteParam *) palloc(
 +                              local_node->nParamRemote * sizeof(RemoteParam));
 +              for (i = 0; i < local_node->nParamRemote; i++)
 +              {
 +                      RemoteParam *rparam = &(local_node->remoteparams[i]);
 +                      token = pg_strtok(&length); /* skip  :paramkind */
 +                      token = pg_strtok(&length);
 +                      rparam->paramkind = (ParamKind) atoi(token);
 +
 +                      token = pg_strtok(&length); /* skip  :paramid */
 +                      token = pg_strtok(&length);
 +                      rparam->paramid = atoi(token);
 +
 +                      token = pg_strtok(&length); /* skip  :paramtype */
 +                      if (portable_input)
 +                      {
 +                              char       *nspname; /* namespace name */
 +                              char       *typname; /* data type name */
 +                              token = pg_strtok(&length); /* get nspname */
 +                              nspname = nullable_string(token, length);
 +                              token = pg_strtok(&length); /* get typname */
 +                              typname = nullable_string(token, length);
 +                              if (typname)
 +                                      rparam->paramtype = get_typname_typid(typname,
 +                                                                                                                NSP_OID(nspname));
 +                              else
 +                                      rparam->paramtype = InvalidOid;
 +                      }
 +                      else
 +                      {
 +                              token = pg_strtok(&length);
 +                              rparam->paramtype = atooid(token);
 +                      }
 +              }
 +      }
 +      else
 +              local_node->remoteparams = NULL;
 +
 +      READ_NODE_FIELD(rowMarks);
 +      READ_CHAR_FIELD(distributionType);
 +      READ_INT_FIELD(distributionKey);
 +      READ_NODE_FIELD(distributionNodes);
 +      READ_NODE_FIELD(distributionRestrict);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readSimpleSort
 + */
 +static SimpleSort *
 +_readSimpleSort(void)
 +{
 +      int i;
 +      READ_LOCALS(SimpleSort);
 +
 +      READ_INT_FIELD(numCols);
 +
 +      token = pg_strtok(&length);             /* skip :sortColIdx */
 +      local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->sortColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :sortOperators */
 +      local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *oprname; /* operator name */
 +                      char       *leftnspname; /* left type namespace */
 +                      char       *leftname; /* left type name */
 +                      Oid                     oprleft; /* left type */
 +                      char       *rightnspname; /* right type namespace */
 +                      char       *rightname; /* right type name */
 +                      Oid                     oprright; /* right type */
 +                      /* token is already set to nspname */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get operator name */
 +                      oprname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type namespace */
 +                      leftnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type name */
 +                      leftname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type namespace */
 +                      rightnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type name */
 +                      rightname = nullable_string(token, length);
 +                      if (leftname)
 +                              oprleft = get_typname_typid(leftname,
 +                                                                                      NSP_OID(leftnspname));
 +                      else
 +                              oprleft = InvalidOid;
 +                      if (rightname)
 +                              oprright = get_typname_typid(rightname,
 +                                                                                       NSP_OID(rightnspname));
 +                      else
 +                              oprright = InvalidOid;
 +                      local_node->sortOperators[i] = get_operid(oprname,
 +                                                                                                        oprleft,
 +                                                                                                        oprright,
 +                                                                                                        NSP_OID(nspname));
 +              }
 +              else
 +                      local_node->sortOperators[i] = atooid(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :sortCollations */
 +      local_node->sortCollations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *collname; /* collation name */
 +                      int             collencoding; /* collation encoding */
 +                      /* the token is already read */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get collname */
 +                      collname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get nargs */
 +                      collencoding = atoi(token);
 +                      if (collname)
 +                              local_node->sortCollations[i] = get_collid(collname,
 +                                                                                                         collencoding,
 +                                                                                                         NSP_OID(nspname));
 +                      else
 +                              local_node->sortCollations[i] = InvalidOid;
 +              }
 +              else
 +                      local_node->sortCollations[i] = atooid(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :nullsFirst */
 +      local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->nullsFirst[i] = strtobool(token);
 +      }
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readNestLoopParam
 + */
 +static NestLoopParam *
 +_readNestLoopParam(void)
 +{
 +      READ_LOCALS(NestLoopParam);
 +
 +      READ_INT_FIELD(paramno);
 +      READ_NODE_FIELD(paramval);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readPlanRowMark
 + */
 +static PlanRowMark *
 +_readPlanRowMark(void)
 +{
 +      READ_LOCALS(PlanRowMark);
 +
 +      READ_UINT_FIELD(rti);
 +      READ_UINT_FIELD(prti);
 +      READ_UINT_FIELD(rowmarkId);
 +      READ_ENUM_FIELD(markType, RowMarkType);
 +      READ_INT_FIELD(allMarkTypes);
 +      READ_ENUM_FIELD(strength, LockClauseStrength);
 +      READ_ENUM_FIELD(waitPolicy, LockWaitPolicy);
 +      READ_BOOL_FIELD(isParent);
 +
 +      READ_DONE();
 +}
 +
 +/*
 + * _readLockRows
 + */
 +static LockRows *
 +_readLockRows(void)
 +{
 +      READ_PLAN_FIELDS(LockRows);
 +
 +      READ_NODE_FIELD(rowMarks);
 +      READ_INT_FIELD(epqParam);
 +
 +      READ_DONE();
 +}
 +
 +#endif /* XCP */
 +
 +
 +/*
 + * parseNodeString
 + *
 + * Given a character string representing a node tree, parseNodeString creates
 + * the internal node structure.
 + *
 + * The string to be read must already have been loaded into pg_strtok().
 + */
 +Node *
 +parseNodeString(void)
 +{
 +      void       *return_value;
 +
 +      READ_TEMP_LOCALS();
 +
 +      token = pg_strtok(&length);
 +
 +#define MATCH(tokname, namelen) \
 +      (length == namelen && memcmp(token, tokname, namelen) == 0)
 +
 +      if (MATCH("QUERY", 5))
 +              return_value = _readQuery();
 +      else if (MATCH("WITHCHECKOPTION", 15))
 +              return_value = _readWithCheckOption();
 +      else if (MATCH("SORTGROUPCLAUSE", 15))
 +              return_value = _readSortGroupClause();
 +      else if (MATCH("GROUPINGSET", 11))
 +              return_value = _readGroupingSet();
 +      else if (MATCH("WINDOWCLAUSE", 12))
 +              return_value = _readWindowClause();
 +      else if (MATCH("ROWMARKCLAUSE", 13))
 +              return_value = _readRowMarkClause();
 +      else if (MATCH("COMMONTABLEEXPR", 15))
 +              return_value = _readCommonTableExpr();
-       else if (MATCH("RANGETABLESAMPLE", 16))
-               return_value = _readRangeTableSample();
-       else if (MATCH("TABLESAMPLECLAUSE", 17))
-               return_value = _readTableSampleClause();
 +      else if (MATCH("SETOPERATIONSTMT", 16))
 +              return_value = _readSetOperationStmt();
 +      else if (MATCH("ALIAS", 5))
 +              return_value = _readAlias();
 +      else if (MATCH("RANGEVAR", 8))
 +              return_value = _readRangeVar();
 +      else if (MATCH("INTOCLAUSE", 10))
 +              return_value = _readIntoClause();
 +      else if (MATCH("VAR", 3))
 +              return_value = _readVar();
 +      else if (MATCH("CONST", 5))
 +              return_value = _readConst();
 +      else if (MATCH("PARAM", 5))
 +              return_value = _readParam();
 +      else if (MATCH("AGGREF", 6))
 +              return_value = _readAggref();
 +      else if (MATCH("GROUPINGFUNC", 12))
 +              return_value = _readGroupingFunc();
 +      else if (MATCH("WINDOWFUNC", 10))
 +              return_value = _readWindowFunc();
 +      else if (MATCH("ARRAYREF", 8))
 +              return_value = _readArrayRef();
 +      else if (MATCH("FUNCEXPR", 8))
 +              return_value = _readFuncExpr();
 +      else if (MATCH("NAMEDARGEXPR", 12))
 +              return_value = _readNamedArgExpr();
 +      else if (MATCH("OPEXPR", 6))
 +              return_value = _readOpExpr();
 +      else if (MATCH("DISTINCTEXPR", 12))
 +              return_value = _readDistinctExpr();
 +      else if (MATCH("NULLIFEXPR", 10))
 +              return_value = _readNullIfExpr();
 +      else if (MATCH("SCALARARRAYOPEXPR", 17))
 +              return_value = _readScalarArrayOpExpr();
 +      else if (MATCH("BOOLEXPR", 8))
 +              return_value = _readBoolExpr();
 +      else if (MATCH("SUBLINK", 7))
 +              return_value = _readSubLink();
 +#ifdef XCP
 +      else if (MATCH("SUBPLAN", 7))
 +              return_value = _readSubPlan();
 +#endif
 +      else if (MATCH("FIELDSELECT", 11))
 +              return_value = _readFieldSelect();
 +      else if (MATCH("FIELDSTORE", 10))
 +              return_value = _readFieldStore();
 +      else if (MATCH("RELABELTYPE", 11))
 +              return_value = _readRelabelType();
 +      else if (MATCH("COERCEVIAIO", 11))
 +              return_value = _readCoerceViaIO();
 +      else if (MATCH("ARRAYCOERCEEXPR", 15))
 +              return_value = _readArrayCoerceExpr();
 +      else if (MATCH("CONVERTROWTYPEEXPR", 18))
 +              return_value = _readConvertRowtypeExpr();
 +      else if (MATCH("COLLATE", 7))
 +              return_value = _readCollateExpr();
 +      else if (MATCH("CASE", 4))
 +              return_value = _readCaseExpr();
 +      else if (MATCH("WHEN", 4))
 +              return_value = _readCaseWhen();
 +      else if (MATCH("CASETESTEXPR", 12))
 +              return_value = _readCaseTestExpr();
 +      else if (MATCH("ARRAY", 5))
 +              return_value = _readArrayExpr();
 +      else if (MATCH("ROW", 3))
 +              return_value = _readRowExpr();
 +      else if (MATCH("ROWCOMPARE", 10))
 +              return_value = _readRowCompareExpr();
 +      else if (MATCH("COALESCE", 8))
 +              return_value = _readCoalesceExpr();
 +      else if (MATCH("MINMAX", 6))
 +              return_value = _readMinMaxExpr();
 +      else if (MATCH("XMLEXPR", 7))
 +              return_value = _readXmlExpr();
 +      else if (MATCH("NULLTEST", 8))
 +              return_value = _readNullTest();
 +      else if (MATCH("BOOLEANTEST", 11))
 +              return_value = _readBooleanTest();
 +      else if (MATCH("COERCETODOMAIN", 14))
 +              return_value = _readCoerceToDomain();
 +      else if (MATCH("COERCETODOMAINVALUE", 19))
 +              return_value = _readCoerceToDomainValue();
 +      else if (MATCH("SETTODEFAULT", 12))
 +              return_value = _readSetToDefault();
 +      else if (MATCH("CURRENTOFEXPR", 13))
 +              return_value = _readCurrentOfExpr();
 +      else if (MATCH("INFERENCEELEM", 13))
 +              return_value = _readInferenceElem();
        else if (MATCH("TARGETENTRY", 11))
                return_value = _readTargetEntry();
        else if (MATCH("RANGETBLREF", 11))
index 91797b8f68a752066e64651db0e048785274a21e,8fc1cfd15f5330a44c537eec49e5eecc93dac27f..4f3996ea32814f88852fa7eb4375a3c2a5a69fe7
@@@ -22,8 -17,8 +22,9 @@@
  
  #include <math.h>
  
 +#include "catalog/pg_namespace.h"
  #include "access/sysattr.h"
+ #include "access/tsmapi.h"
  #include "catalog/pg_class.h"
  #include "catalog/pg_operator.h"
  #include "foreign/fdwapi.h"
Simple merge
Simple merge
Simple merge
index 121393702a2a3979219cfccdd2317ac4111d2b50,9bf1c662b5371614dabb5b39dd90a5f676f3678f..d9c314ef224cf829fa0f1af48bf3f88d3193dac1
@@@ -1100,12 -1100,8 +1104,11 @@@ pull_up_simple_subquery(PlannerInfo *ro
                                case RTE_VALUES:
                                        child_rte->lateral = true;
                                        break;
-                               case RTE_RELATION:
                                case RTE_JOIN:
                                case RTE_CTE:
 +#ifdef XCP    
 +                              case RTE_REMOTE_DUMMY:
 +#endif
                                        /* these can't contain any lateral references */
                                        break;
                        }
@@@ -1931,12 -1950,8 +1957,11 @@@ replace_vars_in_jointree(Node *jtnode
                                                        pullup_replace_vars((Node *) rte->values_lists,
                                                                                                context);
                                                break;
-                                       case RTE_RELATION:
                                        case RTE_JOIN:
                                        case RTE_CTE:
 +#ifdef XCP
 +                                      case RTE_REMOTE_DUMMY:
 +#endif                                        
                                                /* these shouldn't be marked LATERAL */
                                                Assert(false);
                                                break;
index 027d28e26111f48f6723f8eb97ec50a7f2c6b629,4336ca1b782fabadd1f96f38e95ab96a96079cbb..b4a722e027b4abab28f34a12e5acb7b626de7257
@@@ -1714,20 -726,7 +1714,20 @@@ create_samplescan_path(PlannerInfo *roo
                                                                                                         required_outer);
        pathnode->pathkeys = NIL;       /* samplescan has unordered result */
  
-       cost_samplescan(pathnode, root, rel);
 +#ifdef XCP
 +      set_scanpath_distribution(root, rel, pathnode);
 +      if (rel->baserestrictinfo)
 +      {
 +              ListCell *lc;
 +              foreach (lc, rel->baserestrictinfo)
 +              {
 +                      RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
 +                      restrict_distribution(root, ri, pathnode);
 +              }
 +      }
 +#endif
 +
+       cost_samplescan(pathnode, root, rel, pathnode->param_info);
  
        return pathnode;
  }
@@@ -3051,15 -1805,8 +3053,13 @@@ reparameterize_path(PlannerInfo *root, 
                                                                                                                loop_count);
                        }
                case T_SubqueryScan:
 +#ifdef XCP
 +                      return create_subqueryscan_path(root, rel, path->pathkeys,
 +                                                                                      required_outer, path->distribution);
 +#else
                        return create_subqueryscan_path(root, rel, path->pathkeys,
                                                                                        required_outer);
-               case T_SampleScan:
-                       return (Path *) create_samplescan_path(root, rel, required_outer);
 +#endif
                default:
                        break;
        }
index db0a38a957767d02b8387ac6c7b74e7c5a90499e,0000000000000000000000000000000000000000..a19ce71866cc21c2fd8323df736cc020171ab65c
mode 100644,000000..100644
--- /dev/null
@@@ -1,2023 -1,0 +1,2027 @@@
 +/*-------------------------------------------------------------------------
 + *
 + * pgxcship.c
 + *            Routines to evaluate expression shippability to remote nodes
 + *
 + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2010-2012, Postgres-XC Development Group
 + * Portions Copyright (c) 1994, Regents of the University of California
 + *
 + *
 + * IDENTIFICATION
 + *      src/backend/optimizer/util/pgxcship.c
 + *
 + *-------------------------------------------------------------------------
 + */
 +
 +#include "postgres.h"
 +#include "catalog/pg_class.h"
 +#include "catalog/pg_inherits_fn.h"
 +#include "catalog/pg_namespace.h"
 +#include "catalog/pg_proc.h"
 +#ifdef PGXC
 +#include "catalog/pg_trigger.h"
 +#endif
 +#include "catalog/pg_type.h"
 +#include "catalog/pgxc_node.h"
 +#include "commands/trigger.h"
 +#include "nodes/nodeFuncs.h"
 +#include "nodes/relation.h"
 +#include "optimizer/clauses.h"
 +#include "optimizer/pgxcplan.h"
 +#include "optimizer/pgxcship.h"
 +#include "optimizer/tlist.h"
 +#include "parser/parsetree.h"
 +#include "parser/parse_coerce.h"
 +#include "parser/parse_type.h"
 +#include "pgxc/locator.h"
 +#include "pgxc/pgxcnode.h"
 +#include "utils/lsyscache.h"
 +#include "utils/rel.h"
 +
 +
 +/*
 + * Shippability_context
 + * This context structure is used by the Fast Query Shipping walker, to gather
 + * information during analysing query for Fast Query Shipping.
 + */
 +typedef struct
 +{
 +      bool            sc_for_expr;            /* if false, the we are checking shippability
 +                                                                       * of the Query, otherwise, we are checking
 +                                                                       * shippability of a stand-alone expression.
 +                                                                       */
 +      Bitmapset       *sc_shippability;       /* The conditions for (un)shippability of the
 +                                                                       * query.
 +                                                                       */
 +      Query           *sc_query;                      /* the query being analysed for FQS */
 +      int                     sc_query_level;         /* level of the query */
 +      int                     sc_max_varlevelsup;     /* maximum upper level referred to by any
 +                                                                       * variable reference in the query. If this
 +                                                                       * value is greater than 0, the query is not
 +                                                                       * shippable, if shipped alone.
 +                                                                       */
 +      ExecNodes       *sc_exec_nodes;         /* nodes where the query should be executed */
 +      ExecNodes       *sc_subquery_en;        /* ExecNodes produced by merging the ExecNodes
 +                                                                       * for individual subqueries. This gets
 +                                                                       * ultimately merged with sc_exec_nodes.
 +                                                                       */
 +      bool            sc_groupby_has_distcol; /* GROUP BY clause has distribution column */
 +} Shippability_context;
 +
 +/*
 + * ShippabilityStat
 + * List of reasons why a query/expression is not shippable to remote nodes.
 + */
 +typedef enum
 +{
 +      SS_UNSHIPPABLE_EXPR = 0,        /* it has unshippable expression */
 +      SS_NEED_SINGLENODE,                     /* Has expressions which can be evaluated when
 +                                                               * there is only a single node involved.
 +                                                               * Athought aggregates too fit in this class, we
 +                                                               * have a separate status to report aggregates,
 +                                                               * see below.
 +                                                               */
 +      SS_NEEDS_COORD,                         /* the query needs Coordinator */
 +      SS_VARLEVEL,                            /* one of its subqueries has a VAR
 +                                                               * referencing an upper level query
 +                                                               * relation
 +                                                               */
 +      SS_NO_NODES,                            /* no suitable nodes can be found to ship
 +                                                               * the query
 +                                                               */
 +      SS_UNSUPPORTED_EXPR,            /* it has expressions currently unsupported
 +                                                               * by FQS, but such expressions might be
 +                                                               * supported by FQS in future
 +                                                               */
 +      SS_HAS_AGG_EXPR,                        /* it has aggregate expressions */
 +      SS_UNSHIPPABLE_TYPE,            /* the type of expression is unshippable */
 +      SS_UNSHIPPABLE_TRIGGER,         /* the type of trigger is unshippable */
 +      SS_UPDATES_DISTRIBUTION_COLUMN  /* query updates the distribution column */
 +} ShippabilityStat;
 +
 +/* Manipulation of shippability reason */
 +static bool pgxc_test_shippability_reason(Shippability_context *context,
 +                                                                                ShippabilityStat reason);
 +static void pgxc_set_shippability_reason(Shippability_context *context,
 +                                                                               ShippabilityStat reason);
 +static void pgxc_reset_shippability_reason(Shippability_context *context,
 +                                                                                 ShippabilityStat reason);
 +
 +/* Evaluation of shippability */
 +static bool pgxc_shippability_walker(Node *node, Shippability_context *sc_context);
 +static void pgxc_set_exprtype_shippability(Oid exprtype, Shippability_context *sc_context);
 +
 +/* Fast-query shipping (FQS) functions */
 +static ExecNodes *pgxc_FQS_get_relation_nodes(RangeTblEntry *rte,
 +                                                                                        Index varno,
 +                                                                                        Query *query);
 +static ExecNodes *pgxc_FQS_find_datanodes(Query *query);
 +static bool pgxc_query_needs_coord(Query *query);
 +static bool pgxc_query_contains_only_pg_catalog(List *rtable);
 +static bool pgxc_is_var_distrib_column(Var *var, List *rtable);
 +static bool pgxc_distinct_has_distcol(Query *query);
 +static bool pgxc_targetlist_has_distcol(Query *query);
 +static ExecNodes *pgxc_FQS_find_datanodes_recurse(Node *node, Query *query,
 +                                                                                      Bitmapset **relids);
 +static ExecNodes *pgxc_FQS_datanodes_for_rtr(Index varno, Query *query);
 +
 +/*
 + * Set the given reason in Shippability_context indicating why the query can not be
 + * shipped directly to remote nodes.
 + */
 +static void
 +pgxc_set_shippability_reason(Shippability_context *context, ShippabilityStat reason)
 +{
 +      context->sc_shippability = bms_add_member(context->sc_shippability, reason);
 +}
 +
 +/*
 + * pgxc_reset_shippability_reason
 + * Reset reason why the query cannot be shipped to remote nodes
 + */
 +static void
 +pgxc_reset_shippability_reason(Shippability_context *context, ShippabilityStat reason)
 +{
 +      context->sc_shippability = bms_del_member(context->sc_shippability, reason);
 +      return;
 +}
 +
 +
 +/*
 + * See if a given reason is why the query can not be shipped directly
 + * to the remote nodes.
 + */
 +static bool
 +pgxc_test_shippability_reason(Shippability_context *context, ShippabilityStat reason)
 +{
 +      return bms_is_member(reason, context->sc_shippability);
 +}
 +
 +
 +/*
 + * pgxc_set_exprtype_shippability
 + * Set the expression type shippability. For now composite types
 + * derived from view definitions are not shippable.
 + */
 +static void
 +pgxc_set_exprtype_shippability(Oid exprtype, Shippability_context *sc_context)
 +{
 +      char    typerelkind;
 +
 +      typerelkind = get_rel_relkind(typeidTypeRelid(exprtype));
 +
 +      if (typerelkind == RELKIND_SEQUENCE ||
 +              typerelkind == RELKIND_VIEW             ||
 +              typerelkind == RELKIND_FOREIGN_TABLE)
 +              pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_TYPE);
 +}
 +
 +/*
 + * pgxc_FQS_datanodes_for_rtr
 + * For a given RangeTblRef find the datanodes where corresponding data is
 + * located.
 + */
 +static ExecNodes *
 +pgxc_FQS_datanodes_for_rtr(Index varno, Query *query)
 +{
 +      RangeTblEntry *rte = rt_fetch(varno, query->rtable);
 +      switch (rte->rtekind)
 +      {
 +              case RTE_RELATION:
 +              {
 +                      /* For anything, other than a table, we can't find the datanodes */
 +                      if (rte->relkind != RELKIND_RELATION)
 +                              return NULL;
 +                      /*
 +                       * In case of inheritance, child tables can have completely different
 +                       * Datanode distribution than parent. To handle inheritance we need
 +                       * to merge the Datanodes of the children table as well. The inheritance
 +                       * is resolved during planning, so we may not have the RTEs of the
 +                       * children here. Also, the exact method of merging Datanodes of the
 +                       * children is not known yet. So, when inheritance is requested, query
 +                       * can not be shipped.
 +                       * See prologue of has_subclass, we might miss on the optimization
 +                       * because has_subclass can return true even if there aren't any
 +                       * subclasses, but it's ok.
 +                       */
 +                      if (rte->inh && has_subclass(rte->relid))
 +                              return NULL;
 +
 +                      return pgxc_FQS_get_relation_nodes(rte, varno, query);
 +              }
 +              break;
 +
 +              /* For any other type of RTE, we return NULL for now */
 +              case RTE_JOIN:
 +              case RTE_CTE:
 +              case RTE_SUBQUERY:
 +              case RTE_FUNCTION:
 +              case RTE_VALUES:
 +              default:
 +                      return NULL;
 +      }
 +}
 +
 +/*
 + * pgxc_FQS_find_datanodes_recurse
 + * Recursively find whether the sub-tree of From Expr rooted under given node is
 + * pushable and if yes where.
 + */
 +static ExecNodes *
 +pgxc_FQS_find_datanodes_recurse(Node *node, Query *query, Bitmapset **relids)
 +{
 +      List            *query_rtable = query->rtable;
 +
 +      if (!node)
 +              return NULL;
 +
 +      switch(nodeTag(node))
 +      {
 +              case T_FromExpr:
 +              {
 +                      FromExpr        *from_expr = (FromExpr *)node;
 +                      ListCell        *lcell;
 +                      bool            first;
 +                      Bitmapset       *from_relids;
 +                      ExecNodes       *result_en;
 +
 +                      /*
 +                       * For INSERT commands, we won't have any entries in the from list.
 +                       * Get the datanodes using the resultRelation index.
 +                       */
 +                      if (query->commandType != CMD_SELECT && !from_expr->fromlist)
 +                      {
 +                              *relids = bms_make_singleton(query->resultRelation);
 +                              return pgxc_FQS_datanodes_for_rtr(query->resultRelation,
 +                                                                                                              query);
 +                      }
 +
 +                      /*
 +                       * All the entries in the From list are considered to be INNER
 +                       * joined with the quals as the JOIN condition. Get the datanodes
 +                       * for the first entry in the From list. For every subsequent entry
 +                       * determine whether the join between the relation in that entry and
 +                       * the cumulative JOIN of previous entries can be pushed down to the
 +                       * datanodes and the corresponding set of datanodes where the join
 +                       * can be pushed down.
 +                       */
 +                      first = true;
 +                      result_en = NULL;
 +                      from_relids = NULL;
 +                      foreach (lcell, from_expr->fromlist)
 +                      {
 +                              Node    *fromlist_entry = lfirst(lcell);
 +                              Bitmapset *fle_relids = NULL;
 +                              ExecNodes       *tmp_en;
 +                              ExecNodes *en = pgxc_FQS_find_datanodes_recurse(fromlist_entry,
 +                                                                                                                              query, &fle_relids);
 +                              /*
 +                               * If any entry in fromlist is not shippable, jointree is not
 +                               * shippable
 +                               */
 +                              if (!en)
 +                              {
 +                                      FreeExecNodes(&result_en);
 +                                      return NULL;
 +                              }
 +
 +                              /* FQS does't ship a DML with more than one relation involved */
 +                              if (!first && query->commandType != CMD_SELECT)
 +                              {
 +                                      FreeExecNodes(&result_en);
 +                                      return NULL;
 +                              }
 +
 +                              if (first)
 +                              {
 +                                      first = false;
 +                                      result_en = en;
 +                                      from_relids = fle_relids;
 +                                      continue;
 +                              }
 +
 +                              tmp_en = result_en;
 +                              /*
 +                               * Check whether the JOIN is pushable to the datanodes and
 +                               * find the datanodes where the JOIN can be pushed to
 +                               */
 +                              result_en = pgxc_is_join_shippable(result_en, en, from_relids,
 +                                                                              fle_relids, JOIN_INNER,
 +                                                                              make_ands_implicit((Expr *)from_expr->quals),
 +                                                                              query_rtable);
 +                              from_relids = bms_join(from_relids, fle_relids);
 +                              FreeExecNodes(&tmp_en);
 +                      }
 +
 +                      *relids = from_relids;
 +                      return result_en;
 +              }
 +                      break;
 +
 +              case T_RangeTblRef:
 +              {
 +                      RangeTblRef *rtr = (RangeTblRef *)node;
 +                      *relids = bms_make_singleton(rtr->rtindex);
 +                      return pgxc_FQS_datanodes_for_rtr(rtr->rtindex, query);
 +              }
 +                      break;
 +
 +              case T_JoinExpr:
 +              {
 +                      JoinExpr *join_expr = (JoinExpr *)node;
 +                      Bitmapset *l_relids = NULL;
 +                      Bitmapset *r_relids = NULL;
 +                      ExecNodes *len;
 +                      ExecNodes *ren;
 +                      ExecNodes *result_en;
 +
 +                      /* FQS does't ship a DML with more than one relation involved */
 +                      if (query->commandType != CMD_SELECT)
 +                              return NULL;
 +
 +                      len = pgxc_FQS_find_datanodes_recurse(join_expr->larg, query,
 +                                                                                                                              &l_relids);
 +                      ren = pgxc_FQS_find_datanodes_recurse(join_expr->rarg, query,
 +                                                                                                                              &r_relids);
 +                      /* If either side of JOIN is unshippable, JOIN is unshippable */
 +                      if (!len || !ren)
 +                      {
 +                              FreeExecNodes(&len);
 +                              FreeExecNodes(&ren);
 +                              return NULL;
 +                      }
 +                      /*
 +                       * Check whether the JOIN is pushable or not, and find the datanodes
 +                       * where the JOIN can be pushed to.
 +                       */
 +                      result_en = pgxc_is_join_shippable(ren, len, r_relids, l_relids,
 +                                                                                              join_expr->jointype,
 +                                                                                              make_ands_implicit((Expr *)join_expr->quals),
 +                                                                                              query_rtable);
 +                      FreeExecNodes(&len);
 +                      FreeExecNodes(&ren);
 +                      *relids = bms_join(l_relids, r_relids);
 +                      return result_en;
 +              }
 +                      break;
 +
 +              default:
 +                      *relids = NULL;
 +                      return NULL;
 +                      break;
 +      }
 +      /* Keep compiler happy */
 +      return NULL;
 +}
 +
 +/*
 + * pgxc_FQS_find_datanodes
 + * Find the list of nodes where to ship query.
 + */
 +static ExecNodes *
 +pgxc_FQS_find_datanodes(Query *query)
 +{
 +      Bitmapset       *relids = NULL;
 +      ExecNodes       *exec_nodes;
 +
 +      /*
 +       * For SELECT, the datanodes required to execute the query is obtained from
 +       * the join tree of the query
 +       */
 +      exec_nodes = pgxc_FQS_find_datanodes_recurse((Node *)query->jointree,
 +                                                                                                              query, &relids);
 +      bms_free(relids);
 +      relids = NULL;
 +
 +      /* If we found the datanodes to ship, use them */
 +      if (exec_nodes && exec_nodes->nodeList)
 +      {
 +              /*
 +               * If relations involved in the query are such that ultimate JOIN is
 +               * replicated JOIN, choose only one of them. If one of them is a
 +               * preferred node choose that one, otherwise choose the first one.
 +               */
 +              if (IsLocatorReplicated(exec_nodes->baselocatortype) &&
 +                      exec_nodes->accesstype == RELATION_ACCESS_READ)
 +              {
 +                      List *tmp_list = exec_nodes->nodeList;
 +                      exec_nodes->nodeList = GetPreferredReplicationNode(exec_nodes->nodeList);
 +                      list_free(tmp_list);
 +              }
 +              return exec_nodes;
 +      }
 +      /*
 +       * If we found the expression which can decide which can be used to decide
 +       * where to ship the query, use that
 +       */
 +      else if (exec_nodes && exec_nodes->en_expr)
 +              return exec_nodes;
 +      /* No way to figure out datanodes to ship the query to */
 +      return NULL;
 +}
 +
 +
 +/*
 + * pgxc_FQS_get_relation_nodes
 + * Return ExecNodes structure so as to decide which node the query should
 + * execute on. If it is possible to set the node list directly, set it.
 + * Otherwise set the appropriate distribution column expression or relid in
 + * ExecNodes structure.
 + */
 +static ExecNodes *
 +pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno, Query *query)
 +{
 +      CmdType command_type = query->commandType;
 +      bool for_update = query->rowMarks ? true : false;
 +      ExecNodes       *rel_exec_nodes;
 +      RelationAccessType rel_access = RELATION_ACCESS_READ;
 +      RelationLocInfo *rel_loc_info;
 +
 +      Assert(rte == rt_fetch(varno, (query->rtable)));
 +
 +      switch (command_type)
 +      {
 +              case CMD_SELECT:
 +                      if (for_update)
 +                              rel_access = RELATION_ACCESS_READ_FOR_UPDATE;
 +                      else
 +                              rel_access = RELATION_ACCESS_READ;
 +                      break;
 +
 +              case CMD_UPDATE:
 +              case CMD_DELETE:
 +                      rel_access = RELATION_ACCESS_UPDATE;
 +                      break;
 +
 +              case CMD_INSERT:
 +                      rel_access = RELATION_ACCESS_INSERT;
 +                      break;
 +
 +              default:
 +                      /* should not happen, but */
 +                      elog(ERROR, "Unrecognised command type %d", command_type);
 +                      break;
 +      }
 +
 +      rel_loc_info = GetRelationLocInfo(rte->relid);
 +      /* If we don't know about the distribution of relation, bail out */
 +      if (!rel_loc_info)
 +              return NULL;
 +
 +      /*
 +       * Find out the datanodes to execute this query on.
 +       * PGXC_FQS_TODO: for now, we apply node reduction only when there is only
 +       * one relation involved in the query. If there are multiple distributed
 +       * tables in the query and we apply node reduction here, we may fail to ship
 +       * the entire join. We should apply node reduction transitively.
 +       */
 +      if (list_length(query->rtable) == 1)
 +              rel_exec_nodes = GetRelationNodesByQuals(rte->relid, varno,
 +                                                                                               query->jointree->quals, rel_access);
 +      else
 +              rel_exec_nodes = GetRelationNodes(rel_loc_info, (Datum) 0,
 +                                                                                true, rel_access);
 +
 +      if (!rel_exec_nodes)
 +              return NULL;
 +
 +      if (rel_access == RELATION_ACCESS_INSERT &&
 +                       IsRelationDistributedByValue(rel_loc_info))
 +      {
 +              ListCell *lc;
 +              TargetEntry *tle;
 +              /*
 +               * If the INSERT is happening on a table distributed by value of a
 +               * column, find out the
 +               * expression for distribution column in the targetlist, and stick in
 +               * in ExecNodes, and clear the nodelist. Execution will find
 +               * out where to insert the row.
 +               */
 +              /* It is a partitioned table, get value by looking in targetList */
 +              foreach(lc, query->targetList)
 +              {
 +                      tle = (TargetEntry *) lfirst(lc);
 +
 +                      if (tle->resjunk)
 +                              continue;
 +                      if (strcmp(tle->resname, GetRelationDistribColumn(rel_loc_info)) == 0)
 +                              break;
 +              }
 +              /* Not found, bail out */
 +              if (!lc)
 +                      return NULL;
 +
 +              Assert(tle);
 +              /* We found the TargetEntry for the partition column */
 +              list_free(rel_exec_nodes->primarynodelist);
 +              rel_exec_nodes->primarynodelist = NULL;
 +              list_free(rel_exec_nodes->nodeList);
 +              rel_exec_nodes->nodeList = NULL;
 +              rel_exec_nodes->en_expr = tle->expr;
 +              rel_exec_nodes->en_relid = rel_loc_info->relid;
 +      }
 +      return rel_exec_nodes;
 +}
 +
 +bool
 +pgxc_query_has_distcolgrouping(Query *query)
 +{
 +      ListCell        *lcell;
 +      foreach (lcell, query->groupClause)
 +      {
 +              SortGroupClause         *sgc = lfirst(lcell);
 +              Node                            *sgc_expr;
 +              if (!IsA(sgc, SortGroupClause))
 +                      continue;
 +              sgc_expr = get_sortgroupclause_expr(sgc, query->targetList);
 +              if (IsA(sgc_expr, Var) &&
 +                      pgxc_is_var_distrib_column((Var *)sgc_expr, query->rtable))
 +                      return true;
 +      }
 +      return false;
 +}
 +
 +static bool
 +pgxc_distinct_has_distcol(Query *query)
 +{
 +      ListCell        *lcell;
 +      foreach (lcell, query->distinctClause)
 +      {
 +              SortGroupClause         *sgc = lfirst(lcell);
 +              Node                            *sgc_expr;
 +              if (!IsA(sgc, SortGroupClause))
 +                      continue;
 +              sgc_expr = get_sortgroupclause_expr(sgc, query->targetList);
 +              if (IsA(sgc_expr, Var) &&
 +                      pgxc_is_var_distrib_column((Var *)sgc_expr, query->rtable))
 +                      return true;
 +      }
 +      return false;
 +}
 +
 +/*
 + * pgxc_shippability_walker
 + * walks the query/expression tree routed at the node passed in, gathering
 + * information which will help decide whether the query to which this node
 + * belongs is shippable to the Datanodes.
 + *
 + * The function should try to walk the entire tree analysing each subquery for
 + * shippability. If a subquery is shippable but not the whole query, we would be
 + * able to create a RemoteQuery node for that subquery, shipping it to the
 + * Datanode.
 + *
 + * Return value of this function is governed by the same rules as
 + * expression_tree_walker(), see prologue of that function for details.
 + */
 +static bool
 +pgxc_shippability_walker(Node *node, Shippability_context *sc_context)
 +{
 +      if (node == NULL)
 +              return false;
 +
 +      /* Below is the list of nodes that can appear in a query, examine each
 +       * kind of node and find out under what conditions query with this node can
 +       * be shippable. For each node, update the context (add fields if
 +       * necessary) so that decision whether to FQS the query or not can be made.
 +       * Every node which has a result is checked to see if the result type of that
 +       * expression is shippable.
 +       */
 +      switch(nodeTag(node))
 +      {
 +              /* Constants are always shippable */
 +              case T_Const:
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +                      break;
 +
 +                      /*
 +                       * For placeholder nodes the shippability of the node, depends upon the
 +                       * expression which they refer to. It will be checked separately, when
 +                       * that expression is encountered.
 +                       */
 +              case T_CaseTestExpr:
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +                      break;
 +
 +                      /*
 +                       * record_in() function throws error, thus requesting a result in the
 +                       * form of anonymous record from datanode gets into error. Hence, if the
 +                       * top expression of a target entry is ROW(), it's not shippable.
 +                       */
 +              case T_TargetEntry:
 +              {
 +                      TargetEntry *tle = (TargetEntry *)node;
 +                      if (tle->expr)
 +                      {
 +                              char typtype = get_typtype(exprType((Node *)tle->expr));
 +                              if (!typtype || typtype == TYPTYPE_PSEUDO)
 +                                      pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
 +                      }
 +              }
 +              break;
 +
 +              case T_SortGroupClause:
 +                      if (sc_context->sc_for_expr)
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +                      break;
 +
 +              case T_CoerceViaIO:
 +              {
 +                      CoerceViaIO             *cvio = (CoerceViaIO *)node;
 +                      Oid                             input_type = exprType((Node *)cvio->arg);
 +                      Oid                             output_type = cvio->resulttype;
 +                      CoercionContext cc;
 +
 +                      cc = cvio->coerceformat == COERCE_IMPLICIT_CAST ? COERCION_IMPLICIT :
 +                              COERCION_EXPLICIT;
 +                      /*
 +                       * Internally we use IO coercion for types which do not have casting
 +                       * defined for them e.g. cstring::date. If such casts are sent to
 +                       * the datanode, those won't be accepted. Hence such casts are
 +                       * unshippable. Since it will be shown as an explicit cast.
 +                       */
 +                      if (!can_coerce_type(1, &input_type, &output_type, cc))
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +              }
 +              break;
 +              /*
 +               * Nodes, which are shippable if the tree rooted under these nodes is
 +               * shippable
 +               */
 +              case T_CoerceToDomainValue:
 +                      /*
 +                       * PGXCTODO: mostly, CoerceToDomainValue node appears in DDLs,
 +                       * do we handle DDLs here?
 +                       */
 +              case T_FieldSelect:
 +              case T_NamedArgExpr:
 +              case T_RelabelType:
 +              case T_BoolExpr:
 +                      /*
 +                       * PGXCTODO: we might need to take into account the kind of boolean
 +                       * operator we have in the quals and see if the corresponding
 +                       * function is immutable.
 +                       */
 +              case T_ArrayCoerceExpr:
 +              case T_ConvertRowtypeExpr:
 +              case T_CaseExpr:
 +              case T_ArrayExpr:
 +              case T_RowExpr:
 +              case T_CollateExpr:
 +              case T_CoalesceExpr:
 +              case T_XmlExpr:
 +              case T_NullTest:
 +              case T_BooleanTest:
 +              case T_CoerceToDomain:
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +                      break;
 +
 +              case T_List:
 +              case T_RangeTblRef:
 +                      break;
 +
 +              case T_ArrayRef:
 +                      /*
 +                       * When multiple values of of an array are updated at once
 +                       * FQS planner cannot yet handle SQL representation correctly.
 +                       * So disable FQS in this case and let standard planner manage it.
 +                       */
 +              case T_FieldStore:
 +                      /*
 +                       * PostgreSQL deparsing logic does not handle the FieldStore
 +                       * for more than one fields (see processIndirection()). So, let's
 +                       * handle it through standard planner, where whole row will be
 +                       * constructed.
 +                       */
 +              case T_SetToDefault:
 +                      /*
 +                       * PGXCTODO: we should actually check whether the default value to
 +                       * be substituted is shippable to the Datanode. Some cases like
 +                       * nextval() of a sequence can not be shipped to the Datanode, hence
 +                       * for now default values can not be shipped to the Datanodes
 +                       */
 +                      pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +                      break;
 +
 +              case T_Var:
 +              {
 +                      Var     *var = (Var *)node;
 +                      /*
 +                       * if a subquery references an upper level variable, that query is
 +                       * not shippable, if shipped alone.
 +                       */
 +                      if (var->varlevelsup > sc_context->sc_max_varlevelsup)
 +                              sc_context->sc_max_varlevelsup = var->varlevelsup;
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +              }
 +              break;
 +
 +              case T_Param:
 +              {
 +                      Param *param = (Param *)node;
 +                      /* PGXCTODO: Can we handle internally generated parameters? */
 +                      if (param->paramkind != PARAM_EXTERN)
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +              }
 +              break;
 +
 +              case T_CurrentOfExpr:
 +              {
 +                      /*
 +                       * Ideally we should not see CurrentOf expression here, it
 +                       * should have been replaced by the CTID = ? expression. But
 +                       * still, no harm in shipping it as is.
 +                       */
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +              }
 +              break;
 +
 +              case T_Aggref:
 +              {
 +                      Aggref *aggref = (Aggref *)node;
 +                      /*
 +                       * An aggregate is completely shippable to the Datanode, if the
 +                       * whole group resides on that Datanode. This will be clear when
 +                       * we see the GROUP BY clause.
 +                       * agglevelsup is minimum of variable's varlevelsup, so we will
 +                       * set the sc_max_varlevelsup when we reach the appropriate
 +                       * VARs in the tree.
 +                       */
 +                      pgxc_set_shippability_reason(sc_context, SS_HAS_AGG_EXPR);
 +                      /*
 +                       * If a stand-alone expression to be shipped, is an
 +                       * 1. aggregate with ORDER BY, DISTINCT directives, it needs all
 +                       * the qualifying rows
 +                       * 2. aggregate without collection function
 +                       * 3. (PGXCTODO:)aggregate with polymorphic transition type, the
 +                       *    the transition type needs to be resolved to correctly interpret
 +                       *    the transition results from Datanodes.
 +                       * Hence, such an expression can not be shipped to the datanodes.
 +                       */
 +                      if (aggref->aggorder ||
 +                              aggref->aggdistinct ||
 +                              aggref->agglevelsup ||
 +                              !aggref->agghas_collectfn ||
 +                              IsPolymorphicType(aggref->aggtrantype))
 +                              pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
 +
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +              }
 +              break;
 +
 +              case T_FuncExpr:
 +              {
 +                      FuncExpr        *funcexpr = (FuncExpr *)node;
 +                      /*
 +                       * PGXC_FQS_TODO: it's too restrictive not to ship non-immutable
 +                       * functions to the Datanode. We need a better way to see what
 +                       * can be shipped to the Datanode and what can not be.
 +                       */
 +                      if (!pgxc_is_func_shippable(funcexpr->funcid))
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
 +
 +                      /*
 +                       * If this is a stand alone expression and the function returns a
 +                       * set of rows, we need to handle it along with the final result of
 +                       * other expressions. So, it can not be shippable.
 +                       */
 +                      if (funcexpr->funcretset && sc_context->sc_for_expr)
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
 +
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +              }
 +              break;
 +
 +              case T_OpExpr:
 +              case T_DistinctExpr:    /* struct-equivalent to OpExpr */
 +              case T_NullIfExpr:              /* struct-equivalent to OpExpr */
 +              {
 +                      /*
 +                       * All of these three are structurally equivalent to OpExpr, so
 +                       * cast the node to OpExpr and check if the operator function is
 +                       * immutable. See PGXC_FQS_TODO item for FuncExpr.
 +                       */
 +                      OpExpr *op_expr = (OpExpr *)node;
 +                      Oid             opfuncid = OidIsValid(op_expr->opfuncid) ?
 +                              op_expr->opfuncid : get_opcode(op_expr->opno);
 +                      if (!OidIsValid(opfuncid) ||
 +                              !pgxc_is_func_shippable(opfuncid))
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
 +
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +              }
 +              break;
 +
 +              case T_ScalarArrayOpExpr:
 +              {
 +                      /*
 +                       * Check if the operator function is shippable to the Datanode
 +                       * PGXC_FQS_TODO: see immutability note for FuncExpr above
 +                       */
 +                      ScalarArrayOpExpr *sao_expr = (ScalarArrayOpExpr *)node;
 +                      Oid             opfuncid = OidIsValid(sao_expr->opfuncid) ?
 +                              sao_expr->opfuncid : get_opcode(sao_expr->opno);
 +                      if (!OidIsValid(opfuncid) ||
 +                              !pgxc_is_func_shippable(opfuncid))
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
 +              }
 +              break;
 +
 +              case T_RowCompareExpr:
 +              case T_MinMaxExpr:
 +              {
 +                      /*
 +                       * PGXCTODO should we be checking the comparision operator
 +                       * functions as well, as we did for OpExpr OR that check is
 +                       * unnecessary. Operator functions are always shippable?
 +                       * Otherwise this node should be treated similar to other
 +                       * "shell" nodes.
 +                       */
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +              }
 +              break;
 +
 +              case T_Query:
 +              {
 +                      Query *query = (Query *)node;
 +
 +                      /* PGXCTODO : If the query has a returning list, it is not shippable as of now */
 +                      if (query->returningList)
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +
 +                      /* A stand-alone expression containing Query is not shippable */
 +                      if (sc_context->sc_for_expr)
 +                      {
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
 +                              break;
 +                      }
 +                      /*
 +                       * We are checking shippability of whole query, go ahead. The query
 +                       * in the context should be same as the query being checked
 +                       */
 +                      Assert(query == sc_context->sc_query);
 +
 +                      /* CREATE TABLE AS is not supported in FQS */
 +                      if (query->commandType == CMD_UTILITY &&
 +                              IsA(query->utilityStmt, CreateTableAsStmt))
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +
 +                      if (query->hasRecursive)
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +                      /*
 +                       * If the query needs Coordinator for evaluation or the query can be
 +                       * completed on Coordinator itself, we don't ship it to the Datanode
 +                       */
 +                      if (pgxc_query_needs_coord(query))
 +                              pgxc_set_shippability_reason(sc_context, SS_NEEDS_COORD);
 +
 +                      /* PGXCTODO: It should be possible to look at the Query and find out
 +                       * whether it can be completely evaluated on the Datanode just like SELECT
 +                       * queries. But we need to be careful while finding out the Datanodes to
 +                       * execute the query on, esp. for the result relations. If one happens to
 +                       * remove/change this restriction, make sure you change
 +                       * pgxc_FQS_get_relation_nodes appropriately.
 +                       * For now DMLs with single rtable entry are candidates for FQS
 +                       */
 +                      if (query->commandType != CMD_SELECT && list_length(query->rtable) > 1)
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +
 +                      /*
 +                       * In following conditions query is shippable when there is only one
 +                       * Datanode involved
 +                       * 1. the query has aggregagtes without grouping by distribution
 +                       *    column
 +                       * 2. the query has window functions
 +                       * 3. the query has ORDER BY clause
 +                       * 4. the query has Distinct clause without distribution column in
 +                       *    distinct clause
 +                       * 5. the query has limit and offset clause
 +                       */
 +                      if (query->hasWindowFuncs || query->sortClause ||
 +                              query->limitOffset || query->limitCount)
 +                              pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
 +
 +                      /*
 +                       * Presence of aggregates or having clause, implies grouping. In
 +                       * such cases, the query won't be shippable unless 1. there is only
 +                       * a single node involved 2. GROUP BY clause has distribution column
 +                       * in it. In the later case aggregates for a given group are entirely
 +                       * computable on a single datanode, because all the rows
 +                       * participating in particular group reside on that datanode.
 +                       * The distribution column can be of any relation
 +                       * participating in the query. All the rows of that relation with
 +                       * the same value of distribution column reside on same node.
 +                       */
 +                      if ((query->hasAggs || query->havingQual) &&
 +                              !pgxc_query_has_distcolgrouping(query))
 +                              pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
 +
 +                      /*
 +                       * If distribution column of any relation is present in the distinct
 +                       * clause, values for that column across nodes will differ, thus two
 +                       * nodes won't be able to produce same result row. Hence in such
 +                       * case, we can execute the queries on many nodes managing to have
 +                       * distinct result.
 +                       */
 +                      if (query->distinctClause && !pgxc_distinct_has_distcol(query))
 +                              pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
 +
 +                      
 +                      if ((query->commandType == CMD_UPDATE) &&
 +                                      pgxc_targetlist_has_distcol(query))
 +                              pgxc_set_shippability_reason(sc_context, SS_UPDATES_DISTRIBUTION_COLUMN);
 +
 +
 +                      /*
 +                       * walk the entire query tree to analyse the query. We will walk the
 +                       * range table, when examining the FROM clause. No need to do it
 +                       * here
 +                       */
 +                      if (query_tree_walker(query, pgxc_shippability_walker,
 +                                                                      sc_context, QTW_IGNORE_RANGE_TABLE ))
 +                              return true;
 +
 +                      /*
 +                       * PGXC_FQS_TODO:
 +                       * There is a subquery in this query, which references Vars in the upper
 +                       * query. For now stop shipping such queries. We should get rid of this
 +                       * condition.
 +                       */
 +                      if (sc_context->sc_max_varlevelsup != 0)
 +                              pgxc_set_shippability_reason(sc_context, SS_VARLEVEL);
 +
 +                      /*
 +                       * Walk the join tree of the query and find the
 +                       * Datanodes needed for evaluating this query
 +                       */
 +                      sc_context->sc_exec_nodes = pgxc_FQS_find_datanodes(query);
 +              }
 +              break;
 +
 +              case T_FromExpr:
 +              {
 +                      /* We don't expect FromExpr in a stand-alone expression */
 +                      if (sc_context->sc_for_expr)
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +
 +                      /*
 +                       * We will examine the jointree of query separately to determine the
 +                       * set of datanodes where to execute the query.
 +                       * If this is an INSERT query with quals, resulting from say
 +                       * conditional rule, we can not handle those in FQS, since there is
 +                       * not SQL representation for such quals.
 +                       */
 +                      if (sc_context->sc_query->commandType == CMD_INSERT &&
 +                              ((FromExpr *)node)->quals)
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +
 +              }
 +              break;
 +
 +              case T_WindowFunc:
 +              {
 +                      WindowFunc *winf = (WindowFunc *)node;
 +                      /*
 +                       * A window function can be evaluated on a Datanode if there is
 +                       * only one Datanode involved.
 +                       */
 +                      pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
 +
 +                      /*
 +                       * A window function is not shippable as part of a stand-alone
 +                       * expression. If the window function is non-immutable, it can not
 +                       * be shipped to the datanodes.
 +                       */
 +                      if (sc_context->sc_for_expr ||
 +                              !pgxc_is_func_shippable(winf->winfnoid))
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
 +
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +              }
 +              break;
++      
++              case T_GroupingFunc:
++                      pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
++                      break;
 +
 +              case T_WindowClause:
 +              {
 +                      /*
 +                       * A window function can be evaluated on a Datanode if there is
 +                       * only one Datanode involved.
 +                       */
 +                      pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
 +
 +                      /*
 +                       * A window function is not shippable as part of a stand-alone
 +                       * expression
 +                       */
 +                      if (sc_context->sc_for_expr)
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
 +              }
 +              break;
 +
 +              case T_JoinExpr:
 +                      /* We don't expect JoinExpr in a stand-alone expression */
 +                      if (sc_context->sc_for_expr)
 +                              pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +
 +                      /*
 +                       * The shippability of join will be deduced while
 +                       * examining the jointree of the query. Nothing to do here
 +                       */
 +                      break;
 +
 +              case T_SubLink:
 +              {
 +                      /*
 +                       * We need to walk the tree in sublink to check for its
 +                       * shippability. We need to call pgxc_is_query_shippable() on Query
 +                       * instead of this function so that every subquery gets a different
 +                       * context for itself. We should avoid the default expression walker
 +                       * getting called on the subquery. At the same time we don't want to
 +                       * miss any other member (current or future) of this structure, from
 +                       * being scanned. So, copy the SubLink structure with subselect
 +                       * being NULL and call expression_tree_walker on the copied
 +                       * structure.
 +                       */
 +                      SubLink         sublink = *(SubLink *)node;
 +                      ExecNodes       *sublink_en;
 +                      /*
 +                       * Walk the query and find the nodes where the query should be
 +                       * executed and node distribution. Merge this with the existing
 +                       * node list obtained for other subqueries. If merging fails, we
 +                       * can not ship the whole query.
 +                       */
 +                      if (IsA(sublink.subselect, Query))
 +                              sublink_en = pgxc_is_query_shippable((Query *)(sublink.subselect),
 +                                                                                                       sc_context->sc_query_level);
 +                      else
 +                              sublink_en = NULL;
 +
 +                      /* PGXCTODO free the old sc_subquery_en. */
 +                      /* If we already know that this query does not have a set of nodes
 +                       * to evaluate on, don't bother to merge again.
 +                       */
 +                      if (!pgxc_test_shippability_reason(sc_context, SS_NO_NODES))
 +                      {
 +                              /*
 +                               * If this is the first time we are finding out the nodes for
 +                               * SubLink, we don't have anything to merge, just assign.
 +                               */
 +                              if (!sc_context->sc_subquery_en)
 +                                      sc_context->sc_subquery_en = sublink_en;
 +                              /*
 +                               * Merge if only the accumulated SubLink ExecNodes and the
 +                               * ExecNodes for this subquery are both replicated.
 +                               */
 +                              else if (sublink_en && IsExecNodesReplicated(sublink_en) &&
 +                                                      IsExecNodesReplicated(sc_context->sc_subquery_en))
 +                              {
 +                                      sc_context->sc_subquery_en = pgxc_merge_exec_nodes(sublink_en,
 +                                                                                                                                 sc_context->sc_subquery_en);
 +                              }
 +                              else
 +                                      sc_context->sc_subquery_en = NULL;
 +
 +                              /*
 +                               * If we didn't find a cumulative ExecNodes, set shippability
 +                               * reason, so that we don't bother merging future sublinks.
 +                               */
 +                              if (!sc_context->sc_subquery_en)
 +                                      pgxc_set_shippability_reason(sc_context, SS_NO_NODES);
 +                      }
 +                      else
 +                              Assert(!sc_context->sc_subquery_en);
 +
 +                      /* Check if the type of sublink result is shippable */
 +                      pgxc_set_exprtype_shippability(exprType(node), sc_context);
 +
 +                      /* Wipe out subselect as explained above and walk the copied tree */
 +                      sublink.subselect = NULL;
 +                      return expression_tree_walker((Node *)&sublink, pgxc_shippability_walker,
 +                                                                                      sc_context);
 +              }
 +              break;
 +
 +              case T_SubPlan:
 +              case T_AlternativeSubPlan:
 +              case T_CommonTableExpr:
 +              case T_SetOperationStmt:
 +              case T_PlaceHolderVar:
 +              case T_AppendRelInfo:
 +              case T_PlaceHolderInfo:
 +              case T_OnConflictExpr:
 +              case T_WithCheckOption:
 +              {
 +                      /* PGXCTODO: till we exhaust this list */
 +                      pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
 +                      /*
 +                       * These expressions are not supported for shippability entirely, so
 +                       * there is no need to walk trees underneath those. If we do so, we
 +                       * might walk the trees with wrong context there.
 +                       */
 +                      return false;
 +              }
 +              break;
 +
 +              default:
 +                      elog(ERROR, "unrecognized node type: %d",
 +                               (int) nodeTag(node));
 +                      break;
 +      }
 +
 +      return expression_tree_walker(node, pgxc_shippability_walker, (void *)sc_context);
 +}
 +
 +
 +/*
 + * pgxc_query_needs_coord
 + * Check if the query needs Coordinator for evaluation or it can be completely
 + * evaluated on Coordinator. Return true if so, otherwise return false.
 + */
 +static bool
 +pgxc_query_needs_coord(Query *query)
 +{
 +      /*
 +       * If the query involves just the catalog tables, and is not an EXEC DIRECT
 +       * statement, it can be evaluated completely on the Coordinator. No need to
 +       * involve Datanodes.
 +       */
 +      if (pgxc_query_contains_only_pg_catalog(query->rtable))
 +              return true;
 +
 +      return false;
 +}
 +
 +
 +/*
 + * pgxc_is_var_distrib_column
 + * Check if given var is a distribution key.
 + */
 +static
 +bool pgxc_is_var_distrib_column(Var *var, List *rtable)
 +{
 +      RangeTblEntry   *rte = rt_fetch(var->varno, rtable);
 +      RelationLocInfo *rel_loc_info;
 +
 +      /* distribution column only applies to the relations */
 +      if (rte->rtekind != RTE_RELATION ||
 +              rte->relkind != RELKIND_RELATION)
 +              return false;
 +      rel_loc_info = GetRelationLocInfo(rte->relid);
 +      if (!rel_loc_info)
 +              return false;
 +      if (var->varattno == rel_loc_info->partAttrNum)
 +              return true;
 +      return false;
 +}
 +
 +
 +/*
 + * Returns whether or not the rtable (and its subqueries)
 + * only contain pg_catalog entries.
 + */
 +static bool
 +pgxc_query_contains_only_pg_catalog(List *rtable)
 +{
 +      ListCell *item;
 +
 +      /* May be complicated. Before giving up, just check for pg_catalog usage */
 +      foreach(item, rtable)
 +      {
 +              RangeTblEntry *rte = (RangeTblEntry *) lfirst(item);
 +
 +              if (rte->rtekind == RTE_RELATION)
 +              {
 +                      if (get_rel_namespace(rte->relid) != PG_CATALOG_NAMESPACE)
 +                              return false;
 +              }
 +              else if (rte->rtekind == RTE_SUBQUERY &&
 +                               !pgxc_query_contains_only_pg_catalog(rte->subquery->rtable))
 +                      return false;
 +      }
 +      return true;
 +}
 +
 +
 +/*
 + * pgxc_is_query_shippable
 + * This function calls the query walker to analyse the query to gather
 + * information like  Constraints under which the query can be shippable, nodes
 + * on which the query is going to be executed etc.
 + * Based on the information gathered, it decides whether the query can be
 + * executed on Datanodes directly without involving Coordinator.
 + * If the query is shippable this routine also returns the nodes where the query
 + * should be shipped. If the query is not shippable, it returns NULL.
 + */
 +ExecNodes *
 +pgxc_is_query_shippable(Query *query, int query_level)
 +{
 +      Shippability_context sc_context;
 +      ExecNodes       *exec_nodes;
 +      bool            canShip = true;
 +      Bitmapset       *shippability;
 +
 +      memset(&sc_context, 0, sizeof(sc_context));
 +      /* let's assume that by default query is shippable */
 +      sc_context.sc_query = query;
 +      sc_context.sc_query_level = query_level;
 +      sc_context.sc_for_expr = false;
 +
 +      /*
 +       * We might have already decided not to ship the query to the Datanodes, but
 +       * still walk it anyway to find out if there are any subqueries which can be
 +       * shipped.
 +       */
 +      pgxc_shippability_walker((Node *)query, &sc_context);
 +
 +      exec_nodes = sc_context.sc_exec_nodes;
 +      /*
 +       * The shippability context contains two ExecNodes, one for the subLinks
 +       * involved in the Query and other for the relation involved in FromClause.
 +       * They are computed at different times while scanning the query. Merge both
 +       * of them if they are both replicated. If query doesn't have SubLinks, we
 +       * don't need to consider corresponding ExecNodes.
 +       * PGXC_FQS_TODO:
 +       * Merge the subquery ExecNodes if both of them are replicated.
 +       * The logic to merge node lists with other distribution
 +       * strategy is not clear yet.
 +       */
 +      if (query->hasSubLinks)
 +      {
 +              if (exec_nodes && IsExecNodesReplicated(exec_nodes) &&
 +                      sc_context.sc_subquery_en &&
 +                      IsExecNodesReplicated(sc_context.sc_subquery_en))
 +                      exec_nodes = pgxc_merge_exec_nodes(exec_nodes,
 +                                                                                         sc_context.sc_subquery_en);
 +              else
 +                      exec_nodes = NULL;
 +      }
 +
 +      /*
 +       * Look at the information gathered by the walker in Shippability_context and that
 +       * in the Query structure to decide whether we should ship this query
 +       * directly to the Datanode or not
 +       */
 +
 +      /*
 +       * If the planner was not able to find the Datanodes to the execute the
 +       * query, the query is not completely shippable. So, return NULL
 +       */
 +      if (!exec_nodes)
 +              return NULL;
 +
 +      /* Copy the shippability reasons. We modify the copy for easier handling.
 +       * The original can be saved away */
 +      shippability = bms_copy(sc_context.sc_shippability);
 +
 +      /*
 +       * If the query has an expression which renders the shippability to single
 +       * node, and query needs to be shipped to more than one node, it can not be
 +       * shipped
 +       */
 +      if (bms_is_member(SS_NEED_SINGLENODE, shippability))
 +      {
 +              /*
 +               * if nodeList has no nodes, it ExecNodes will have other means to know
 +               * the nodes where to execute like distribution column expression. We
 +               * can't tell how many nodes the query will be executed on, hence treat
 +               * that as multiple nodes.
 +               */
 +              if (list_length(exec_nodes->nodeList) != 1)
 +                      canShip = false;
 +
 +              /* We handled the reason here, reset it */
 +              shippability = bms_del_member(shippability, SS_NEED_SINGLENODE);
 +      }
 +
 +      /*
 +       * If HAS_AGG_EXPR is set but NEED_SINGLENODE is not set, it means the
 +       * aggregates are entirely shippable, so don't worry about it.
 +       */
 +      shippability = bms_del_member(shippability, SS_HAS_AGG_EXPR);
 +
 +      /* Can not ship the query for some reason */
 +      if (!bms_is_empty(shippability))
 +              canShip = false;
 +
 +      /* Always keep this at the end before checking canShip and return */
 +      if (!canShip && exec_nodes)
 +              FreeExecNodes(&exec_nodes);
 +      /* If query is to be shipped, we should know where to execute the query */
 +      Assert (!canShip || exec_nodes);
 +
 +      bms_free(shippability);
 +      shippability = NULL;
 +
 +      return exec_nodes;
 +}
 +
 +
 +/*
 + * pgxc_is_expr_shippable
 + * Check whether the given expression can be shipped to datanodes.
 + *
 + * Note on has_aggs
 + * The aggregate expressions are not shippable if they can not be completely
 + * evaluated on a single datanode. But this function does not have enough
 + * context to determine the set of datanodes where the expression will be
 + * evaluated. Hence, the caller of this function can handle aggregate
 + * expressions, it passes a non-NULL value for has_aggs. This function returns
 + * whether the expression has any aggregates or not through this argument. If a
 + * caller passes NULL value for has_aggs, this function assumes that the caller
 + * can not handle the aggregates and deems the expression has unshippable.
 + */
 +bool
 +pgxc_is_expr_shippable(Expr *node, bool *has_aggs)
 +{
 +      Shippability_context sc_context;
 +
 +      /* Create the FQS context */
 +      memset(&sc_context, 0, sizeof(sc_context));
 +      sc_context.sc_query = NULL;
 +      sc_context.sc_query_level = 0;
 +      sc_context.sc_for_expr = true;
 +
 +      /* Walk the expression to check its shippability */
 +      pgxc_shippability_walker((Node *)node, &sc_context);
 +
 +      /*
 +       * If caller is interested in knowing, whether the expression has aggregates
 +       * let the caller know about it. The caller is capable of handling such
 +       * expressions. Otherwise assume such an expression as not shippable.
 +       */
 +      if (has_aggs)
 +              *has_aggs = pgxc_test_shippability_reason(&sc_context, SS_HAS_AGG_EXPR);
 +      else if (pgxc_test_shippability_reason(&sc_context, SS_HAS_AGG_EXPR))
 +              return false;
 +      /* Done with aggregate expression shippability. Delete the status */
 +      pgxc_reset_shippability_reason(&sc_context, SS_HAS_AGG_EXPR);
 +
 +      /* If there are reasons why the expression is unshippable, return false */
 +      if (!bms_is_empty(sc_context.sc_shippability))
 +              return false;
 +
 +      /* If nothing wrong found, the expression is shippable */
 +      return true;
 +}
 +
 +
 +/*
 + * pgxc_is_func_shippable
 + * Determine if a function is shippable
 + */
 +bool
 +pgxc_is_func_shippable(Oid funcid)
 +{
 +      /*
 +       * For the time being a function is thought as shippable
 +       * only if it is immutable.
 +       */
 +      return func_volatile(funcid) == PROVOLATILE_IMMUTABLE;
 +}
 +
 +
 +/*
 + * pgxc_find_dist_equijoin_qual
 + * Check equijoin conditions on given relations
 + */
 +Expr *
 +pgxc_find_dist_equijoin_qual(Relids varnos_1,
 +              Relids varnos_2, Oid distcol_type, Node *quals, List *rtable)
 +{
 +      List            *lquals;
 +      ListCell        *qcell;
 +
 +      /* If no quals, no equijoin */
 +      if (!quals)
 +              return false;
 +      /*
 +       * Make a copy of the argument bitmaps, it will be modified by
 +       * bms_first_member().
 +       */
 +      varnos_1 = bms_copy(varnos_1);
 +      varnos_2 = bms_copy(varnos_2);
 +
 +      if (!IsA(quals, List))
 +              lquals = make_ands_implicit((Expr *)quals);
 +      else
 +              lquals = (List *)quals;
 +
 +      foreach(qcell, lquals)
 +      {
 +              Expr *qual_expr = (Expr *)lfirst(qcell);
 +              OpExpr *op;
 +              Var *lvar;
 +              Var *rvar;
 +
 +              if (!IsA(qual_expr, OpExpr))
 +                      continue;
 +              op = (OpExpr *)qual_expr;
 +              /* If not a binary operator, it can not be '='. */
 +              if (list_length(op->args) != 2)
 +                      continue;
 +
 +              /*
 +               * Check if both operands are Vars, if not check next expression */
 +              if (IsA(linitial(op->args), Var) && IsA(lsecond(op->args), Var))
 +              {
 +                      lvar = (Var *)linitial(op->args);
 +                      rvar = (Var *)lsecond(op->args);
 +              }
 +              else
 +                      continue;
 +
 +              /*
 +               * If the data types of both the columns are not same, continue. Hash
 +               * and Modulo of a the same bytes will be same if the data types are
 +               * same. So, only when the data types of the columns are same, we can
 +               * ship a distributed JOIN to the Datanodes
 +               */
 +              if (exprType((Node *)lvar) != exprType((Node *)rvar))
 +                      continue;
 +
 +              /* if the vars do not correspond to the required varnos, continue. */
 +              if ((bms_is_member(lvar->varno, varnos_1) && bms_is_member(rvar->varno, varnos_2)) ||
 +                      (bms_is_member(lvar->varno, varnos_2) && bms_is_member(rvar->varno, varnos_1)))
 +              {
 +                      if (!pgxc_is_var_distrib_column(lvar, rtable) ||
 +                              !pgxc_is_var_distrib_column(rvar, rtable))
 +                              continue;
 +              }
 +              else
 +                      continue;
 +              /*
 +               * If the operator is not an assignment operator, check next
 +               * constraint. An operator is an assignment operator if it's
 +               * mergejoinable or hashjoinable. Beware that not every assignment
 +               * operator is mergejoinable or hashjoinable, so we might leave some
 +               * oportunity. But then we have to rely on the opname which may not
 +               * be something we know to be equality operator as well.
 +               */
 +              if (!op_mergejoinable(op->opno, exprType((Node *)lvar)) &&
 +                      !op_hashjoinable(op->opno, exprType((Node *)lvar)))
 +                      continue;
 +              /* Found equi-join condition on distribution columns */
 +              return qual_expr;
 +      }
 +      return NULL;
 +}
 +
 +
 +/*
 + * pgxc_merge_exec_nodes
 + * The routine combines the two exec_nodes passed such that the resultant
 + * exec_node corresponds to the JOIN of respective relations.
 + * If both exec_nodes can not be merged, it returns NULL.
 + */
 +ExecNodes *
 +pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2)
 +{
 +      ExecNodes       *merged_en = makeNode(ExecNodes);
 +      ExecNodes       *tmp_en;
 +
 +      /* If either of exec_nodes are NULL, return the copy of other one */
 +      if (!en1)
 +      {
 +              tmp_en = copyObject(en2);
 +              return tmp_en;
 +      }
 +      if (!en2)
 +      {
 +              tmp_en = copyObject(en1);
 +              return tmp_en;
 +      }
 +
 +      /* Following cases are not handled in this routine */
 +      /* PGXC_FQS_TODO how should we handle table usage type? */
 +      if (en1->primarynodelist || en2->primarynodelist ||
 +              en1->en_expr || en2->en_expr ||
 +              OidIsValid(en1->en_relid) || OidIsValid(en2->en_relid) ||
 +              en1->accesstype != RELATION_ACCESS_READ || en2->accesstype != RELATION_ACCESS_READ)
 +              return NULL;
 +
 +      if (IsExecNodesReplicated(en1) &&
 +              IsExecNodesReplicated(en2))
 +      {
 +              /*
 +               * Replicated/replicated join case
 +               * Check that replicated relation is not disjoint
 +               * with initial relation which is also replicated.
 +               * If there is a common portion of the node list between
 +               * the two relations, other rtables have to be checked on
 +               * this restricted list.
 +               */
 +              merged_en->nodeList = list_intersection_int(en1->nodeList,
 +                                                                                                      en2->nodeList);
 +              merged_en->baselocatortype = LOCATOR_TYPE_REPLICATED;
 +              if (!merged_en->nodeList)
 +                      FreeExecNodes(&merged_en);
 +              return merged_en;
 +      }
 +
 +      if (IsExecNodesReplicated(en1) &&
 +              IsExecNodesColumnDistributed(en2))
 +      {
 +              List    *diff_nodelist = NULL;
 +              /*
 +               * Replicated/distributed join case.
 +               * Node list of distributed table has to be included
 +               * in node list of replicated table.
 +               */
 +              diff_nodelist = list_difference_int(en2->nodeList, en1->nodeList);
 +              /*
 +               * If the difference list is not empty, this means that node list of
 +               * distributed table is not completely mapped by node list of replicated
 +               * table, so go through standard planner.
 +               */
 +              if (diff_nodelist)
 +                      FreeExecNodes(&merged_en);
 +              else
 +              {
 +                      merged_en->nodeList = list_copy(en2->nodeList);
 +                      merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED;
 +              }
 +              return merged_en;
 +      }
 +
 +      if (IsExecNodesColumnDistributed(en1) &&
 +              IsExecNodesReplicated(en2))
 +      {
 +              List *diff_nodelist = NULL;
 +              /*
 +               * Distributed/replicated join case.
 +               * Node list of distributed table has to be included
 +               * in node list of replicated table.
 +               */
 +              diff_nodelist = list_difference_int(en1->nodeList, en2->nodeList);
 +
 +              /*
 +               * If the difference list is not empty, this means that node list of
 +               * distributed table is not completely mapped by node list of replicated
 +                       * table, so go through standard planner.
 +               */
 +              if (diff_nodelist)
 +                      FreeExecNodes(&merged_en);
 +              else
 +              {
 +                      merged_en->nodeList = list_copy(en1->nodeList);
 +                      merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED;
 +              }
 +              return merged_en;
 +      }
 +
 +      if (IsExecNodesColumnDistributed(en1) &&
 +              IsExecNodesColumnDistributed(en2))
 +      {
 +              /*
 +               * Distributed/distributed case
 +               * If the caller has suggested that this is an equi-join between two
 +               * distributed results, check that they have the same nodes in the distribution
 +               * node list. The caller is expected to fully decide whether to merge
 +               * the nodes or not.
 +               */
 +              if (!list_difference_int(en1->nodeList, en2->nodeList) &&
 +                      !list_difference_int(en2->nodeList, en1->nodeList))
 +              {
 +                      merged_en->nodeList = list_copy(en1->nodeList);
 +                      if (en1->baselocatortype == en2->baselocatortype)
 +                              merged_en->baselocatortype = en1->baselocatortype;
 +                      else
 +                              merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED;
 +              }
 +              else
 +                      FreeExecNodes(&merged_en);
 +              return merged_en;
 +      }
 +
 +      ereport(ERROR,
 +                      (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 +                       errmsg("Postgres-XC does not support this distribution type yet"),
 +                       errdetail("The feature is not currently supported")));
 +
 +      /* Keep compiler happy */
 +      return NULL;
 +}
 +
 +
 +/*
 + * pgxc_check_index_shippability
 + * Check shippability of index described by given conditions. This generic
 + * function can be called even if the index is not yet defined.
 + */
 +bool
 +pgxc_check_index_shippability(RelationLocInfo *relLocInfo,
 +                                                        bool is_primary,
 +                                                        bool is_unique,
 +                                                        bool is_exclusion,
 +                                                        List *indexAttrs,
 +                                                        List *indexExprs)
 +{
 +      bool            result = true;
 +      ListCell   *lc;
 +
 +      /*
 +       * Leave if no locator information, in this case shippability has no
 +       * meaning.
 +       */
 +      if (!relLocInfo)
 +              return result;
 +
 +      /*
 +       * Scan the expressions used in index and check the shippability of each
 +       * of them. If only one is not-shippable, the index is considered as non
 +       * shippable. It is important to check the shippability of the expressions
 +       * before refining scan on the index columns and distribution type of
 +       * parent relation.
 +       */
 +      foreach(lc, indexExprs)
 +      {
 +              if (!pgxc_is_expr_shippable((Expr *) lfirst(lc), NULL))
 +              {
 +                      /* One of the expressions is not shippable, so leave */
 +                      result = false;
 +                      goto finish;
 +              }
 +      }
 +
 +      /*
 +       * Check if relation is distributed on a single node, in this case
 +       * the constraint can be shipped in all the cases.
 +       */
 +      if (list_length(relLocInfo->nodeList) == 1)
 +              return result;
 +
 +      /*
 +       * Check the case of EXCLUSION index.
 +       * EXCLUSION constraints are shippable only for replicated relations as
 +       * such constraints need that one tuple is checked on all the others, and
 +       * if this tuple is correctly excluded of the others, the constraint is
 +       * verified.
 +       */
 +      if (is_exclusion)
 +      {
 +              if (!IsRelationReplicated(relLocInfo))
 +              {
 +                      result = false;
 +                      goto finish;
 +              }
 +      }
 +
 +      /*
 +       * Check the case of PRIMARY KEY INDEX and UNIQUE index.
 +       * Those constraints are shippable if the parent relation is replicated
 +       * or if the column
 +       */
 +      if (is_unique ||
 +              is_primary)
 +      {
 +              /*
 +               * Perform different checks depending on distribution type of parent
 +               * relation.
 +               */
 +              switch(relLocInfo->locatorType)
 +              {
 +                      case LOCATOR_TYPE_REPLICATED:
 +                              /* In the replicated case this index is shippable */
 +                              result = true;
 +                              break;
 +
 +                      case LOCATOR_TYPE_RROBIN:
 +                              /*
 +                               * Index on roundrobin parent table cannot be safely shipped
 +                               * because of the random behavior of data balancing.
 +                               */
 +                              result = false;
 +                              break;
 +
 +                      case LOCATOR_TYPE_HASH:
 +                      case LOCATOR_TYPE_MODULO:
 +                              /*
 +                               * Unique indexes on Hash and Modulo tables are shippable if the
 +                               * index expression contains all the distribution expressions of
 +                               * its parent relation.
 +                               *
 +                               * Here is a short example with concatenate that cannot be
 +                               * shipped:
 +                               * CREATE TABLE aa (a text, b text) DISTRIBUTE BY HASH(a);
 +                               * CREATE UNIQUE INDEX aap ON aa((a || b));
 +                               * INSERT INTO aa VALUES ('a', 'abb');
 +                               * INSERT INTO aa VALUES ('aab', b); -- no error ??!
 +                               * The output uniqueness is not guaranteed as both INSERT will
 +                               * go to different nodes. For such simple reasons unique
 +                               * indexes on distributed tables are not shippable.
 +                               * Shippability is not even ensured if all the expressions
 +                               * used as Var are only distributed columns as the hash output of
 +                               * their value combination does not ensure that query will
 +                               * be directed to the correct remote node. Uniqueness is not even
 +                               * protected if the index expression contains only the distribution
 +                               * column like for that with a cluster of 2 Datanodes:
 +                               * CREATE TABLE aa (a int) DISTRIBUTE BY HASH(a);
 +                               * CREATE UNIQUE INDEX aap ON (abs(a));
 +                               * INSERT INTO aa (2); -- to Datanode 1
 +                               * INSERT INTO aa (-2); -- to Datanode 2, breaks uniqueness
 +                               *
 +                               * PGXCTODO: for the time being distribution key can only be
 +                               * defined on a single column, so this will need to be changed
 +                               * onde a relation distribution will be able to be defined based
 +                               * on an expression of multiple columns.
 +                               */
 +
 +                              /* Index contains expressions, it cannot be shipped safely */
 +                              if (indexExprs != NIL)
 +                              {
 +                                      result = false;
 +                                      break;
 +                              }
 +
 +                              /* Nothing to do if no attributes */
 +                              if (indexAttrs == NIL)
 +                                      break;
 +
 +                              /*
 +                               * Check that distribution column is included in the list of
 +                               * index columns.
 +                               */
 +                              if (!list_member_int(indexAttrs, relLocInfo->partAttrNum))
 +                              {
 +                                      /*
 +                                       * Distribution column is not in index column list
 +                                       * So index can be enforced remotely.
 +                                       */
 +                                      result = false;
 +                                      break;
 +                              }
 +
 +                              /*
 +                               * by being here we are now sure that the index can be enforced
 +                               * remotely as the distribution column is included in index.
 +                               */
 +                              break;
 +
 +                      /* Those types are not supported yet */
 +                      case LOCATOR_TYPE_RANGE:
 +                      case LOCATOR_TYPE_NONE:
 +                      case LOCATOR_TYPE_DISTRIBUTED:
 +                      case LOCATOR_TYPE_CUSTOM:
 +                      default:
 +                              /* Should not come here */
 +                              Assert(0);
 +              }
 +      }
 +
 +finish:
 +      return result;
 +}
 +
 +
 +/*
 + * pgxc_check_fk_shippabilily
 + * Check the shippability of a parent and a child relation based on the
 + * distribution of each and the columns that are used to reference to
 + * parent and child relation. This can be used for inheritance or foreign
 + * key shippability evaluation.
 + */
 +bool
 +pgxc_check_fk_shippability(RelationLocInfo *parentLocInfo,
 +                                                 RelationLocInfo *childLocInfo,
 +                                                 List *parentRefs,
 +                                                 List *childRefs)
 +{
 +      bool result = true;
 +
 +      Assert(list_length(parentRefs) == list_length(childRefs));
 +
 +      /*
 +       * If either child or parent have no relation data, shippability makes
 +       * no sense.
 +       */
 +      if (!parentLocInfo || !childLocInfo)
 +              return result;
 +
 +      /* In the case of a child referencing to itself, constraint is shippable */
 +      if (IsLocatorInfoEqual(parentLocInfo, childLocInfo))
 +              return result;
 +
 +      /* Now begin the evaluation */
 +      switch (parentLocInfo->locatorType)
 +      {
 +              case LOCATOR_TYPE_REPLICATED:
 +                      /*
 +                       * If the parent relation is replicated, the child relation can
 +                       * always refer to it on all the nodes.
 +                       */
 +                      result = true;
 +                      break;
 +
 +              case LOCATOR_TYPE_RROBIN:
 +                      /*
 +                       * If the parent relation is based on roundrobin, the child
 +                       * relation cannot be enforced on remote nodes before of the
 +                       * random behavior of data balancing.
 +                       */
 +                      result = false;
 +                      break;
 +
 +              case LOCATOR_TYPE_HASH:
 +              case LOCATOR_TYPE_MODULO:
 +                      /*
 +                       * If parent table is distributed, the child table can reference
 +                       * to its parent safely if the following conditions are satisfied:
 +                       * - parent and child are both hash-based, or both modulo-based
 +                       * - parent reference columns contain the distribution column
 +                       *   of the parent relation
 +                       * - child reference columns contain the distribution column
 +                       *   of the child relation
 +                       * - both child and parent map the same nodes for data location
 +                       */
 +
 +                      /* A replicated child cannot refer to a distributed parent */
 +                      if (IsRelationReplicated(childLocInfo))
 +                      {
 +                              result = false;
 +                              break;
 +                      }
 +
 +                      /*
 +                       * Parent and child need to have the same distribution type:
 +                       * hash or modulo.
 +                       */
 +                      if (parentLocInfo->locatorType != childLocInfo->locatorType)
 +                      {
 +                              result = false;
 +                              break;
 +                      }
 +
 +                      /*
 +                       * Parent and child need to have their data located exactly
 +                       * on the same list of nodes.
 +                       */
 +                      if (list_difference_int(childLocInfo->nodeList, parentLocInfo->nodeList) ||
 +                              list_difference_int(parentLocInfo->nodeList, childLocInfo->nodeList))
 +                      {
 +                              result = false;
 +                              break;
 +                      }
 +
 +                      /*
 +                       * Check that child and parents are referenced using their
 +                       * distribution column.
 +                       */
 +                      if (!list_member_int(childRefs, childLocInfo->partAttrNum) ||
 +                              !list_member_int(parentRefs, parentLocInfo->partAttrNum))
 +                      {
 +                              result = false;
 +                              break;
 +                      }
 +
 +                      /* By being here, parent-child constraint can be shipped correctly */
 +                      break;
 +
 +              case LOCATOR_TYPE_RANGE:
 +              case LOCATOR_TYPE_NONE:
 +              case LOCATOR_TYPE_DISTRIBUTED:
 +              case LOCATOR_TYPE_CUSTOM:
 +              default:
 +                      /* Should not come here */
 +                      Assert(0);
 +      }
 +
 +      return result;
 +}
 +
 +/*
 + * pgxc_is_join_reducible
 + * The shippability of JOIN is decided in following steps
 + * 1. Are the JOIN conditions shippable?
 + *    For INNER JOIN it's possible to apply some of the conditions at the
 + *    Datanodes and others at coordinator. But for other JOINs, JOIN conditions
 + *    decide which tuples on the OUTER side are appended with NULL columns from
 + *    INNER side, we need all the join conditions to be shippable for the join to
 + *    be shippable.
 + * 2. Do the JOIN conditions have quals that will make it shippable?
 + *    When both sides of JOIN are replicated, irrespective of the quals the JOIN
 + *    is shippable.
 + *    INNER joins between replicated and distributed relation are shippable
 + *    irrespective of the quals. OUTER join between replicated and distributed
 + *    relation is shippable if distributed relation is the outer relation.
 + *    All joins between hash/modulo distributed relations are shippable if they
 + *    have equi-join on the distributed column, such that distribution columns
 + *    have same datatype and same distribution strategy.
 + * 3. Are datanodes where the joining relations exist, compatible?
 + *    Joins between replicated relations are shippable if both relations share a
 + *    datanode. Joins between distributed relations are shippable if both
 + *    relations are distributed on same set of Datanodes. Join between replicated
 + *    and distributed relations is shippable is replicated relation is replicated
 + *    on all nodes where distributed relation is distributed.
 + *
 + * The first step is to be applied by the caller of this function.
 + */
 +ExecNodes *
 +pgxc_is_join_shippable(ExecNodes *inner_en, ExecNodes *outer_en, Relids in_relids,
 +                                              Relids out_relids, JoinType jointype, List *join_quals,
 +                                              List *rtables)
 +{
 +      bool    merge_nodes = false;
 +
 +      /*
 +       * If either of inner_en or outer_en is NULL, return NULL. We can't ship the
 +       * join when either of the sides do not have datanodes to ship to.
 +       */
 +      if (!outer_en || !inner_en)
 +              return NULL;
 +      /*
 +       * We only support reduction of INNER, LEFT [OUTER] and FULL [OUTER] joins.
 +       * RIGHT [OUTER] join is converted to LEFT [OUTER] join during join tree
 +       * deconstruction.
 +       */
 +      if (jointype != JOIN_INNER && jointype != JOIN_LEFT && jointype != JOIN_FULL)
 +              return NULL;
 +
 +      /* If both sides are replicated or have single node each, we ship any kind of JOIN */
 +      if ((IsExecNodesReplicated(inner_en) && IsExecNodesReplicated(outer_en)) ||
 +               (list_length(inner_en->nodeList) == 1 &&
 +                      list_length(outer_en->nodeList) == 1))
 +              merge_nodes = true;
 +
 +      /* If both sides are distributed, ... */
 +      else if (IsExecNodesColumnDistributed(inner_en) &&
 +                              IsExecNodesColumnDistributed(outer_en))
 +      {
 +              /*
 +               * If two sides are distributed in the same manner by a value, with an
 +               * equi-join on the distribution column and that condition
 +               * is shippable, ship the join if node lists from both sides can be
 +               * merged.
 +               */
 +              if (inner_en->baselocatortype == outer_en->baselocatortype &&
 +                      IsExecNodesDistributedByValue(inner_en))
 +              {
 +                      Expr *equi_join_expr = pgxc_find_dist_equijoin_qual(in_relids,
 +                                                                                                      out_relids, InvalidOid,
 +                                                                                                      (Node *)join_quals, rtables);
 +                      if (equi_join_expr && pgxc_is_expr_shippable(equi_join_expr, NULL))
 +                              merge_nodes = true;
 +              }
 +      }
 +      /*
 +       * If outer side is distributed and inner side is replicated, we can ship
 +       * LEFT OUTER and INNER join.
 +       */
 +      else if (IsExecNodesColumnDistributed(outer_en) &&
 +                              IsExecNodesReplicated(inner_en) &&
 +                              (jointype == JOIN_INNER || jointype == JOIN_LEFT))
 +                      merge_nodes = true;
 +      /*
 +       * If outer side is replicated and inner side is distributed, we can ship
 +       * only for INNER join.
 +       */
 +      else if (IsExecNodesReplicated(outer_en) &&
 +                              IsExecNodesColumnDistributed(inner_en) &&
 +                              jointype == JOIN_INNER)
 +              merge_nodes = true;
 +      /*
 +       * If the ExecNodes of inner and outer nodes can be merged, the JOIN is
 +       * shippable
 +       */
 +      if (merge_nodes)
 +              return pgxc_merge_exec_nodes(inner_en, outer_en);
 +      else
 +              return NULL;
 +}
 +
 +static
 +bool pgxc_targetlist_has_distcol(Query *query)
 +{
 +      RangeTblEntry   *rte = rt_fetch(query->resultRelation, query->rtable);
 +      RelationLocInfo *rel_loc_info;
 +      ListCell   *lc;
 +      const char *distcol;
 +
 +      /* distribution column only applies to the relations */
 +      if (rte->rtekind != RTE_RELATION ||
 +              rte->relkind != RELKIND_RELATION)
 +              return false;
 +      rel_loc_info = GetRelationLocInfo(rte->relid);
 +      if (!rel_loc_info)
 +              return false;
 +
 +      distcol = GetRelationDistribColumn(rel_loc_info);
 +      if (!distcol)
 +              return false;
 +
 +      foreach(lc, query->targetList)
 +      {
 +              TargetEntry *tle = (TargetEntry *) lfirst(lc);
 +
 +              if (tle->resjunk)
 +                      continue;
 +              if (strcmp(tle->resname, distcol) == 0)
 +                      return true;
 +      }
 +      return false;
 +}
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index d3ef43122d45468c6aa54236332b48d3c61b0da5,0000000000000000000000000000000000000000..d9c89622a1df975e7ef66b63fd65b87014a570f7
mode 100644,000000..100644
--- /dev/null
@@@ -1,6191 -1,0 +1,6192 @@@
 +/*-------------------------------------------------------------------------
 + *
 + * execRemote.c
 + *
 + *      Functions to execute commands on remote Datanodes
 + *
 + *
 + * This Source Code Form is subject to the terms of the Mozilla Public
 + * License, v. 2.0. If a copy of the MPL was not distributed with this
 + * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 + *
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
 + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
 + *
 + *
 + * IDENTIFICATION
 + *      src/backend/pgxc/pool/execRemote.c
 + *
 + *-------------------------------------------------------------------------
 + */
 +
 +#include <time.h>
 +#include "postgres.h"
 +#include "access/twophase.h"
 +#include "access/gtm.h"
 +#include "access/sysattr.h"
 +#include "access/transam.h"
 +#include "access/xact.h"
 +#include "access/relscan.h"
 +#include "catalog/pg_type.h"
 +#include "catalog/pgxc_node.h"
 +#include "commands/prepare.h"
 +#include "executor/executor.h"
 +#include "gtm/gtm_c.h"
 +#include "libpq/libpq.h"
 +#include "miscadmin.h"
 +#include "pgxc/execRemote.h"
 +#include "tcop/tcopprot.h"
 +#include "executor/nodeSubplan.h"
 +#include "nodes/nodeFuncs.h"
 +#include "pgstat.h"
 +#include "nodes/nodes.h"
 +#include "nodes/nodeFuncs.h"
 +#include "optimizer/var.h"
 +#include "pgxc/copyops.h"
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/poolmgr.h"
 +#include "storage/ipc.h"
 +#include "storage/proc.h"
 +#include "utils/datum.h"
 +#include "utils/lsyscache.h"
 +#include "utils/memutils.h"
 +#include "utils/pg_rusage.h"
 +#include "utils/tuplesort.h"
 +#include "utils/snapmgr.h"
 +#include "utils/builtins.h"
 +#include "pgxc/locator.h"
 +#include "pgxc/pgxc.h"
 +#include "parser/parse_type.h"
 +#include "parser/parsetree.h"
 +#include "pgxc/xc_maintenance_mode.h"
 +
 +/* Enforce the use of two-phase commit when temporary objects are used */
 +bool EnforceTwoPhaseCommit = true;
 +/*
 + * We do not want it too long, when query is terminating abnormally we just
 + * want to read in already available data, if datanode connection will reach a
 + * consistent state after that, we will go normal clean up procedure: send down
 + * ABORT etc., if data node is not responding we will signal pooler to drop
 + * the connection.
 + * It is better to drop and recreate datanode connection then wait for several
 + * seconds while it being cleaned up when, for example, cancelling query.
 + */
 +#define END_QUERY_TIMEOUT     20
 +
 +typedef struct
 +{
 +      xact_callback function;
 +      void *fparams;
 +} abort_callback_type;
 +
 +/*
 + * Buffer size does not affect performance significantly, just do not allow
 + * connection buffer grows infinitely
 + */
 +#define COPY_BUFFER_SIZE 8192
 +#define PRIMARY_NODE_WRITEAHEAD 1024 * 1024
 +
 +/*
 + * Flag to track if a temporary object is accessed by the current transaction
 + */
 +static bool temp_object_included = false;
 +static abort_callback_type dbcleanup_info = { NULL, NULL };
 +
 +static int    pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections,
 +                              GlobalTransactionId gxid, bool need_tran_block,
 +                              bool readOnly, char node_type);
 +
 +static PGXCNodeAllHandles *get_exec_connections(RemoteQueryState *planstate,
 +                                       ExecNodes *exec_nodes,
 +                                       RemoteQueryExecType exec_type,
 +                                       bool is_global_session);
 +
 +
 +static bool pgxc_start_command_on_connection(PGXCNodeHandle *connection,
 +                                      RemoteQueryState *remotestate, Snapshot snapshot);
 +
 +static char *pgxc_node_remote_prepare(char *prepareGID, bool localNode);
 +static bool pgxc_node_remote_finish(char *prepareGID, bool commit,
 +                                              char *nodestring, GlobalTransactionId gxid,
 +                                              GlobalTransactionId prepare_gxid);
 +static void pgxc_node_remote_commit(void);
 +static void pgxc_node_remote_abort(void);
 +static void pgxc_connections_cleanup(ResponseCombiner *combiner);
 +
 +static void pgxc_node_report_error(ResponseCombiner *combiner);
 +
 +#define REMOVE_CURR_CONN(combiner) \
 +      if ((combiner)->current_conn < --((combiner)->conn_count)) \
 +      { \
 +              (combiner)->connections[(combiner)->current_conn] = \
 +                              (combiner)->connections[(combiner)->conn_count]; \
 +      } \
 +      else \
 +              (combiner)->current_conn = 0
 +
 +#define MAX_STATEMENTS_PER_TRAN 10
 +
 +/* Variables to collect statistics */
 +static int    total_transactions = 0;
 +static int    total_statements = 0;
 +static int    total_autocommit = 0;
 +static int    nonautocommit_2pc = 0;
 +static int    autocommit_2pc = 0;
 +static int    current_tran_statements = 0;
 +static int *statements_per_transaction = NULL;
 +static int *nodes_per_transaction = NULL;
 +
 +/*
 + * statistics collection: count a statement
 + */
 +static void
 +stat_statement()
 +{
 +      total_statements++;
 +      current_tran_statements++;
 +}
 +
 +/*
 + * To collect statistics: count a transaction
 + */
 +static void
 +stat_transaction(int node_count)
 +{
 +      total_transactions++;
 +
 +      if (!statements_per_transaction)
 +      {
 +              statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
 +              memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
 +      }
 +      if (current_tran_statements > MAX_STATEMENTS_PER_TRAN)
 +              statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++;
 +      else
 +              statements_per_transaction[current_tran_statements]++;
 +      current_tran_statements = 0;
 +      if (node_count > 0 && node_count <= NumDataNodes)
 +      {
 +              if (!nodes_per_transaction)
 +              {
 +                      nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int));
 +                      memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int));
 +              }
 +              nodes_per_transaction[node_count - 1]++;
 +      }
 +}
 +
 +
 +/*
 + * Output collected statistics to the log
 + */
 +static void
 +stat_log()
 +{
 +      elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements);
 +      elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d",
 +               total_autocommit, autocommit_2pc, nonautocommit_2pc);
 +      if (total_transactions)
 +      {
 +              if (statements_per_transaction)
 +              {
 +                      int                     i;
 +
 +                      for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++)
 +                              elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)",
 +                                       i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions);
 +              }
 +              elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)",
 +                       MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions);
 +              if (nodes_per_transaction)
 +              {
 +                      int                     i;
 +
 +                      for (i = 0; i < NumDataNodes; i++)
 +                              elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)",
 +                                       i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions);
 +              }
 +      }
 +}
 +
 +
 +/*
 + * Create a structure to store parameters needed to combine responses from
 + * multiple connections as well as state information
 + */
 +void
 +InitResponseCombiner(ResponseCombiner *combiner, int node_count,
 +                                         CombineType combine_type)
 +{
 +      combiner->node_count = node_count;
 +      combiner->connections = NULL;
 +      combiner->conn_count = 0;
 +      combiner->combine_type = combine_type;
 +      combiner->command_complete_count = 0;
 +      combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
 +      combiner->description_count = 0;
 +      combiner->copy_in_count = 0;
 +      combiner->copy_out_count = 0;
 +      combiner->copy_file = NULL;
 +      combiner->errorMessage = NULL;
 +      combiner->errorDetail = NULL;
 +      combiner->errorHint = NULL;
 +      combiner->tuple_desc = NULL;
 +      combiner->probing_primary = false;
 +      combiner->returning_node = InvalidOid;
 +      combiner->currentRow = NULL;
 +      combiner->rowBuffer = NIL;
 +      combiner->tapenodes = NULL;
 +      combiner->merge_sort = false;
 +      combiner->extended_query = false;
 +      combiner->tapemarks = NULL;
 +      combiner->tuplesortstate = NULL;
 +      combiner->cursor = NULL;
 +      combiner->update_cursor = NULL;
 +      combiner->cursor_count = 0;
 +      combiner->cursor_connections = NULL;
 +      combiner->remoteCopyType = REMOTE_COPY_NONE;
 +}
 +
 +
 +/*
 + * Parse out row count from the command status response and convert it to integer
 + */
 +static int
 +parse_row_count(const char *message, size_t len, uint64 *rowcount)
 +{
 +      int                     digits = 0;
 +      int                     pos;
 +
 +      *rowcount = 0;
 +      /* skip \0 string terminator */
 +      for (pos = 0; pos < len - 1; pos++)
 +      {
 +              if (message[pos] >= '0' && message[pos] <= '9')
 +              {
 +                      *rowcount = *rowcount * 10 + message[pos] - '0';
 +                      digits++;
 +              }
 +              else
 +              {
 +                      *rowcount = 0;
 +                      digits = 0;
 +              }
 +      }
 +      return digits;
 +}
 +
 +/*
 + * Convert RowDescription message to a TupleDesc
 + */
 +static TupleDesc
 +create_tuple_desc(char *msg_body, size_t len)
 +{
 +      TupleDesc       result;
 +      int             i, nattr;
 +      uint16          n16;
 +
 +      /* get number of attributes */
 +      memcpy(&n16, msg_body, 2);
 +      nattr = ntohs(n16);
 +      msg_body += 2;
 +
 +      result = CreateTemplateTupleDesc(nattr, false);
 +
 +      /* decode attributes */
 +      for (i = 1; i <= nattr; i++)
 +      {
 +              AttrNumber      attnum;
 +              char            *attname;
 +              char            *typname;
 +              Oid             oidtypeid;
 +              int32           typemode, typmod;
 +
 +              attnum = (AttrNumber) i;
 +
 +              /* attribute name */
 +              attname = msg_body;
 +              msg_body += strlen(attname) + 1;
 +
 +              /* type name */
 +              typname = msg_body;
 +              msg_body += strlen(typname) + 1;
 +
 +              /* table OID, ignored */
 +              msg_body += 4;
 +
 +              /* column no, ignored */
 +              msg_body += 2;
 +
 +              /* data type OID, ignored */
 +              msg_body += 4;
 +
 +              /* type len, ignored */
 +              msg_body += 2;
 +
 +              /* type mod */
 +              memcpy(&typemode, msg_body, 4);
 +              typmod = ntohl(typemode);
 +              msg_body += 4;
 +
 +              /* PGXCTODO text/binary flag? */
 +              msg_body += 2;
 +
 +              /* Get the OID type and mode type from typename */
 +              parseTypeString(typname, &oidtypeid, NULL, false);
 +
 +              TupleDescInitEntry(result, attnum, attname, oidtypeid, typmod, 0);
 +      }
 +      return result;
 +}
 +
 +/*
 + * Handle CopyOutCommandComplete ('c') message from a Datanode connection
 + */
 +static void
 +HandleCopyOutComplete(ResponseCombiner *combiner)
 +{
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return;
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_COPY_OUT;
 +      if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
 +              /* Inconsistent responses */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the Datanodes for 'c' message, current request type %d", combiner->request_type)));
 +      /* Just do nothing, close message is managed by the Coordinator */
 +      combiner->copy_out_count++;
 +}
 +
 +/*
 + * Handle CommandComplete ('C') message from a Datanode connection
 + */
 +static void
 +HandleCommandComplete(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
 +{
 +      int                     digits = 0;
 +      EState             *estate = combiner->ss.ps.state;
 +
 +      /*
 +       * If we did not receive description we are having rowcount or OK response
 +       */
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_COMMAND;
 +      /* Extract rowcount */
 +      if (combiner->combine_type != COMBINE_TYPE_NONE && estate)
 +      {
 +              uint64  rowcount;
 +              digits = parse_row_count(msg_body, len, &rowcount);
 +              if (digits > 0)
 +              {
 +                      /* Replicated write, make sure they are the same */
 +                      if (combiner->combine_type == COMBINE_TYPE_SAME)
 +                      {
 +                              if (combiner->command_complete_count)
 +                              {
 +                                      /*
 +                                       * Replicated command may succeed on on node and fail on
 +                                       * another. The example is if distributed table referenced
 +                                       * by a foreign key constraint defined on a partitioned
 +                                       * table. If command deletes rows from the replicated table
 +                                       * they may be referenced on one Datanode but not on other.
 +                                       * So, replicated command on each Datanode either affects
 +                                       * proper number of rows, or returns error. Here if
 +                                       * combiner got an error already, we allow to report it,
 +                                       * not the scaring data corruption message.
 +                                       */
 +                                      if (combiner->errorMessage == NULL && rowcount != estate->es_processed)
 +                                              /* There is a consistency issue in the database with the replicated table */
 +                                              ereport(ERROR,
 +                                                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                                                               errmsg("Write to replicated table returned different results from the Datanodes")));
 +                              }
 +                              else
 +                                      /* first result */
 +                                      estate->es_processed = rowcount;
 +                      }
 +                      else
 +                              estate->es_processed += rowcount;
 +              }
 +              else
 +                      combiner->combine_type = COMBINE_TYPE_NONE;
 +      }
 +
 +      /* If response checking is enable only then do further processing */
 +      if (conn->ck_resp_rollback)
 +      {
 +              if (strcmp(msg_body, "ROLLBACK") == 0)
 +              {
 +                      /*
 +                       * Subsequent clean up routine will be checking this flag
 +                       * to determine nodes where to send ROLLBACK PREPARED.
 +                       * On current node PREPARE has failed and the two-phase record
 +                       * does not exist, so clean this flag as if PREPARE was not sent
 +                       * to that node and avoid erroneous command.
 +                       */
 +                      conn->ck_resp_rollback = false;
 +                      /*
 +                       * Set the error, if none, to force throwing.
 +                       * If there is error already, it will be thrown anyway, do not add
 +                       * this potentially confusing message
 +                       */
 +                      if (combiner->errorMessage == NULL)
 +                      {
 +                              MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext);
 +                              combiner->errorMessage =
 +                                                              pstrdup("unexpected ROLLBACK from remote node");
 +                              MemoryContextSwitchTo(oldcontext);
 +                              /*
 +                               * ERRMSG_PRODUCER_ERROR
 +                               * Messages with this code are replaced by others, if they are
 +                               * received, so if node will send relevant error message that
 +                               * one will be replaced.
 +                               */
 +                              combiner->errorCode[0] = 'X';
 +                              combiner->errorCode[1] = 'X';
 +                              combiner->errorCode[2] = '0';
 +                              combiner->errorCode[3] = '1';
 +                              combiner->errorCode[4] = '0';
 +                      }
 +              }
 +      }
 +      combiner->command_complete_count++;
 +}
 +
 +/*
 + * Handle RowDescription ('T') message from a Datanode connection
 + */
 +static bool
 +HandleRowDescription(ResponseCombiner *combiner, char *msg_body, size_t len)
 +{
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return false;
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_QUERY;
 +      if (combiner->request_type != REQUEST_TYPE_QUERY)
 +      {
 +              /* Inconsistent responses */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the Datanodes for 'T' message, current request type %d", combiner->request_type)));
 +      }
 +      /* Increment counter and check if it was first */
 +      if (combiner->description_count++ == 0)
 +      {
 +              combiner->tuple_desc = create_tuple_desc(msg_body, len);
 +              return true;
 +      }
 +      return false;
 +}
 +
 +
 +/*
 + * Handle CopyInResponse ('G') message from a Datanode connection
 + */
 +static void
 +HandleCopyIn(ResponseCombiner *combiner)
 +{
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return;
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_COPY_IN;
 +      if (combiner->request_type != REQUEST_TYPE_COPY_IN)
 +      {
 +              /* Inconsistent responses */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the Datanodes for 'G' message, current request type %d", combiner->request_type)));
 +      }
 +      /*
 +       * The normal PG code will output an G message when it runs in the
 +       * Coordinator, so do not proxy message here, just count it.
 +       */
 +      combiner->copy_in_count++;
 +}
 +
 +/*
 + * Handle CopyOutResponse ('H') message from a Datanode connection
 + */
 +static void
 +HandleCopyOut(ResponseCombiner *combiner)
 +{
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return;
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_COPY_OUT;
 +      if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
 +      {
 +              /* Inconsistent responses */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the Datanodes for 'H' message, current request type %d", combiner->request_type)));
 +      }
 +      /*
 +       * The normal PG code will output an H message when it runs in the
 +       * Coordinator, so do not proxy message here, just count it.
 +       */
 +      combiner->copy_out_count++;
 +}
 +
 +/*
 + * Handle CopyOutDataRow ('d') message from a Datanode connection
 + */
 +static void
 +HandleCopyDataRow(ResponseCombiner *combiner, char *msg_body, size_t len)
 +{
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return;
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_COPY_OUT;
 +
 +      /* Inconsistent responses */
 +      if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the Datanodes for 'd' message, current request type %d", combiner->request_type)));
 +
 +      /* count the row */
 +      combiner->processed++;
 +
 +      /* Output remote COPY operation to correct location */
 +      switch (combiner->remoteCopyType)
 +      {
 +              case REMOTE_COPY_FILE:
 +                      /* Write data directly to file */
 +                      fwrite(msg_body, 1, len, combiner->copy_file);
 +                      break;
 +              case REMOTE_COPY_STDOUT:
 +                      /* Send back data to client */
 +                      pq_putmessage('d', msg_body, len);
 +                      break;
 +              case REMOTE_COPY_TUPLESTORE:
 +                      /*
 +                       * Do not store trailing \n character.
 +                       * When tuplestore data are loaded to a table it automatically
 +                       * inserts line ends.
 +                       */
 +                      tuplestore_putmessage(combiner->tuplestorestate, len-1, msg_body);
 +                      break;
 +              case REMOTE_COPY_NONE:
 +              default:
 +                      Assert(0); /* Should not happen */
 +      }
 +}
 +
 +/*
 + * Handle DataRow ('D') message from a Datanode connection
 + * The function returns true if data row is accepted and successfully stored
 + * within the combiner.
 + */
 +static bool
 +HandleDataRow(ResponseCombiner *combiner, char *msg_body, size_t len, Oid node)
 +{
 +      /* We expect previous message is consumed */
 +      Assert(combiner->currentRow == NULL);
 +
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return false;
 +
 +      if (combiner->request_type != REQUEST_TYPE_QUERY)
 +      {
 +              /* Inconsistent responses */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the data nodes for 'D' message, current request type %d", combiner->request_type)));
 +      }
 +
 +      /*
 +       * If we got an error already ignore incoming data rows from other nodes
 +       * Still we want to continue reading until get CommandComplete
 +       */
 +      if (combiner->errorMessage)
 +              return false;
 +
 +      /*
 +       * Replicated INSERT/UPDATE/DELETE with RETURNING: receive only tuples
 +       * from one node, skip others as duplicates
 +       */
 +      if (combiner->combine_type == COMBINE_TYPE_SAME)
 +      {
 +              /* Do not return rows when probing primary, instead return when doing
 +               * first normal node. Just save some CPU and traffic in case if
 +               * probing fails.
 +               */
 +              if (combiner->probing_primary)
 +                      return false;
 +              if (OidIsValid(combiner->returning_node))
 +              {
 +                      if (combiner->returning_node != node)
 +                              return false;
 +              }
 +              else
 +                      combiner->returning_node = node;
 +      }
 +
 +      /*
 +       * We are copying message because it points into connection buffer, and
 +       * will be overwritten on next socket read
 +       */
 +      combiner->currentRow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + len);
 +      memcpy(combiner->currentRow->msg, msg_body, len);
 +      combiner->currentRow->msglen = len;
 +      combiner->currentRow->msgnode = node;
 +
 +      return true;
 +}
 +
 +/*
 + * Handle ErrorResponse ('E') message from a Datanode connection
 + */
 +static void
 +HandleError(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
 +{
 +      /* parse error message */
 +      char *code = NULL;
 +      char *message = NULL;
 +      char *detail = NULL;
 +      char *hint = NULL;
 +      int   offset = 0;
 +
 +      /*
 +       * Scan until point to terminating \0
 +       */
 +      while (offset + 1 < len)
 +      {
 +              /* pointer to the field message */
 +              char *str = msg_body + offset + 1;
 +
 +              switch (msg_body[offset])
 +              {
 +                      case 'C':       /* code */
 +                              code = str;
 +                              break;
 +                      case 'M':       /* message */
 +                              message = str;
 +                              break;
 +                      case 'D':       /* details */
 +                              detail = str;
 +                              break;
 +
 +                      case 'H':       /* hint */
 +                              hint = str;
 +                              break;
 +
 +                      /* Fields not yet in use */
 +                      case 'S':       /* severity */
 +                      case 'R':       /* routine */
 +                      case 'P':       /* position string */
 +                      case 'p':       /* position int */
 +                      case 'q':       /* int query */
 +                      case 'W':       /* where */
 +                      case 'F':       /* file */
 +                      case 'L':       /* line */
 +                      default:
 +                              break;
 +              }
 +
 +              /* code, message and \0 */
 +              offset += strlen(str) + 2;
 +      }
 +
 +      /*
 +       * We may have special handling for some errors, default handling is to
 +       * throw out error with the same message. We can not ereport immediately
 +       * because we should read from this and other connections until
 +       * ReadyForQuery is received, so we just store the error message.
 +       * If multiple connections return errors only first one is reported.
 +       *
 +       * The producer error may be hiding primary error, so if previously received
 +       * error is a producer error allow it to be overwritten.
 +       */
 +      if (combiner->errorMessage == NULL ||
 +                      MAKE_SQLSTATE(combiner->errorCode[0], combiner->errorCode[1],
 +                                                combiner->errorCode[2], combiner->errorCode[3],
 +                                                combiner->errorCode[4]) == ERRCODE_PRODUCER_ERROR)
 +      {
 +              MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext);
 +              combiner->errorMessage = pstrdup(message);
 +              /* Error Code is exactly 5 significant bytes */
 +              if (code)
 +                      memcpy(combiner->errorCode, code, 5);
 +              if (detail)
 +                      combiner->errorDetail = pstrdup(detail);
 +              if (hint)
 +                      combiner->errorHint = pstrdup(hint);
 +              MemoryContextSwitchTo(oldcontext);
 +      }
 +
 +      /*
 +       * If the PREPARE TRANSACTION command fails for whatever reason, we don't
 +       * want to send down ROLLBACK PREPARED to this node. Otherwise, it may end
 +       * up rolling back an unrelated prepared transaction with the same GID as
 +       * used by this transaction
 +       */
 +      if (conn->ck_resp_rollback)
 +              conn->ck_resp_rollback = false;
 +
 +      /*
 +       * If Datanode have sent ErrorResponse it will never send CommandComplete.
 +       * Increment the counter to prevent endless waiting for it.
 +       */
 +      combiner->command_complete_count++;
 +}
 +
 +/*
 + * HandleCmdComplete -
 + *    combine deparsed sql statements execution results
 + *
 + * Input parameters:
 + *    commandType is dml command type
 + *    combineTag is used to combine the completion result
 + *    msg_body is execution result needed to combine
 + *    len is msg_body size
 + */
 +void
 +HandleCmdComplete(CmdType commandType, CombineTag *combine,
 +                                              const char *msg_body, size_t len)
 +{
 +      int     digits = 0;
 +      uint64  originrowcount = 0;
 +      uint64  rowcount = 0;
 +      uint64  total = 0;
 +
 +      if (msg_body == NULL)
 +              return;
 +
 +      /* if there's nothing in combine, just copy the msg_body */
 +      if (strlen(combine->data) == 0)
 +      {
 +              strcpy(combine->data, msg_body);
 +              combine->cmdType = commandType;
 +              return;
 +      }
 +      else
 +      {
 +              /* commandType is conflict */
 +              if (combine->cmdType != commandType)
 +                      return;
 +
 +              /* get the processed row number from msg_body */
 +              digits = parse_row_count(msg_body, len + 1, &rowcount);
 +              elog(DEBUG1, "digits is %d\n", digits);
 +              Assert(digits >= 0);
 +
 +              /* no need to combine */
 +              if (digits == 0)
 +                      return;
 +
 +              /* combine the processed row number */
 +              parse_row_count(combine->data, strlen(combine->data) + 1, &originrowcount);
 +              elog(DEBUG1, "originrowcount is %lu, rowcount is %lu\n", originrowcount, rowcount);
 +              total = originrowcount + rowcount;
 +
 +      }
 +
 +      /* output command completion tag */
 +      switch (commandType)
 +      {
 +              case CMD_SELECT:
 +                      strcpy(combine->data, "SELECT");
 +                      break;
 +              case CMD_INSERT:
 +                      snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
 +                         "INSERT %u %lu", 0, total);
 +                      break;
 +              case CMD_UPDATE:
 +                      snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
 +                                       "UPDATE %lu", total);
 +                      break;
 +              case CMD_DELETE:
 +                      snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
 +                                       "DELETE %lu", total);
 +                      break;
 +              default:
 +                      strcpy(combine->data, "");
 +                      break;
 +      }
 +
 +}
 +
 +/*
 + * HandleDatanodeCommandId ('M') message from a Datanode connection
 + */
 +static void
 +HandleDatanodeCommandId(ResponseCombiner *combiner, char *msg_body, size_t len)
 +{
 +      uint32          n32;
 +      CommandId       cid;
 +
 +      Assert(msg_body != NULL);
 +      Assert(len >= 2);
 +
 +      /* Get the command Id */
 +      memcpy(&n32, &msg_body[0], 4);
 +      cid = ntohl(n32);
 +
 +      /* If received command Id is higher than current one, set it to a new value */
 +      if (cid > GetReceivedCommandId())
 +              SetReceivedCommandId(cid);
 +}
 +
 +/*
 + * Record waited-for XIDs received from the remote nodes into the transaction
 + * state
 + */
 +static void
 +HandleWaitXids(char *msg_body, size_t len)
 +{
 +      int xid_count;
 +      uint32          n32;
 +      int cur;
 +      int i;
 +
 +      /* Get the xid count */
 +      xid_count = len / sizeof (TransactionId);
 +
 +      cur = 0;
 +      for (i = 0; i < xid_count; i++)
 +      {
 +              Assert(cur < len);
 +              memcpy(&n32, &msg_body[cur], sizeof (TransactionId));
 +              cur = cur + sizeof (TransactionId);
 +              TransactionRecordXidWait(ntohl(n32));
 +      }
 +}
 +
 +static void
 +HandleGlobalTransactionId(char *msg_body, size_t len)
 +{
 +      GlobalTransactionId xid;
 +
 +      Assert(len == sizeof (GlobalTransactionId));
 +      memcpy(&xid, &msg_body[0], sizeof (GlobalTransactionId));
 +
 +      SetTopTransactionId(xid);
 +}
 +
 +/*
 + * Examine the specified combiner state and determine if command was completed
 + * successfully
 + */
 +static bool
 +validate_combiner(ResponseCombiner *combiner)
 +{
 +      /* There was error message while combining */
 +      if (combiner->errorMessage)
 +              return false;
 +      /* Check if state is defined */
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              return false;
 +
 +      /* Check all nodes completed */
 +      if ((combiner->request_type == REQUEST_TYPE_COMMAND
 +              || combiner->request_type == REQUEST_TYPE_QUERY)
 +              && combiner->command_complete_count != combiner->node_count)
 +              return false;
 +
 +      /* Check count of description responses */
 +      if (combiner->request_type == REQUEST_TYPE_QUERY
 +              && combiner->description_count != combiner->node_count)
 +              return false;
 +
 +      /* Check count of copy-in responses */
 +      if (combiner->request_type == REQUEST_TYPE_COPY_IN
 +              && combiner->copy_in_count != combiner->node_count)
 +              return false;
 +
 +      /* Check count of copy-out responses */
 +      if (combiner->request_type == REQUEST_TYPE_COPY_OUT
 +              && combiner->copy_out_count != combiner->node_count)
 +              return false;
 +
 +      /* Add other checks here as needed */
 +
 +      /* All is good if we are here */
 +      return true;
 +}
 +
 +/*
 + * Close combiner and free allocated memory, if it is not needed
 + */
 +void
 +CloseCombiner(ResponseCombiner *combiner)
 +{
 +      if (combiner->connections)
 +              pfree(combiner->connections);
 +      if (combiner->tuple_desc)
 +              FreeTupleDesc(combiner->tuple_desc);
 +      if (combiner->errorMessage)
 +              pfree(combiner->errorMessage);
 +      if (combiner->errorDetail)
 +              pfree(combiner->errorDetail);
 +      if (combiner->errorHint)
 +              pfree(combiner->errorHint);
 +      if (combiner->cursor_connections)
 +              pfree(combiner->cursor_connections);
 +      if (combiner->tapenodes)
 +              pfree(combiner->tapenodes);
 +      if (combiner->tapemarks)
 +              pfree(combiner->tapemarks);
 +}
 +
 +/*
 + * Validate combiner and release storage freeing allocated memory
 + */
 +static bool
 +ValidateAndCloseCombiner(ResponseCombiner *combiner)
 +{
 +      bool            valid = validate_combiner(combiner);
 +
 +      CloseCombiner(combiner);
 +
 +      return valid;
 +}
 +
 +/*
 + * It is possible if multiple steps share the same Datanode connection, when
 + * executor is running multi-step query or client is running multiple queries
 + * using Extended Query Protocol. After returning next tuple ExecRemoteQuery
 + * function passes execution control to the executor and then it can be given
 + * to the same RemoteQuery or to different one. It is possible that before
 + * returning a tuple the function do not read all Datanode responses. In this
 + * case pending responses should be read in context of original RemoteQueryState
 + * till ReadyForQuery message and data rows should be stored (buffered) to be
 + * available when fetch from that RemoteQueryState is requested again.
 + * BufferConnection function does the job.
 + * If a RemoteQuery is going to use connection it should check connection state.
 + * DN_CONNECTION_STATE_QUERY indicates query has data to read and combiner
 + * points to the original RemoteQueryState. If combiner differs from "this" the
 + * connection should be buffered.
 + */
 +void
 +BufferConnection(PGXCNodeHandle *conn)
 +{
 +      ResponseCombiner *combiner = conn->combiner;
 +      MemoryContext oldcontext;
 +
 +      if (combiner == NULL || conn->state != DN_CONNECTION_STATE_QUERY)
 +              return;
 +
 +      elog(DEBUG2, "Buffer connection %u to step %s", conn->nodeoid, combiner->cursor);
 +
 +      /*
 +       * When BufferConnection is invoked CurrentContext is related to other
 +       * portal, which is trying to control the connection.
 +       * TODO See if we can find better context to switch to
 +       */
 +      oldcontext = MemoryContextSwitchTo(combiner->ss.ps.ps_ResultTupleSlot->tts_mcxt);
 +
 +      /* Verify the connection is in use by the combiner */
 +      combiner->current_conn = 0;
 +      while (combiner->current_conn < combiner->conn_count)
 +      {
 +              if (combiner->connections[combiner->current_conn] == conn)
 +                      break;
 +              combiner->current_conn++;
 +      }
 +      Assert(combiner->current_conn < combiner->conn_count);
 +
 +      if (combiner->tapemarks == NULL)
 +              combiner->tapemarks = (ListCell**) palloc0(combiner->conn_count * sizeof(ListCell*));
 +
 +      /*
 +       * If current bookmark for the current tape is not set it means either
 +       * first row in the buffer is from the current tape or no rows from
 +       * the tape in the buffer, so if first row is not from current
 +       * connection bookmark the last cell in the list.
 +       */
 +      if (combiner->tapemarks[combiner->current_conn] == NULL &&
 +                      list_length(combiner->rowBuffer) > 0)
 +      {
 +              RemoteDataRow dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
 +              if (dataRow->msgnode != conn->nodeoid)
 +                      combiner->tapemarks[combiner->current_conn] = list_tail(combiner->rowBuffer);
 +      }
 +
 +      /*
 +       * Buffer data rows until data node return number of rows specified by the
 +       * fetch_size parameter of last Execute message (PortalSuspended message)
 +       * or end of result set is reached (CommandComplete message)
 +       */
 +      while (true)
 +      {
 +              int res;
 +
 +              /* Move to buffer currentRow (received from the data node) */
 +              if (combiner->currentRow)
 +              {
 +                      combiner->rowBuffer = lappend(combiner->rowBuffer,
 +                                                                                combiner->currentRow);
 +                      combiner->currentRow = NULL;
 +              }
 +
 +              res = handle_response(conn, combiner);
 +              /*
 +               * If response message is a DataRow it will be handled on the next
 +               * iteration.
 +               * PortalSuspended will cause connection state change and break the loop
 +               * The same is for CommandComplete, but we need additional handling -
 +               * remove connection from the list of active connections.
 +               * We may need to add handling error response
 +               */
 +
 +              /* Most often result check first */
 +              if (res == RESPONSE_DATAROW)
 +              {
 +                      /*
 +                       * The row is in the combiner->currentRow, on next iteration it will
 +                       * be moved to the buffer
 +                       */
 +                      continue;
 +              }
 +
 +              /* incomplete message, read more */
 +              if (res == RESPONSE_EOF)
 +              {
 +                      if (pgxc_node_receive(1, &conn, NULL))
 +                      {
 +                              conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
 +                              add_error_message(conn, "Failed to fetch from data node");
 +                      }
 +              }
 +
 +              /*
 +               * End of result set is reached, so either set the pointer to the
 +               * connection to NULL (combiner with sort) or remove it from the list
 +               * (combiner without sort)
 +               */
 +              else if (res == RESPONSE_COMPLETE)
 +              {
 +                      /*
 +                       * If combiner is doing merge sort we should set reference to the
 +                       * current connection to NULL in the array, indicating the end
 +                       * of the tape is reached. FetchTuple will try to access the buffer
 +                       * first anyway.
 +                       * Since we remove that reference we can not determine what node
 +                       * number was this connection, but we need this info to find proper
 +                       * tuple in the buffer if we are doing merge sort. So store node
 +                       * number in special array.
 +                       * NB: We can not test if combiner->tuplesortstate is set here:
 +                       * connection may require buffering inside tuplesort_begin_merge
 +                       * - while pre-read rows from the tapes, one of the tapes may be
 +                       * the local connection with RemoteSubplan in the tree. The
 +                       * combiner->tuplesortstate is set only after tuplesort_begin_merge
 +                       * returns.
 +                       */
 +                      if (combiner->merge_sort)
 +                      {
 +                              combiner->connections[combiner->current_conn] = NULL;
 +                              if (combiner->tapenodes == NULL)
 +                                      combiner->tapenodes = (Oid *)
 +                                                      palloc0(combiner->conn_count * sizeof(Oid));
 +                              combiner->tapenodes[combiner->current_conn] = conn->nodeoid;
 +                      }
 +                      else
 +                      {
 +                              /* Remove current connection, move last in-place, adjust current_conn */
 +                              if (combiner->current_conn < --combiner->conn_count)
 +                                      combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count];
 +                              else
 +                                      combiner->current_conn = 0;
 +                      }
 +                      /*
 +                       * If combiner runs Simple Query Protocol we need to read in
 +                       * ReadyForQuery. In case of Extended Query Protocol it is not
 +                       * sent and we should quit.
 +                       */
 +                      if (combiner->extended_query)
 +                              break;
 +              }
 +              else if (res == RESPONSE_ERROR)
 +              {
 +                      if (combiner->extended_query)
 +                      {
 +                              /*
 +                               * Need to sync connection to enable receiving commands
 +                               * by the datanode
 +                               */
 +                              if (pgxc_node_send_sync(conn) != 0)
 +                              {
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to sync msg to node %u", conn->nodeoid)));
 +                              }
 +                      }
 +              }
 +              else if (res == RESPONSE_SUSPENDED || res == RESPONSE_READY)
 +              {
 +                      /* Now it is OK to quit */
 +                      break;
 +              }
 +      }
 +      Assert(conn->state != DN_CONNECTION_STATE_QUERY);
 +      MemoryContextSwitchTo(oldcontext);
 +      conn->combiner = NULL;
 +}
 +
 +/*
 + * copy the datarow from combiner to the given slot, in the slot's memory
 + * context
 + */
 +static void
 +CopyDataRowTupleToSlot(ResponseCombiner *combiner, TupleTableSlot *slot)
 +{
 +      RemoteDataRow   datarow;
 +      MemoryContext   oldcontext;
 +      oldcontext = MemoryContextSwitchTo(slot->tts_mcxt);
 +      datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + combiner->currentRow->msglen);
 +      datarow->msgnode = combiner->currentRow->msgnode;
 +      datarow->msglen = combiner->currentRow->msglen;
 +      memcpy(datarow->msg, combiner->currentRow->msg, datarow->msglen);
 +      ExecStoreDataRowTuple(datarow, slot, true);
 +      pfree(combiner->currentRow);
 +      combiner->currentRow = NULL;
 +      MemoryContextSwitchTo(oldcontext);
 +}
 +
 +
 +/*
 + * FetchTuple
 + *
 +              Get next tuple from one of the datanode connections.
 + * The connections should be in combiner->connections, if "local" dummy
 + * connection presents it should be the last active connection in the array.
 + *      If combiner is set up to perform merge sort function returns tuple from
 + * connection defined by combiner->current_conn, or NULL slot if no more tuple
 + * are available from the connection. Otherwise it returns tuple from any
 + * connection or NULL slot if no more available connections.
 + *            Function looks into combiner->rowBuffer before accessing connection
 + * and return a tuple from there if found.
 + *            Function may wait while more data arrive from the data nodes. If there
 + * is a locally executed subplan function advance it and buffer resulting rows
 + * instead of waiting.
 + */
 +TupleTableSlot *
 +FetchTuple(ResponseCombiner *combiner)
 +{
 +      PGXCNodeHandle *conn;
 +      TupleTableSlot *slot;
 +      Oid                     nodeOid = -1;
 +
 +      /*
 +       * Case if we run local subplan.
 +       * We do not have remote connections, so just get local tuple and return it
 +       */
 +      if (outerPlanState(combiner))
 +      {
 +              RemoteSubplanState *planstate = (RemoteSubplanState *) combiner;
 +              RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
 +              /* Advance subplan in a loop until we have something to return */
 +              for (;;)
 +              {
 +                      Datum   value = (Datum) 0;
 +                      bool    isnull;
 +                      int     numnodes;
 +                      int             i;
 +
 +                      slot = ExecProcNode(outerPlanState(combiner));
 +                      /* If locator is not defined deliver all the results */
 +                      if (planstate->locator == NULL)
 +                              return slot;
 +
 +                      /*
 +                       * If NULL tuple is returned we done with the subplan, finish it up and
 +                       * return NULL
 +                       */
 +                      if (TupIsNull(slot))
 +                              return NULL;
 +
 +                      /* Get partitioning value if defined */
 +                      if (plan->distributionKey != InvalidAttrNumber)
 +                              value = slot_getattr(slot, plan->distributionKey, &isnull);
 +
 +                      /* Determine target nodes */
 +                      numnodes = GET_NODES(planstate->locator, value, isnull, NULL);
 +                      for (i = 0; i < numnodes; i++)
 +                      {
 +                              /* Deliver the node */
 +                              if (planstate->dest_nodes[i] == PGXCNodeId-1)
 +                                      return slot;
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * Get current connection
 +       */
 +      if (combiner->conn_count > combiner->current_conn)
 +              conn = combiner->connections[combiner->current_conn];
 +      else
 +              conn = NULL;
 +
 +      /*
 +       * If doing merge sort determine the node number.
 +       * It may be needed to get buffered row.
 +       */
 +      if (combiner->merge_sort)
 +      {
 +              Assert(conn || combiner->tapenodes);
 +              nodeOid = conn ? conn->nodeoid :
 +                                               combiner->tapenodes[combiner->current_conn];
 +              Assert(OidIsValid(nodeOid));
 +      }
 +
 +      /*
 +       * First look into the row buffer.
 +       * When we are performing merge sort we need to get from the buffer record
 +       * from the connection marked as "current". Otherwise get first.
 +       */
 +      if (list_length(combiner->rowBuffer) > 0)
 +      {
 +              RemoteDataRow dataRow;
 +
 +              Assert(combiner->currentRow == NULL);
 +
 +              if (combiner->merge_sort)
 +              {
 +                      ListCell *lc;
 +                      ListCell *prev;
 +
 +                      elog(DEBUG1, "Getting buffered tuple from node %x", nodeOid);
 +
 +                      prev = combiner->tapemarks[combiner->current_conn];
 +                      if (prev)
 +                      {
 +                              /*
 +                               * Start looking through the list from the bookmark.
 +                               * Probably the first cell we check contains row from the needed
 +                               * node. Otherwise continue scanning until we encounter one,
 +                               * advancing prev pointer as well.
 +                               */
 +                              while((lc = lnext(prev)) != NULL)
 +                              {
 +                                      dataRow = (RemoteDataRow) lfirst(lc);
 +                                      if (dataRow->msgnode == nodeOid)
 +                                      {
 +                                              combiner->currentRow = dataRow;
 +                                              break;
 +                                      }
 +                                      prev = lc;
 +                              }
 +                      }
 +                      else
 +                      {
 +                              /*
 +                               * Either needed row is the first in the buffer or no such row
 +                               */
 +                              lc = list_head(combiner->rowBuffer);
 +                              dataRow = (RemoteDataRow) lfirst(lc);
 +                              if (dataRow->msgnode == nodeOid)
 +                                      combiner->currentRow = dataRow;
 +                              else
 +                                      lc = NULL;
 +                      }
 +                      if (lc)
 +                      {
 +                              /*
 +                               * Delete cell from the buffer. Before we delete we must check
 +                               * the bookmarks, if the cell is a bookmark for any tape.
 +                               * If it is the case we are deleting last row of the current
 +                               * block from the current tape. That tape should have bookmark
 +                               * like current, and current bookmark will be advanced when we
 +                               * read the tape once again.
 +                               */
 +                              int i;
 +                              for (i = 0; i < combiner->conn_count; i++)
 +                              {
 +                                      if (combiner->tapemarks[i] == lc)
 +                                              combiner->tapemarks[i] = prev;
 +                              }
 +                              elog(DEBUG1, "Found buffered tuple from node %x", nodeOid);
 +                              combiner->rowBuffer = list_delete_cell(combiner->rowBuffer,
 +                                                                                                         lc, prev);
 +                      }
 +                      elog(DEBUG1, "Update tapemark");
 +                      combiner->tapemarks[combiner->current_conn] = prev;
 +              }
 +              else
 +              {
 +                      dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
 +                      combiner->currentRow = dataRow;
 +                      combiner->rowBuffer = list_delete_first(combiner->rowBuffer);
 +              }
 +      }
 +
 +      /* If we have node message in the currentRow slot, and it is from a proper
 +       * node, consume it.  */
 +      if (combiner->currentRow)
 +      {
 +              Assert(!combiner->merge_sort ||
 +                         combiner->currentRow->msgnode == nodeOid);
 +              slot = combiner->ss.ps.ps_ResultTupleSlot;
 +              CopyDataRowTupleToSlot(combiner, slot);
 +              return slot;
 +      }
 +
 +      while (conn)
 +      {
 +              int res;
 +
 +              /* Going to use a connection, buffer it if needed */
 +              CHECK_OWNERSHIP(conn, combiner);
 +
 +              /*
 +               * If current connection is idle it means portal on the data node is
 +               * suspended. Request more and try to get it
 +               */
 +              if (combiner->extended_query &&
 +                              conn->state == DN_CONNECTION_STATE_IDLE)
 +              {
 +                      /*
 +                       * We do not allow to suspend if querying primary node, so that
 +                       * only may mean the current node is secondary and subplan was not
 +                       * executed there yet. Return and go on with second phase.
 +                       */
 +                      if (combiner->probing_primary)
 +                      {
 +                              return NULL;
 +                      }
 +
 +                      if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
 +                      }
 +
 +                      if (pgxc_node_send_flush(conn) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
 +                      }
 +
 +                      if (pgxc_node_receive(1, &conn, NULL))
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed receive data from node %u cursor '%s'", conn->nodeoid, combiner->cursor)));
 +                      }
 +              }
 +
 +              /* read messages */
 +              res = handle_response(conn, combiner);
 +              if (res == RESPONSE_DATAROW)
 +              {
 +                      slot = combiner->ss.ps.ps_ResultTupleSlot;
 +                      CopyDataRowTupleToSlot(combiner, slot);
 +                      return slot;
 +              }
 +              else if (res == RESPONSE_EOF)
 +              {
 +                      /* incomplete message, read more */
 +                      if (pgxc_node_receive(1, &conn, NULL))
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to receive more data from data node %u", conn->nodeoid)));
 +                      continue;
 +              }
 +              else if (res == RESPONSE_SUSPENDED)
 +              {
 +                      /*
 +                       * If we are doing merge sort or probing primary node we should
 +                       * remain on the same node, so query next portion immediately.
 +                       * Otherwise leave node suspended and fetch lazily.
 +                       */
 +                      if (combiner->merge_sort || combiner->probing_primary)
 +                      {
 +                              if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0)
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
 +                              if (pgxc_node_send_flush(conn) != 0)
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
 +                              if (pgxc_node_receive(1, &conn, NULL))
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed receive node from node %u cursor '%s'", conn->nodeoid, combiner->cursor)));
 +                              continue;
 +                      }
 +
 +                      /*
 +                       * Tell the node to fetch data in background, next loop when we 
 +                       * pgxc_node_receive, data is already there, so we can run faster
 +                       * */
 +                      if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
 +                      }
 +
 +                      if (pgxc_node_send_flush(conn) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
 +                      }
 +
 +                      if (++combiner->current_conn >= combiner->conn_count)
 +                              combiner->current_conn = 0;
 +                      conn = combiner->connections[combiner->current_conn];
 +              }
 +              else if (res == RESPONSE_COMPLETE)
 +              {
 +                      /*
 +                       * In case of Simple Query Protocol we should receive ReadyForQuery
 +                       * before removing connection from the list. In case of Extended
 +                       * Query Protocol we may remove connection right away.
 +                       */
 +                      if (combiner->extended_query)
 +                      {
 +                              /* If we are doing merge sort clean current connection and return
 +                               * NULL, otherwise remove current connection, move last in-place,
 +                               * adjust current_conn and continue if it is not last connection */
 +                              if (combiner->merge_sort)
 +                              {
 +                                      combiner->connections[combiner->current_conn] = NULL;
 +                                      return NULL;
 +                              }
 +                              REMOVE_CURR_CONN(combiner);
 +                              if (combiner->conn_count > 0)
 +                                      conn = combiner->connections[combiner->current_conn];
 +                              else
 +                                      return NULL;
 +                      }
 +              }
 +              else if (res == RESPONSE_ERROR)
 +              {
 +                      /*
 +                       * If doing Extended Query Protocol we need to sync connection,
 +                       * otherwise subsequent commands will be ignored.
 +                       */
 +                      if (combiner->extended_query)
 +                      {
 +                              if (pgxc_node_send_sync(conn) != 0)
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to sync msg to node %u", conn->nodeoid)));
 +                      }
 +                      /*
 +                       * Do not wait for response from primary, it needs to wait
 +                       * for other nodes to respond. Instead go ahead and send query to
 +                       * other nodes. It will fail there, but we can continue with
 +                       * normal cleanup.
 +                       */
 +                      if (combiner->probing_primary)
 +                      {
 +                              REMOVE_CURR_CONN(combiner);
 +                              return NULL;
 +                      }
 +              }
 +              else if (res == RESPONSE_READY)
 +              {
 +                      /* If we are doing merge sort clean current connection and return
 +                       * NULL, otherwise remove current connection, move last in-place,
 +                       * adjust current_conn and continue if it is not last connection */
 +                      if (combiner->merge_sort)
 +                      {
 +                              combiner->connections[combiner->current_conn] = NULL;
 +                              return NULL;
 +                      }
 +                      REMOVE_CURR_CONN(combiner);
 +                      if (combiner->conn_count > 0)
 +                              conn = combiner->connections[combiner->current_conn];
 +                      else
 +                              return NULL;
 +              }
 +              else if (res == RESPONSE_TUPDESC)
 +              {
 +                      ExecSetSlotDescriptor(combiner->ss.ps.ps_ResultTupleSlot,
 +                                                                combiner->tuple_desc);
 +                      /* Now slot is responsible for freeng the descriptor */
 +                      combiner->tuple_desc = NULL;
 +              }
 +              else if (res == RESPONSE_ASSIGN_GXID)
 +              {
 +                      /* Do nothing. It must have been handled in handle_response() */
 +              }
 +              else
 +              {
 +                      // Can not get here?
 +                      Assert(false);
 +              }
 +      }
 +
 +      return NULL;
 +}
 +
 +
 +/*
 + * Handle responses from the Datanode connections
 + */
 +static int
 +pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
 +                                               struct timeval * timeout, ResponseCombiner *combiner)
 +{
 +      int                     count = conn_count;
 +      PGXCNodeHandle *to_receive[conn_count];
 +
 +      /* make a copy of the pointers to the connections */
 +      memcpy(to_receive, connections, conn_count * sizeof(PGXCNodeHandle *));
 +
 +      /*
 +       * Read results.
 +       * Note we try and read from Datanode connections even if there is an error on one,
 +       * so as to avoid reading incorrect results on the next statement.
 +       * Other safegaurds exist to avoid this, however.
 +       */
 +      while (count > 0)
 +      {
 +              int i = 0;
 +
 +              if (pgxc_node_receive(count, to_receive, timeout))
 +                      return EOF;
 +              while (i < count)
 +              {
 +                      int result =  handle_response(to_receive[i], combiner);
 +                      switch (result)
 +                      {
 +                              case RESPONSE_EOF: /* have something to read, keep receiving */
 +                                      i++;
 +                                      break;
 +                              case RESPONSE_COMPLETE:
 +                                      if (to_receive[i]->state != DN_CONNECTION_STATE_ERROR_FATAL)
 +                                              /* Continue read until ReadyForQuery */
 +                                              break;
 +                                      /* fallthru */
 +                              case RESPONSE_READY:
 +                                      /* fallthru */
 +                              case RESPONSE_COPY:
 +                                      /* Handling is done, do not track this connection */
 +                                      count--;
 +                                      /* Move last connection in place */
 +                                      if (i < count)
 +                                              to_receive[i] = to_receive[count];
 +                                      break;
 +                              case RESPONSE_ERROR:
 +                                      /* no handling needed, just wait for ReadyForQuery */
 +                                      break;
 +
 +                              case RESPONSE_WAITXIDS:
 +                                      break;
 +
 +                              case RESPONSE_ASSIGN_GXID:
 +                                      break;
 +
 +                              default:
 +                                      /* Inconsistent responses */
 +                                      add_error_message(to_receive[i], "Unexpected response from the Datanodes");
 +                                      elog(ERROR, "Unexpected response from the Datanodes, result = %d, request type %d", result, combiner->request_type);
 +                                      /* Stop tracking and move last connection in place */
 +                                      count--;
 +                                      if (i < count)
 +                                              to_receive[i] = to_receive[count];
 +                      }
 +              }
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * Read next message from the connection and update the combiner
 + * and connection state accordingly
 + * If we are in an error state we just consume the messages, and do not proxy
 + * Long term, we should look into cancelling executing statements
 + * and closing the connections.
 + * It returns if states need to be handled
 + * Return values:
 + * RESPONSE_EOF - need to receive more data for the connection
 + * RESPONSE_READY - got ReadyForQuery
 + * RESPONSE_COMPLETE - done with the connection, but not yet ready for query.
 + * Also this result is output in case of error
 + * RESPONSE_SUSPENDED - got PortalSuspended
 + * RESPONSE_TUPLEDESC - got tuple description
 + * RESPONSE_DATAROW - got data row
 + * RESPONSE_COPY - got copy response
 + * RESPONSE_BARRIER_OK - barrier command completed successfully
 + */
 +int
 +handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner)
 +{
 +      char       *msg;
 +      int                     msg_len;
 +      char            msg_type;
 +
 +      for (;;)
 +      {
 +              /*
 +               * If we are in the process of shutting down, we
 +               * may be rolling back, and the buffer may contain other messages.
 +               * We want to avoid a procarray exception
 +               * as well as an error stack overflow.
 +               */
 +              if (proc_exit_inprogress)
 +                      conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
 +
 +              /*
 +               * Don't read from from the connection if there is a fatal error.
 +               * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since
 +               * Handling of RESPONSE_ERROR assumes sending SYNC message, but
 +               * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is
 +               * not usable.
 +               */
 +              if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
 +                      return RESPONSE_COMPLETE;
 +
 +              /* No data available, exit */
 +              if (!HAS_MESSAGE_BUFFERED(conn))
 +                      return RESPONSE_EOF;
 +
 +              Assert(conn->combiner == combiner || conn->combiner == NULL);
 +
 +              /* TODO handle other possible responses */
 +              msg_type = get_message(conn, &msg_len, &msg);
 +              switch (msg_type)
 +              {
 +                      case '\0':                      /* Not enough data in the buffer */
 +                              return RESPONSE_EOF;
 +                      case 'c':                       /* CopyToCommandComplete */
 +                              HandleCopyOutComplete(combiner);
 +                              break;
 +                      case 'C':                       /* CommandComplete */
 +                              HandleCommandComplete(combiner, msg, msg_len, conn);
 +                              conn->combiner = NULL;
 +                              if (conn->state == DN_CONNECTION_STATE_QUERY)
 +                                      conn->state = DN_CONNECTION_STATE_IDLE;
 +                              return RESPONSE_COMPLETE;
 +                      case 'T':                       /* RowDescription */
 +#ifdef DN_CONNECTION_DEBUG
 +                              Assert(!conn->have_row_desc);
 +                              conn->have_row_desc = true;
 +#endif
 +                              if (HandleRowDescription(combiner, msg, msg_len))
 +                                      return RESPONSE_TUPDESC;
 +                              break;
 +                      case 'D':                       /* DataRow */
 +#ifdef DN_CONNECTION_DEBUG
 +                              Assert(conn->have_row_desc);
 +#endif
 +                              /* Do not return if data row has not been actually handled */
 +                              if (HandleDataRow(combiner, msg, msg_len, conn->nodeoid))
 +                                      return RESPONSE_DATAROW;
 +                              break;
 +                      case 's':                       /* PortalSuspended */
 +                              /* No activity is expected on the connection until next query */
 +                              conn->state = DN_CONNECTION_STATE_IDLE;
 +                              conn->combiner = NULL;
 +                              return RESPONSE_SUSPENDED;
 +                      case '1': /* ParseComplete */
 +                      case '2': /* BindComplete */
 +                      case '3': /* CloseComplete */
 +                      case 'n': /* NoData */
 +                              /* simple notifications, continue reading */
 +                              break;
 +                      case 'G': /* CopyInResponse */
 +                              conn->state = DN_CONNECTION_STATE_COPY_IN;
 +                              HandleCopyIn(combiner);
 +                              /* Done, return to caller to let it know the data can be passed in */
 +                              return RESPONSE_COPY;
 +                      case 'H': /* CopyOutResponse */
 +                              conn->state = DN_CONNECTION_STATE_COPY_OUT;
 +                              HandleCopyOut(combiner);
 +                              return RESPONSE_COPY;
 +                      case 'd': /* CopyOutDataRow */
 +                              conn->state = DN_CONNECTION_STATE_COPY_OUT;
 +                              HandleCopyDataRow(combiner, msg, msg_len);
 +                              break;
 +                      case 'E':                       /* ErrorResponse */
 +                              HandleError(combiner, msg, msg_len, conn);
 +                              add_error_message(conn, combiner->errorMessage);
 +                              return RESPONSE_ERROR;
 +                      case 'A':                       /* NotificationResponse */
 +                      case 'N':                       /* NoticeResponse */
 +                      case 'S':                       /* SetCommandComplete */
 +                              /*
 +                               * Ignore these to prevent multiple messages, one from each
 +                               * node. Coordinator will send one for DDL anyway
 +                               */
 +                              break;
 +                      case 'Z':                       /* ReadyForQuery */
 +                      {
 +                              /*
 +                               * Return result depends on previous connection state.
 +                               * If it was PORTAL_SUSPENDED Coordinator want to send down
 +                               * another EXECUTE to fetch more rows, otherwise it is done
 +                               * with the connection
 +                               */
 +                              conn->transaction_status = msg[0];
 +                              conn->state = DN_CONNECTION_STATE_IDLE;
 +                              conn->combiner = NULL;
 +#ifdef DN_CONNECTION_DEBUG
 +                              conn->have_row_desc = false;
 +#endif
 +                              return RESPONSE_READY;
 +                      }
 +                      case 'M':                       /* Command Id */
 +                              HandleDatanodeCommandId(combiner, msg, msg_len);
 +                              break;
 +                      case 'b':
 +                              conn->state = DN_CONNECTION_STATE_IDLE;
 +                              return RESPONSE_BARRIER_OK;
 +                      case 'I':                       /* EmptyQuery */
 +                              return RESPONSE_COMPLETE;
 +                      case 'W':
 +                              HandleWaitXids(msg, msg_len);   
 +                              return RESPONSE_WAITXIDS;
 +                      case 'x':
 +                              HandleGlobalTransactionId(msg, msg_len);
 +                              return RESPONSE_ASSIGN_GXID;
 +                      default:
 +                              /* sync lost? */
 +                              elog(WARNING, "Received unsupported message type: %c", msg_type);
 +                              conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
 +                              /* stop reading */
 +                              return RESPONSE_COMPLETE;
 +              }
 +      }
 +      /* never happen, but keep compiler quiet */
 +      return RESPONSE_EOF;
 +}
 +
 +/*
 + * Has the data node sent Ready For Query
 + */
 +
 +bool
 +is_data_node_ready(PGXCNodeHandle * conn)
 +{
 +      char            *msg;
 +      int             msg_len;
 +      char            msg_type;
 +
 +      for (;;)
 +      {
 +              /*
 +               * If we are in the process of shutting down, we
 +               * may be rolling back, and the buffer may contain other messages.
 +               * We want to avoid a procarray exception
 +               * as well as an error stack overflow.
 +               */
 +              if (proc_exit_inprogress)
 +                      conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
 +
 +              /* don't read from from the connection if there is a fatal error */
 +              if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
 +                      return true;
 +
 +              /* No data available, exit */
 +              if (!HAS_MESSAGE_BUFFERED(conn))
 +                      return false;
 +
 +              msg_type = get_message(conn, &msg_len, &msg);
 +              if (msg_type == 'Z')
 +              {
 +                      /*
 +                       * Return result depends on previous connection state.
 +                       * If it was PORTAL_SUSPENDED Coordinator want to send down
 +                       * another EXECUTE to fetch more rows, otherwise it is done
 +                       * with the connection
 +                       */
 +                      conn->transaction_status = msg[0];
 +                      conn->state = DN_CONNECTION_STATE_IDLE;
 +                      conn->combiner = NULL;
 +                      return true;
 +              }
 +      }
 +      /* never happen, but keep compiler quiet */
 +      return false;
 +}
 +
 +
 +/*
 + * Send BEGIN command to the Datanodes or Coordinators and receive responses.
 + * Also send the GXID for the transaction.
 + */
 +static int
 +pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
 +                              GlobalTransactionId gxid, bool need_tran_block,
 +                              bool readOnly, char node_type)
 +{
 +      int                     i;
 +      struct timeval *timeout = NULL;
 +      ResponseCombiner combiner;
 +      TimestampTz timestamp = GetCurrentGTMStartTimestamp();
 +      PGXCNodeHandle *new_connections[conn_count];
 +      int new_count = 0;
 +      char               *init_str;
 +      char                    lxid[13];
 +
 +      /*
 +       * If no remote connections, we don't have anything to do
 +       */
 +      if (conn_count == 0)
 +              return 0;
 +
 +      for (i = 0; i < conn_count; i++)
 +      {
 +              if (!readOnly && !IsConnFromDatanode())
 +                      connections[i]->read_only = false;
 +              /*
 +               * PGXC TODO - A connection should not be in DN_CONNECTION_STATE_QUERY
 +               * state when we are about to send a BEGIN TRANSACTION command to the
 +               * node. We should consider changing the following to an assert and fix
 +               * any bugs reported
 +               */
 +              if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
 +                      BufferConnection(connections[i]);
 +
 +              /* Send GXID and check for errors */
 +              if (GlobalTransactionIdIsValid(gxid) && pgxc_node_send_gxid(connections[i], gxid))
 +                      return EOF;
 +
 +              /* Send timestamp and check for errors */
 +              if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp))
 +                      return EOF;
 +
 +              if (IS_PGXC_DATANODE && GlobalTransactionIdIsValid(gxid))
 +                      need_tran_block = true;
 +              else if (IS_PGXC_REMOTE_COORDINATOR)
 +                      need_tran_block = false;
 +              /* Send BEGIN if not already in transaction */
 +              if (need_tran_block && connections[i]->transaction_status == 'I')
 +              {
 +                      /* Send the BEGIN TRANSACTION command and check for errors */
 +                      if (pgxc_node_send_query(connections[i], "BEGIN"))
 +                              return EOF;
 +
 +                      new_connections[new_count++] = connections[i];
 +              }
 +      }
 +
 +      /*
 +       * If we did not send a BEGIN command to any node, we are done. Otherwise,
 +       * we need to check for any errors and report them
 +       */
 +      if (new_count == 0)
 +              return 0;
 +
 +      InitResponseCombiner(&combiner, new_count, COMBINE_TYPE_NONE);
 +      /*
 +       * Make sure there are zeroes in unused fields
 +       */
 +      memset(&combiner, 0, sizeof(ScanState));
 +
 +      /* Receive responses */
 +      if (pgxc_node_receive_responses(new_count, new_connections, timeout, &combiner))
 +              return EOF;
 +
 +      /* Verify status */
 +      if (!ValidateAndCloseCombiner(&combiner))
 +              return EOF;
 +
 +      /* Send virtualXID to the remote nodes using SET command */
 +      sprintf(lxid, "%d", MyProc->lxid);
 +      PGXCNodeSetParam(true, "coordinator_lxid", lxid);
 +
 +      /* after transactions are started send down local set commands */
 +      init_str = PGXCNodeGetTransactionParamStr();
 +      if (init_str)
 +      {
 +              for (i = 0; i < new_count; i++)
 +              {
 +                      pgxc_node_set_query(new_connections[i], init_str);
 +              }
 +      }
 +
 +      /* No problem, let's get going */
 +      return 0;
 +}
 +
 +
 +/*
 + * Execute DISCARD ALL command on all allocated nodes to remove all session
 + * specific stuff before releasing them to pool for reuse by other sessions.
 + */
 +static void
 +pgxc_node_remote_cleanup_all(void)
 +{
 +      PGXCNodeAllHandles *handles = get_current_handles();
 +      PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count];
 +      int                             new_conn_count = 0;
 +      int                             i;
 +      char               *resetcmd = "RESET ALL;RESET SESSION AUTHORIZATION;"
 +                                                         "RESET transaction_isolation;";
 +
 +      /*
 +       * We must handle reader and writer connections both since even a read-only
 +       * needs to be cleaned up.
 +       */
 +      if (handles->co_conn_count + handles->dn_conn_count == 0)
 +              return;
 +
 +      /*
 +       * Send down snapshot followed by DISCARD ALL command.
 +       */
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *handle = handles->coord_handles[i];
 +
 +              /* At this point connection should be in IDLE state */
 +              if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              {
 +                      handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
 +                      continue;
 +              }
 +
 +              /*
 +               * We must go ahead and release connections anyway, so do not throw
 +               * an error if we have a problem here.
 +               */
 +              if (pgxc_node_send_query(handle, resetcmd))
 +              {
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to clean up data nodes")));
 +                      handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
 +                      continue;
 +              }
 +              new_connections[new_conn_count++] = handle;
 +              handle->combiner = NULL;
 +      }
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *handle = handles->datanode_handles[i];
 +
 +              /* At this point connection should be in IDLE state */
 +              if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              {
 +                      handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
 +                      continue;
 +              }
 +
 +              /*
 +               * We must go ahead and release connections anyway, so do not throw
 +               * an error if we have a problem here.
 +               */
 +              if (pgxc_node_send_query(handle, resetcmd))
 +              {
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to clean up data nodes")));
 +                      handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
 +                      continue;
 +              }
 +              new_connections[new_conn_count++] = handle;
 +              handle->combiner = NULL;
 +      }
 +
 +      if (new_conn_count)
 +      {
 +              ResponseCombiner combiner;
 +              InitResponseCombiner(&combiner, new_conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              pgxc_node_receive_responses(new_conn_count, new_connections, NULL, &combiner);
 +              CloseCombiner(&combiner);
 +      }
 +      pfree_pgxc_all_handles(handles);
 +}
 +
 +
 +/*
 + * Prepare nodes which ran write operations during the transaction.
 + * Read only remote transactions are committed and connections are released
 + * back to the pool.
 + * Function returns the list of nodes where transaction is prepared, including
 + * local node, if requested, in format expected by the GTM server.
 + * If something went wrong the function tries to abort prepared transactions on
 + * the nodes where it succeeded and throws error. A warning is emitted if abort
 + * prepared fails.
 + * After completion remote connection handles are released.
 + */
 +static char *
 +pgxc_node_remote_prepare(char *prepareGID, bool localNode)
 +{
 +      bool                    isOK = true;
 +      StringInfoData  nodestr;
 +      char                    prepare_cmd[256];
 +      char                    abort_cmd[256];
 +      GlobalTransactionId auxXid;
 +      char               *commit_cmd = "COMMIT TRANSACTION";
 +      int                             i;
 +      ResponseCombiner combiner;
 +      PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
 +      int                             conn_count = 0;
 +      PGXCNodeAllHandles *handles = get_current_handles();
 +
 +      initStringInfo(&nodestr);
 +      if (localNode)
 +              appendStringInfoString(&nodestr, PGXCNodeName);
 +
 +      sprintf(prepare_cmd, "PREPARE TRANSACTION '%s'", prepareGID);
 +
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->datanode_handles[i];
 +
 +              /*
 +               * If something went wrong already we have nothing to do here. The error
 +               * will be reported at the end of the function, and we will rollback
 +               * remotes as part of the error handling.
 +               * Just skip to clean up section and check if we have already prepared
 +               * somewhere, we should abort that prepared transaction.
 +               */
 +              if (!isOK)
 +                      goto prepare_err;
 +
 +              /*
 +               * Skip empty slots
 +               */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +              else if (conn->transaction_status == 'T')
 +              {
 +                      /* Read in any pending input */
 +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
 +                              BufferConnection(conn);
 +
 +                      if (conn->read_only)
 +                      {
 +                              /* Send down prepare command */
 +                              if (pgxc_node_send_query(conn, commit_cmd))
 +                              {
 +                                      /*
 +                                       * not a big deal, it was read only, the connection will be
 +                                       * abandoned later.
 +                                       */
 +                                      ereport(LOG,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("failed to send COMMIT command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                              }
 +                              else
 +                              {
 +                                      /* Read responses from these */
 +                                      connections[conn_count++] = conn;
 +                              }
 +                      }
 +                      else
 +                      {
 +                              /* Send down prepare command */
 +                              if (pgxc_node_send_query(conn, prepare_cmd))
 +                              {
 +                                      /*
 +                                       * That is the trouble, we really want to prepare it.
 +                                       * Just emit warning so far and go to clean up.
 +                                       */
 +                                      isOK = false;
 +                                      ereport(WARNING,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("failed to send PREPARE TRANSACTION command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                              }
 +                              else
 +                              {
 +                                      char *nodename = get_pgxc_nodename(conn->nodeoid);
 +                                      if (nodestr.len > 0)
 +                                              appendStringInfoChar(&nodestr, ',');
 +                                      appendStringInfoString(&nodestr, nodename);
 +                                      /* Read responses from these */
 +                                      connections[conn_count++] = conn;
 +                                      /*
 +                                       * If it fails on remote node it would just return ROLLBACK.
 +                                       * Set the flag for the message handler so the response is
 +                                       * verified.
 +                                       */
 +                                      conn->ck_resp_rollback = true;
 +                              }
 +                      }
 +              }
 +              else if (conn->transaction_status == 'E')
 +              {
 +                      /*
 +                       * Probably can not happen, if there was a error the engine would
 +                       * abort anyway, even in case of explicit PREPARE.
 +                       * Anyway, just in case...
 +                       */
 +                      isOK = false;
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("remote node %u is in error state", conn->nodeoid)));
 +              }
 +      }
 +
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->coord_handles[i];
 +
 +              /*
 +               * If something went wrong already we have nothing to do here. The error
 +               * will be reported at the end of the function, and we will rollback
 +               * remotes as part of the error handling.
 +               * Just skip to clean up section and check if we have already prepared
 +               * somewhere, we should abort that prepared transaction.
 +               */
 +              if (!isOK)
 +                      goto prepare_err;
 +
 +              /*
 +               * Skip empty slots
 +               */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +              else if (conn->transaction_status == 'T')
 +              {
 +                      if (conn->read_only)
 +                      {
 +                              /* Send down prepare command */
 +                              if (pgxc_node_send_query(conn, commit_cmd))
 +                              {
 +                                      /*
 +                                       * not a big deal, it was read only, the connection will be
 +                                       * abandoned later.
 +                                       */
 +                                      ereport(LOG,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("failed to send COMMIT command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                              }
 +                              else
 +                              {
 +                                      /* Read responses from these */
 +                                      connections[conn_count++] = conn;
 +                              }
 +                      }
 +                      else
 +                      {
 +                              /* Send down prepare command */
 +                              if (pgxc_node_send_query(conn, prepare_cmd))
 +                              {
 +                                      /*
 +                                       * That is the trouble, we really want to prepare it.
 +                                       * Just emit warning so far and go to clean up.
 +                                       */
 +                                      isOK = false;
 +                                      ereport(WARNING,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("failed to send PREPARE TRANSACTION command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                              }
 +                              else
 +                              {
 +                                      char *nodename = get_pgxc_nodename(conn->nodeoid);
 +                                      if (nodestr.len > 0)
 +                                              appendStringInfoChar(&nodestr, ',');
 +                                      appendStringInfoString(&nodestr, nodename);
 +                                      /* Read responses from these */
 +                                      connections[conn_count++] = conn;
 +                                      /*
 +                                       * If it fails on remote node it would just return ROLLBACK.
 +                                       * Set the flag for the message handler so the response is
 +                                       * verified.
 +                                       */
 +                                      conn->ck_resp_rollback = true;
 +                              }
 +                      }
 +              }
 +              else if (conn->transaction_status == 'E')
 +              {
 +                      /*
 +                       * Probably can not happen, if there was a error the engine would
 +                       * abort anyway, even in case of explicit PREPARE.
 +                       * Anyway, just in case...
 +                       */
 +                      isOK = false;
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("remote node %u is in error state", conn->nodeoid)));
 +              }
 +      }
 +
 +      SetSendCommandId(false);
 +
 +      if (!isOK)
 +              goto prepare_err;
 +
 +      /* exit if nothing has been prepared */
 +      if (conn_count > 0)
 +      {
 +              int result;
 +              /*
 +               * Receive and check for any errors. In case of errors, we don't bail out
 +               * just yet. We first go through the list of connections and look for
 +               * errors on each connection. This is important to ensure that we run
 +               * an appropriate ROLLBACK command later on (prepared transactions must be
 +               * rolled back with ROLLBACK PREPARED commands).
 +               *
 +               * PGXCTODO - There doesn't seem to be a solid mechanism to track errors on
 +               * individual connections. The transaction_status field doesn't get set
 +               * every time there is an error on the connection. The combiner mechanism is
 +               * good for parallel proessing, but I think we should have a leak-proof
 +               * mechanism to track connection status
 +               */
 +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
 +              if (result || !validate_combiner(&combiner))
 +                      goto prepare_err;
 +              else
 +                      CloseCombiner(&combiner);
 +
 +              /* Before exit clean the flag, to avoid unnecessary checks */
 +              for (i = 0; i < conn_count; i++)
 +                      connections[i]->ck_resp_rollback = false;
 +
 +              pfree_pgxc_all_handles(handles);
 +              if (!temp_object_included && !PersistentConnections)
 +              {
 +                      /* Clean up remote sessions */
 +                      pgxc_node_remote_cleanup_all();
 +                      release_handles();
 +              }
 +      }
 +
 +      return nodestr.data;
 +prepare_err:
 +      sprintf(abort_cmd, "ROLLBACK PREPARED '%s'", prepareGID);
 +
 +      auxXid = GetAuxilliaryTransactionId();
 +      conn_count = 0;
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->datanode_handles[i];
 +
 +              /*
 +               * PREPARE succeeded on that node, roll it back there
 +               */
 +              if (conn->ck_resp_rollback)
 +              {
 +                      conn->ck_resp_rollback = false;
 +                      /* sanity checks */
 +                      Assert(conn->sock != NO_SOCKET);
 +                      Assert(conn->state == DN_CONNECTION_STATE_IDLE);
 +                      /* Send down abort prepared command */
 +                      if (pgxc_node_send_gxid(conn, auxXid))
 +                      {
 +                              /*
 +                               * Prepared transaction is left on the node, but we can not
 +                               * do anything with that except warn the user.
 +                               */
 +                              ereport(WARNING,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send xid to "
 +                                                              "the node %u", conn->nodeoid)));
 +                      }
 +                      if (pgxc_node_send_query(conn, abort_cmd))
 +                      {
 +                              /*
 +                               * Prepared transaction is left on the node, but we can not
 +                               * do anything with that except warn the user.
 +                               */
 +                              ereport(WARNING,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send ABORT PREPARED command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->coord_handles[i];
 +
 +              if (conn->ck_resp_rollback)
 +              {
 +                      conn->ck_resp_rollback = false;
 +                      /* sanity checks */
 +                      Assert(conn->sock != NO_SOCKET);
 +                      Assert(conn->state = DN_CONNECTION_STATE_IDLE);
 +                      /* Send down abort prepared command */
 +                      if (pgxc_node_send_gxid(conn, auxXid))
 +                      {
 +                              /*
 +                               * Prepared transaction is left on the node, but we can not
 +                               * do anything with that except warn the user.
 +                               */
 +                              ereport(WARNING,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send xid to "
 +                                                              "the node %u", conn->nodeoid)));
 +                      }
 +                      if (pgxc_node_send_query(conn, abort_cmd))
 +                      {
 +                              /*
 +                               * Prepared transaction is left on the node, but we can not
 +                               * do anything with that except warn the user.
 +                               */
 +                              ereport(WARNING,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send ABORT PREPARED command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +      if (conn_count > 0)
 +      {
 +              /* Just read out responses, throw error from the first combiner */
 +              ResponseCombiner combiner2;
 +              InitResponseCombiner(&combiner2, conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              pgxc_node_receive_responses(conn_count, connections, NULL, &combiner2);
 +              CloseCombiner(&combiner2);
 +      }
 +
 +      if (!temp_object_included && !PersistentConnections)
 +      {
 +              /* Clean up remote sessions */
 +              pgxc_node_remote_cleanup_all();
 +              release_handles();
 +      }
 +
 +      pfree_pgxc_all_handles(handles);
 +
 +      /*
 +       * If the flag is set we are here because combiner carries error message
 +       */
 +      if (isOK)
 +              pgxc_node_report_error(&combiner);
 +      else
 +              elog(ERROR, "failed to PREPARE transaction on one or more nodes");
 +      return NULL;
 +}
 +
 +
 +/*
 + * Commit transactions on remote nodes.
 + * If barrier lock is set wait while it is released.
 + * Release remote connection after completion.
 + */
 +static void
 +pgxc_node_remote_commit(void)
 +{
 +      int                             result = 0;
 +      char               *commitCmd = "COMMIT TRANSACTION";
 +      int                             i;
 +      ResponseCombiner combiner;
 +      PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
 +      int                             conn_count = 0;
 +      PGXCNodeAllHandles *handles = get_current_handles();
 +
 +      SetSendCommandId(false);
 +
 +      /*
 +       * Barrier:
 +       *
 +       * We should acquire the BarrierLock in SHARE mode here to ensure that
 +       * there are no in-progress barrier at this point. This mechanism would
 +       * work as long as LWLock mechanism does not starve a EXCLUSIVE lock
 +       * requester
 +       */
 +      LWLockAcquire(BarrierLock, LW_SHARED);
 +
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->datanode_handles[i];
 +
 +              /* Skip empty slots */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +
 +              /*
 +               * We do not need to commit remote node if it is not in transaction.
 +               * If transaction is in error state the commit command will cause
 +               * rollback, that is OK
 +               */
 +              if (conn->transaction_status != 'I')
 +              {
 +                      /* Read in any pending input */
 +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
 +                              BufferConnection(conn);
 +
 +                      if (pgxc_node_send_query(conn, commitCmd))
 +                      {
 +                              /*
 +                               * Do not bother with clean up, just bomb out. The error handler
 +                               * will invoke RollbackTransaction which will do the work.
 +                               */
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send COMMIT command to the node %u",
 +                                                              conn->nodeoid)));
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->coord_handles[i];
 +
 +              /* Skip empty slots */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +
 +              /*
 +               * We do not need to commit remote node if it is not in transaction.
 +               * If transaction is in error state the commit command will cause
 +               * rollback, that is OK
 +               */
 +              if (conn->transaction_status != 'I')
 +              {
 +                      if (pgxc_node_send_query(conn, commitCmd))
 +                      {
 +                              /*
 +                               * Do not bother with clean up, just bomb out. The error handler
 +                               * will invoke RollbackTransaction which will do the work.
 +                               */
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send COMMIT command to the node %u",
 +                                                              conn->nodeoid)));
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * Release the BarrierLock.
 +       */
 +      LWLockRelease(BarrierLock);
 +
 +      if (conn_count)
 +      {
 +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
 +              if (result || !validate_combiner(&combiner))
 +                      result = EOF;
 +              else
 +                      CloseCombiner(&combiner);
 +      }
 +
 +      stat_transaction(conn_count);
 +
 +      if (result)
 +      {
 +              if (combiner.errorMessage)
 +                      pgxc_node_report_error(&combiner);
 +              else
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to COMMIT the transaction on one or more nodes")));
 +      }
 +
 +      if (!temp_object_included && !PersistentConnections)
 +      {
 +              /* Clean up remote sessions */
 +              pgxc_node_remote_cleanup_all();
 +              release_handles();
 +      }
 +
 +      pfree_pgxc_all_handles(handles);
 +}
 +
 +
 +/*
 + * Rollback transactions on remote nodes.
 + * Release remote connection after completion.
 + */
 +static void
 +pgxc_node_remote_abort(void)
 +{
 +      int                             result = 0;
 +      char               *rollbackCmd = "ROLLBACK TRANSACTION";
 +      int                             i;
 +      ResponseCombiner combiner;
 +      PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
 +      int                             conn_count = 0;
 +      PGXCNodeAllHandles *handles = get_current_handles();
 +
 +      SetSendCommandId(false);
 +
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->datanode_handles[i];
 +
 +              /* Skip empty slots */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +
 +              if (conn->transaction_status != 'I')
 +              {
 +                      /* Read in any pending input */
 +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
 +                              BufferConnection(conn);
 +
 +                      /*
 +                       * Do not matter, is there committed or failed transaction,
 +                       * just send down rollback to finish it.
 +                       */
 +                      if (pgxc_node_send_query(conn, rollbackCmd))
 +                      {
 +                              add_error_message(conn,
 +                                              "failed to send ROLLBACK TRANSACTION command");
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->coord_handles[i];
 +
 +              /* Skip empty slots */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +
 +              if (conn->transaction_status != 'I')
 +              {
 +                      /*
 +                       * Do not matter, is there committed or failed transaction,
 +                       * just send down rollback to finish it.
 +                       */
 +                      if (pgxc_node_send_query(conn, rollbackCmd))
 +                      {
 +                              add_error_message(conn,
 +                                              "failed to send ROLLBACK TRANSACTION command");
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +
 +      if (conn_count)
 +      {
 +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
 +              if (result || !validate_combiner(&combiner))
 +                      result = EOF;
 +              else
 +                      CloseCombiner(&combiner);
 +      }
 +
 +      stat_transaction(conn_count);
 +
 +      if (result)
 +      {
 +              if (combiner.errorMessage)
 +                      pgxc_node_report_error(&combiner);
 +              else
 +                      ereport(LOG,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to ROLLBACK the transaction on one or more nodes")));
 +      }
 +
 +      pfree_pgxc_all_handles(handles);
 +}
 +
 +/*
 + * Begin COPY command
 + * The copy_connections array must have room for NumDataNodes items
 + */
 +void
 +DataNodeCopyBegin(RemoteCopyData *rcstate)
 +{
 +      int i;
 +      List *nodelist = rcstate->rel_loc->nodeList;
 +      PGXCNodeHandle **connections;
 +      bool need_tran_block;
 +      GlobalTransactionId gxid;
 +      ResponseCombiner combiner;
 +      Snapshot snapshot = GetActiveSnapshot();
 +      int conn_count = list_length(nodelist);
 +
 +      /* Get needed datanode connections */
 +      if (!rcstate->is_from && IsLocatorReplicated(rcstate->rel_loc->locatorType))
 +      {
 +              /* Connections is a single handle to read from */
 +              connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *));
 +              connections[0] = get_any_handle(nodelist);
 +              conn_count = 1;
 +      }
 +      else
 +      {
 +              PGXCNodeAllHandles *pgxc_handles;
 +              pgxc_handles = get_handles(nodelist, NULL, false, true);
 +              connections = pgxc_handles->datanode_handles;
 +              Assert(pgxc_handles->dn_conn_count == conn_count);
 +              pfree(pgxc_handles);
 +      }
 +
 +      /*
 +       * If more than one nodes are involved or if we are already in a
 +       * transaction block, we must the remote statements in a transaction block
 +       */
 +      need_tran_block = (conn_count > 1) || (TransactionBlockStatusCode() == 'T');
 +
 +      elog(DEBUG1, "conn_count = %d, need_tran_block = %s", conn_count,
 +                      need_tran_block ? "true" : "false");
 +
 +      /* Gather statistics */
 +      stat_statement();
 +      stat_transaction(conn_count);
 +
 +      gxid = GetCurrentTransactionId();
 +
 +      /* Start transaction on connections where it is not started */
 +      if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE))
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("Could not begin transaction on data nodes.")));
 +      }
 +
 +      /*
 +       * COPY TO do not use locator, it just takes connections from it, and
 +       * we do not look up distribution data type in this case.
 +       * So always use LOCATOR_TYPE_RROBIN to avoid errors because of not
 +       * defined partType if real locator type is HASH or MODULO.
 +       * Create locator before sending down query, because createLocator may
 +       * fail and we leave with dirty connections.
 +       * If we get an error now datanode connection will be clean and error
 +       * handler will issue transaction abort.
 +       */
 +      rcstate->locator = createLocator(
 +                      rcstate->is_from ? rcstate->rel_loc->locatorType
 +                                      : LOCATOR_TYPE_RROBIN,
 +                      rcstate->is_from ? RELATION_ACCESS_INSERT : RELATION_ACCESS_READ,
 +                      rcstate->dist_type,
 +                      LOCATOR_LIST_POINTER,
 +                      conn_count,
 +                      (void *) connections,
 +                      NULL,
 +                      false);
 +
 +      /* Send query to nodes */
 +      for (i = 0; i < conn_count; i++)
 +      {
 +              CHECK_OWNERSHIP(connections[i], NULL);
 +
 +              if (snapshot && pgxc_node_send_snapshot(connections[i], snapshot))
 +              {
 +                      add_error_message(connections[i], "Can not send request");
 +                      pfree(connections);
 +                      freeLocator(rcstate->locator);
 +                      rcstate->locator = NULL;
 +                      return;
 +              }
 +              if (pgxc_node_send_query(connections[i], rcstate->query_buf.data) != 0)
 +              {
 +                      add_error_message(connections[i], "Can not send request");
 +                      pfree(connections);
 +                      freeLocator(rcstate->locator);
 +                      rcstate->locator = NULL;
 +                      return;
 +              }
 +      }
 +
 +      /*
 +       * We are expecting CopyIn response, but do not want to send it to client,
 +       * caller should take care about this, because here we do not know if
 +       * client runs console or file copy
 +       */
 +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +      /*
 +       * Make sure there are zeroes in unused fields
 +       */
 +      memset(&combiner, 0, sizeof(ScanState));
 +
 +      /* Receive responses */
 +      if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner)
 +                      || !ValidateAndCloseCombiner(&combiner))
 +      {
 +              DataNodeCopyFinish(conn_count, connections);
 +              freeLocator(rcstate->locator);
 +              rcstate->locator = NULL;
 +              return;
 +      }
 +      pfree(connections);
 +}
 +
 +
 +/*
 + * Send a data row to the specified nodes
 + */
 +int
 +DataNodeCopyIn(char *data_row, int len, int conn_count, PGXCNodeHandle** copy_connections)
 +{
 +      /* size + data row + \n */
 +      int msgLen = 4 + len + 1;
 +      int nLen = htonl(msgLen);
 +      int i;
 +
 +      for(i = 0; i < conn_count; i++)
 +      {
 +              PGXCNodeHandle *handle = copy_connections[i];
 +              if (handle->state == DN_CONNECTION_STATE_COPY_IN)
 +              {
 +                      /* precalculate to speed up access */
 +                      int bytes_needed = handle->outEnd + 1 + msgLen;
 +
 +                      /* flush buffer if it is almost full */
 +                      if (bytes_needed > COPY_BUFFER_SIZE)
 +                      {
 +                              int to_send = handle->outEnd;
 +
 +                              /* First look if data node has sent a error message */
 +                              int read_status = pgxc_node_read_data(handle, true);
 +                              if (read_status == EOF || read_status < 0)
 +                              {
 +                                      add_error_message(handle, "failed to read data from data node");
 +                                      return EOF;
 +                              }
 +
 +                              if (handle->inStart < handle->inEnd)
 +                              {
 +                                      ResponseCombiner combiner;
 +                                      InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE);
 +                                      /*
 +                                       * Make sure there are zeroes in unused fields
 +                                       */
 +                                      memset(&combiner, 0, sizeof(ScanState));
 +                                      handle_response(handle, &combiner);
 +                                      if (!ValidateAndCloseCombiner(&combiner))
 +                                              return EOF;
 +                              }
 +
 +                              if (DN_CONNECTION_STATE_ERROR(handle))
 +                                      return EOF;
 +
 +                              /*
 +                               * Try to send down buffered data if we have
 +                               */
 +                              if (to_send && send_some(handle, to_send) < 0)
 +                              {
 +                                      add_error_message(handle, "failed to send data to data node");
 +                                      return EOF;
 +                              }
 +                      }
 +
 +                      if (ensure_out_buffer_capacity(bytes_needed, handle) != 0)
 +                      {
 +                              ereport(ERROR,